xref: /aosp_15_r20/external/libdav1d/src/x86/mc16_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2020, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2020, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64
32*c0909341SAndroid Build Coastguard Worker
33*c0909341SAndroid Build Coastguard Workerspel_h_shufA:  db  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
34*c0909341SAndroid Build Coastguard Worker               db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41
35*c0909341SAndroid Build Coastguard Workerspel_h_shufC:  db  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17
36*c0909341SAndroid Build Coastguard Worker               db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49
37*c0909341SAndroid Build Coastguard Worker               db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25
38*c0909341SAndroid Build Coastguard Worker               db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57
39*c0909341SAndroid Build Coastguard Workerspel_h_shufB:  db  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
40*c0909341SAndroid Build Coastguard Worker               db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45
41*c0909341SAndroid Build Coastguard Workerspel_h_shufD:  db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21
42*c0909341SAndroid Build Coastguard Worker               db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53
43*c0909341SAndroid Build Coastguard Worker               db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29
44*c0909341SAndroid Build Coastguard Worker               db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61
45*c0909341SAndroid Build Coastguard Workerspel_v_shuf8:  db  0,  1, 16, 17,  2,  3, 18, 19,  4,  5, 20, 21,  6,  7, 22, 23
46*c0909341SAndroid Build Coastguard Worker               db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
47*c0909341SAndroid Build Coastguard Worker               db  8,  9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
48*c0909341SAndroid Build Coastguard Worker               db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
49*c0909341SAndroid Build Coastguard Workerspel_v_shuf16: db  0,  1, 32, 33,  2,  3, 34, 35,  4,  5, 36, 37,  6,  7, 38, 39
50*c0909341SAndroid Build Coastguard Worker               db  8,  9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
51*c0909341SAndroid Build Coastguard Worker               db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
52*c0909341SAndroid Build Coastguard Worker               db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
53*c0909341SAndroid Build Coastguard Workerprep_endA:     db  1,  2,  5,  6,  9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
54*c0909341SAndroid Build Coastguard Worker               db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
55*c0909341SAndroid Build Coastguard Worker               db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
56*c0909341SAndroid Build Coastguard Worker               db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
57*c0909341SAndroid Build Coastguard Workerprep_endB:     db  1,  2,  5,  6,  9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46
58*c0909341SAndroid Build Coastguard Worker               db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62
59*c0909341SAndroid Build Coastguard Worker               db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110
60*c0909341SAndroid Build Coastguard Worker               db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126
61*c0909341SAndroid Build Coastguard Workerprep_endC:     db  1,  2,  5,  6,  9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78
62*c0909341SAndroid Build Coastguard Worker               db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94
63*c0909341SAndroid Build Coastguard Worker               db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110
64*c0909341SAndroid Build Coastguard Worker               db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126
65*c0909341SAndroid Build Coastguard Workerspel_shuf4a:   db  1,  2, 17, 18,  5,  6, 21, 22,  9, 10, 25, 26, 13, 14, 29, 30
66*c0909341SAndroid Build Coastguard Worker               db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46
67*c0909341SAndroid Build Coastguard Worker               db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
68*c0909341SAndroid Build Coastguard Worker               db 49, 50, 65, 66, 53, 54, 69, 70, 57, 58, 73, 74, 61, 62, 77, 78
69*c0909341SAndroid Build Coastguard Workerspel_shuf4b:   db 50, 51, 65, 66, 54, 55, 69, 70, 58, 59, 73, 74, 62, 63, 77, 78
70*c0909341SAndroid Build Coastguard Worker               db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94
71*c0909341SAndroid Build Coastguard Worker               db 81, 82, 97, 98, 85, 86,101,102, 89, 90,105,106, 93, 94,109,110
72*c0909341SAndroid Build Coastguard Worker               db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126
73*c0909341SAndroid Build Coastguard Workerspel_shuf8a:   db  1,  2, 17, 18,  5,  6, 21, 22,  9, 10, 25, 26, 13, 14, 29, 30
74*c0909341SAndroid Build Coastguard Worker               db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78
75*c0909341SAndroid Build Coastguard Worker               db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
76*c0909341SAndroid Build Coastguard Worker               db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110
77*c0909341SAndroid Build Coastguard Workerspel_shuf8b:   db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78
78*c0909341SAndroid Build Coastguard Worker               db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94
79*c0909341SAndroid Build Coastguard Worker               db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110
80*c0909341SAndroid Build Coastguard Worker               db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126
81*c0909341SAndroid Build Coastguard Workerspel_shuf16:   db  1,  2, 33, 34,  5,  6, 37, 38,  9, 10, 41, 42, 13, 14, 45, 46
82*c0909341SAndroid Build Coastguard Worker               db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62
83*c0909341SAndroid Build Coastguard Worker               db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110
84*c0909341SAndroid Build Coastguard Worker               db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126
85*c0909341SAndroid Build Coastguard Workerspel_shuf32:   db  1,  2, 65, 66,  5,  6, 69, 70,  9, 10, 73, 74, 13, 14, 77, 78
86*c0909341SAndroid Build Coastguard Worker               db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94
87*c0909341SAndroid Build Coastguard Worker               db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110
88*c0909341SAndroid Build Coastguard Worker               db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126
89*c0909341SAndroid Build Coastguard Workerspel_h_shuf2b: db  1,  2, 17, 18,  5,  6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38
90*c0909341SAndroid Build Coastguard Worker               db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50,  9, 10, 53, 54, 13, 14
91*c0909341SAndroid Build Coastguard Worker               db  9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46
92*c0909341SAndroid Build Coastguard Workerspel_shuf2:    db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30
93*c0909341SAndroid Build Coastguard Workerspel_h_shuf2a: db  0,  1,  2,  3,  2,  3,  4,  5, 16, 17, 18, 19, 18, 19, 20, 21
94*c0909341SAndroid Build Coastguard Worker               db  4,  5,  6,  7,  6,  7,  8,  9, 20, 21, 22, 23, 22, 23, 24, 25
95*c0909341SAndroid Build Coastguard Workerw_mask_end42x: db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
96*c0909341SAndroid Build Coastguard Worker               db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
97*c0909341SAndroid Build Coastguard Workerw_mask_end444: db  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
98*c0909341SAndroid Build Coastguard Worker               db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
99*c0909341SAndroid Build Coastguard Worker               db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
100*c0909341SAndroid Build Coastguard Worker               db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
101*c0909341SAndroid Build Coastguard Workerw_mask_shuf4:  db  0,  2,  8, 10,  4,  6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
102*c0909341SAndroid Build Coastguard Worker               db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
103*c0909341SAndroid Build Coastguard Worker               db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94
104*c0909341SAndroid Build Coastguard Worker               db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126
105*c0909341SAndroid Build Coastguard Workerw_mask_shuf8:  db  0,  2, 16, 18,  4,  6, 20, 22,  8, 10, 24, 26, 12, 14, 28, 30
106*c0909341SAndroid Build Coastguard Worker               db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
107*c0909341SAndroid Build Coastguard Worker               db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94
108*c0909341SAndroid Build Coastguard Worker               db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126
109*c0909341SAndroid Build Coastguard Workerw_mask_shuf16: db  0,  2, 32, 34,  4,  6, 36, 38,  8, 10, 40, 42, 12, 14, 44, 46
110*c0909341SAndroid Build Coastguard Worker               db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
111*c0909341SAndroid Build Coastguard Worker               db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110
112*c0909341SAndroid Build Coastguard Worker               db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126
113*c0909341SAndroid Build Coastguard Workerwarp8x8_permA: db  0,  1,  2,  3, 32, 33, 34, 35,  2,  3,  4,  5, 34, 35, 36, 37
114*c0909341SAndroid Build Coastguard Worker               db  4,  5,  6,  7, 36, 37, 38, 39,  6,  7,  8,  9, 38, 39, 40, 41
115*c0909341SAndroid Build Coastguard Worker               db  8,  9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45
116*c0909341SAndroid Build Coastguard Worker               db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
117*c0909341SAndroid Build Coastguard Workerwarp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
118*c0909341SAndroid Build Coastguard Worker               db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53
119*c0909341SAndroid Build Coastguard Worker               db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57
120*c0909341SAndroid Build Coastguard Worker               db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61
121*c0909341SAndroid Build Coastguard Workerwarp8x8_end:   db  0,  1,  4,  5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
122*c0909341SAndroid Build Coastguard Worker               db  2,  3,  6,  7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
123*c0909341SAndroid Build Coastguard Worker               db  8,  9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
124*c0909341SAndroid Build Coastguard Worker               db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
125*c0909341SAndroid Build Coastguard Workerdeint_q_shuf: ;dq  0,  2,  4,  6,  1,  3,  5,  7
126*c0909341SAndroid Build Coastguard Workerpd_0to7:       dd  0,  1,  2,  3,  4,  5,  6,  7
127*c0909341SAndroid Build Coastguard Worker               dd  1
128*c0909341SAndroid Build Coastguard Workerpw_2048:       times 2 dw 2048
129*c0909341SAndroid Build Coastguard Worker               dd  3
130*c0909341SAndroid Build Coastguard Workerpw_8192:       times 2 dw 8192
131*c0909341SAndroid Build Coastguard Workeravg_shift:     dw  5,  5,  3,  3
132*c0909341SAndroid Build Coastguard Workerpw_27615:      times 2 dw 27615
133*c0909341SAndroid Build Coastguard Workerpw_32766:      times 2 dw 32766
134*c0909341SAndroid Build Coastguard Workerwarp8x8_permC: db -1,  0, -1,  1, -1,  8, -1,  9, -1,  4, -1,  5, -1, 12, -1, 13
135*c0909341SAndroid Build Coastguard Workerwarp8x8_permD: db -1,  2, -1,  3, -1, 10, -1, 11, -1,  6, -1,  7, -1, 14, -1, 15
136*c0909341SAndroid Build Coastguard Workerwarp_shift_h:  db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53
137*c0909341SAndroid Build Coastguard Workerblend_shuf:    db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
138*c0909341SAndroid Build Coastguard Workerresize_permA:  dd  0,  4,  8, 12,  1,  5,  9, 13, 16, 20, 24, 28, 17, 21, 25, 29
139*c0909341SAndroid Build Coastguard Workerresize_permB:  dd  2,  6, 10, 14,  3,  7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31
140*c0909341SAndroid Build Coastguard Workerresize_permC:  dq  0,  1,  4,  5,  8,  9, 12, 13
141*c0909341SAndroid Build Coastguard Workerresize_permD:  dq  2,  3,  6,  7, 10, 11, 14, 15
142*c0909341SAndroid Build Coastguard Workerresize_permE:  dq  0,  2,  4,  6
143*c0909341SAndroid Build Coastguard Workerresize_shufA:  db -1,  0, -1,  1, -1,  4, -1,  5, -1,  8, -1,  9, -1, 12, -1, 13
144*c0909341SAndroid Build Coastguard Workerresize_shufB:  db -1,  2, -1,  3, -1,  6, -1,  7, -1, 10, -1, 11, -1, 14, -1, 15
145*c0909341SAndroid Build Coastguard Workerrescale_mul:   dd  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
146*c0909341SAndroid Build Coastguard Workerresize_shuf:   db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7
147*c0909341SAndroid Build Coastguard Worker               db  8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
148*c0909341SAndroid Build Coastguard Worker
149*c0909341SAndroid Build Coastguard Workerprep_hv_shift:    dq  6,  4
150*c0909341SAndroid Build Coastguard Workerput_bilin_h_rnd:  dw  8,  8, 10, 10
151*c0909341SAndroid Build Coastguard Workerprep_mul:         dw 16, 16,  4,  4
152*c0909341SAndroid Build Coastguard Workerput_8tap_h_rnd:   dd 34, 40
153*c0909341SAndroid Build Coastguard Workerprep_8tap_rnd:    dd 128 - (8192 << 8)
154*c0909341SAndroid Build Coastguard Workerwarp_8x8_rnd_h:   dd 512, 2048
155*c0909341SAndroid Build Coastguard Workerwarp_8x8_rnd_v:   dd 262144, 65536
156*c0909341SAndroid Build Coastguard Workerwarp_8x8t_rnd_v:  dd 16384 - (8192 << 15)
157*c0909341SAndroid Build Coastguard Workeravg_round:        dw -16400, -16400, -16388, -16388
158*c0909341SAndroid Build Coastguard Workerw_avg_round:      dd 128 + (8192 << 4),  32 + (8192 << 4)
159*c0909341SAndroid Build Coastguard Workermask_round:       dd 512 + (8192 << 6), 128 + (8192 << 6)
160*c0909341SAndroid Build Coastguard Workerw_mask_round:     dd 128, 64
161*c0909341SAndroid Build Coastguard Workerbidir_shift:      dw  6,  6,  4,  4
162*c0909341SAndroid Build Coastguard Worker
163*c0909341SAndroid Build Coastguard Workerpb_64:    times 4 db 64
164*c0909341SAndroid Build Coastguard Workerpw_m512:  times 2 dw -512
165*c0909341SAndroid Build Coastguard Workerpw_2:     times 2 dw 2
166*c0909341SAndroid Build Coastguard Workerpw_64:    times 2 dw 64
167*c0909341SAndroid Build Coastguard Workerpd_32:    dd 32
168*c0909341SAndroid Build Coastguard Workerpd_63:    dd 63
169*c0909341SAndroid Build Coastguard Workerpd_128:   dd 128
170*c0909341SAndroid Build Coastguard Workerpd_640:   dd 640
171*c0909341SAndroid Build Coastguard Workerpd_2176:  dd 2176
172*c0909341SAndroid Build Coastguard Workerpd_16384: dd 16384
173*c0909341SAndroid Build Coastguard Workerpd_0_4:   dd 0, 4
174*c0909341SAndroid Build Coastguard Worker
175*c0909341SAndroid Build Coastguard Worker%define pw_16 prep_mul
176*c0909341SAndroid Build Coastguard Worker%define pd_512 warp_8x8_rnd_h
177*c0909341SAndroid Build Coastguard Worker
178*c0909341SAndroid Build Coastguard Worker%macro BASE_JMP_TABLE 3-*
179*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - %3)
180*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_%2
181*c0909341SAndroid Build Coastguard Worker    %%table:
182*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
183*c0909341SAndroid Build Coastguard Worker        dw %%base %+ _w%3 - %%base
184*c0909341SAndroid Build Coastguard Worker        %rotate 1
185*c0909341SAndroid Build Coastguard Worker    %endrep
186*c0909341SAndroid Build Coastguard Worker%endmacro
187*c0909341SAndroid Build Coastguard Worker
188*c0909341SAndroid Build Coastguard Worker%macro HV_JMP_TABLE 5-*
189*c0909341SAndroid Build Coastguard Worker    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3)
190*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_%3
191*c0909341SAndroid Build Coastguard Worker    %assign %%types %4
192*c0909341SAndroid Build Coastguard Worker    %if %%types & 1
193*c0909341SAndroid Build Coastguard Worker        %xdefine %1_%2_h_%3_table  (%%h  - %5)
194*c0909341SAndroid Build Coastguard Worker        %%h:
195*c0909341SAndroid Build Coastguard Worker        %rep %0 - 4
196*c0909341SAndroid Build Coastguard Worker            dw %%prefix %+ .h_w%5 - %%base
197*c0909341SAndroid Build Coastguard Worker            %rotate 1
198*c0909341SAndroid Build Coastguard Worker        %endrep
199*c0909341SAndroid Build Coastguard Worker        %rotate 4
200*c0909341SAndroid Build Coastguard Worker    %endif
201*c0909341SAndroid Build Coastguard Worker    %if %%types & 2
202*c0909341SAndroid Build Coastguard Worker        %xdefine %1_%2_v_%3_table  (%%v  - %5)
203*c0909341SAndroid Build Coastguard Worker        %%v:
204*c0909341SAndroid Build Coastguard Worker        %rep %0 - 4
205*c0909341SAndroid Build Coastguard Worker            dw %%prefix %+ .v_w%5 - %%base
206*c0909341SAndroid Build Coastguard Worker            %rotate 1
207*c0909341SAndroid Build Coastguard Worker        %endrep
208*c0909341SAndroid Build Coastguard Worker        %rotate 4
209*c0909341SAndroid Build Coastguard Worker    %endif
210*c0909341SAndroid Build Coastguard Worker    %if %%types & 4
211*c0909341SAndroid Build Coastguard Worker        %xdefine %1_%2_hv_%3_table (%%hv - %5)
212*c0909341SAndroid Build Coastguard Worker        %%hv:
213*c0909341SAndroid Build Coastguard Worker        %rep %0 - 4
214*c0909341SAndroid Build Coastguard Worker            dw %%prefix %+ .hv_w%5 - %%base
215*c0909341SAndroid Build Coastguard Worker            %rotate 1
216*c0909341SAndroid Build Coastguard Worker        %endrep
217*c0909341SAndroid Build Coastguard Worker    %endif
218*c0909341SAndroid Build Coastguard Worker%endmacro
219*c0909341SAndroid Build Coastguard Worker
220*c0909341SAndroid Build Coastguard Worker%macro BIDIR_JMP_TABLE 2-*
221*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - 2*%3)
222*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_%2_table
223*c0909341SAndroid Build Coastguard Worker    %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
224*c0909341SAndroid Build Coastguard Worker    %%table:
225*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
226*c0909341SAndroid Build Coastguard Worker        dd %%prefix %+ .w%3 - %%base
227*c0909341SAndroid Build Coastguard Worker        %rotate 1
228*c0909341SAndroid Build Coastguard Worker    %endrep
229*c0909341SAndroid Build Coastguard Worker%endmacro
230*c0909341SAndroid Build Coastguard Worker
231*c0909341SAndroid Build Coastguard Worker%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put)
232*c0909341SAndroid Build Coastguard Worker%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep)
233*c0909341SAndroid Build Coastguard Worker
234*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE avg,        avx512icl,       4, 8, 16, 32, 64, 128
235*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_avg,      avx512icl,       4, 8, 16, 32, 64, 128
236*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE mask,       avx512icl,       4, 8, 16, 32, 64, 128
237*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_420, avx512icl,       4, 8, 16, 32, 64, 128
238*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_422, avx512icl,       4, 8, 16, 32, 64, 128
239*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_444, avx512icl,       4, 8, 16, 32, 64, 128
240*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend,      avx512icl,       4, 8, 16, 32
241*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend_v,    avx512icl,    2, 4, 8, 16, 32
242*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend_h,    avx512icl,    2, 4, 8, 16, 32, 64, 128
243*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE put,         avx512icl,    2, 4, 8, 16, 32, 64, 128
244*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE prep,        avx512icl,       4, 8, 16, 32, 64, 128
245*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE   put,  bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
246*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE   prep, bilin, avx512icl, 7,    4, 8, 16, 32, 64, 128
247*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE   put,  6tap,  avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
248*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE   put,  8tap,  avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
249*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE   prep, 6tap,  avx512icl, 2,    4, 8, 16, 32, 64, 128
250*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE   prep, 8tap,  avx512icl, 2,    4, 8, 16, 32, 64, 128
251*c0909341SAndroid Build Coastguard Worker
252*c0909341SAndroid Build Coastguard Worker%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
253*c0909341SAndroid Build Coastguard Worker
254*c0909341SAndroid Build Coastguard Workercextern mc_subpel_filters
255*c0909341SAndroid Build Coastguard Worker%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
256*c0909341SAndroid Build Coastguard Worker
257*c0909341SAndroid Build Coastguard Workercextern mc_warp_filter
258*c0909341SAndroid Build Coastguard Workercextern obmc_masks_avx2
259*c0909341SAndroid Build Coastguard Workercextern resize_filter
260*c0909341SAndroid Build Coastguard Worker
261*c0909341SAndroid Build Coastguard WorkerSECTION .text
262*c0909341SAndroid Build Coastguard Worker
263*c0909341SAndroid Build Coastguard Worker%if WIN64
264*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4
265*c0909341SAndroid Build Coastguard Worker%else
266*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 8
267*c0909341SAndroid Build Coastguard Worker%endif
268*c0909341SAndroid Build Coastguard Worker
269*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
270*c0909341SAndroid Build Coastguard Workercglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy
271*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r6m ; mx
272*c0909341SAndroid Build Coastguard Worker    lea                  r7, [put_avx512icl]
273*c0909341SAndroid Build Coastguard Worker    tzcnt               t0d, wm
274*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
275*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
276*c0909341SAndroid Build Coastguard Worker    jnz .h
277*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r7m ; my
278*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
279*c0909341SAndroid Build Coastguard Worker    jnz .v
280*c0909341SAndroid Build Coastguard Worker.put:
281*c0909341SAndroid Build Coastguard Worker    movzx               t0d, word [r7+t0*2+table_offset(put,)]
282*c0909341SAndroid Build Coastguard Worker    add                  t0, r7
283*c0909341SAndroid Build Coastguard Worker    jmp                  t0
284*c0909341SAndroid Build Coastguard Worker.put_w2:
285*c0909341SAndroid Build Coastguard Worker    mov                 r6d, [srcq+ssq*0]
286*c0909341SAndroid Build Coastguard Worker    mov                 r7d, [srcq+ssq*1]
287*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
288*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r6d
289*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r7d
290*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
291*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
292*c0909341SAndroid Build Coastguard Worker    jg .put_w2
293*c0909341SAndroid Build Coastguard Worker    RET
294*c0909341SAndroid Build Coastguard Worker.put_w4:
295*c0909341SAndroid Build Coastguard Worker    mov                  r6, [srcq+ssq*0]
296*c0909341SAndroid Build Coastguard Worker    mov                  r7, [srcq+ssq*1]
297*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
298*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r6
299*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r7
300*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
301*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
302*c0909341SAndroid Build Coastguard Worker    jg .put_w4
303*c0909341SAndroid Build Coastguard Worker    RET
304*c0909341SAndroid Build Coastguard Worker.put_w8:
305*c0909341SAndroid Build Coastguard Worker    movu               xmm0, [srcq+ssq*0]
306*c0909341SAndroid Build Coastguard Worker    movu               xmm1, [srcq+ssq*1]
307*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
308*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], xmm0
309*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], xmm1
310*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
311*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
312*c0909341SAndroid Build Coastguard Worker    jg .put_w8
313*c0909341SAndroid Build Coastguard Worker    RET
314*c0909341SAndroid Build Coastguard Worker.put_w16:
315*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+ssq*0]
316*c0909341SAndroid Build Coastguard Worker    movu                ym1, [srcq+ssq*1]
317*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
318*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], ym0
319*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], ym1
320*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
321*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
322*c0909341SAndroid Build Coastguard Worker    jg .put_w16
323*c0909341SAndroid Build Coastguard Worker    RET
324*c0909341SAndroid Build Coastguard Worker.put_w32:
325*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
326*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
327*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
328*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m0
329*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
330*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
331*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
332*c0909341SAndroid Build Coastguard Worker    jg .put_w32
333*c0909341SAndroid Build Coastguard Worker    RET
334*c0909341SAndroid Build Coastguard Worker.put_w64:
335*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+64*0]
336*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+64*1]
337*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1+64*0]
338*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1+64*1]
339*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
340*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+64*0], m0
341*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+64*1], m1
342*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+64*0], m2
343*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+64*1], m3
344*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
345*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
346*c0909341SAndroid Build Coastguard Worker    jg .put_w64
347*c0909341SAndroid Build Coastguard Worker    RET
348*c0909341SAndroid Build Coastguard Worker.put_w128:
349*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+64*0]
350*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+64*1]
351*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+64*2]
352*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+64*3]
353*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
354*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
355*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
356*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*2], m2
357*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*3], m3
358*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
359*c0909341SAndroid Build Coastguard Worker    dec                  hd
360*c0909341SAndroid Build Coastguard Worker    jg .put_w128
361*c0909341SAndroid Build Coastguard Worker    RET
362*c0909341SAndroid Build Coastguard Worker.h:
363*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, mxyd
364*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r7m ; my
365*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [pw_16]
366*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5
367*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
368*c0909341SAndroid Build Coastguard Worker    jnz .hv
369*c0909341SAndroid Build Coastguard Worker    ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
370*c0909341SAndroid Build Coastguard Worker    movzx               t0d, word [r7+t0*2+table_offset(put, _bilin_h)]
371*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r8m ; bitdepth_max
372*c0909341SAndroid Build Coastguard Worker    add                  t0, r7
373*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
374*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4]
375*c0909341SAndroid Build Coastguard Worker    jmp                  t0
376*c0909341SAndroid Build Coastguard Worker.h_w2:
377*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [srcq+ssq*0]
378*c0909341SAndroid Build Coastguard Worker    movhps             xmm1, [srcq+ssq*1]
379*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
380*c0909341SAndroid Build Coastguard Worker    pmullw             xmm0, xmm1, xm4
381*c0909341SAndroid Build Coastguard Worker    psrlq              xmm1, 16
382*c0909341SAndroid Build Coastguard Worker    pmullw             xmm1, xm5
383*c0909341SAndroid Build Coastguard Worker    paddw              xmm0, xm6
384*c0909341SAndroid Build Coastguard Worker    paddw              xmm0, xmm1
385*c0909341SAndroid Build Coastguard Worker    psrlw              xmm0, 4
386*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm0
387*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm0, 2
388*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
389*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
390*c0909341SAndroid Build Coastguard Worker    jg .h_w2
391*c0909341SAndroid Build Coastguard Worker    RET
392*c0909341SAndroid Build Coastguard Worker.h_w4:
393*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [srcq+ssq*0+0]
394*c0909341SAndroid Build Coastguard Worker    movhps             xmm0, [srcq+ssq*1+0]
395*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [srcq+ssq*0+2]
396*c0909341SAndroid Build Coastguard Worker    movhps             xmm1, [srcq+ssq*1+2]
397*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
398*c0909341SAndroid Build Coastguard Worker    pmullw             xmm0, xm4
399*c0909341SAndroid Build Coastguard Worker    pmullw             xmm1, xm5
400*c0909341SAndroid Build Coastguard Worker    paddw              xmm0, xm6
401*c0909341SAndroid Build Coastguard Worker    paddw              xmm0, xmm1
402*c0909341SAndroid Build Coastguard Worker    psrlw              xmm0, 4
403*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xmm0
404*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xmm0
405*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
406*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
407*c0909341SAndroid Build Coastguard Worker    jg .h_w4
408*c0909341SAndroid Build Coastguard Worker    RET
409*c0909341SAndroid Build Coastguard Worker.h_w8:
410*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0+0]
411*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, [srcq+ssq*1+0], 1
412*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*0+2]
413*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, [srcq+ssq*1+2], 1
414*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
415*c0909341SAndroid Build Coastguard Worker    pmullw              ym0, ym4
416*c0909341SAndroid Build Coastguard Worker    pmullw              ym1, ym5
417*c0909341SAndroid Build Coastguard Worker    paddw               ym0, ym6
418*c0909341SAndroid Build Coastguard Worker    paddw               ym0, ym1
419*c0909341SAndroid Build Coastguard Worker    psrlw               ym0, 4
420*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], xm0
421*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*1], ym0, 1
422*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
423*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
424*c0909341SAndroid Build Coastguard Worker    jg .h_w8
425*c0909341SAndroid Build Coastguard Worker    RET
426*c0909341SAndroid Build Coastguard Worker.h_w16:
427*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+ssq*0+0]
428*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*1+0], 1
429*c0909341SAndroid Build Coastguard Worker    movu                ym1, [srcq+ssq*0+2]
430*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [srcq+ssq*1+2], 1
431*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
432*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
433*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5
434*c0909341SAndroid Build Coastguard Worker    paddw                m0, m6
435*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
436*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 4
437*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym0
438*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m0, 1
439*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
440*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
441*c0909341SAndroid Build Coastguard Worker    jg .h_w16
442*c0909341SAndroid Build Coastguard Worker    RET
443*c0909341SAndroid Build Coastguard Worker.h_w32:
444*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+ssq*0+0]
445*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5, [srcq+ssq*0+2]
446*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+ssq*1+0]
447*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m5, [srcq+ssq*1+2]
448*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
449*c0909341SAndroid Build Coastguard Worker    paddw                m0, m6
450*c0909341SAndroid Build Coastguard Worker    paddw                m1, m6
451*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
452*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
453*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 4
454*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 4
455*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m0
456*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
457*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
458*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
459*c0909341SAndroid Build Coastguard Worker    jg .h_w32
460*c0909341SAndroid Build Coastguard Worker    RET
461*c0909341SAndroid Build Coastguard Worker.h_w64:
462*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+64*0+0]
463*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5, [srcq+64*0+2]
464*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+64*1+0]
465*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m5, [srcq+64*1+2]
466*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
467*c0909341SAndroid Build Coastguard Worker    paddw                m0, m6
468*c0909341SAndroid Build Coastguard Worker    paddw                m1, m6
469*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
470*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
471*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 4
472*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 4
473*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
474*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
475*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
476*c0909341SAndroid Build Coastguard Worker    dec                  hd
477*c0909341SAndroid Build Coastguard Worker    jg .h_w64
478*c0909341SAndroid Build Coastguard Worker    RET
479*c0909341SAndroid Build Coastguard Worker.h_w128:
480*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+64*0+0]
481*c0909341SAndroid Build Coastguard Worker    pmullw               m7, m5, [srcq+64*0+2]
482*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+64*1+0]
483*c0909341SAndroid Build Coastguard Worker    pmullw               m8, m5, [srcq+64*1+2]
484*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4, [srcq+64*2+0]
485*c0909341SAndroid Build Coastguard Worker    pmullw               m9, m5, [srcq+64*2+2]
486*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m4, [srcq+64*3+0]
487*c0909341SAndroid Build Coastguard Worker    pmullw              m10, m5, [srcq+64*3+2]
488*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
489*c0909341SAndroid Build Coastguard Worker    REPX      {paddw x, m6}, m0, m1, m2, m3
490*c0909341SAndroid Build Coastguard Worker    paddw                m0, m7
491*c0909341SAndroid Build Coastguard Worker    paddw                m1, m8
492*c0909341SAndroid Build Coastguard Worker    paddw                m2, m9
493*c0909341SAndroid Build Coastguard Worker    paddw                m3, m10
494*c0909341SAndroid Build Coastguard Worker    REPX       {psrlw x, 4}, m0, m1, m2, m3
495*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
496*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
497*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*2], m2
498*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*3], m3
499*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
500*c0909341SAndroid Build Coastguard Worker    dec                  hd
501*c0909341SAndroid Build Coastguard Worker    jg .h_w128
502*c0909341SAndroid Build Coastguard Worker    RET
503*c0909341SAndroid Build Coastguard Worker.v:
504*c0909341SAndroid Build Coastguard Worker    movzx               t0d, word [r7+t0*2+table_offset(put, _bilin_v)]
505*c0909341SAndroid Build Coastguard Worker    shl                mxyd, 11
506*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m8, mxyd
507*c0909341SAndroid Build Coastguard Worker    add                  t0, r7
508*c0909341SAndroid Build Coastguard Worker    jmp                  t0
509*c0909341SAndroid Build Coastguard Worker.v_w2:
510*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [srcq+ssq*0]
511*c0909341SAndroid Build Coastguard Worker.v_w2_loop:
512*c0909341SAndroid Build Coastguard Worker    movd               xmm1, [srcq+ssq*1]
513*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
514*c0909341SAndroid Build Coastguard Worker    punpckldq          xmm2, xmm0, xmm1
515*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [srcq+ssq*0]
516*c0909341SAndroid Build Coastguard Worker    punpckldq          xmm1, xmm0
517*c0909341SAndroid Build Coastguard Worker    psubw              xmm1, xmm2
518*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm1, xm8
519*c0909341SAndroid Build Coastguard Worker    paddw              xmm1, xmm2
520*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm1
521*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm1, 1
522*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
523*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
524*c0909341SAndroid Build Coastguard Worker    jg .v_w2_loop
525*c0909341SAndroid Build Coastguard Worker    RET
526*c0909341SAndroid Build Coastguard Worker.v_w4:
527*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [srcq+ssq*0]
528*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
529*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [srcq+ssq*1]
530*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
531*c0909341SAndroid Build Coastguard Worker    punpcklqdq         xmm2, xmm0, xmm1
532*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [srcq+ssq*0]
533*c0909341SAndroid Build Coastguard Worker    punpcklqdq         xmm1, xmm0
534*c0909341SAndroid Build Coastguard Worker    psubw              xmm1, xmm2
535*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm1, xm8
536*c0909341SAndroid Build Coastguard Worker    paddw              xmm1, xmm2
537*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xmm1
538*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xmm1
539*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
540*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
541*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
542*c0909341SAndroid Build Coastguard Worker    RET
543*c0909341SAndroid Build Coastguard Worker.v_w8:
544*c0909341SAndroid Build Coastguard Worker    movu               xmm0, [srcq+ssq*0]
545*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
546*c0909341SAndroid Build Coastguard Worker    vbroadcasti128     ymm1, [srcq+ssq*1]
547*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
548*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm2, ymm0, ymm1, 0xf0
549*c0909341SAndroid Build Coastguard Worker    vbroadcasti128     ymm0, [srcq+ssq*0]
550*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm1, ymm0, 0xf0
551*c0909341SAndroid Build Coastguard Worker    psubw              ymm1, ymm2
552*c0909341SAndroid Build Coastguard Worker    pmulhrsw           ymm1, ym8
553*c0909341SAndroid Build Coastguard Worker    paddw              ymm1, ymm2
554*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xmm1
555*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], ymm1, 1
556*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
557*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
558*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
559*c0909341SAndroid Build Coastguard Worker    vzeroupper
560*c0909341SAndroid Build Coastguard Worker    RET
561*c0909341SAndroid Build Coastguard Worker.v_w16:
562*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+ssq*0]
563*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
564*c0909341SAndroid Build Coastguard Worker    movu                ym3, [srcq+ssq*1]
565*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
566*c0909341SAndroid Build Coastguard Worker    psubw               ym1, ym3, ym0
567*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym1, ym8
568*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym0
569*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+ssq*0]
570*c0909341SAndroid Build Coastguard Worker    psubw               ym2, ym0, ym3
571*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym2, ym8
572*c0909341SAndroid Build Coastguard Worker    paddw               ym2, ym3
573*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], ym1
574*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], ym2
575*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
576*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
577*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
578*c0909341SAndroid Build Coastguard Worker    RET
579*c0909341SAndroid Build Coastguard Worker.v_w32:
580*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
581*c0909341SAndroid Build Coastguard Worker.v_w32_loop:
582*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1]
583*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
584*c0909341SAndroid Build Coastguard Worker    psubw                m1, m3, m0
585*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m8
586*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
587*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
588*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, m3
589*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m8
590*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
591*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m1
592*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m2
593*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
594*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
595*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop
596*c0909341SAndroid Build Coastguard Worker    RET
597*c0909341SAndroid Build Coastguard Worker.v_w64:
598*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+64*0]
599*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+64*1]
600*c0909341SAndroid Build Coastguard Worker.v_w64_loop:
601*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1+64*0]
602*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1+64*1]
603*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
604*c0909341SAndroid Build Coastguard Worker    psubw                m4, m2, m0
605*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m8
606*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
607*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+64*0]
608*c0909341SAndroid Build Coastguard Worker    psubw                m5, m3, m1
609*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m8
610*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
611*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+64*1]
612*c0909341SAndroid Build Coastguard Worker    psubw                m6, m0, m2
613*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m8
614*c0909341SAndroid Build Coastguard Worker    psubw                m7, m1, m3
615*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m8
616*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+64*0], m4
617*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+64*1], m5
618*c0909341SAndroid Build Coastguard Worker    paddw                m6, m2
619*c0909341SAndroid Build Coastguard Worker    paddw                m7, m3
620*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+64*0], m6
621*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+64*1], m7
622*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
623*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
624*c0909341SAndroid Build Coastguard Worker    jg .v_w64_loop
625*c0909341SAndroid Build Coastguard Worker    RET
626*c0909341SAndroid Build Coastguard Worker.v_w128:
627*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+64*0]
628*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+64*1]
629*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*0+64*2]
630*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*0+64*3]
631*c0909341SAndroid Build Coastguard Worker.v_w128_loop:
632*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*1+64*0]
633*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*1+64*1]
634*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*1+64*2]
635*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*1+64*3]
636*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
637*c0909341SAndroid Build Coastguard Worker    psubw                m9, m4, m0
638*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m8
639*c0909341SAndroid Build Coastguard Worker    paddw                m9, m0
640*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+64*0]
641*c0909341SAndroid Build Coastguard Worker    psubw               m10, m5, m1
642*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m10, m8
643*c0909341SAndroid Build Coastguard Worker    paddw               m10, m1
644*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+64*1]
645*c0909341SAndroid Build Coastguard Worker    psubw               m11, m6, m2
646*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m8
647*c0909341SAndroid Build Coastguard Worker    paddw               m11, m2
648*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*0+64*2]
649*c0909341SAndroid Build Coastguard Worker    psubw               m12, m7, m3
650*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m12, m8
651*c0909341SAndroid Build Coastguard Worker    paddw               m12, m3
652*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*0+64*3]
653*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+64*0], m9
654*c0909341SAndroid Build Coastguard Worker    psubw                m9, m0, m4
655*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m8
656*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+64*1], m10
657*c0909341SAndroid Build Coastguard Worker    psubw               m10, m1, m5
658*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m10, m8
659*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+64*2], m11
660*c0909341SAndroid Build Coastguard Worker    psubw               m11, m2, m6
661*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m8
662*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+64*3], m12
663*c0909341SAndroid Build Coastguard Worker    psubw               m12, m3, m7
664*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m12, m8
665*c0909341SAndroid Build Coastguard Worker    paddw                m9, m4
666*c0909341SAndroid Build Coastguard Worker    paddw               m10, m5
667*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+64*0], m9
668*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+64*1], m10
669*c0909341SAndroid Build Coastguard Worker    paddw               m11, m6
670*c0909341SAndroid Build Coastguard Worker    paddw               m12, m7
671*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+64*2], m11
672*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+64*3], m12
673*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
674*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
675*c0909341SAndroid Build Coastguard Worker    jg .v_w128_loop
676*c0909341SAndroid Build Coastguard Worker    RET
677*c0909341SAndroid Build Coastguard Worker.hv:
678*c0909341SAndroid Build Coastguard Worker    movzx               t0d, word [r7+t0*2+table_offset(put, _bilin_hv)]
679*c0909341SAndroid Build Coastguard Worker    shl                mxyd, 11
680*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [pw_2]
681*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, mxyd
682*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [pw_8192]
683*c0909341SAndroid Build Coastguard Worker    add                  t0, r7
684*c0909341SAndroid Build Coastguard Worker    test          dword r8m, 0x800
685*c0909341SAndroid Build Coastguard Worker    jnz .hv_12bpc
686*c0909341SAndroid Build Coastguard Worker    psllw                m4, 2
687*c0909341SAndroid Build Coastguard Worker    psllw                m5, 2
688*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [pw_2048]
689*c0909341SAndroid Build Coastguard Worker.hv_12bpc:
690*c0909341SAndroid Build Coastguard Worker    jmp                  t0
691*c0909341SAndroid Build Coastguard Worker.hv_w2:
692*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       xmm1, [srcq+ssq*0]
693*c0909341SAndroid Build Coastguard Worker    pmullw             xmm0, xmm1, xm4
694*c0909341SAndroid Build Coastguard Worker    psrlq              xmm1, 16
695*c0909341SAndroid Build Coastguard Worker    pmullw             xmm1, xm5
696*c0909341SAndroid Build Coastguard Worker    paddw              xmm0, xm6
697*c0909341SAndroid Build Coastguard Worker    paddw              xmm0, xmm1
698*c0909341SAndroid Build Coastguard Worker    psrlw              xmm0, 2
699*c0909341SAndroid Build Coastguard Worker.hv_w2_loop:
700*c0909341SAndroid Build Coastguard Worker    movq               xmm2, [srcq+ssq*1]
701*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
702*c0909341SAndroid Build Coastguard Worker    movhps             xmm2, [srcq+ssq*0]
703*c0909341SAndroid Build Coastguard Worker    pmullw             xmm1, xmm2, xm4
704*c0909341SAndroid Build Coastguard Worker    psrlq              xmm2, 16
705*c0909341SAndroid Build Coastguard Worker    pmullw             xmm2, xm5
706*c0909341SAndroid Build Coastguard Worker    paddw              xmm1, xm6
707*c0909341SAndroid Build Coastguard Worker    paddw              xmm1, xmm2
708*c0909341SAndroid Build Coastguard Worker    psrlw              xmm1, 2                ; 1 _ 2 _
709*c0909341SAndroid Build Coastguard Worker    shufpd             xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _
710*c0909341SAndroid Build Coastguard Worker    mova               xmm0, xmm1
711*c0909341SAndroid Build Coastguard Worker    psubw              xmm1, xmm2
712*c0909341SAndroid Build Coastguard Worker    paddw              xmm1, xmm1
713*c0909341SAndroid Build Coastguard Worker    pmulhw             xmm1, xm7
714*c0909341SAndroid Build Coastguard Worker    paddw              xmm1, xmm2
715*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm1, xm8
716*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm1
717*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm1, 2
718*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
719*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
720*c0909341SAndroid Build Coastguard Worker    jg .hv_w2_loop
721*c0909341SAndroid Build Coastguard Worker    RET
722*c0909341SAndroid Build Coastguard Worker.hv_w4:
723*c0909341SAndroid Build Coastguard Worker    pmullw             xmm0, xm4, [srcq+ssq*0-8]
724*c0909341SAndroid Build Coastguard Worker    pmullw             xmm1, xm5, [srcq+ssq*0-6]
725*c0909341SAndroid Build Coastguard Worker    paddw              xmm0, xm6
726*c0909341SAndroid Build Coastguard Worker    paddw              xmm0, xmm1
727*c0909341SAndroid Build Coastguard Worker    psrlw              xmm0, 2
728*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
729*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [srcq+ssq*1+0]
730*c0909341SAndroid Build Coastguard Worker    movq               xmm2, [srcq+ssq*1+2]
731*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
732*c0909341SAndroid Build Coastguard Worker    movhps             xmm1, [srcq+ssq*0+0]
733*c0909341SAndroid Build Coastguard Worker    movhps             xmm2, [srcq+ssq*0+2]
734*c0909341SAndroid Build Coastguard Worker    pmullw             xmm1, xm4
735*c0909341SAndroid Build Coastguard Worker    pmullw             xmm2, xm5
736*c0909341SAndroid Build Coastguard Worker    paddw              xmm1, xm6
737*c0909341SAndroid Build Coastguard Worker    paddw              xmm1, xmm2
738*c0909341SAndroid Build Coastguard Worker    psrlw              xmm1, 2                ; 1 2
739*c0909341SAndroid Build Coastguard Worker    shufpd             xmm2, xmm0, xmm1, 0x01 ; 0 1
740*c0909341SAndroid Build Coastguard Worker    mova               xmm0, xmm1
741*c0909341SAndroid Build Coastguard Worker    psubw              xmm1, xmm2
742*c0909341SAndroid Build Coastguard Worker    paddw              xmm1, xmm1
743*c0909341SAndroid Build Coastguard Worker    pmulhw             xmm1, xm7
744*c0909341SAndroid Build Coastguard Worker    paddw              xmm1, xmm2
745*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm1, xm8
746*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xmm1
747*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xmm1
748*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
749*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
750*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
751*c0909341SAndroid Build Coastguard Worker    RET
752*c0909341SAndroid Build Coastguard Worker.hv_w8:
753*c0909341SAndroid Build Coastguard Worker    pmullw             xmm0, xm4, [srcq+ssq*0+0]
754*c0909341SAndroid Build Coastguard Worker    pmullw             xmm1, xm5, [srcq+ssq*0+2]
755*c0909341SAndroid Build Coastguard Worker    paddw              xmm0, xm6
756*c0909341SAndroid Build Coastguard Worker    paddw              xmm0, xmm1
757*c0909341SAndroid Build Coastguard Worker    psrlw              xmm0, 2
758*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, xmm0, 1
759*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
760*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*1+0]
761*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+ssq*1+2]
762*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
763*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, [srcq+ssq*0+0], 1
764*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym2, [srcq+ssq*0+2], 1
765*c0909341SAndroid Build Coastguard Worker    pmullw              ym1, ym4
766*c0909341SAndroid Build Coastguard Worker    pmullw              ym2, ym5
767*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym6
768*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym2
769*c0909341SAndroid Build Coastguard Worker    psrlw               ym1, 2              ; 1 2
770*c0909341SAndroid Build Coastguard Worker    vshufi32x4          ym2, ym0, ym1, 0x01 ; 0 1
771*c0909341SAndroid Build Coastguard Worker    mova                ym0, ym1
772*c0909341SAndroid Build Coastguard Worker    psubw               ym1, ym2
773*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym1
774*c0909341SAndroid Build Coastguard Worker    pmulhw              ym1, ym7
775*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym2
776*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym1, ym8
777*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], xm1
778*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*1], ym1, 1
779*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
780*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
781*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
782*c0909341SAndroid Build Coastguard Worker    RET
783*c0909341SAndroid Build Coastguard Worker.hv_w16:
784*c0909341SAndroid Build Coastguard Worker    pmullw              ym0, ym4, [srcq+ssq*0+0]
785*c0909341SAndroid Build Coastguard Worker    pmullw              ym1, ym5, [srcq+ssq*0+2]
786*c0909341SAndroid Build Coastguard Worker    paddw               ym0, ym6
787*c0909341SAndroid Build Coastguard Worker    paddw               ym0, ym1
788*c0909341SAndroid Build Coastguard Worker    psrlw               ym0, 2
789*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, ym0, 1
790*c0909341SAndroid Build Coastguard Worker.hv_w16_loop:
791*c0909341SAndroid Build Coastguard Worker    movu                ym1, [srcq+ssq*1+0]
792*c0909341SAndroid Build Coastguard Worker    movu                ym2, [srcq+ssq*1+2]
793*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
794*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [srcq+ssq*0+0], 1
795*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, [srcq+ssq*0+2], 1
796*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
797*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5
798*c0909341SAndroid Build Coastguard Worker    paddw                m1, m6
799*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
800*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2             ; 1 2
801*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m0, m1, q1032 ; 0 1
802*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
803*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
804*c0909341SAndroid Build Coastguard Worker    paddw                m1, m1
805*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m7
806*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
807*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m8
808*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym1
809*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m1, 1
810*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
811*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
812*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop
813*c0909341SAndroid Build Coastguard Worker    RET
814*c0909341SAndroid Build Coastguard Worker.hv_w32:
815*c0909341SAndroid Build Coastguard Worker.hv_w64:
816*c0909341SAndroid Build Coastguard Worker.hv_w128:
817*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
818*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+wq*8-256]
819*c0909341SAndroid Build Coastguard Worker    mov                  r4, srcq
820*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
821*c0909341SAndroid Build Coastguard Worker.hv_w32_loop0:
822*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+ssq*0+0]
823*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5, [srcq+ssq*0+2]
824*c0909341SAndroid Build Coastguard Worker    paddw                m0, m6
825*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
826*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
827*c0909341SAndroid Build Coastguard Worker.hv_w32_loop:
828*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m4, [srcq+ssq*1+0]
829*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5, [srcq+ssq*1+2]
830*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
831*c0909341SAndroid Build Coastguard Worker    paddw                m3, m6
832*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
833*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 2
834*c0909341SAndroid Build Coastguard Worker    psubw                m1, m3, m0
835*c0909341SAndroid Build Coastguard Worker    paddw                m1, m1
836*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m7
837*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
838*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+ssq*0+0]
839*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5, [srcq+ssq*0+2]
840*c0909341SAndroid Build Coastguard Worker    paddw                m0, m6
841*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
842*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
843*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, m3
844*c0909341SAndroid Build Coastguard Worker    paddw                m2, m2
845*c0909341SAndroid Build Coastguard Worker    pmulhw               m2, m7
846*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
847*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m8
848*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m8
849*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m1
850*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m2
851*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
852*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
853*c0909341SAndroid Build Coastguard Worker    jg .hv_w32_loop
854*c0909341SAndroid Build Coastguard Worker    add                  r4, 64
855*c0909341SAndroid Build Coastguard Worker    add                  r7, 64
856*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
857*c0909341SAndroid Build Coastguard Worker    mov                srcq, r4
858*c0909341SAndroid Build Coastguard Worker    mov                dstq, r7
859*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
860*c0909341SAndroid Build Coastguard Worker    jg .hv_w32_loop0
861*c0909341SAndroid Build Coastguard Worker    RET
862*c0909341SAndroid Build Coastguard Worker
863*c0909341SAndroid Build Coastguard Workercglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3
864*c0909341SAndroid Build Coastguard Worker    movifnidn          mxyd, r5m ; mx
865*c0909341SAndroid Build Coastguard Worker    lea                  r6, [prep_avx512icl]
866*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
867*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
868*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
869*c0909341SAndroid Build Coastguard Worker    jnz .h
870*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r6m ; my
871*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
872*c0909341SAndroid Build Coastguard Worker    jnz .v
873*c0909341SAndroid Build Coastguard Worker.prep:
874*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r6+wq*2+table_offset(prep,)]
875*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r7m ; bitdepth_max
876*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [r6-prep_avx512icl+pw_8192]
877*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
878*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 11
879*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [r6-prep_avx512icl+prep_mul+r5*4]
880*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
881*c0909341SAndroid Build Coastguard Worker    jmp                  wq
882*c0909341SAndroid Build Coastguard Worker.prep_w4:
883*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 0x0c
884*c0909341SAndroid Build Coastguard Worker    kmovb                k1, r3d
885*c0909341SAndroid Build Coastguard Worker.prep_w4_loop:
886*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+strideq*0]
887*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [srcq+strideq*1]
888*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        ym1, [srcq+strideq*2]
889*c0909341SAndroid Build Coastguard Worker    vpunpcklqdq     ym0{k1}, ym1, [srcq+stride3q] {1to4}
890*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
891*c0909341SAndroid Build Coastguard Worker    pmullw              ym0, ym4
892*c0909341SAndroid Build Coastguard Worker    psubw               ym0, ym5
893*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], ym0
894*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
895*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
896*c0909341SAndroid Build Coastguard Worker    jg .prep_w4_loop
897*c0909341SAndroid Build Coastguard Worker    RET
898*c0909341SAndroid Build Coastguard Worker.prep_w8:
899*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*0]
900*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, [srcq+strideq*1], 1
901*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, [srcq+strideq*2], 2
902*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, [srcq+stride3q ], 3
903*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
904*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
905*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
906*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
907*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
908*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
909*c0909341SAndroid Build Coastguard Worker    jg .prep_w8
910*c0909341SAndroid Build Coastguard Worker    RET
911*c0909341SAndroid Build Coastguard Worker.prep_w16:
912*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+strideq*0]
913*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+strideq*1], 1
914*c0909341SAndroid Build Coastguard Worker    movu                ym1, [srcq+strideq*2]
915*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [srcq+stride3q ], 1
916*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
917*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
918*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
919*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
920*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5
921*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m0
922*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m1
923*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
924*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
925*c0909341SAndroid Build Coastguard Worker    jg .prep_w16
926*c0909341SAndroid Build Coastguard Worker    RET
927*c0909341SAndroid Build Coastguard Worker.prep_w32:
928*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+strideq*0]
929*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+strideq*1]
930*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4, [srcq+strideq*2]
931*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m4, [srcq+stride3q ]
932*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
933*c0909341SAndroid Build Coastguard Worker    REPX      {psubw x, m5}, m0, m1, m2, m3
934*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m0
935*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m1
936*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*2], m2
937*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*3], m3
938*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*4
939*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
940*c0909341SAndroid Build Coastguard Worker    jg .prep_w32
941*c0909341SAndroid Build Coastguard Worker    RET
942*c0909341SAndroid Build Coastguard Worker.prep_w64:
943*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+strideq*0+64*0]
944*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+strideq*0+64*1]
945*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4, [srcq+strideq*1+64*0]
946*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m4, [srcq+strideq*1+64*1]
947*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
948*c0909341SAndroid Build Coastguard Worker    REPX      {psubw x, m5}, m0, m1, m2, m3
949*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m0
950*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m1
951*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*2], m2
952*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*3], m3
953*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*4
954*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
955*c0909341SAndroid Build Coastguard Worker    jg .prep_w64
956*c0909341SAndroid Build Coastguard Worker    RET
957*c0909341SAndroid Build Coastguard Worker.prep_w128:
958*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+64*0]
959*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+64*1]
960*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4, [srcq+64*2]
961*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m4, [srcq+64*3]
962*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
963*c0909341SAndroid Build Coastguard Worker    REPX      {psubw x, m5}, m0, m1, m2, m3
964*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m0
965*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m1
966*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*2], m2
967*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*3], m3
968*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*4
969*c0909341SAndroid Build Coastguard Worker    dec                  hd
970*c0909341SAndroid Build Coastguard Worker    jg .prep_w128
971*c0909341SAndroid Build Coastguard Worker    RET
972*c0909341SAndroid Build Coastguard Worker.h:
973*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, mxyd
974*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r6m ; my
975*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [pw_16]
976*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [pw_32766]
977*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5
978*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
979*c0909341SAndroid Build Coastguard Worker    jnz .h_12bpc
980*c0909341SAndroid Build Coastguard Worker    psllw                m4, 2
981*c0909341SAndroid Build Coastguard Worker    psllw                m5, 2
982*c0909341SAndroid Build Coastguard Worker.h_12bpc:
983*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
984*c0909341SAndroid Build Coastguard Worker    jnz .hv
985*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
986*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
987*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
988*c0909341SAndroid Build Coastguard Worker    jmp                  wq
989*c0909341SAndroid Build Coastguard Worker.h_w4:
990*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*0]
991*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, [srcq+strideq*2], 1
992*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+strideq*1]
993*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym2, [srcq+stride3q ], 1
994*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
995*c0909341SAndroid Build Coastguard Worker    punpcklqdq          ym0, ym1, ym2
996*c0909341SAndroid Build Coastguard Worker    psrldq              ym1, 2
997*c0909341SAndroid Build Coastguard Worker    psrldq              ym2, 2
998*c0909341SAndroid Build Coastguard Worker    pmullw              ym0, ym4
999*c0909341SAndroid Build Coastguard Worker    punpcklqdq          ym1, ym2
1000*c0909341SAndroid Build Coastguard Worker    pmullw              ym1, ym5
1001*c0909341SAndroid Build Coastguard Worker    psubw               ym0, ym6
1002*c0909341SAndroid Build Coastguard Worker    paddw               ym0, ym1
1003*c0909341SAndroid Build Coastguard Worker    psraw               ym0, 2
1004*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], ym0
1005*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
1006*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1007*c0909341SAndroid Build Coastguard Worker    jg .h_w4
1008*c0909341SAndroid Build Coastguard Worker    RET
1009*c0909341SAndroid Build Coastguard Worker.h_w8:
1010*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*0+0]
1011*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*0+2]
1012*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, [srcq+strideq*1+0], 1
1013*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, [srcq+strideq*1+2], 1
1014*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, [srcq+strideq*2+0], 2
1015*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, [srcq+strideq*2+2], 2
1016*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, [srcq+stride3q +0], 3
1017*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, [srcq+stride3q +2], 3
1018*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1019*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
1020*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5
1021*c0909341SAndroid Build Coastguard Worker    psubw                m0, m6
1022*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1023*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
1024*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
1025*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
1026*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1027*c0909341SAndroid Build Coastguard Worker    jg .h_w8
1028*c0909341SAndroid Build Coastguard Worker    RET
1029*c0909341SAndroid Build Coastguard Worker.h_w16:
1030*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+strideq*0+0]
1031*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+strideq*1+0], 1
1032*c0909341SAndroid Build Coastguard Worker    movu                ym1, [srcq+strideq*0+2]
1033*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [srcq+strideq*1+2], 1
1034*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1035*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
1036*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5
1037*c0909341SAndroid Build Coastguard Worker    psubw                m0, m6
1038*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1039*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
1040*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
1041*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
1042*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1043*c0909341SAndroid Build Coastguard Worker    jg .h_w16
1044*c0909341SAndroid Build Coastguard Worker    RET
1045*c0909341SAndroid Build Coastguard Worker.h_w32:
1046*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+strideq*0+0]
1047*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5, [srcq+strideq*0+2]
1048*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+strideq*1+0]
1049*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m5, [srcq+strideq*1+2]
1050*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1051*c0909341SAndroid Build Coastguard Worker    psubw                m0, m6
1052*c0909341SAndroid Build Coastguard Worker    psubw                m1, m6
1053*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1054*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
1055*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
1056*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
1057*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m0
1058*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m1
1059*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
1060*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1061*c0909341SAndroid Build Coastguard Worker    jg .h_w32
1062*c0909341SAndroid Build Coastguard Worker    RET
1063*c0909341SAndroid Build Coastguard Worker.h_w64:
1064*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+ 0]
1065*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5, [srcq+ 2]
1066*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+64]
1067*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m5, [srcq+66]
1068*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
1069*c0909341SAndroid Build Coastguard Worker    psubw                m0, m6
1070*c0909341SAndroid Build Coastguard Worker    psubw                m1, m6
1071*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1072*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
1073*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
1074*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
1075*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m0
1076*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m1
1077*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
1078*c0909341SAndroid Build Coastguard Worker    dec                  hd
1079*c0909341SAndroid Build Coastguard Worker    jg .h_w64
1080*c0909341SAndroid Build Coastguard Worker    RET
1081*c0909341SAndroid Build Coastguard Worker.h_w128:
1082*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+  0]
1083*c0909341SAndroid Build Coastguard Worker    pmullw               m7, m5, [srcq+  2]
1084*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+ 64]
1085*c0909341SAndroid Build Coastguard Worker    pmullw               m8, m5, [srcq+ 66]
1086*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4, [srcq+128]
1087*c0909341SAndroid Build Coastguard Worker    pmullw               m9, m5, [srcq+130]
1088*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m4, [srcq+192]
1089*c0909341SAndroid Build Coastguard Worker    pmullw              m10, m5, [srcq+194]
1090*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
1091*c0909341SAndroid Build Coastguard Worker    REPX      {psubw x, m6}, m0, m1, m2, m3
1092*c0909341SAndroid Build Coastguard Worker    paddw                m0, m7
1093*c0909341SAndroid Build Coastguard Worker    paddw                m1, m8
1094*c0909341SAndroid Build Coastguard Worker    paddw                m2, m9
1095*c0909341SAndroid Build Coastguard Worker    paddw                m3, m10
1096*c0909341SAndroid Build Coastguard Worker    REPX       {psraw x, 2}, m0, m1, m2, m3
1097*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m0
1098*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m1
1099*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*2], m2
1100*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*3], m3
1101*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*4
1102*c0909341SAndroid Build Coastguard Worker    dec                  hd
1103*c0909341SAndroid Build Coastguard Worker    jg .h_w128
1104*c0909341SAndroid Build Coastguard Worker    RET
1105*c0909341SAndroid Build Coastguard Worker.v:
1106*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
1107*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, mxyd
1108*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [pw_16]
1109*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [pw_32766]
1110*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
1111*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
1112*c0909341SAndroid Build Coastguard Worker    psubw                m8, m9
1113*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
1114*c0909341SAndroid Build Coastguard Worker    jnz .v_12bpc
1115*c0909341SAndroid Build Coastguard Worker    psllw                m8, 2
1116*c0909341SAndroid Build Coastguard Worker    psllw                m9, 2
1117*c0909341SAndroid Build Coastguard Worker.v_12bpc:
1118*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1119*c0909341SAndroid Build Coastguard Worker.v_w4:
1120*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [srcq+strideq*0]
1121*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
1122*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       xmm2, [srcq+strideq*1]
1123*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm1, [srcq+strideq*2]
1124*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm3, [srcq+stride3q ]
1125*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1126*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm2, ymm1, 0x30
1127*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm2, ymm3, 0xc0
1128*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3
1129*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [srcq+strideq*0]
1130*c0909341SAndroid Build Coastguard Worker    valignq            ymm2, ymm0, ymm2, 1    ; 1 2 3 4
1131*c0909341SAndroid Build Coastguard Worker    pmullw             ymm1, ym8
1132*c0909341SAndroid Build Coastguard Worker    pmullw             ymm2, ym9
1133*c0909341SAndroid Build Coastguard Worker    psubw              ymm1, ym10
1134*c0909341SAndroid Build Coastguard Worker    paddw              ymm1, ymm2
1135*c0909341SAndroid Build Coastguard Worker    psraw              ymm1, 2
1136*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], ymm1
1137*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
1138*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1139*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
1140*c0909341SAndroid Build Coastguard Worker    vzeroupper
1141*c0909341SAndroid Build Coastguard Worker    RET
1142*c0909341SAndroid Build Coastguard Worker.v_w8:
1143*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*0]
1144*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
1145*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, ym0, [srcq+strideq*1], 1
1146*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, [srcq+strideq*2], 2
1147*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, [srcq+stride3q ], 3 ; 0 1 2 3
1148*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1149*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*0]
1150*c0909341SAndroid Build Coastguard Worker    valignq              m2, m0, m1, 2           ; 1 2 3 4
1151*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m8
1152*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m9
1153*c0909341SAndroid Build Coastguard Worker    psubw                m1, m10
1154*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1155*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
1156*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m1
1157*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
1158*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1159*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
1160*c0909341SAndroid Build Coastguard Worker    RET
1161*c0909341SAndroid Build Coastguard Worker.v_w16:
1162*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+strideq*0]
1163*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
1164*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, m0, [srcq+strideq*1], 1 ; 0 1
1165*c0909341SAndroid Build Coastguard Worker    movu                ym3, [srcq+strideq*2]
1166*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, m3, [srcq+stride3q ], 1 ; 2 3
1167*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1168*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+strideq*0]
1169*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m1, m3, q1032           ; 1 2
1170*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m2, m0, q1032           ; 3 4
1171*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m8
1172*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m8
1173*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m9
1174*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m9
1175*c0909341SAndroid Build Coastguard Worker    psubw                m1, m10
1176*c0909341SAndroid Build Coastguard Worker    psubw                m2, m10
1177*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
1178*c0909341SAndroid Build Coastguard Worker    paddw                m2, m4
1179*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
1180*c0909341SAndroid Build Coastguard Worker    psraw                m2, 2
1181*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m1
1182*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m2
1183*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
1184*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1185*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
1186*c0909341SAndroid Build Coastguard Worker    RET
1187*c0909341SAndroid Build Coastguard Worker.v_w32:
1188*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
1189*c0909341SAndroid Build Coastguard Worker.v_w32_loop:
1190*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+strideq*1]
1191*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1192*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m8, m0
1193*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
1194*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m8, m3
1195*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m9
1196*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m9, m0
1197*c0909341SAndroid Build Coastguard Worker    psubw                m1, m10
1198*c0909341SAndroid Build Coastguard Worker    psubw                m2, m10
1199*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
1200*c0909341SAndroid Build Coastguard Worker    paddw                m2, m4
1201*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
1202*c0909341SAndroid Build Coastguard Worker    psraw                m2, 2
1203*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m1
1204*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m2
1205*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
1206*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1207*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop
1208*c0909341SAndroid Build Coastguard Worker    RET
1209*c0909341SAndroid Build Coastguard Worker.v_w64:
1210*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+64*0]
1211*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+64*1]
1212*c0909341SAndroid Build Coastguard Worker.v_w64_loop:
1213*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
1214*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m8, m0
1215*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+64*0]
1216*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m8, m1
1217*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+64*1]
1218*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m9, m0
1219*c0909341SAndroid Build Coastguard Worker    pmullw               m5, m9, m1
1220*c0909341SAndroid Build Coastguard Worker    psubw                m2, m10
1221*c0909341SAndroid Build Coastguard Worker    psubw                m3, m10
1222*c0909341SAndroid Build Coastguard Worker    paddw                m2, m4
1223*c0909341SAndroid Build Coastguard Worker    paddw                m3, m5
1224*c0909341SAndroid Build Coastguard Worker    psraw                m2, 2
1225*c0909341SAndroid Build Coastguard Worker    psraw                m3, 2
1226*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m2
1227*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m3
1228*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
1229*c0909341SAndroid Build Coastguard Worker    dec                  hd
1230*c0909341SAndroid Build Coastguard Worker    jg .v_w64_loop
1231*c0909341SAndroid Build Coastguard Worker    RET
1232*c0909341SAndroid Build Coastguard Worker.v_w128:
1233*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+64*0]
1234*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+64*1]
1235*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+64*2]
1236*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+64*3]
1237*c0909341SAndroid Build Coastguard Worker.v_w128_loop:
1238*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
1239*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m8, m0
1240*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+64*0]
1241*c0909341SAndroid Build Coastguard Worker    pmullw               m5, m8, m1
1242*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+64*1]
1243*c0909341SAndroid Build Coastguard Worker    pmullw               m6, m8, m2
1244*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+64*2]
1245*c0909341SAndroid Build Coastguard Worker    pmullw               m7, m8, m3
1246*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+64*3]
1247*c0909341SAndroid Build Coastguard Worker    pmullw              m11, m9, m0
1248*c0909341SAndroid Build Coastguard Worker    pmullw              m12, m9, m1
1249*c0909341SAndroid Build Coastguard Worker    pmullw              m13, m9, m2
1250*c0909341SAndroid Build Coastguard Worker    pmullw              m14, m9, m3
1251*c0909341SAndroid Build Coastguard Worker    REPX     {psubw x, m10}, m4, m5, m6, m7
1252*c0909341SAndroid Build Coastguard Worker    paddw                m4, m11
1253*c0909341SAndroid Build Coastguard Worker    paddw                m5, m12
1254*c0909341SAndroid Build Coastguard Worker    paddw                m6, m13
1255*c0909341SAndroid Build Coastguard Worker    paddw                m7, m14
1256*c0909341SAndroid Build Coastguard Worker    REPX       {psraw x, 2}, m4, m5, m6, m7
1257*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m4
1258*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m5
1259*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*2], m6
1260*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*3], m7
1261*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*4
1262*c0909341SAndroid Build Coastguard Worker    dec                  hd
1263*c0909341SAndroid Build Coastguard Worker    jg .v_w128_loop
1264*c0909341SAndroid Build Coastguard Worker    RET
1265*c0909341SAndroid Build Coastguard Worker.hv:
1266*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
1267*c0909341SAndroid Build Coastguard Worker    shl                mxyd, 11
1268*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, mxyd
1269*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
1270*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
1271*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1272*c0909341SAndroid Build Coastguard Worker.hv_w4:
1273*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [srcq+strideq*0+0]
1274*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [srcq+strideq*0+2]
1275*c0909341SAndroid Build Coastguard Worker    pmullw             xmm0, xm4
1276*c0909341SAndroid Build Coastguard Worker    pmullw             xmm1, xm5
1277*c0909341SAndroid Build Coastguard Worker    psubw              xmm0, xm6
1278*c0909341SAndroid Build Coastguard Worker    paddw              xmm0, xmm1
1279*c0909341SAndroid Build Coastguard Worker    psraw              xmm0, 2
1280*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        ym0, xmm0
1281*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
1282*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*1]
1283*c0909341SAndroid Build Coastguard Worker    vinserti128         ym1, [srcq+stride3q ], 1
1284*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+strideq*2]
1285*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1286*c0909341SAndroid Build Coastguard Worker    vinserti128         ym2, [srcq+strideq*0], 1
1287*c0909341SAndroid Build Coastguard Worker    punpcklqdq          ym3, ym1, ym2
1288*c0909341SAndroid Build Coastguard Worker    psrldq              ym1, 2
1289*c0909341SAndroid Build Coastguard Worker    psrldq              ym2, 2
1290*c0909341SAndroid Build Coastguard Worker    pmullw              ym3, ym4
1291*c0909341SAndroid Build Coastguard Worker    punpcklqdq          ym1, ym2
1292*c0909341SAndroid Build Coastguard Worker    pmullw              ym1, ym5
1293*c0909341SAndroid Build Coastguard Worker    psubw               ym3, ym6
1294*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym3
1295*c0909341SAndroid Build Coastguard Worker    psraw               ym1, 2           ; 1 2 3 4
1296*c0909341SAndroid Build Coastguard Worker    valignq             ym2, ym1, ym0, 3 ; 0 1 2 3
1297*c0909341SAndroid Build Coastguard Worker    mova                ym0, ym1
1298*c0909341SAndroid Build Coastguard Worker    psubw               ym1, ym2
1299*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym1, ym7
1300*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym2
1301*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], ym1
1302*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
1303*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1304*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
1305*c0909341SAndroid Build Coastguard Worker    RET
1306*c0909341SAndroid Build Coastguard Worker.hv_w8:
1307*c0909341SAndroid Build Coastguard Worker    pmullw              xm0, xm4, [srcq+strideq*0+0]
1308*c0909341SAndroid Build Coastguard Worker    pmullw              xm1, xm5, [srcq+strideq*0+2]
1309*c0909341SAndroid Build Coastguard Worker    psubw               xm0, xm6
1310*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
1311*c0909341SAndroid Build Coastguard Worker    psraw               xm0, 2
1312*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, xm0, 3
1313*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
1314*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*1+0]
1315*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+strideq*1+2]
1316*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, [srcq+strideq*2+0], 1
1317*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym2, [srcq+strideq*2+2], 1
1318*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, [srcq+stride3q +0], 2
1319*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m2, [srcq+stride3q +2], 2
1320*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1321*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, [srcq+strideq*0+0], 3
1322*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m2, [srcq+strideq*0+2], 3
1323*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
1324*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5
1325*c0909341SAndroid Build Coastguard Worker    psubw                m1, m6
1326*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1327*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2         ; 1 2 3 4
1328*c0909341SAndroid Build Coastguard Worker    valignq              m2, m1, m0, 6 ; 0 1 2 3
1329*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
1330*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
1331*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7
1332*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1333*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m1
1334*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
1335*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1336*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
1337*c0909341SAndroid Build Coastguard Worker    RET
1338*c0909341SAndroid Build Coastguard Worker.hv_w16:
1339*c0909341SAndroid Build Coastguard Worker    pmullw              ym0, ym4, [srcq+strideq*0+0]
1340*c0909341SAndroid Build Coastguard Worker    pmullw              ym1, ym5, [srcq+strideq*0+2]
1341*c0909341SAndroid Build Coastguard Worker    psubw               ym0, ym6
1342*c0909341SAndroid Build Coastguard Worker    paddw               ym0, ym1
1343*c0909341SAndroid Build Coastguard Worker    psraw               ym0, 2
1344*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, ym0, 1
1345*c0909341SAndroid Build Coastguard Worker.hv_w16_loop:
1346*c0909341SAndroid Build Coastguard Worker    movu                ym1, [srcq+strideq*1+0]
1347*c0909341SAndroid Build Coastguard Worker    movu                ym2, [srcq+strideq*1+2]
1348*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1349*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [srcq+strideq*0+0], 1
1350*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, [srcq+strideq*0+2], 1
1351*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
1352*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5
1353*c0909341SAndroid Build Coastguard Worker    psubw                m1, m6
1354*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1355*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2             ; 1 2
1356*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m0, m1, q1032 ; 0 1
1357*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
1358*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
1359*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7
1360*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1361*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m1
1362*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
1363*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1364*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop
1365*c0909341SAndroid Build Coastguard Worker    RET
1366*c0909341SAndroid Build Coastguard Worker.hv_w32:
1367*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+strideq*0+0]
1368*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5, [srcq+strideq*0+2]
1369*c0909341SAndroid Build Coastguard Worker    psubw                m0, m6
1370*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1371*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
1372*c0909341SAndroid Build Coastguard Worker.hv_w32_loop:
1373*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m4, [srcq+strideq*1+0]
1374*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5, [srcq+strideq*1+2]
1375*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1376*c0909341SAndroid Build Coastguard Worker    psubw                m3, m6
1377*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
1378*c0909341SAndroid Build Coastguard Worker    psraw                m3, 2
1379*c0909341SAndroid Build Coastguard Worker    psubw                m1, m3, m0
1380*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7
1381*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
1382*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+strideq*0+0]
1383*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5, [srcq+strideq*0+2]
1384*c0909341SAndroid Build Coastguard Worker    psubw                m0, m6
1385*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1386*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
1387*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, m3
1388*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m7
1389*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
1390*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m1
1391*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m2
1392*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
1393*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1394*c0909341SAndroid Build Coastguard Worker    jg .hv_w32_loop
1395*c0909341SAndroid Build Coastguard Worker    RET
1396*c0909341SAndroid Build Coastguard Worker.hv_w64:
1397*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+ 0]
1398*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5, [srcq+ 2]
1399*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+64]
1400*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m5, [srcq+66]
1401*c0909341SAndroid Build Coastguard Worker    psubw                m0, m6
1402*c0909341SAndroid Build Coastguard Worker    psubw                m1, m6
1403*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1404*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
1405*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
1406*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
1407*c0909341SAndroid Build Coastguard Worker.hv_w64_loop:
1408*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
1409*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4, [srcq+ 0]
1410*c0909341SAndroid Build Coastguard Worker    pmullw               m8, m5, [srcq+ 2]
1411*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m4, [srcq+64]
1412*c0909341SAndroid Build Coastguard Worker    pmullw               m9, m5, [srcq+66]
1413*c0909341SAndroid Build Coastguard Worker    psubw                m2, m6
1414*c0909341SAndroid Build Coastguard Worker    psubw                m3, m6
1415*c0909341SAndroid Build Coastguard Worker    paddw                m2, m8
1416*c0909341SAndroid Build Coastguard Worker    paddw                m3, m9
1417*c0909341SAndroid Build Coastguard Worker    psraw                m2, 2
1418*c0909341SAndroid Build Coastguard Worker    psraw                m3, 2
1419*c0909341SAndroid Build Coastguard Worker    psubw                m8, m2, m0
1420*c0909341SAndroid Build Coastguard Worker    psubw                m9, m3, m1
1421*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m7
1422*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m7
1423*c0909341SAndroid Build Coastguard Worker    paddw                m8, m0
1424*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
1425*c0909341SAndroid Build Coastguard Worker    paddw                m9, m1
1426*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
1427*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m8
1428*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m9
1429*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
1430*c0909341SAndroid Build Coastguard Worker    dec                  hd
1431*c0909341SAndroid Build Coastguard Worker    jg .hv_w64_loop
1432*c0909341SAndroid Build Coastguard Worker    RET
1433*c0909341SAndroid Build Coastguard Worker.hv_w128:
1434*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+  0]
1435*c0909341SAndroid Build Coastguard Worker    pmullw               m8, m5, [srcq+  2]
1436*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+ 64]
1437*c0909341SAndroid Build Coastguard Worker    pmullw               m9, m5, [srcq+ 66]
1438*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4, [srcq+128]
1439*c0909341SAndroid Build Coastguard Worker    pmullw              m10, m5, [srcq+130]
1440*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m4, [srcq+192]
1441*c0909341SAndroid Build Coastguard Worker    pmullw              m11, m5, [srcq+194]
1442*c0909341SAndroid Build Coastguard Worker    REPX      {psubw x, m6}, m0, m1, m2, m3
1443*c0909341SAndroid Build Coastguard Worker    paddw                m0, m8
1444*c0909341SAndroid Build Coastguard Worker    paddw                m1, m9
1445*c0909341SAndroid Build Coastguard Worker    paddw                m2, m10
1446*c0909341SAndroid Build Coastguard Worker    paddw                m3, m11
1447*c0909341SAndroid Build Coastguard Worker    REPX       {psraw x, 2}, m0, m1, m2, m3
1448*c0909341SAndroid Build Coastguard Worker.hv_w128_loop:
1449*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
1450*c0909341SAndroid Build Coastguard Worker    pmullw               m8, m4, [srcq+  0]
1451*c0909341SAndroid Build Coastguard Worker    pmullw              m12, m5, [srcq+  2]
1452*c0909341SAndroid Build Coastguard Worker    pmullw               m9, m4, [srcq+ 64]
1453*c0909341SAndroid Build Coastguard Worker    pmullw              m13, m5, [srcq+ 66]
1454*c0909341SAndroid Build Coastguard Worker    pmullw              m10, m4, [srcq+128]
1455*c0909341SAndroid Build Coastguard Worker    pmullw              m14, m5, [srcq+130]
1456*c0909341SAndroid Build Coastguard Worker    pmullw              m11, m4, [srcq+192]
1457*c0909341SAndroid Build Coastguard Worker    pmullw              m15, m5, [srcq+194]
1458*c0909341SAndroid Build Coastguard Worker    REPX      {psubw x, m6}, m8, m9, m10, m11
1459*c0909341SAndroid Build Coastguard Worker    paddw                m8, m12
1460*c0909341SAndroid Build Coastguard Worker    paddw                m9, m13
1461*c0909341SAndroid Build Coastguard Worker    paddw               m10, m14
1462*c0909341SAndroid Build Coastguard Worker    paddw               m11, m15
1463*c0909341SAndroid Build Coastguard Worker    REPX       {psraw x, 2}, m8, m9, m10, m11
1464*c0909341SAndroid Build Coastguard Worker    psubw               m12, m8, m0
1465*c0909341SAndroid Build Coastguard Worker    psubw               m13, m9, m1
1466*c0909341SAndroid Build Coastguard Worker    psubw               m14, m10, m2
1467*c0909341SAndroid Build Coastguard Worker    psubw               m15, m11, m3
1468*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m7}, m12, m13, m14, m15
1469*c0909341SAndroid Build Coastguard Worker    paddw               m12, m0
1470*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
1471*c0909341SAndroid Build Coastguard Worker    paddw               m13, m1
1472*c0909341SAndroid Build Coastguard Worker    mova                 m1, m9
1473*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m12
1474*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m13
1475*c0909341SAndroid Build Coastguard Worker    paddw               m14, m2
1476*c0909341SAndroid Build Coastguard Worker    mova                 m2, m10
1477*c0909341SAndroid Build Coastguard Worker    paddw               m15, m3
1478*c0909341SAndroid Build Coastguard Worker    mova                 m3, m11
1479*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*2], m14
1480*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*3], m15
1481*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*4
1482*c0909341SAndroid Build Coastguard Worker    dec                  hd
1483*c0909341SAndroid Build Coastguard Worker    jg .hv_w128_loop
1484*c0909341SAndroid Build Coastguard Worker    RET
1485*c0909341SAndroid Build Coastguard Worker
1486*c0909341SAndroid Build Coastguard Worker; int8_t subpel_filters[5][15][8]
1487*c0909341SAndroid Build Coastguard Worker%assign FILTER_REGULAR (0*15 << 16) | 3*15
1488*c0909341SAndroid Build Coastguard Worker%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
1489*c0909341SAndroid Build Coastguard Worker%assign FILTER_SHARP   (2*15 << 16) | 3*15
1490*c0909341SAndroid Build Coastguard Worker
1491*c0909341SAndroid Build Coastguard Worker%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
1492*c0909341SAndroid Build Coastguard Workercglobal %1_%2_16bpc
1493*c0909341SAndroid Build Coastguard Worker    mov                 t0d, FILTER_%3
1494*c0909341SAndroid Build Coastguard Worker%ifidn %3, %4
1495*c0909341SAndroid Build Coastguard Worker    mov                 t1d, t0d
1496*c0909341SAndroid Build Coastguard Worker%else
1497*c0909341SAndroid Build Coastguard Worker    mov                 t1d, FILTER_%4
1498*c0909341SAndroid Build Coastguard Worker%endif
1499*c0909341SAndroid Build Coastguard Worker%if %0 == 5 ; skip the jump in the last filter
1500*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
1501*c0909341SAndroid Build Coastguard Worker%endif
1502*c0909341SAndroid Build Coastguard Worker%endmacro
1503*c0909341SAndroid Build Coastguard Worker
1504*c0909341SAndroid Build Coastguard Worker%if WIN64
1505*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4, 5
1506*c0909341SAndroid Build Coastguard Worker%define buf rsp+stack_offset+8 ; shadow space
1507*c0909341SAndroid Build Coastguard Worker%else
1508*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7, 8
1509*c0909341SAndroid Build Coastguard Worker%define buf rsp-40 ; red zone
1510*c0909341SAndroid Build Coastguard Worker%endif
1511*c0909341SAndroid Build Coastguard Worker
1512*c0909341SAndroid Build Coastguard Worker%define PUT_8TAP_FN FN put_8tap,
1513*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_16bpc
1514*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_16bpc
1515*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_16bpc
1516*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular,        REGULAR, REGULAR
1517*c0909341SAndroid Build Coastguard Worker
1518*c0909341SAndroid Build Coastguard Workercglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
1519*c0909341SAndroid Build Coastguard Worker%define base r8-put_avx512icl
1520*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
1521*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
1522*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
1523*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 6tap_v, my, 4tap_v
1524*c0909341SAndroid Build Coastguard Worker    lea                  r8, [put_avx512icl]
1525*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
1526*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
1527*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
1528*c0909341SAndroid Build Coastguard Worker    jnz .h
1529*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
1530*c0909341SAndroid Build Coastguard Worker    jnz .v
1531*c0909341SAndroid Build Coastguard Worker.put:
1532*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
1533*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r8+wq*2+table_offset(put,)]
1534*c0909341SAndroid Build Coastguard Worker    add                  wq, r8
1535*c0909341SAndroid Build Coastguard Worker%if WIN64
1536*c0909341SAndroid Build Coastguard Worker    pop                  r8
1537*c0909341SAndroid Build Coastguard Worker%endif
1538*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1539*c0909341SAndroid Build Coastguard Worker.h_w8:
1540*c0909341SAndroid Build Coastguard Worker    mova                 m4, [spel_h_shufA]
1541*c0909341SAndroid Build Coastguard Worker    movu                 m5, [spel_h_shufB]
1542*c0909341SAndroid Build Coastguard Worker    movu                 m6, [spel_h_shufC]
1543*c0909341SAndroid Build Coastguard Worker.h_w8_loop:
1544*c0909341SAndroid Build Coastguard Worker    movu                ym2, [srcq+ssq*0]
1545*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, [srcq+ssq*1], 1
1546*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1547*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
1548*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m4, m2
1549*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m10, m1
1550*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m5, m2
1551*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m11, m1
1552*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m6, m2
1553*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m12, m1
1554*c0909341SAndroid Build Coastguard Worker    psrad                m0, 6
1555*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym1, m0, 1
1556*c0909341SAndroid Build Coastguard Worker    packusdw            ym0, ym1
1557*c0909341SAndroid Build Coastguard Worker    pminsw              ym0, ym15
1558*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], xm0
1559*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*1], ym0, 1
1560*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1561*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1562*c0909341SAndroid Build Coastguard Worker    jg .h_w8_loop
1563*c0909341SAndroid Build Coastguard Worker    RET
1564*c0909341SAndroid Build Coastguard Worker.h:
1565*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m15, r8m
1566*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
1567*c0909341SAndroid Build Coastguard Worker    jnz .hv
1568*c0909341SAndroid Build Coastguard Worker    mov                 r7d, r8m
1569*c0909341SAndroid Build Coastguard Worker    shr                 r7d, 11
1570*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+put_8tap_h_rnd+r7*4]
1571*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1572*c0909341SAndroid Build Coastguard Worker    jle mangle(private_prefix %+ _put_8tap_16bpc_avx512icl).h_w4
1573*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
1574*c0909341SAndroid Build Coastguard Worker    sub                srcq, 4
1575*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm0, [base+subpel_filters+1+mxq*8]
1576*c0909341SAndroid Build Coastguard Worker    mova              [buf], xmm0
1577*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, xmm0
1578*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [buf+8]
1579*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [buf+4]
1580*c0909341SAndroid Build Coastguard Worker    sub                  wd, 16
1581*c0909341SAndroid Build Coastguard Worker    jl .h_w8
1582*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m6, [spel_h_shufA]
1583*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m7, [spel_h_shufB]
1584*c0909341SAndroid Build Coastguard Worker    jg .h_w32
1585*c0909341SAndroid Build Coastguard Worker.h_w16_loop:
1586*c0909341SAndroid Build Coastguard Worker    movu                ym2, [srcq+ssq*0+ 0]
1587*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, [srcq+ssq*1+ 0], 1
1588*c0909341SAndroid Build Coastguard Worker    movu                ym3, [srcq+ssq*0+12]
1589*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, [srcq+ssq*1+12], 1
1590*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1591*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
1592*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
1593*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m2, m6
1594*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m10, m4 ; a0  b0
1595*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m3, m7
1596*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m12, m4 ; a2' b2'
1597*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
1598*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m6
1599*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m11, m2 ; a1  b1
1600*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m11, m3 ; a1' b1'
1601*c0909341SAndroid Build Coastguard Worker    shufpd               m2, m3, 0x55
1602*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m12, m2 ; a2  b2
1603*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m10, m2 ; a0' b0'
1604*c0909341SAndroid Build Coastguard Worker    psrad                m0, 6
1605*c0909341SAndroid Build Coastguard Worker    psrad                m1, 6
1606*c0909341SAndroid Build Coastguard Worker    packusdw             m0, m1
1607*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m15
1608*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym0
1609*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m0, 1
1610*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1611*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1612*c0909341SAndroid Build Coastguard Worker    jg .h_w16_loop
1613*c0909341SAndroid Build Coastguard Worker    RET
1614*c0909341SAndroid Build Coastguard Worker.h_w32:
1615*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+wq*2]
1616*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+wq*2]
1617*c0909341SAndroid Build Coastguard Worker    neg                  wq
1618*c0909341SAndroid Build Coastguard Worker.h_w32_loop0:
1619*c0909341SAndroid Build Coastguard Worker    mov                  r6, wq
1620*c0909341SAndroid Build Coastguard Worker.h_w32_loop:
1621*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6*2+ 0]
1622*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+r6*2+12]
1623*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
1624*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
1625*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m2, m6
1626*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m10, m4 ; a0
1627*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m3, m7
1628*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m12, m4 ; b2
1629*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
1630*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m6
1631*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m11, m2 ; a1
1632*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m11, m3 ; b1
1633*c0909341SAndroid Build Coastguard Worker    shufpd               m2, m3, 0x55
1634*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m12, m2 ; a2
1635*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m10, m2 ; b0
1636*c0909341SAndroid Build Coastguard Worker    psrad                m0, 6
1637*c0909341SAndroid Build Coastguard Worker    psrad                m1, 6
1638*c0909341SAndroid Build Coastguard Worker    packusdw             m0, m1
1639*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m15
1640*c0909341SAndroid Build Coastguard Worker    mova        [dstq+r6*2], m0
1641*c0909341SAndroid Build Coastguard Worker    add                  r6, 32
1642*c0909341SAndroid Build Coastguard Worker    jl .h_w32_loop
1643*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
1644*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
1645*c0909341SAndroid Build Coastguard Worker    dec                  hd
1646*c0909341SAndroid Build Coastguard Worker    jg .h_w32_loop0
1647*c0909341SAndroid Build Coastguard Worker    RET
1648*c0909341SAndroid Build Coastguard Worker.v:
1649*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1650*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1651*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1652*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
1653*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pd_32]
1654*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm0, [base+subpel_filters+1+myq*8]
1655*c0909341SAndroid Build Coastguard Worker    tzcnt               r7d, wd
1656*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m15, r8m
1657*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
1658*c0909341SAndroid Build Coastguard Worker    movzx               r7d, word [r8+r7*2+table_offset(put, _6tap_v)]
1659*c0909341SAndroid Build Coastguard Worker    neg                  r6
1660*c0909341SAndroid Build Coastguard Worker    mova [rsp+stack_offset+8], xmm0
1661*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, xmm0
1662*c0909341SAndroid Build Coastguard Worker    add                  r7, r8
1663*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [rsp+stack_offset+12]
1664*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [rsp+stack_offset+16]
1665*c0909341SAndroid Build Coastguard Worker    jmp                  r7
1666*c0909341SAndroid Build Coastguard Worker.v_w2:
1667*c0909341SAndroid Build Coastguard Worker    movd               xmm2, [srcq+r6 *2]
1668*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm2, [srcq+r6 *1], 1
1669*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm2, [srcq+ssq*0], 2
1670*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm2, [srcq+ssq*1], 3 ; 0 1 2 3
1671*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1672*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [srcq+ssq*0]
1673*c0909341SAndroid Build Coastguard Worker    palignr            xmm3, xmm0, xmm2, 4   ; 1 2 3 4
1674*c0909341SAndroid Build Coastguard Worker    punpcklwd          xmm1, xmm2, xmm3      ; 01 12
1675*c0909341SAndroid Build Coastguard Worker    punpckhwd          xmm2, xmm3            ; 23 34
1676*c0909341SAndroid Build Coastguard Worker.v_w2_loop:
1677*c0909341SAndroid Build Coastguard Worker    movd               xmm3, [srcq+ssq*1]
1678*c0909341SAndroid Build Coastguard Worker    mova               xmm4, xm11
1679*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm4, xmm1, xm12      ; a0 b0
1680*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1681*c0909341SAndroid Build Coastguard Worker    mova               xmm1, xmm2
1682*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm4, xmm2, xm13      ; a1 b1
1683*c0909341SAndroid Build Coastguard Worker    punpckldq          xmm2, xmm0, xmm3      ; 4 5
1684*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [srcq+ssq*0]
1685*c0909341SAndroid Build Coastguard Worker    punpckldq          xmm3, xmm0            ; 5 6
1686*c0909341SAndroid Build Coastguard Worker    punpcklwd          xmm2, xmm3            ; 45 56
1687*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm4, xmm2, xm14      ; a2 b2
1688*c0909341SAndroid Build Coastguard Worker    psrad              xmm4, 6
1689*c0909341SAndroid Build Coastguard Worker    packusdw           xmm4, xmm4
1690*c0909341SAndroid Build Coastguard Worker    pminsw             xmm4, xm15
1691*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm4
1692*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm4, 1
1693*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1694*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1695*c0909341SAndroid Build Coastguard Worker    jg .v_w2_loop
1696*c0909341SAndroid Build Coastguard Worker    RET
1697*c0909341SAndroid Build Coastguard Worker.v_w4:
1698*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [srcq+r6 *2]
1699*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm3, [srcq+r6 *1]
1700*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm2, [srcq+ssq*0]
1701*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm4, [srcq+ssq*1]
1702*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1703*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm0, [srcq+ssq*0]
1704*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm1, ymm3, 0x30
1705*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm3, ymm2, 0x30
1706*c0909341SAndroid Build Coastguard Worker    punpcklwd          ymm1, ymm3       ; 01 12
1707*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm2, ymm4, 0x30
1708*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm4, ymm0, 0x30
1709*c0909341SAndroid Build Coastguard Worker    punpcklwd          ymm2, ymm4       ; 23 34
1710*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
1711*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm3, [srcq+ssq*1]
1712*c0909341SAndroid Build Coastguard Worker    mova               ymm4, ym11
1713*c0909341SAndroid Build Coastguard Worker    vpdpwssd           ymm4, ymm1, ym12 ; a0 b0
1714*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1715*c0909341SAndroid Build Coastguard Worker    mova               ymm1, ymm2
1716*c0909341SAndroid Build Coastguard Worker    vpdpwssd           ymm4, ymm2, ym13 ; a1 b1
1717*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm2, ymm0, ymm3, 0x30
1718*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm0, [srcq+ssq*0]
1719*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm3, ymm0, 0x30
1720*c0909341SAndroid Build Coastguard Worker    punpcklwd          ymm2, ymm3       ; 45 56
1721*c0909341SAndroid Build Coastguard Worker    vpdpwssd           ymm4, ymm2, ym14 ; a2 b2
1722*c0909341SAndroid Build Coastguard Worker    psrad              ymm4, 6
1723*c0909341SAndroid Build Coastguard Worker    vextracti128       xmm3, ymm4, 1
1724*c0909341SAndroid Build Coastguard Worker    packusdw           xmm4, xmm3
1725*c0909341SAndroid Build Coastguard Worker    pminsw             xmm4, xm15
1726*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xmm4
1727*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xmm4
1728*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1729*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1730*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
1731*c0909341SAndroid Build Coastguard Worker    vzeroupper
1732*c0909341SAndroid Build Coastguard Worker    RET
1733*c0909341SAndroid Build Coastguard Worker.v_w8:
1734*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [srcq+ssq*0]
1735*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, m0, [srcq+r6 *2], 0
1736*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, [srcq+r6 *1], 1 ; 0 1 2
1737*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, [srcq+ssq*1], 1
1738*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1739*c0909341SAndroid Build Coastguard Worker    mova                 m5, [spel_v_shuf8]
1740*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, [srcq+ssq*0], 2 ; 2 3 4
1741*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m5, m1          ; 01 12
1742*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m5, m0          ; 23 34
1743*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
1744*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, [srcq+ssq*1], 3
1745*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1746*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ssq*0]
1747*c0909341SAndroid Build Coastguard Worker    mova                 m4, m11
1748*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m12, m1         ; a0 b0
1749*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m3, q1032       ; 4 5 6
1750*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
1751*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m13, m2         ; a1 b1
1752*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m5, m0          ; 45 56
1753*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m14, m2         ; a2 b2
1754*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
1755*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym3, m4, 1
1756*c0909341SAndroid Build Coastguard Worker    packusdw            ym4, ym3
1757*c0909341SAndroid Build Coastguard Worker    pminsw              ym4, ym15
1758*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], xm4
1759*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*1], ym4, 1
1760*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1761*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1762*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
1763*c0909341SAndroid Build Coastguard Worker    RET
1764*c0909341SAndroid Build Coastguard Worker.v_w16:
1765*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m0, [srcq+r6 *1]
1766*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, m0, [srcq+ssq*0], 1
1767*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+r6*2], 0
1768*c0909341SAndroid Build Coastguard Worker    mova                 m6, [spel_v_shuf16]
1769*c0909341SAndroid Build Coastguard Worker    movu                ym3, [srcq+ssq*1]
1770*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1771*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, [srcq+ssq*0], 1
1772*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m6, m1     ; 12
1773*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m6, m0     ; 01
1774*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m6, m3     ; 34
1775*c0909341SAndroid Build Coastguard Worker    mova                 m7, [deint_q_shuf]
1776*c0909341SAndroid Build Coastguard Worker    vpshrdd              m2, m1, m3, 16 ; 23
1777*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
1778*c0909341SAndroid Build Coastguard Worker    mova                 m5, m11
1779*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m12, m1    ; b0
1780*c0909341SAndroid Build Coastguard Worker    mova                 m4, m11
1781*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m12, m0    ; a0
1782*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
1783*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m13, m3    ; b1
1784*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
1785*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m13, m2    ; a1
1786*c0909341SAndroid Build Coastguard Worker    movu                ym3, [srcq+ssq*1]
1787*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1788*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, [srcq+ssq*0], 1
1789*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m6, m3     ; 56
1790*c0909341SAndroid Build Coastguard Worker    vpshrdd              m2, m1, m3, 16 ; 45
1791*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m14, m3    ; b2
1792*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m14, m2    ; a2
1793*c0909341SAndroid Build Coastguard Worker    psrad                m5, 6
1794*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
1795*c0909341SAndroid Build Coastguard Worker    packusdw             m4, m5
1796*c0909341SAndroid Build Coastguard Worker    pminsw               m4, m15
1797*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m7, m4
1798*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym4
1799*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m4, 1
1800*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1801*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1802*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
1803*c0909341SAndroid Build Coastguard Worker    RET
1804*c0909341SAndroid Build Coastguard Worker.v_w32:
1805*c0909341SAndroid Build Coastguard Worker.v_w64:
1806*c0909341SAndroid Build Coastguard Worker.v_w128:
1807*c0909341SAndroid Build Coastguard Worker    lea                  wd, [hq+wq*8-256]
1808*c0909341SAndroid Build Coastguard Worker.v_w32_loop0:
1809*c0909341SAndroid Build Coastguard Worker    movu                m16, [srcq+r6 *2]
1810*c0909341SAndroid Build Coastguard Worker    movu                m17, [srcq+r6 *1]
1811*c0909341SAndroid Build Coastguard Worker    lea                  r7, [srcq+ssq*2]
1812*c0909341SAndroid Build Coastguard Worker    movu                m18, [srcq+ssq*0]
1813*c0909341SAndroid Build Coastguard Worker    movu                m19, [srcq+ssq*1]
1814*c0909341SAndroid Build Coastguard Worker    mov                  r8, dstq
1815*c0909341SAndroid Build Coastguard Worker    movu                m20, [r7  +ssq*0]
1816*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m16, m17 ; 01
1817*c0909341SAndroid Build Coastguard Worker    punpckhwd           m16, m17
1818*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m17, m18 ; 12
1819*c0909341SAndroid Build Coastguard Worker    punpckhwd           m17, m18
1820*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m18, m19 ; 23
1821*c0909341SAndroid Build Coastguard Worker    punpckhwd           m18, m19
1822*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m19, m20 ; 34
1823*c0909341SAndroid Build Coastguard Worker    punpckhwd           m19, m20
1824*c0909341SAndroid Build Coastguard Worker.v_w32_loop:
1825*c0909341SAndroid Build Coastguard Worker    mova                 m4, m11
1826*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m12, m0  ; a0
1827*c0909341SAndroid Build Coastguard Worker    mova                 m6, m11
1828*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m12, m16
1829*c0909341SAndroid Build Coastguard Worker    mova                 m5, m11
1830*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m12, m1  ; b0
1831*c0909341SAndroid Build Coastguard Worker    mova                 m7, m11
1832*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m12, m17
1833*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
1834*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m13, m2  ; a1
1835*c0909341SAndroid Build Coastguard Worker    mova                m16, m18
1836*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m13, m18
1837*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
1838*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m13, m3  ; b1
1839*c0909341SAndroid Build Coastguard Worker    mova                m17, m19
1840*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m13, m19
1841*c0909341SAndroid Build Coastguard Worker    movu                m19, [r7+ssq*1]
1842*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+ssq*2]
1843*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m20, m19 ; 45
1844*c0909341SAndroid Build Coastguard Worker    punpckhwd           m18, m20, m19
1845*c0909341SAndroid Build Coastguard Worker    movu                m20, [r7+ssq*0]
1846*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m14, m2  ; a2
1847*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m14, m18
1848*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m19, m20 ; 56
1849*c0909341SAndroid Build Coastguard Worker    punpckhwd           m19, m20
1850*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m14, m3  ; b2
1851*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m14, m19
1852*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 6}, m4, m6, m5, m7
1853*c0909341SAndroid Build Coastguard Worker    packusdw             m4, m6
1854*c0909341SAndroid Build Coastguard Worker    packusdw             m5, m7
1855*c0909341SAndroid Build Coastguard Worker    pminsw               m4, m15
1856*c0909341SAndroid Build Coastguard Worker    pminsw               m5, m15
1857*c0909341SAndroid Build Coastguard Worker    mova         [r8+dsq*0], m4
1858*c0909341SAndroid Build Coastguard Worker    mova         [r8+dsq*1], m5
1859*c0909341SAndroid Build Coastguard Worker    lea                  r8, [r8+dsq*2]
1860*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1861*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop
1862*c0909341SAndroid Build Coastguard Worker    add                srcq, 64
1863*c0909341SAndroid Build Coastguard Worker    add                dstq, 64
1864*c0909341SAndroid Build Coastguard Worker    movzx                hd, wb
1865*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<8
1866*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop0
1867*c0909341SAndroid Build Coastguard Worker    vzeroupper
1868*c0909341SAndroid Build Coastguard Worker    RET
1869*c0909341SAndroid Build Coastguard Worker.hv:
1870*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1871*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
1872*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
1873*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
1874*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1875*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1876*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1877*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
1878*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm1, [base+subpel_filters+1+myq*8]
1879*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
1880*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
1881*c0909341SAndroid Build Coastguard Worker    neg                  r6
1882*c0909341SAndroid Build Coastguard Worker    test          dword r8m, 0x800
1883*c0909341SAndroid Build Coastguard Worker    jnz .hv_12bit
1884*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [pd_2176]
1885*c0909341SAndroid Build Coastguard Worker    psllw              xmm0, 6
1886*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
1887*c0909341SAndroid Build Coastguard Worker.hv_12bit:
1888*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [pd_640]
1889*c0909341SAndroid Build Coastguard Worker    psllw              xmm0, 4
1890*c0909341SAndroid Build Coastguard Worker    psllw              xmm1, 2
1891*c0909341SAndroid Build Coastguard Worker.hv_main:
1892*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+r6 *2]
1893*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym4, [srcq+r6 *1], 1
1894*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m4, [srcq+ssq*0], 2
1895*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m6, [spel_h_shufA]
1896*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m4, [srcq+ssq*1], 3 ; 0 1 2 3
1897*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1898*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+ssq*0]    ; 4
1899*c0909341SAndroid Build Coastguard Worker    mova           [buf+ 0], xmm0
1900*c0909341SAndroid Build Coastguard Worker    mova           [buf+16], xmm1
1901*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [buf+ 4]
1902*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [buf+ 8]
1903*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym12, xmm1
1904*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym13, [buf+20]
1905*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym14, [buf+24]
1906*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1907*c0909341SAndroid Build Coastguard Worker    je .hv_w4
1908*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m2, [spel_h_shufA]
1909*c0909341SAndroid Build Coastguard Worker    mova                 m3, [spel_h_shuf2b]
1910*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
1911*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m6
1912*c0909341SAndroid Build Coastguard Worker    pshufb              xm5, xm6
1913*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m4, m5
1914*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m8, m2    ; 04 1_ 2_ 3_
1915*c0909341SAndroid Build Coastguard Worker    mova                ym6, [spel_h_shuf2a]
1916*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m4, m5
1917*c0909341SAndroid Build Coastguard Worker    mova                xm5, [spel_shuf2]
1918*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m9, m4
1919*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m3, m1    ; 01 12
1920*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, ym1, 1    ; 23 34
1921*c0909341SAndroid Build Coastguard Worker.hv_w2_loop:
1922*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ssq*1]
1923*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1924*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym3, [srcq+ssq*0], 1
1925*c0909341SAndroid Build Coastguard Worker    vpermb              ym3, ym6, ym3
1926*c0909341SAndroid Build Coastguard Worker    pmaddwd            xmm0, xm12, xm1 ; a0 b0
1927*c0909341SAndroid Build Coastguard Worker    mova                xm4, xm10
1928*c0909341SAndroid Build Coastguard Worker    vpdpwssd            xm4, xm8, xm3
1929*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm3, ym3, 1
1930*c0909341SAndroid Build Coastguard Worker    mova                xm1, xm2
1931*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm0, xm13, xm2 ; a1 b1
1932*c0909341SAndroid Build Coastguard Worker    vpdpwssd            xm4, xm9, xm3  ; 5 6
1933*c0909341SAndroid Build Coastguard Worker    vpermt2b            xm2, xm5, xm4  ; 45 56
1934*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm0, xm14, xm2 ; a2 b2
1935*c0909341SAndroid Build Coastguard Worker    psrad              xmm0, 10
1936*c0909341SAndroid Build Coastguard Worker    packusdw           xmm0, xmm0
1937*c0909341SAndroid Build Coastguard Worker    pminsw             xmm0, xm15
1938*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm0
1939*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm0, 1
1940*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1941*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1942*c0909341SAndroid Build Coastguard Worker    jg .hv_w2_loop
1943*c0909341SAndroid Build Coastguard Worker    RET
1944*c0909341SAndroid Build Coastguard Worker.hv_w4:
1945*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m7, [spel_h_shufB]
1946*c0909341SAndroid Build Coastguard Worker    mova                ym0, [spel_shuf4a]
1947*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m6
1948*c0909341SAndroid Build Coastguard Worker    mova                 m2, m10
1949*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m8, m1
1950*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm5, xm6
1951*c0909341SAndroid Build Coastguard Worker    mova                xm3, xm10
1952*c0909341SAndroid Build Coastguard Worker    vpdpwssd            xm3, xm8, xm1
1953*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m7
1954*c0909341SAndroid Build Coastguard Worker    pshufb              xm5, xm7
1955*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m9, m4    ; 0 1 2 3
1956*c0909341SAndroid Build Coastguard Worker    vpdpwssd            xm3, xm9, xm5  ; 4
1957*c0909341SAndroid Build Coastguard Worker    mova                ym5, [spel_shuf4b]
1958*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m0, m2    ; 01 12
1959*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m3, q1032 ; 2 3 4
1960*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m0, m2    ; 23 34
1961*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
1962*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ssq*1]
1963*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1964*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym3, [srcq+ssq*0], 1
1965*c0909341SAndroid Build Coastguard Worker    pmaddwd             ym0, ym12, ym1 ; a0 b0
1966*c0909341SAndroid Build Coastguard Worker    mova                ym1, ym2
1967*c0909341SAndroid Build Coastguard Worker    pshufb              ym4, ym3, ym6
1968*c0909341SAndroid Build Coastguard Worker    mova                ym2, ym10
1969*c0909341SAndroid Build Coastguard Worker    vpdpwssd            ym2, ym8, ym4
1970*c0909341SAndroid Build Coastguard Worker    pshufb              ym3, ym7
1971*c0909341SAndroid Build Coastguard Worker    vpdpwssd            ym0, ym13, ym1 ; a1 b1
1972*c0909341SAndroid Build Coastguard Worker    vpdpwssd            ym2, ym9, ym3  ; 5 6
1973*c0909341SAndroid Build Coastguard Worker    vpermt2b            ym2, ym5, ym1  ; 45 56
1974*c0909341SAndroid Build Coastguard Worker    vpdpwssd            ym0, ym14, ym2 ; a2 b2
1975*c0909341SAndroid Build Coastguard Worker    psrad               ym0, 10
1976*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm4, ym0, 1
1977*c0909341SAndroid Build Coastguard Worker    packusdw            xm0, xm4
1978*c0909341SAndroid Build Coastguard Worker    pminsw             xmm0, xm0, xm15
1979*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xmm0
1980*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xmm0
1981*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1982*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1983*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
1984*c0909341SAndroid Build Coastguard Worker    RET
1985*c0909341SAndroid Build Coastguard Worker.hv_w8:
1986*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
1987*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm0, [base+subpel_filters+1+mxq*8]
1988*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1989*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1990*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1991*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
1992*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm1, [base+subpel_filters+1+myq*8]
1993*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
1994*c0909341SAndroid Build Coastguard Worker    sub                srcq, 4
1995*c0909341SAndroid Build Coastguard Worker    neg                  r6
1996*c0909341SAndroid Build Coastguard Worker    test          dword r8m, 0x800
1997*c0909341SAndroid Build Coastguard Worker    jnz .hv_w8_12bit
1998*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [pd_2176]
1999*c0909341SAndroid Build Coastguard Worker    psllw              xmm0, 6
2000*c0909341SAndroid Build Coastguard Worker    jmp .hv_w8_main
2001*c0909341SAndroid Build Coastguard Worker.hv_w8_12bit:
2002*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [pd_640]
2003*c0909341SAndroid Build Coastguard Worker    psllw              xmm0, 4
2004*c0909341SAndroid Build Coastguard Worker    psllw              xmm1, 2
2005*c0909341SAndroid Build Coastguard Worker.hv_w8_main:
2006*c0909341SAndroid Build Coastguard Worker    mova           [buf+ 0], xmm0
2007*c0909341SAndroid Build Coastguard Worker    mova           [buf+16], xmm1
2008*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, xmm0
2009*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [buf+ 4]
2010*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [buf+ 8]
2011*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, xmm1
2012*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [buf+20]
2013*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [buf+24]
2014*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 16
2015*c0909341SAndroid Build Coastguard Worker    jge .hv_w16
2016*c0909341SAndroid Build Coastguard Worker    mova                 m6, [spel_h_shufA]
2017*c0909341SAndroid Build Coastguard Worker    movu               ym16, [srcq+r6 *2]
2018*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m16, [srcq+r6 *1], 1 ; 0 1
2019*c0909341SAndroid Build Coastguard Worker    movu               ym17, [srcq+ssq*0]
2020*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m17, [srcq+ssq*1], 1 ; 2 3
2021*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2022*c0909341SAndroid Build Coastguard Worker    movu               ym18, [srcq+ssq*0]    ; 4
2023*c0909341SAndroid Build Coastguard Worker    movu                 m7, [spel_h_shufC]
2024*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m6, m16
2025*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
2026*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m6, m17
2027*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m9, m3   ; a0 b0
2028*c0909341SAndroid Build Coastguard Worker    mova                 m2, m8
2029*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m6, m18
2030*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m9, m4   ; c0 d0
2031*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
2032*c0909341SAndroid Build Coastguard Worker    vpermb              m16, m7, m16
2033*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m9, m5   ; e0
2034*c0909341SAndroid Build Coastguard Worker    vpermb              m17, m7, m17
2035*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m11, m16 ; a2 b2
2036*c0909341SAndroid Build Coastguard Worker    vpermb              m18, m7, m18
2037*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m11, m17 ; c2 d2
2038*c0909341SAndroid Build Coastguard Worker    shufpd               m3, m16, 0x55
2039*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m11, m18 ; e2
2040*c0909341SAndroid Build Coastguard Worker    mova                m16, [spel_shuf8a]
2041*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m17, 0x55
2042*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m10, m3  ; a1 b1
2043*c0909341SAndroid Build Coastguard Worker    shufpd               m5, m18, 0x55
2044*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m10, m4  ; c1 d1
2045*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m10, m5  ; e1
2046*c0909341SAndroid Build Coastguard Worker    mova                 m5, [spel_shuf8b]
2047*c0909341SAndroid Build Coastguard Worker    vpermt2b             m1, m16, m2  ; 01 12
2048*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m16, m0  ; 23 34
2049*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
2050*c0909341SAndroid Build Coastguard Worker    movu               ym18, [srcq+ssq*1]
2051*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2052*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m18, [srcq+ssq*0], 1
2053*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
2054*c0909341SAndroid Build Coastguard Worker    vpermb              m17, m6, m18
2055*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m9, m17  ; f0 g0
2056*c0909341SAndroid Build Coastguard Worker    vpermb              m18, m7, m18
2057*c0909341SAndroid Build Coastguard Worker    pmaddwd             m16, m12, m1  ; A0 B0
2058*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m11, m18 ; f2 g2
2059*c0909341SAndroid Build Coastguard Worker    shufpd              m17, m18, 0x55
2060*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
2061*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m13, m2  ; A1 B1
2062*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m10, m17 ; f1 g1
2063*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m5, m0   ; 45 56
2064*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m14, m2  ; A2 B2
2065*c0909341SAndroid Build Coastguard Worker    psrad               m16, 10
2066*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym17, m16, 1
2067*c0909341SAndroid Build Coastguard Worker    packusdw           ym16, ym17
2068*c0909341SAndroid Build Coastguard Worker    pminsw             ym16, ym15
2069*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm16
2070*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], ym16, 1
2071*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2072*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2073*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
2074*c0909341SAndroid Build Coastguard Worker    vzeroupper
2075*c0909341SAndroid Build Coastguard Worker    RET
2076*c0909341SAndroid Build Coastguard Worker.hv_w16:
2077*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m20, [spel_h_shufA]
2078*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m21, [spel_h_shufB]
2079*c0909341SAndroid Build Coastguard Worker    jg .hv_w32
2080*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m6, [srcq+r6 *2+ 8]
2081*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, m6, [srcq+r6 *2+16], 1
2082*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m6, [srcq+r6 *2+ 0], 0 ; 0
2083*c0909341SAndroid Build Coastguard Worker    movu               ym16, [srcq+r6 *1+ 0]
2084*c0909341SAndroid Build Coastguard Worker    movu               ym17, [srcq+r6 *1+12]
2085*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m16, [srcq+ssq*0+ 0], 1
2086*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m17, [srcq+ssq*0+12], 1 ; 1 2
2087*c0909341SAndroid Build Coastguard Worker    movu               ym18, [srcq+ssq*1+ 0]
2088*c0909341SAndroid Build Coastguard Worker    movu               ym19, [srcq+ssq*1+12]
2089*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2090*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m18, [srcq+ssq*0+ 0], 1
2091*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m19, [srcq+ssq*0+12], 1 ; 3 4
2092*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m20
2093*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
2094*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m16, m20
2095*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m11, m2    ; a2
2096*c0909341SAndroid Build Coastguard Worker    mova                 m2, m8
2097*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m17, m21
2098*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m9, m3     ; b0  c0
2099*c0909341SAndroid Build Coastguard Worker    mova                 m3, m8
2100*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m18, m20
2101*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m11, m4    ; b2' c2'
2102*c0909341SAndroid Build Coastguard Worker    mova                 m4, m8
2103*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m19, m21
2104*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m9, m5     ; d0  e0
2105*c0909341SAndroid Build Coastguard Worker    mova                 m5, m8
2106*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6, m20
2107*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m11, m7    ; d2' e2'
2108*c0909341SAndroid Build Coastguard Worker    mova                 m7, [spel_shuf16]
2109*c0909341SAndroid Build Coastguard Worker    pshufb              m16, m21
2110*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m9, m0     ; a0
2111*c0909341SAndroid Build Coastguard Worker    pshufb              m17, m20
2112*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m10, m16   ; b1  c1
2113*c0909341SAndroid Build Coastguard Worker    pshufb              m18, m21
2114*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m10, m17   ; b1' c1'
2115*c0909341SAndroid Build Coastguard Worker    pshufb              m19, m20
2116*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m10, m18   ; d1  e1
2117*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m21
2118*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m10, m19   ; d1' e1'
2119*c0909341SAndroid Build Coastguard Worker    shufpd              m16, m17, 0x55
2120*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m10, m6    ; a1
2121*c0909341SAndroid Build Coastguard Worker    shufpd              m18, m19, 0x55
2122*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m11, m16   ; b2  c2
2123*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m9, m16    ; b0' c0'
2124*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m11, m18   ; d2  e2
2125*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m9, m18    ; d0' e0'
2126*c0909341SAndroid Build Coastguard Worker    pslldq               m1, 1
2127*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m7, m3     ; 12
2128*c0909341SAndroid Build Coastguard Worker    vpermt2b             m4, m7, m5     ; 34
2129*c0909341SAndroid Build Coastguard Worker    vpshrdd              m1, m2, 16     ; 01
2130*c0909341SAndroid Build Coastguard Worker    vpshrdd              m3, m2, m4, 16 ; 23
2131*c0909341SAndroid Build Coastguard Worker.hv_w16_loop:
2132*c0909341SAndroid Build Coastguard Worker    movu               ym18, [srcq+ssq*1+ 0]
2133*c0909341SAndroid Build Coastguard Worker    movu               ym19, [srcq+ssq*1+12]
2134*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2135*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m18, [srcq+ssq*0+ 0], 1
2136*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m19, [srcq+ssq*0+12], 1
2137*c0909341SAndroid Build Coastguard Worker    mova                 m5, m8
2138*c0909341SAndroid Build Coastguard Worker    mova                 m6, m8
2139*c0909341SAndroid Build Coastguard Worker    pshufb              m17, m18, m20
2140*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m9, m17    ; f0  g0
2141*c0909341SAndroid Build Coastguard Worker    pshufb              m16, m19, m21
2142*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m11, m16   ; f2' g2'
2143*c0909341SAndroid Build Coastguard Worker    pmaddwd             m17, m12, m2    ; B0
2144*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
2145*c0909341SAndroid Build Coastguard Worker    pmaddwd             m16, m12, m1    ; A0
2146*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
2147*c0909341SAndroid Build Coastguard Worker    pshufb              m18, m21
2148*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m10, m18   ; f1  g1
2149*c0909341SAndroid Build Coastguard Worker    pshufb              m19, m20
2150*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m10, m19   ; f1' g1'
2151*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m13, m4    ; B1
2152*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m13, m3    ; A1
2153*c0909341SAndroid Build Coastguard Worker    shufpd              m18, m19, 0x55
2154*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m11, m18   ; f2  g2
2155*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m9, m18    ; f0' g0'
2156*c0909341SAndroid Build Coastguard Worker    mova                 m4, m7
2157*c0909341SAndroid Build Coastguard Worker    vpermi2b             m4, m5, m6     ; 56
2158*c0909341SAndroid Build Coastguard Worker    vpshrdd              m3, m2, m4, 16 ; 45
2159*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m14, m4    ; B2
2160*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m14, m3    ; A2
2161*c0909341SAndroid Build Coastguard Worker    psrad               m16, 10
2162*c0909341SAndroid Build Coastguard Worker    psrad               m17, 10
2163*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m18, m16, m17, q3232
2164*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m16, ym17, 1
2165*c0909341SAndroid Build Coastguard Worker    packusdw            m16, m18
2166*c0909341SAndroid Build Coastguard Worker    pminsw              m16, m15
2167*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym16
2168*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m16, 1
2169*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2170*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2171*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop
2172*c0909341SAndroid Build Coastguard Worker    vzeroupper
2173*c0909341SAndroid Build Coastguard Worker    RET
2174*c0909341SAndroid Build Coastguard Worker.hv_w32:
2175*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      28
2176*c0909341SAndroid Build Coastguard Worker    mova                m27, [spel_shuf32]
2177*c0909341SAndroid Build Coastguard Worker    lea                  wd, [hq+wq*8-256]
2178*c0909341SAndroid Build Coastguard Worker.hv_w32_loop0:
2179*c0909341SAndroid Build Coastguard Worker    movu                m16, [srcq+r6 *2+ 0]
2180*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+r6 *2+12]
2181*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+r6 *1+ 0]
2182*c0909341SAndroid Build Coastguard Worker    movu                m18, [srcq+r6 *1+12]
2183*c0909341SAndroid Build Coastguard Worker    lea                  r7, [srcq+ssq*2]
2184*c0909341SAndroid Build Coastguard Worker    movu                m17, [srcq+ssq*0+ 0]
2185*c0909341SAndroid Build Coastguard Worker    movu                m19, [srcq+ssq*0+12]
2186*c0909341SAndroid Build Coastguard Worker    movu                m22, [srcq+ssq*1+ 0]
2187*c0909341SAndroid Build Coastguard Worker    movu                m24, [srcq+ssq*1+12]
2188*c0909341SAndroid Build Coastguard Worker    mov                  r8, dstq
2189*c0909341SAndroid Build Coastguard Worker    movu                m23, [r7  +ssq*0+ 0]
2190*c0909341SAndroid Build Coastguard Worker    movu                m25, [r7  +ssq*0+12]
2191*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m16, m20
2192*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
2193*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7, m21
2194*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m9, m1     ; a0
2195*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
2196*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m6, m20
2197*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m11, m2    ; a2'
2198*c0909341SAndroid Build Coastguard Worker    mova                 m2, m8
2199*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m17, m20
2200*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m9, m4     ; b0
2201*c0909341SAndroid Build Coastguard Worker    mova                 m4, m8
2202*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m18, m21
2203*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m9, m3     ; c0
2204*c0909341SAndroid Build Coastguard Worker    mova                 m3, m8
2205*c0909341SAndroid Build Coastguard Worker    pshufb              m26, m19, m21
2206*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m11, m5    ; b2'
2207*c0909341SAndroid Build Coastguard Worker    mova                 m5, m8
2208*c0909341SAndroid Build Coastguard Worker    pshufb              m16, m21
2209*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m11, m26   ; c2'
2210*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m20
2211*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m10, m16   ; a1
2212*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m21
2213*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m10, m7    ; a1'
2214*c0909341SAndroid Build Coastguard Worker    pshufb              m17, m21
2215*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m10, m6    ; b1
2216*c0909341SAndroid Build Coastguard Worker    pshufb              m18, m20
2217*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m10, m17   ; c1
2218*c0909341SAndroid Build Coastguard Worker    pshufb              m19, m20
2219*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m10, m18   ; b1'
2220*c0909341SAndroid Build Coastguard Worker    shufpd              m16, m7, 0x55
2221*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m10, m19   ; c1'
2222*c0909341SAndroid Build Coastguard Worker    shufpd               m6, m18, 0x55
2223*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m11, m16   ; a2
2224*c0909341SAndroid Build Coastguard Worker    shufpd              m17, m19, 0x55
2225*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m9, m16    ; a0'
2226*c0909341SAndroid Build Coastguard Worker    pshufb              m16, m22, m20
2227*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m11, m6    ; b2
2228*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m23, m20
2229*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m11, m17   ; c2
2230*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m9, m6     ; b0'
2231*c0909341SAndroid Build Coastguard Worker    mova                 m6, m8
2232*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m9, m17    ; c0'
2233*c0909341SAndroid Build Coastguard Worker    pshufb              m17, m24, m21
2234*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m9, m16    ; d0
2235*c0909341SAndroid Build Coastguard Worker    mova                m16, m8
2236*c0909341SAndroid Build Coastguard Worker    pshufb              m26, m25, m21
2237*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m9, m7     ; e0
2238*c0909341SAndroid Build Coastguard Worker    mova                 m7, m8
2239*c0909341SAndroid Build Coastguard Worker    pshufb              m22, m21
2240*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m11, m17   ; d2'
2241*c0909341SAndroid Build Coastguard Worker    mova                m17, m8
2242*c0909341SAndroid Build Coastguard Worker    pshufb              m23, m21
2243*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m11, m26   ; e2'
2244*c0909341SAndroid Build Coastguard Worker    pshufb              m24, m20
2245*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m10, m22   ; d1
2246*c0909341SAndroid Build Coastguard Worker    pshufb              m25, m20
2247*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m10, m23   ; e1
2248*c0909341SAndroid Build Coastguard Worker    shufpd              m22, m24, 0x55
2249*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m10, m24   ; d1'
2250*c0909341SAndroid Build Coastguard Worker    shufpd              m23, m25, 0x55
2251*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m10, m25   ; e1'
2252*c0909341SAndroid Build Coastguard Worker    pslldq               m0, 1
2253*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m11, m22   ; d2
2254*c0909341SAndroid Build Coastguard Worker    pslldq               m1, 1
2255*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m11, m23   ; e2
2256*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m27, m4    ; 12
2257*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m9, m22    ; d0'
2258*c0909341SAndroid Build Coastguard Worker    vpermt2b             m3, m27, m5    ; 12'
2259*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m9, m23    ; e0'
2260*c0909341SAndroid Build Coastguard Worker    vpshrdd              m0, m2, 16     ; 01
2261*c0909341SAndroid Build Coastguard Worker    vpermt2b             m6, m27, m16   ; 34
2262*c0909341SAndroid Build Coastguard Worker    vpshrdd              m1, m3, 16     ; 01'
2263*c0909341SAndroid Build Coastguard Worker    vpermt2b             m7, m27, m17   ; 34'
2264*c0909341SAndroid Build Coastguard Worker    vpshrdd              m4, m2, m6, 16 ; 23
2265*c0909341SAndroid Build Coastguard Worker    vpshrdd              m5, m3, m7, 16 ; 23'
2266*c0909341SAndroid Build Coastguard Worker.hv_w32_loop:
2267*c0909341SAndroid Build Coastguard Worker    movu                m22, [r7+ssq*1+ 0]
2268*c0909341SAndroid Build Coastguard Worker    movu                m24, [r7+ssq*1+12]
2269*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+ssq*2]
2270*c0909341SAndroid Build Coastguard Worker    movu                m23, [r7+ssq*0+ 0]
2271*c0909341SAndroid Build Coastguard Worker    movu                m25, [r7+ssq*0+12]
2272*c0909341SAndroid Build Coastguard Worker    pmaddwd             m17, m12, m2    ; B0
2273*c0909341SAndroid Build Coastguard Worker    mova                 m2, m6
2274*c0909341SAndroid Build Coastguard Worker    pmaddwd             m19, m12, m3    ; B0'
2275*c0909341SAndroid Build Coastguard Worker    mova                 m3, m7
2276*c0909341SAndroid Build Coastguard Worker    pmaddwd             m16, m12, m0    ; A0
2277*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
2278*c0909341SAndroid Build Coastguard Worker    pmaddwd             m18, m12, m1    ; A0'
2279*c0909341SAndroid Build Coastguard Worker    mova                 m1, m5
2280*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m13, m6    ; B1
2281*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m19, m13, m7    ; B1'
2282*c0909341SAndroid Build Coastguard Worker    mova                 m6, m8
2283*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m13, m4    ; A1
2284*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m22, m20
2285*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m18, m13, m5    ; A1'
2286*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m23, m20
2287*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m9, m4     ; f0
2288*c0909341SAndroid Build Coastguard Worker    mova                 m4, m8
2289*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m24, m21
2290*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m9, m7     ; g0
2291*c0909341SAndroid Build Coastguard Worker    mova                 m7, m8
2292*c0909341SAndroid Build Coastguard Worker    pshufb              m26, m25, m21
2293*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m11, m5    ; f2'
2294*c0909341SAndroid Build Coastguard Worker    mova                 m5, m8
2295*c0909341SAndroid Build Coastguard Worker    pshufb              m22, m21
2296*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m11, m26   ; g2'
2297*c0909341SAndroid Build Coastguard Worker    pshufb              m23, m21
2298*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m10, m22   ; f1
2299*c0909341SAndroid Build Coastguard Worker    pshufb              m24, m20
2300*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m10, m23   ; g1
2301*c0909341SAndroid Build Coastguard Worker    pshufb              m25, m20
2302*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m10, m24   ; f1'
2303*c0909341SAndroid Build Coastguard Worker    shufpd              m22, m24, 0x55
2304*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m10, m25   ; g1'
2305*c0909341SAndroid Build Coastguard Worker    shufpd              m23, m25, 0x55
2306*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m11, m22   ; f2
2307*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m11, m23   ; g2
2308*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m9, m22    ; f0'
2309*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m9, m23    ; g0'
2310*c0909341SAndroid Build Coastguard Worker    vpermt2b             m6, m27, m4    ; 56
2311*c0909341SAndroid Build Coastguard Worker    vpermt2b             m7, m27, m5    ; 56'
2312*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m14, m6    ; B2
2313*c0909341SAndroid Build Coastguard Worker    vpshrdd              m4, m2, m6, 16 ; 45
2314*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m19, m14, m7    ; B2'
2315*c0909341SAndroid Build Coastguard Worker    vpshrdd              m5, m3, m7, 16 ; 45'
2316*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m14, m4    ; A2
2317*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m18, m14, m5    ; A2'
2318*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 10}, m17, m19, m16, m18
2319*c0909341SAndroid Build Coastguard Worker    packusdw            m17, m19
2320*c0909341SAndroid Build Coastguard Worker    packusdw            m16, m18
2321*c0909341SAndroid Build Coastguard Worker    pminsw              m17, m15
2322*c0909341SAndroid Build Coastguard Worker    pminsw              m16, m15
2323*c0909341SAndroid Build Coastguard Worker    mova         [r8+dsq*0], m16
2324*c0909341SAndroid Build Coastguard Worker    mova         [r8+dsq*1], m17
2325*c0909341SAndroid Build Coastguard Worker    lea                  r8, [r8+dsq*2]
2326*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2327*c0909341SAndroid Build Coastguard Worker    jg .hv_w32_loop
2328*c0909341SAndroid Build Coastguard Worker    add                srcq, 64
2329*c0909341SAndroid Build Coastguard Worker    add                dstq, 64
2330*c0909341SAndroid Build Coastguard Worker    movzx                hd, wb
2331*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<8
2332*c0909341SAndroid Build Coastguard Worker    jg .hv_w32_loop0
2333*c0909341SAndroid Build Coastguard Worker    RET
2334*c0909341SAndroid Build Coastguard Worker
2335*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_16bpc
2336*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_16bpc
2337*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_sharp,  REGULAR, SHARP,   put_8tap_16bpc
2338*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_8tap_16bpc
2339*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp,          SHARP,   SHARP
2340*c0909341SAndroid Build Coastguard Worker
2341*c0909341SAndroid Build Coastguard Workercglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
2342*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
2343*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
2344*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
2345*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 8tap_v, my, 4tap_v
2346*c0909341SAndroid Build Coastguard Worker    lea                  r8, [put_avx512icl]
2347*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
2348*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
2349*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
2350*c0909341SAndroid Build Coastguard Worker    jnz .h
2351*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2352*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _put_6tap_16bpc_avx512icl).put
2353*c0909341SAndroid Build Coastguard Worker.v:
2354*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2355*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2356*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2357*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
2358*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [pd_32]
2359*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm0, [base+subpel_filters+myq*8]
2360*c0909341SAndroid Build Coastguard Worker    tzcnt               r7d, wd
2361*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m11, r8m
2362*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
2363*c0909341SAndroid Build Coastguard Worker    movzx               r7d, word [r8+r7*2+table_offset(put, _8tap_v)]
2364*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
2365*c0909341SAndroid Build Coastguard Worker    mova [rsp+stack_offset+8], xmm0
2366*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, xmm0
2367*c0909341SAndroid Build Coastguard Worker    add                  r7, r8
2368*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [rsp+stack_offset+12]
2369*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [rsp+stack_offset+16]
2370*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [rsp+stack_offset+20]
2371*c0909341SAndroid Build Coastguard Worker    jmp                  r7
2372*c0909341SAndroid Build Coastguard Worker.v_w2:
2373*c0909341SAndroid Build Coastguard Worker    movd               xmm2, [srcq+ssq*0]
2374*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm2, [srcq+ssq*1], 1
2375*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm2, [srcq+ssq*2], 2
2376*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2377*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm2, [srcq+ssq*0], 3  ; 0 1 2 3
2378*c0909341SAndroid Build Coastguard Worker    movd               xmm3, [srcq+ssq*1]
2379*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm1, [srcq+ssq*2]
2380*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2381*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm0, [srcq+ssq*0]
2382*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm3, xmm1, 0x02       ; 4 5
2383*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm1, xmm0, 0x02       ; 5 6
2384*c0909341SAndroid Build Coastguard Worker    palignr            xmm4, xmm3, xmm2, 4    ; 1 2 3 4
2385*c0909341SAndroid Build Coastguard Worker    punpcklwd          xmm3, xmm1             ; 45 56
2386*c0909341SAndroid Build Coastguard Worker    punpcklwd          xmm1, xmm2, xmm4       ; 01 12
2387*c0909341SAndroid Build Coastguard Worker    punpckhwd          xmm2, xmm4             ; 23 34
2388*c0909341SAndroid Build Coastguard Worker.v_w2_loop:
2389*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm4, [srcq+ssq*1]
2390*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2391*c0909341SAndroid Build Coastguard Worker    mova               xmm5, xm10
2392*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm5, xm12, xmm1       ; a0 b0
2393*c0909341SAndroid Build Coastguard Worker    mova               xmm1, xmm2
2394*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm5, xm13, xmm2       ; a1 b1
2395*c0909341SAndroid Build Coastguard Worker    mova               xmm2, xmm3
2396*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm5, xm14, xmm3       ; a2 b2
2397*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm3, xmm0, xmm4, 0x02 ; 6 7
2398*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm0, [srcq+ssq*0]
2399*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm4, xmm0, 0x02       ; 7 8
2400*c0909341SAndroid Build Coastguard Worker    punpcklwd          xmm3, xmm4             ; 67 78
2401*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm5, xm15, xmm3       ; a3 b3
2402*c0909341SAndroid Build Coastguard Worker    psrad              xmm5, 6
2403*c0909341SAndroid Build Coastguard Worker    packusdw           xmm5, xmm5
2404*c0909341SAndroid Build Coastguard Worker    pminsw             xmm5, xm11
2405*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm5
2406*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm5, 1
2407*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2408*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2409*c0909341SAndroid Build Coastguard Worker    jg .v_w2_loop
2410*c0909341SAndroid Build Coastguard Worker    RET
2411*c0909341SAndroid Build Coastguard Worker.v_w4:
2412*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [srcq+ssq*0]
2413*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm0, [srcq+ssq*1]
2414*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm2, [srcq+ssq*2]
2415*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2416*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm4, [srcq+ssq*0]
2417*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm3, [srcq+ssq*1]
2418*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm5, [srcq+ssq*2]
2419*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2420*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm1, ymm0, 0x30
2421*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm0, ymm2, 0x30
2422*c0909341SAndroid Build Coastguard Worker    punpcklwd          ymm1, ymm0       ; 01 12
2423*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm0, [srcq+ssq*0]
2424*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm2, ymm4, 0x30
2425*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm4, ymm3, 0x30
2426*c0909341SAndroid Build Coastguard Worker    punpcklwd          ymm2, ymm4       ; 23 34
2427*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm3, ymm5, 0x30
2428*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm5, ymm0, 0x30
2429*c0909341SAndroid Build Coastguard Worker    punpcklwd          ymm3, ymm5       ; 45 56
2430*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
2431*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm5, [srcq+ssq*1]
2432*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2433*c0909341SAndroid Build Coastguard Worker    mova               ymm4, ym10
2434*c0909341SAndroid Build Coastguard Worker    vpdpwssd           ymm4, ym12, ymm1 ; a0 b0
2435*c0909341SAndroid Build Coastguard Worker    mova               ymm1, ymm2
2436*c0909341SAndroid Build Coastguard Worker    vpdpwssd           ymm4, ym13, ymm2 ; a1 b1
2437*c0909341SAndroid Build Coastguard Worker    mova               ymm2, ymm3
2438*c0909341SAndroid Build Coastguard Worker    vpdpwssd           ymm4, ym14, ymm3 ; a2 b2
2439*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm3, ymm0, ymm5, 0x30
2440*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm0, [srcq+ssq*0]
2441*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm5, ymm0, 0x30
2442*c0909341SAndroid Build Coastguard Worker    punpcklwd          ymm3, ymm5       ; 67 78
2443*c0909341SAndroid Build Coastguard Worker    vpdpwssd           ymm4, ym15, ymm3 ; a3 b3
2444*c0909341SAndroid Build Coastguard Worker    psrad              ymm4, 6
2445*c0909341SAndroid Build Coastguard Worker    vextracti128       xmm5, ymm4, 1
2446*c0909341SAndroid Build Coastguard Worker    packusdw           xmm4, xmm5
2447*c0909341SAndroid Build Coastguard Worker    pminsw             xmm4, xm11
2448*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xmm4
2449*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xmm4
2450*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2451*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2452*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
2453*c0909341SAndroid Build Coastguard Worker    vzeroupper
2454*c0909341SAndroid Build Coastguard Worker    RET
2455*c0909341SAndroid Build Coastguard Worker.v_w8:
2456*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m2, [srcq+ssq*2]
2457*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, m2, [srcq+ssq*0], 0
2458*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, [srcq+ssq*1], 1 ; 0 1 2
2459*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2460*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym2, [srcq+ssq*0], 1
2461*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m2, [srcq+ssq*1], 2 ; 2 3 4
2462*c0909341SAndroid Build Coastguard Worker    mova                 m6, [spel_v_shuf8]
2463*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*1]
2464*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, [srcq+ssq*2], 1
2465*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2466*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, [srcq+ssq*0], 2 ; 4 5 6
2467*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m6, m1          ; 01 12
2468*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m6, m2          ; 23 34
2469*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m6, m0          ; 45 56
2470*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
2471*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, [srcq+ssq*1], 3
2472*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2473*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+ssq*0]
2474*c0909341SAndroid Build Coastguard Worker    mova                 m4, m10
2475*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m12, m1         ; a0 b0
2476*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
2477*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m5, q1032       ; 6 7 8
2478*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m13, m2         ; a1 b1
2479*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
2480*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m14, m3         ; a2 b2
2481*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m6, m0          ; 67 78
2482*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m15, m3         ; a3 b3
2483*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
2484*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym5, m4, 1
2485*c0909341SAndroid Build Coastguard Worker    packusdw            ym4, ym5
2486*c0909341SAndroid Build Coastguard Worker    pminsw              ym4, ym11
2487*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], xm4
2488*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*1], ym4, 1
2489*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2490*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2491*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
2492*c0909341SAndroid Build Coastguard Worker    RET
2493*c0909341SAndroid Build Coastguard Worker.v_w16:
2494*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m0, [srcq+ssq*1]
2495*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, m0, [srcq+ssq*2], 1
2496*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*0], 0
2497*c0909341SAndroid Build Coastguard Worker    mova                 m8, [spel_v_shuf16]
2498*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2499*c0909341SAndroid Build Coastguard Worker    movu                ym3, [srcq+ssq*0]
2500*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, [srcq+ssq*1], 1
2501*c0909341SAndroid Build Coastguard Worker    movu                ym5, [srcq+ssq*2]
2502*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2503*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, [srcq+ssq*0], 1
2504*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m8, m1     ; 12
2505*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m8, m0     ; 01
2506*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m8, m3     ; 34
2507*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m8, m5     ; 56
2508*c0909341SAndroid Build Coastguard Worker    mova                 m9, [deint_q_shuf]
2509*c0909341SAndroid Build Coastguard Worker    vpshrdd              m2, m1, m3, 16 ; 23
2510*c0909341SAndroid Build Coastguard Worker    vpshrdd              m4, m3, m5, 16 ; 45
2511*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
2512*c0909341SAndroid Build Coastguard Worker    mova                 m7, m10
2513*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m12, m1    ; b0
2514*c0909341SAndroid Build Coastguard Worker    mova                 m6, m10
2515*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m12, m0    ; a0
2516*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
2517*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m13, m3    ; b1
2518*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
2519*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m13, m2    ; a1
2520*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
2521*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m14, m5    ; b2
2522*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
2523*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m14, m4    ; a2
2524*c0909341SAndroid Build Coastguard Worker    movu                ym5, [srcq+ssq*1]
2525*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2526*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, [srcq+ssq*0], 1
2527*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m8, m5     ; 78
2528*c0909341SAndroid Build Coastguard Worker    vpshrdd              m4, m3, m5, 16 ; 67
2529*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m15, m5    ; b3
2530*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m15, m4    ; a3
2531*c0909341SAndroid Build Coastguard Worker    psrad                m7, 6
2532*c0909341SAndroid Build Coastguard Worker    psrad                m6, 6
2533*c0909341SAndroid Build Coastguard Worker    packusdw             m6, m7
2534*c0909341SAndroid Build Coastguard Worker    pminsw               m6, m11
2535*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m9, m6
2536*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym6
2537*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m6, 1
2538*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2539*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2540*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
2541*c0909341SAndroid Build Coastguard Worker    RET
2542*c0909341SAndroid Build Coastguard Worker.v_w32:
2543*c0909341SAndroid Build Coastguard Worker.v_w64:
2544*c0909341SAndroid Build Coastguard Worker.v_w128:
2545*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      23
2546*c0909341SAndroid Build Coastguard Worker    lea                  wd, [hq+wq*8-256]
2547*c0909341SAndroid Build Coastguard Worker.v_w32_loop0:
2548*c0909341SAndroid Build Coastguard Worker    movu                m16, [srcq+ssq*0]
2549*c0909341SAndroid Build Coastguard Worker    movu                m17, [srcq+ssq*1]
2550*c0909341SAndroid Build Coastguard Worker    lea                  r7, [srcq+r6   ]
2551*c0909341SAndroid Build Coastguard Worker    movu                m18, [srcq+ssq*2]
2552*c0909341SAndroid Build Coastguard Worker    movu                m19, [r7  +ssq*0]
2553*c0909341SAndroid Build Coastguard Worker    mov                  r8, dstq
2554*c0909341SAndroid Build Coastguard Worker    movu                m20, [r7  +ssq*1]
2555*c0909341SAndroid Build Coastguard Worker    movu                m21, [r7  +ssq*2]
2556*c0909341SAndroid Build Coastguard Worker    add                  r7, r6
2557*c0909341SAndroid Build Coastguard Worker    movu                m22, [r7  +ssq*0]
2558*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m16, m17 ; 01l
2559*c0909341SAndroid Build Coastguard Worker    punpckhwd           m16, m17      ; 01h
2560*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m17, m18 ; 12l
2561*c0909341SAndroid Build Coastguard Worker    punpckhwd           m17, m18      ; 12h
2562*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m18, m19 ; 23l
2563*c0909341SAndroid Build Coastguard Worker    punpckhwd           m18, m19      ; 23h
2564*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m19, m20 ; 34l
2565*c0909341SAndroid Build Coastguard Worker    punpckhwd           m19, m20      ; 34h
2566*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m20, m21 ; 45l
2567*c0909341SAndroid Build Coastguard Worker    punpckhwd           m20, m21      ; 45h
2568*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m21, m22 ; 56l
2569*c0909341SAndroid Build Coastguard Worker    punpckhwd           m21, m22      ; 56h
2570*c0909341SAndroid Build Coastguard Worker.v_w32_loop:
2571*c0909341SAndroid Build Coastguard Worker    mova                 m6, m10
2572*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m12, m0  ; a0l
2573*c0909341SAndroid Build Coastguard Worker    mova                 m8, m10
2574*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m12, m16 ; a0h
2575*c0909341SAndroid Build Coastguard Worker    mova                 m7, m10
2576*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m12, m1  ; b0l
2577*c0909341SAndroid Build Coastguard Worker    mova                 m9, m10
2578*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m9, m12, m17 ; b0h
2579*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
2580*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m13, m2  ; a1l
2581*c0909341SAndroid Build Coastguard Worker    mova                m16, m18
2582*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m13, m18 ; a1h
2583*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
2584*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m13, m3  ; b1l
2585*c0909341SAndroid Build Coastguard Worker    mova                m17, m19
2586*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m9, m13, m19 ; b1h
2587*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
2588*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m14, m4  ; a2l
2589*c0909341SAndroid Build Coastguard Worker    mova                m18, m20
2590*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m14, m20 ; a2h
2591*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
2592*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m14, m5  ; b2l
2593*c0909341SAndroid Build Coastguard Worker    mova                m19, m21
2594*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m9, m14, m21 ; b2h
2595*c0909341SAndroid Build Coastguard Worker    movu                m21, [r7+ssq*1]
2596*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+ssq*2]
2597*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m22, m21 ; 67l
2598*c0909341SAndroid Build Coastguard Worker    punpckhwd           m20, m22, m21 ; 67h
2599*c0909341SAndroid Build Coastguard Worker    movu                m22, [r7+ssq*0]
2600*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m15, m4  ; a3l
2601*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m15, m20 ; a3h
2602*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m21, m22 ; 78l
2603*c0909341SAndroid Build Coastguard Worker    punpckhwd           m21, m22      ; 78h
2604*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m15, m5  ; b3l
2605*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m9, m15, m21 ; b3h
2606*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 6}, m6, m8, m7, m9
2607*c0909341SAndroid Build Coastguard Worker    packusdw             m6, m8
2608*c0909341SAndroid Build Coastguard Worker    packusdw             m7, m9
2609*c0909341SAndroid Build Coastguard Worker    pminsw               m6, m11
2610*c0909341SAndroid Build Coastguard Worker    pminsw               m7, m11
2611*c0909341SAndroid Build Coastguard Worker    mova         [r8+dsq*0], m6
2612*c0909341SAndroid Build Coastguard Worker    mova         [r8+dsq*1], m7
2613*c0909341SAndroid Build Coastguard Worker    lea                  r8, [r8+dsq*2]
2614*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2615*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop
2616*c0909341SAndroid Build Coastguard Worker    add                srcq, 64
2617*c0909341SAndroid Build Coastguard Worker    add                dstq, 64
2618*c0909341SAndroid Build Coastguard Worker    movzx                hd, wb
2619*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<8
2620*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop0
2621*c0909341SAndroid Build Coastguard Worker    RET
2622*c0909341SAndroid Build Coastguard Worker.h_w2:
2623*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
2624*c0909341SAndroid Build Coastguard Worker    mova                ym2, [spel_h_shuf2a]
2625*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
2626*c0909341SAndroid Build Coastguard Worker    pshufd             xmm3, xmm0, q1111
2627*c0909341SAndroid Build Coastguard Worker    pshufd             xmm4, xmm0, q2222
2628*c0909341SAndroid Build Coastguard Worker.h_w2_loop:
2629*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*0]
2630*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, [srcq+ssq*1], 1
2631*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2632*c0909341SAndroid Build Coastguard Worker    mova               xmm0, xm8
2633*c0909341SAndroid Build Coastguard Worker    vpermb              ym1, ym2, ym1
2634*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm0, xmm3, xm1
2635*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym1, 1
2636*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm0, xmm4, xm1
2637*c0909341SAndroid Build Coastguard Worker    psrad              xmm0, 6
2638*c0909341SAndroid Build Coastguard Worker    packusdw           xmm0, xmm0
2639*c0909341SAndroid Build Coastguard Worker    pminsw             xmm0, xm15
2640*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm0
2641*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm0, 1
2642*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2643*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2644*c0909341SAndroid Build Coastguard Worker    jg .h_w2_loop
2645*c0909341SAndroid Build Coastguard Worker    RET
2646*c0909341SAndroid Build Coastguard Worker.h_w4:
2647*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
2648*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
2649*c0909341SAndroid Build Coastguard Worker    jl .h_w2
2650*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym4, [spel_h_shufA]
2651*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym5, [spel_h_shufB]
2652*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
2653*c0909341SAndroid Build Coastguard Worker    pshufd             xmm0, xmm0, q2211
2654*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        ym6, xmm0
2655*c0909341SAndroid Build Coastguard Worker    vpermq              ym7, ymm0, q1111
2656*c0909341SAndroid Build Coastguard Worker.h_w4_loop:
2657*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+ssq*0]
2658*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym2, [srcq+ssq*1], 1
2659*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2660*c0909341SAndroid Build Coastguard Worker    mova                ym0, ym8
2661*c0909341SAndroid Build Coastguard Worker    pshufb              ym1, ym2, ym4
2662*c0909341SAndroid Build Coastguard Worker    vpdpwssd            ym0, ym6, ym1
2663*c0909341SAndroid Build Coastguard Worker    pshufb              ym2, ym5
2664*c0909341SAndroid Build Coastguard Worker    vpdpwssd            ym0, ym7, ym2
2665*c0909341SAndroid Build Coastguard Worker    psrad               ym0, 6
2666*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
2667*c0909341SAndroid Build Coastguard Worker    packusdw            xm0, xm1
2668*c0909341SAndroid Build Coastguard Worker    pminsw             xmm0, xm0, xm15
2669*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xmm0
2670*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xmm0
2671*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2672*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2673*c0909341SAndroid Build Coastguard Worker    jg .h_w4_loop
2674*c0909341SAndroid Build Coastguard Worker    RET
2675*c0909341SAndroid Build Coastguard Worker.h_w8:
2676*c0909341SAndroid Build Coastguard Worker    mova                 m4, [spel_h_shufA]
2677*c0909341SAndroid Build Coastguard Worker    movu                 m5, [spel_h_shufB]
2678*c0909341SAndroid Build Coastguard Worker    movu                 m6, [spel_h_shufC]
2679*c0909341SAndroid Build Coastguard Worker    mova                 m7, [spel_h_shufD]
2680*c0909341SAndroid Build Coastguard Worker.h_w8_loop:
2681*c0909341SAndroid Build Coastguard Worker    movu                ym2, [srcq+ssq*0]
2682*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, [srcq+ssq*1], 1
2683*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2684*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
2685*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m4, m2
2686*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m10, m1
2687*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m5, m2
2688*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m11, m1
2689*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m6, m2
2690*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m12, m1
2691*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m7, m2
2692*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m13, m1
2693*c0909341SAndroid Build Coastguard Worker    psrad                m0, 6
2694*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym1, m0, 1
2695*c0909341SAndroid Build Coastguard Worker    packusdw            ym0, ym1
2696*c0909341SAndroid Build Coastguard Worker    pminsw              ym0, ym15
2697*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], xm0
2698*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*1], ym0, 1
2699*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2700*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2701*c0909341SAndroid Build Coastguard Worker    jg .h_w8_loop
2702*c0909341SAndroid Build Coastguard Worker    RET
2703*c0909341SAndroid Build Coastguard Worker.h:
2704*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m15, r8m
2705*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2706*c0909341SAndroid Build Coastguard Worker    jnz .hv
2707*c0909341SAndroid Build Coastguard Worker    mov                 r7d, r8m
2708*c0909341SAndroid Build Coastguard Worker    shr                 r7d, 11
2709*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+put_8tap_h_rnd+r7*4]
2710*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2711*c0909341SAndroid Build Coastguard Worker    jle .h_w4
2712*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
2713*c0909341SAndroid Build Coastguard Worker    sub                srcq, 6
2714*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
2715*c0909341SAndroid Build Coastguard Worker    mova              [buf], xmm0
2716*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, xmm0
2717*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [buf+ 4]
2718*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [buf+ 8]
2719*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [buf+12]
2720*c0909341SAndroid Build Coastguard Worker    sub                  wd, 16
2721*c0909341SAndroid Build Coastguard Worker    jl .h_w8
2722*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m6, [spel_h_shufA]
2723*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m7, [spel_h_shufB]
2724*c0909341SAndroid Build Coastguard Worker    jg .h_w32
2725*c0909341SAndroid Build Coastguard Worker.h_w16_loop:
2726*c0909341SAndroid Build Coastguard Worker    movu                ym2, [srcq+ssq*0+ 0]
2727*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, [srcq+ssq*1+ 0], 1
2728*c0909341SAndroid Build Coastguard Worker    movu                ym3, [srcq+ssq*0+16]
2729*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, [srcq+ssq*1+16], 1
2730*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2731*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
2732*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
2733*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m2, m6
2734*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m10, m4 ; a0
2735*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m3, m6
2736*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m12, m4 ; b2
2737*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m2, m7
2738*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m11, m4 ; a1
2739*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m3, m7
2740*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m13, m4 ; b3
2741*c0909341SAndroid Build Coastguard Worker    shufpd               m2, m3, 0x55
2742*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m2, m6
2743*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m12, m4 ; a2
2744*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m10, m4 ; b0
2745*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
2746*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m13, m2 ; a3
2747*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m11, m2 ; b1
2748*c0909341SAndroid Build Coastguard Worker    psrad                m0, 6
2749*c0909341SAndroid Build Coastguard Worker    psrad                m1, 6
2750*c0909341SAndroid Build Coastguard Worker    packusdw             m0, m1
2751*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m15
2752*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym0
2753*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m0, 1
2754*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2755*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2756*c0909341SAndroid Build Coastguard Worker    jg .h_w16_loop
2757*c0909341SAndroid Build Coastguard Worker    RET
2758*c0909341SAndroid Build Coastguard Worker.h_w32:
2759*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+wq*2]
2760*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+wq*2]
2761*c0909341SAndroid Build Coastguard Worker    neg                  wq
2762*c0909341SAndroid Build Coastguard Worker.h_w32_loop0:
2763*c0909341SAndroid Build Coastguard Worker    mov                  r6, wq
2764*c0909341SAndroid Build Coastguard Worker.h_w32_loop:
2765*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6*2+ 0]
2766*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+r6*2+ 8]
2767*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
2768*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
2769*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m2, m6
2770*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m10, m4 ; a0
2771*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m3, m6
2772*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m10, m4 ; b0
2773*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m12, m4 ; a2
2774*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+r6*2+16]
2775*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7
2776*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m11, m3 ; b1
2777*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m13, m3 ; a3
2778*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4, m6
2779*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m12, m3 ; b2
2780*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
2781*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m11, m2 ; a1
2782*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m7
2783*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m13, m4 ; b3
2784*c0909341SAndroid Build Coastguard Worker    psrad                m0, 6
2785*c0909341SAndroid Build Coastguard Worker    psrad                m1, 6
2786*c0909341SAndroid Build Coastguard Worker    packusdw             m0, m1
2787*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m15
2788*c0909341SAndroid Build Coastguard Worker    mova        [dstq+r6*2], m0
2789*c0909341SAndroid Build Coastguard Worker    add                  r6, 32
2790*c0909341SAndroid Build Coastguard Worker    jl .h_w32_loop
2791*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
2792*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
2793*c0909341SAndroid Build Coastguard Worker    dec                  hd
2794*c0909341SAndroid Build Coastguard Worker    jg .h_w32_loop0
2795*c0909341SAndroid Build Coastguard Worker    RET
2796*c0909341SAndroid Build Coastguard Worker.hv:
2797*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2798*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
2799*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
2800*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
2801*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2802*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2803*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2804*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
2805*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm1, [base+subpel_filters+myq*8]
2806*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
2807*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
2808*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
2809*c0909341SAndroid Build Coastguard Worker    test          dword r8m, 0x800
2810*c0909341SAndroid Build Coastguard Worker    jnz .hv_12bit
2811*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [pd_2176]
2812*c0909341SAndroid Build Coastguard Worker    psllw              xmm0, 6
2813*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
2814*c0909341SAndroid Build Coastguard Worker.hv_12bit:
2815*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [pd_640]
2816*c0909341SAndroid Build Coastguard Worker    psllw              xmm0, 4
2817*c0909341SAndroid Build Coastguard Worker    psllw              xmm1, 2
2818*c0909341SAndroid Build Coastguard Worker.hv_main:
2819*c0909341SAndroid Build Coastguard Worker    mova           [buf+ 0], xmm0
2820*c0909341SAndroid Build Coastguard Worker    mova           [buf+16], xmm1
2821*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [buf+ 4]
2822*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [buf+ 8]
2823*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym11, xmm1
2824*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym12, [buf+20]
2825*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym13, [buf+24]
2826*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym14, [buf+28]
2827*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+ssq*0]
2828*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym4, [srcq+ssq*1], 1
2829*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m4, [srcq+ssq*2], 2
2830*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2831*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m4, [srcq+ssq*0], 3 ; 0 1 2 3
2832*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*1]
2833*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, [srcq+ssq*2], 1
2834*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2835*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, [srcq+ssq*0], 2 ; 4 5 6
2836*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2837*c0909341SAndroid Build Coastguard Worker    je .hv_w4
2838*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m2, [spel_h_shufA]
2839*c0909341SAndroid Build Coastguard Worker    mova                 m3, [spel_h_shuf2b]
2840*c0909341SAndroid Build Coastguard Worker    mova                ym6, [spel_h_shuf2a]
2841*c0909341SAndroid Build Coastguard Worker    mova                xm7, [spel_shuf2]
2842*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
2843*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m2
2844*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2
2845*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m4, m0
2846*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m8, m2    ; 04 15 26 3_
2847*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m4, m0
2848*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m9, m4
2849*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m3, m1    ; 01 12
2850*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, ym1, 1    ; 23 34
2851*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm3, m1, 2     ; 45 56
2852*c0909341SAndroid Build Coastguard Worker.hv_w2_loop:
2853*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+ssq*1]
2854*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2855*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym5, [srcq+ssq*0], 1
2856*c0909341SAndroid Build Coastguard Worker    mova                xm4, xm10
2857*c0909341SAndroid Build Coastguard Worker    vpermb              ym5, ym6, ym5
2858*c0909341SAndroid Build Coastguard Worker    pmaddwd            xmm0, xm11, xm1 ; a0 b0
2859*c0909341SAndroid Build Coastguard Worker    vpdpwssd            xm4, xm8, xm5
2860*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm5, ym5, 1
2861*c0909341SAndroid Build Coastguard Worker    mova                xm1, xm2
2862*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm0, xm12, xm2 ; a1 b1
2863*c0909341SAndroid Build Coastguard Worker    vpdpwssd            xm4, xm9, xm5  ; 7 8
2864*c0909341SAndroid Build Coastguard Worker    mova                xm2, xm3
2865*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm0, xm13, xm3 ; a2 b2
2866*c0909341SAndroid Build Coastguard Worker    vpermt2b            xm3, xm7, xm4  ; 67 78
2867*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm0, xm14, xm3 ; a3 b3
2868*c0909341SAndroid Build Coastguard Worker    psrad              xmm0, 10
2869*c0909341SAndroid Build Coastguard Worker    packusdw           xmm0, xmm0
2870*c0909341SAndroid Build Coastguard Worker    pminsw             xmm0, xm15
2871*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm0
2872*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm0, 1
2873*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2874*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2875*c0909341SAndroid Build Coastguard Worker    jg .hv_w2_loop
2876*c0909341SAndroid Build Coastguard Worker    RET
2877*c0909341SAndroid Build Coastguard Worker.hv_w4:
2878*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m19, [spel_h_shufA]
2879*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m20, [spel_h_shufB]
2880*c0909341SAndroid Build Coastguard Worker    mova                ym6, [spel_shuf4a]
2881*c0909341SAndroid Build Coastguard Worker    mova                ym7, [spel_shuf4b]
2882*c0909341SAndroid Build Coastguard Worker    mova                 m2, m10
2883*c0909341SAndroid Build Coastguard Worker    mova                 m3, m10
2884*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m19
2885*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m8, m1
2886*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m0, m19
2887*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m8, m1
2888*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m20
2889*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m9, m4
2890*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m20
2891*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m9, m0
2892*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m6, m2    ; 01 12
2893*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m3, q1032
2894*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m6, m3    ; 45 56
2895*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m6, m2    ; 23 34
2896*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
2897*c0909341SAndroid Build Coastguard Worker    movu               xm18, [srcq+ssq*1]
2898*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2899*c0909341SAndroid Build Coastguard Worker    vinserti128        ym18, [srcq+ssq*0], 1
2900*c0909341SAndroid Build Coastguard Worker    pmaddwd            ym16, ym11, ym1 ; a0 b0
2901*c0909341SAndroid Build Coastguard Worker    mova                ym1, ym2
2902*c0909341SAndroid Build Coastguard Worker    mova                ym2, ym3
2903*c0909341SAndroid Build Coastguard Worker    pshufb             ym17, ym18, ym19
2904*c0909341SAndroid Build Coastguard Worker    mova                ym3, ym10
2905*c0909341SAndroid Build Coastguard Worker    vpdpwssd            ym3, ym8, ym17
2906*c0909341SAndroid Build Coastguard Worker    pshufb             ym18, ym20
2907*c0909341SAndroid Build Coastguard Worker    vpdpwssd           ym16, ym12, ym1 ; a1 b1
2908*c0909341SAndroid Build Coastguard Worker    vpdpwssd            ym3, ym9, ym18 ; 7 8
2909*c0909341SAndroid Build Coastguard Worker    vpdpwssd           ym16, ym13, ym2 ; a2 b2
2910*c0909341SAndroid Build Coastguard Worker    vpermt2b            ym3, ym7, ym2  ; 67 78
2911*c0909341SAndroid Build Coastguard Worker    vpdpwssd           ym16, ym14, ym3 ; a3 b3
2912*c0909341SAndroid Build Coastguard Worker    psrad              ym16, 10
2913*c0909341SAndroid Build Coastguard Worker    vextracti128       xm17, ym16, 1
2914*c0909341SAndroid Build Coastguard Worker    packusdw           xm16, xm17
2915*c0909341SAndroid Build Coastguard Worker    pminsw             xm16, xm15
2916*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm16
2917*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm16
2918*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2919*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2920*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
2921*c0909341SAndroid Build Coastguard Worker    vzeroupper
2922*c0909341SAndroid Build Coastguard Worker    RET
2923*c0909341SAndroid Build Coastguard Worker.hv_w8:
2924*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
2925*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
2926*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2927*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2928*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2929*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
2930*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm1, [base+subpel_filters+myq*8]
2931*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
2932*c0909341SAndroid Build Coastguard Worker    sub                srcq, 6
2933*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
2934*c0909341SAndroid Build Coastguard Worker    test          dword r8m, 0x800
2935*c0909341SAndroid Build Coastguard Worker    jnz .hv_w8_12bit
2936*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [pd_2176]
2937*c0909341SAndroid Build Coastguard Worker    psllw              xmm0, 6
2938*c0909341SAndroid Build Coastguard Worker    jmp .hv_w8_main
2939*c0909341SAndroid Build Coastguard Worker.hv_w8_12bit:
2940*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [pd_640]
2941*c0909341SAndroid Build Coastguard Worker    psllw              xmm0, 4
2942*c0909341SAndroid Build Coastguard Worker    psllw              xmm1, 2
2943*c0909341SAndroid Build Coastguard Worker.hv_w8_main:
2944*c0909341SAndroid Build Coastguard Worker    mova           [buf+ 0], xmm0
2945*c0909341SAndroid Build Coastguard Worker    mova           [buf+16], xmm1
2946*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, xmm0
2947*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [buf+ 4]
2948*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [buf+ 8]
2949*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [buf+12]
2950*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m16, xmm1
2951*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m17, [buf+20]
2952*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m18, [buf+24]
2953*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m19, [buf+28]
2954*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
2955*c0909341SAndroid Build Coastguard Worker    jg .hv_w16
2956*c0909341SAndroid Build Coastguard Worker    mova                 m5, [spel_h_shufA]
2957*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+ssq*0]
2958*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*1], 1 ; 0 1
2959*c0909341SAndroid Build Coastguard Worker    movu                ym9, [srcq+ssq*2]
2960*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2961*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m9, [srcq+ssq*0], 1 ; 2 3
2962*c0909341SAndroid Build Coastguard Worker    movu               ym20, [srcq+ssq*1]
2963*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m20, [srcq+ssq*2], 1 ; 4 5
2964*c0909341SAndroid Build Coastguard Worker    add srcq, r6
2965*c0909341SAndroid Build Coastguard Worker    movu               ym21, [srcq+ssq*0]    ; 6
2966*c0909341SAndroid Build Coastguard Worker    movu                 m6, [spel_h_shufB]
2967*c0909341SAndroid Build Coastguard Worker    movu                 m7, [spel_h_shufC]
2968*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m5, m0
2969*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
2970*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m11, m8  ; a0 b0
2971*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m5, m9
2972*c0909341SAndroid Build Coastguard Worker    mova                 m2, m10
2973*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m11, m8  ; c0 d0
2974*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m5, m20
2975*c0909341SAndroid Build Coastguard Worker    mova                 m3, m10
2976*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m11, m8  ; e0 f0
2977*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m5, m21
2978*c0909341SAndroid Build Coastguard Worker    mova                 m4, m10
2979*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m11, m8  ; g0
2980*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m6, m0
2981*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m12, m8  ; a1 b1
2982*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m6, m9
2983*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m12, m8  ; c1 d1
2984*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m6, m20
2985*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m12, m8  ; e1 f1
2986*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m6, m21
2987*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m12, m8  ; g1
2988*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m7, m0
2989*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m13, m8  ; a2 b2
2990*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m7, m9
2991*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m13, m8  ; c2 d2
2992*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m7, m20
2993*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m13, m8  ; e2 f2
2994*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m7, m21
2995*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m13, m8  ; g2
2996*c0909341SAndroid Build Coastguard Worker    mova                 m8, [spel_h_shufD]
2997*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m8, m0
2998*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m14, m0  ; a3 b3
2999*c0909341SAndroid Build Coastguard Worker    mova                 m0, [spel_shuf8a]
3000*c0909341SAndroid Build Coastguard Worker    vpermb               m9, m8, m9
3001*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m14, m9  ; c3 d3
3002*c0909341SAndroid Build Coastguard Worker    mova                 m9, [spel_shuf8b]
3003*c0909341SAndroid Build Coastguard Worker    vpermb              m20, m8, m20
3004*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m14, m20 ; e3 f3
3005*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m8, m21
3006*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m14, m21 ; g3
3007*c0909341SAndroid Build Coastguard Worker    vpermt2b             m1, m0, m2   ; 01 12
3008*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m0, m3   ; 23 34
3009*c0909341SAndroid Build Coastguard Worker    vpermt2b             m3, m0, m4   ; 45 56
3010*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
3011*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+ssq*1]
3012*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3013*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*0], 1
3014*c0909341SAndroid Build Coastguard Worker    mova                 m4, m10
3015*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m5, m0
3016*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m11, m21 ; h0 i0
3017*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m6, m0
3018*c0909341SAndroid Build Coastguard Worker    pmaddwd             m20, m16, m1  ; A0 B0
3019*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m12, m21 ; h1 i1
3020*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m7, m0
3021*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
3022*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m17, m2  ; A1 B1
3023*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m13, m21 ; h2 i2
3024*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m8, m0
3025*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
3026*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m18, m3  ; A2 B2
3027*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m14, m21 ; h3 i3
3028*c0909341SAndroid Build Coastguard Worker    vpermt2b             m3, m9, m4   ; 67 78
3029*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m19, m3  ; A3 B3
3030*c0909341SAndroid Build Coastguard Worker    psrad               m20, 10
3031*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym21, m20, 1
3032*c0909341SAndroid Build Coastguard Worker    packusdw           ym20, ym21
3033*c0909341SAndroid Build Coastguard Worker    pminsw             ym20, ym15
3034*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm20
3035*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], ym20, 1
3036*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
3037*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3038*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
3039*c0909341SAndroid Build Coastguard Worker    vzeroupper
3040*c0909341SAndroid Build Coastguard Worker    RET
3041*c0909341SAndroid Build Coastguard Worker.hv_w16:
3042*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM 26
3043*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m20, [spel_h_shufA]
3044*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m21, [spel_h_shufB]
3045*c0909341SAndroid Build Coastguard Worker    add                  wd, wd
3046*c0909341SAndroid Build Coastguard Worker    mova                 m9, [spel_shuf16]
3047*c0909341SAndroid Build Coastguard Worker    lea                  wd, [hq+wq*8-256]
3048*c0909341SAndroid Build Coastguard Worker.hv_w16_loop0:
3049*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m5, [srcq+ssq*0+ 8]
3050*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m4, m5, [srcq+ssq*0+ 0], 0
3051*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, [srcq+ssq*0+16], 1 ; 0
3052*c0909341SAndroid Build Coastguard Worker    movu                ym6, [srcq+ssq*1+ 0]
3053*c0909341SAndroid Build Coastguard Worker    movu                ym7, [srcq+ssq*1+16]
3054*c0909341SAndroid Build Coastguard Worker    lea                  r7, [srcq+r6]
3055*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m6, [srcq+ssq*2+ 0], 1
3056*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m7, [srcq+ssq*2+16], 1 ; 1 2
3057*c0909341SAndroid Build Coastguard Worker    movu               ym22, [r7  +ssq*0+ 0]
3058*c0909341SAndroid Build Coastguard Worker    movu               ym23, [r7  +ssq*0+16]
3059*c0909341SAndroid Build Coastguard Worker    mov                  r8, dstq
3060*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m22, [r7  +ssq*1+ 0], 1
3061*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m23, [r7  +ssq*1+16], 1 ; 3 4
3062*c0909341SAndroid Build Coastguard Worker    movu               ym24, [r7  +ssq*2+ 0]
3063*c0909341SAndroid Build Coastguard Worker    movu               ym25, [r7  +ssq*2+16]
3064*c0909341SAndroid Build Coastguard Worker    add                  r7, r6
3065*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m24, [r7  +ssq*0+ 0], 1
3066*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m25, [r7  +ssq*0+16], 1 ; 5 6
3067*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4, m20
3068*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
3069*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m11, m0    ; a0
3070*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6, m20
3071*c0909341SAndroid Build Coastguard Worker    mova                 m2, m10
3072*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m11, m0    ; b0
3073*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m7, m20
3074*c0909341SAndroid Build Coastguard Worker    mova                 m3, m10
3075*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m13, m0    ; c2
3076*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4, m21
3077*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m12, m0    ; a1
3078*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6, m21
3079*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m12, m0    ; b1
3080*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m7, m21
3081*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m14, m0    ; c3
3082*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m5, m20
3083*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m13, m0    ; a2
3084*c0909341SAndroid Build Coastguard Worker    shufpd               m6, m7, 0x55
3085*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m6, m20
3086*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m13, m7    ; b2
3087*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m11, m7    ; c0
3088*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m21
3089*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m14, m5    ; a3
3090*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m21
3091*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m14, m6    ; b3
3092*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m12, m6    ; c1
3093*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m22, m20
3094*c0909341SAndroid Build Coastguard Worker    mova                 m4, m10
3095*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m11, m0    ; d0
3096*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m23, m20
3097*c0909341SAndroid Build Coastguard Worker    mova                 m5, m10
3098*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m13, m0    ; e2
3099*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m24, m20
3100*c0909341SAndroid Build Coastguard Worker    mova                 m6, m10
3101*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m11, m0    ; f0
3102*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m25, m20
3103*c0909341SAndroid Build Coastguard Worker    mova                 m7, m10
3104*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m13, m0    ; g2
3105*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m22, m21
3106*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m12, m0    ; d1
3107*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m23, m21
3108*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m14, m0    ; e3
3109*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m24, m21
3110*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m12, m0    ; f1
3111*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m25, m21
3112*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m14, m0    ; g3
3113*c0909341SAndroid Build Coastguard Worker    shufpd              m22, m23, 0x55
3114*c0909341SAndroid Build Coastguard Worker    pshufb              m23, m22, m20
3115*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m13, m23   ; d2
3116*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m11, m23   ; e0
3117*c0909341SAndroid Build Coastguard Worker    shufpd              m24, m25, 0x55
3118*c0909341SAndroid Build Coastguard Worker    pshufb              m25, m24, m20
3119*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m13, m25   ; f2
3120*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m11, m25   ; g0
3121*c0909341SAndroid Build Coastguard Worker    pshufb              m22, m21
3122*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m14, m22   ; d3
3123*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m12, m22   ; e1
3124*c0909341SAndroid Build Coastguard Worker    pshufb              m24, m21
3125*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m14, m24   ; f3
3126*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m12, m24   ; g1
3127*c0909341SAndroid Build Coastguard Worker    pslldq               m1, 1
3128*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m9, m3     ; 12
3129*c0909341SAndroid Build Coastguard Worker    vpermt2b             m4, m9, m5     ; 34
3130*c0909341SAndroid Build Coastguard Worker    vpermt2b             m6, m9, m7     ; 56
3131*c0909341SAndroid Build Coastguard Worker    vpshrdd              m1, m2, 16     ; 01
3132*c0909341SAndroid Build Coastguard Worker    vpshrdd              m3, m2, m4, 16 ; 23
3133*c0909341SAndroid Build Coastguard Worker    vpshrdd              m5, m4, m6, 16 ; 45
3134*c0909341SAndroid Build Coastguard Worker.hv_w16_loop:
3135*c0909341SAndroid Build Coastguard Worker    movu               ym24, [r7+ssq*1+ 0]
3136*c0909341SAndroid Build Coastguard Worker    movu               ym25, [r7+ssq*1+16]
3137*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+ssq*2]
3138*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m24, [r7+ssq*0+ 0], 1
3139*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m25, [r7+ssq*0+16], 1
3140*c0909341SAndroid Build Coastguard Worker    mova                 m7, m10
3141*c0909341SAndroid Build Coastguard Worker    mova                 m8, m10
3142*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m24, m20
3143*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m11, m0    ; h0
3144*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m25, m20
3145*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m13, m0    ; i2
3146*c0909341SAndroid Build Coastguard Worker    pmaddwd             m22, m16, m1    ; A0
3147*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3148*c0909341SAndroid Build Coastguard Worker    pmaddwd             m23, m16, m2    ; B0
3149*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
3150*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m24, m21
3151*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m12, m0    ; h1
3152*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m25, m21
3153*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m14, m0    ; i3
3154*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m22, m17, m3    ; A1
3155*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
3156*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m23, m17, m4    ; B1
3157*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
3158*c0909341SAndroid Build Coastguard Worker    shufpd              m24, m25, 0x55
3159*c0909341SAndroid Build Coastguard Worker    pshufb              m25, m24, m20
3160*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m13, m25   ; h2
3161*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m11, m25   ; i0
3162*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m22, m18, m5    ; A2
3163*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m23, m18, m6    ; B2
3164*c0909341SAndroid Build Coastguard Worker    pshufb              m24, m21
3165*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m14, m24   ; h3
3166*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m12, m24   ; i1
3167*c0909341SAndroid Build Coastguard Worker    vpermt2b             m7, m9, m8     ; 78
3168*c0909341SAndroid Build Coastguard Worker    vpshrdd              m5, m6, m7, 16 ; 67
3169*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m22, m19, m5    ; A3
3170*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m23, m19, m7    ; B3
3171*c0909341SAndroid Build Coastguard Worker    mova                 m6, m7
3172*c0909341SAndroid Build Coastguard Worker    psrad               m22, 10
3173*c0909341SAndroid Build Coastguard Worker    psrad               m23, 10
3174*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m22, m23, q3232
3175*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m22, ym23, 1
3176*c0909341SAndroid Build Coastguard Worker    packusdw            m22, m0
3177*c0909341SAndroid Build Coastguard Worker    pminsw              m22, m15
3178*c0909341SAndroid Build Coastguard Worker    mova          [r8+dsq*0], ym22
3179*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [r8+dsq*1], m22, 1
3180*c0909341SAndroid Build Coastguard Worker    lea                  r8, [r8+dsq*2]
3181*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3182*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop
3183*c0909341SAndroid Build Coastguard Worker    add                srcq, 32
3184*c0909341SAndroid Build Coastguard Worker    add                dstq, 32
3185*c0909341SAndroid Build Coastguard Worker    movzx                hd, wb
3186*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<8
3187*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop0
3188*c0909341SAndroid Build Coastguard Worker    RET
3189*c0909341SAndroid Build Coastguard Worker
3190*c0909341SAndroid Build Coastguard Worker%if WIN64
3191*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 4
3192*c0909341SAndroid Build Coastguard Worker%else
3193*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 7
3194*c0909341SAndroid Build Coastguard Worker%endif
3195*c0909341SAndroid Build Coastguard Worker
3196*c0909341SAndroid Build Coastguard Worker%define PREP_8TAP_FN FN prep_8tap,
3197*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_16bpc
3198*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_16bpc
3199*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_16bpc
3200*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular,        REGULAR, REGULAR
3201*c0909341SAndroid Build Coastguard Worker
3202*c0909341SAndroid Build Coastguard Workercglobal prep_6tap_16bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my
3203*c0909341SAndroid Build Coastguard Worker%define base r7-prep_avx512icl
3204*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
3205*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
3206*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
3207*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 6tap_v, my, 4tap_v
3208*c0909341SAndroid Build Coastguard Worker    lea                  r7, [prep_avx512icl]
3209*c0909341SAndroid Build Coastguard Worker    mov                  wd, wm
3210*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
3211*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
3212*c0909341SAndroid Build Coastguard Worker    jnz .h
3213*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
3214*c0909341SAndroid Build Coastguard Worker    jnz .v
3215*c0909341SAndroid Build Coastguard Worker.prep:
3216*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
3217*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r7m ; bitdepth_max
3218*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pw_8192]
3219*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(prep,)]
3220*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 11
3221*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [r7-prep_avx512icl+prep_mul+r5*4]
3222*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
3223*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
3224*c0909341SAndroid Build Coastguard Worker%if WIN64
3225*c0909341SAndroid Build Coastguard Worker    pop                  r7
3226*c0909341SAndroid Build Coastguard Worker%endif
3227*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3228*c0909341SAndroid Build Coastguard Worker.h_w8:
3229*c0909341SAndroid Build Coastguard Worker    mova                 m6, [spel_h_shufA]
3230*c0909341SAndroid Build Coastguard Worker    movu                 m7, [spel_h_shufC]
3231*c0909341SAndroid Build Coastguard Worker    mova                 m8, [prep_endB]
3232*c0909341SAndroid Build Coastguard Worker.h_w8_loop:
3233*c0909341SAndroid Build Coastguard Worker    movu                ym4, [srcq+ssq*0]
3234*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m4, [srcq+ssq*1], 1
3235*c0909341SAndroid Build Coastguard Worker    movu                ym5, [srcq+ssq*2]
3236*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, [srcq+r6   ], 1
3237*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
3238*c0909341SAndroid Build Coastguard Worker    mova                 m0, m10
3239*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
3240*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m6, m4
3241*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m6, m5
3242*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m12, m2 ; a0 b0
3243*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m12, m3 ; c0 d0
3244*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m7, m4
3245*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m7, m5
3246*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m14, m4 ; a2 b2
3247*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m14, m5 ; c2 d2
3248*c0909341SAndroid Build Coastguard Worker    shufpd               m2, m4, 0x55
3249*c0909341SAndroid Build Coastguard Worker    shufpd               m3, m5, 0x55
3250*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m13, m2 ; a1 b1
3251*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m13, m3 ; c1 d1
3252*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m8, m1
3253*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
3254*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
3255*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3256*c0909341SAndroid Build Coastguard Worker    jg .h_w8_loop
3257*c0909341SAndroid Build Coastguard Worker    RET
3258*c0909341SAndroid Build Coastguard Worker.h:
3259*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [prep_8tap_rnd]
3260*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
3261*c0909341SAndroid Build Coastguard Worker    jnz .hv
3262*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
3263*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
3264*c0909341SAndroid Build Coastguard Worker    je mangle(private_prefix %+ _prep_8tap_16bpc_avx512icl).h_w4
3265*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
3266*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm0, [base+subpel_filters+1+mxq*8]
3267*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r7m
3268*c0909341SAndroid Build Coastguard Worker    sub                srcq, 4
3269*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 11
3270*c0909341SAndroid Build Coastguard Worker    psllw              xmm0, [base+prep_hv_shift+r5*8]
3271*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xmm0
3272*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, xmm0
3273*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [tmpq+ 4]
3274*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [tmpq+ 8]
3275*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 16
3276*c0909341SAndroid Build Coastguard Worker    jl .h_w8
3277*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m5, [spel_h_shufA]
3278*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m6, [spel_h_shufB]
3279*c0909341SAndroid Build Coastguard Worker    mova                 m7, [prep_endC]
3280*c0909341SAndroid Build Coastguard Worker    jg .h_w32
3281*c0909341SAndroid Build Coastguard Worker.h_w16_loop:
3282*c0909341SAndroid Build Coastguard Worker    movu                ym2, [srcq+ssq*0+ 0]
3283*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, [srcq+ssq*1+ 0], 1
3284*c0909341SAndroid Build Coastguard Worker    movu                ym3, [srcq+ssq*0+12]
3285*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, [srcq+ssq*1+12], 1
3286*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3287*c0909341SAndroid Build Coastguard Worker    mova                 m0, m10
3288*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
3289*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m2, m5   ; 01
3290*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m12, m4  ; a0  b0
3291*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m3, m6   ; 89
3292*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m14, m4  ; a2' b2'
3293*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m6       ; 23
3294*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m5       ; 67
3295*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m13, m2  ; a1  b1
3296*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m13, m3  ; a1' b1'
3297*c0909341SAndroid Build Coastguard Worker    shufpd               m2, m3, 0x55 ; 45
3298*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m14, m2  ; a2  b2
3299*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m12, m2  ; a0' b0'
3300*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m7, m1
3301*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
3302*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
3303*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3304*c0909341SAndroid Build Coastguard Worker    jg .h_w16_loop
3305*c0909341SAndroid Build Coastguard Worker    RET
3306*c0909341SAndroid Build Coastguard Worker.h_w32:
3307*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+wq*2]
3308*c0909341SAndroid Build Coastguard Worker    neg                  wq
3309*c0909341SAndroid Build Coastguard Worker.h_w32_loop0:
3310*c0909341SAndroid Build Coastguard Worker    mov                  r6, wq
3311*c0909341SAndroid Build Coastguard Worker.h_w32_loop:
3312*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6*2+ 0]
3313*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+r6*2+12]
3314*c0909341SAndroid Build Coastguard Worker    mova                 m0, m10
3315*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
3316*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m2, m5
3317*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m12, m4
3318*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m3, m6
3319*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m14, m4
3320*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m6
3321*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m5
3322*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m13, m2
3323*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m13, m3
3324*c0909341SAndroid Build Coastguard Worker    shufpd               m2, m3, 0x55
3325*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m14, m2
3326*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m12, m2
3327*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m7, m1
3328*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
3329*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
3330*c0909341SAndroid Build Coastguard Worker    add                  r6, 32
3331*c0909341SAndroid Build Coastguard Worker    jl .h_w32_loop
3332*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
3333*c0909341SAndroid Build Coastguard Worker    dec                  hd
3334*c0909341SAndroid Build Coastguard Worker    jg .h_w32_loop0
3335*c0909341SAndroid Build Coastguard Worker    RET
3336*c0909341SAndroid Build Coastguard Worker.v:
3337*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3338*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3339*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
3340*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
3341*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r7m
3342*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [prep_8tap_rnd]
3343*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm0, [base+subpel_filters+1+myq*8]
3344*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, wd
3345*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 11
3346*c0909341SAndroid Build Coastguard Worker    movzx               r6d, word [r7+r6*2+table_offset(prep, _6tap_v)]
3347*c0909341SAndroid Build Coastguard Worker    psllw              xmm0, [base+prep_hv_shift+r5*8]
3348*c0909341SAndroid Build Coastguard Worker    add                  r7, r6
3349*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xmm0
3350*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, xmm0
3351*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
3352*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [tmpq+ 4]
3353*c0909341SAndroid Build Coastguard Worker    neg                  r6
3354*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [tmpq+ 8]
3355*c0909341SAndroid Build Coastguard Worker    jmp                  r7
3356*c0909341SAndroid Build Coastguard Worker.v_w4:
3357*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 0x330c
3358*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+r6 *2]
3359*c0909341SAndroid Build Coastguard Worker    kmovw                k1, r3d
3360*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    ym1{k1}, [srcq+r6 *1]
3361*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+ssq*0]
3362*c0909341SAndroid Build Coastguard Worker    vinserti32x4     m1{k1}, m2, [srcq+ssq*1], 3
3363*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+ssq*2]
3364*c0909341SAndroid Build Coastguard Worker    mova                ym4, [prep_endA]
3365*c0909341SAndroid Build Coastguard Worker    valignq              m0, m1, 2
3366*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0        ; 01 12 23 34
3367*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
3368*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
3369*c0909341SAndroid Build Coastguard Worker    movq                xm2, [srcq+r6 *1]
3370*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    ym2{k1}, [srcq+ssq*0]
3371*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+ssq*1]
3372*c0909341SAndroid Build Coastguard Worker    vinserti32x4     m2{k1}, m3, [srcq+ssq*2], 3
3373*c0909341SAndroid Build Coastguard Worker    mova                 m3, m10
3374*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m12, m1   ; a0 b0 c0 d0
3375*c0909341SAndroid Build Coastguard Worker    valignq              m0, m2, m0, 6 ; 4 5 6 7
3376*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2        ; 45 56 67 78
3377*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m14, m0   ; a2 b2 c2 d2
3378*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m0, q1032 ; 23 34 45 56
3379*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m13, m1   ; a1 b1 c1 d1
3380*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
3381*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
3382*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m4, m3
3383*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], ym3
3384*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
3385*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3386*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
3387*c0909341SAndroid Build Coastguard Worker    RET
3388*c0909341SAndroid Build Coastguard Worker.v_w8:
3389*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym1, [srcq+r6 *1]
3390*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 0x33
3391*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m2, [srcq+ssq*0]
3392*c0909341SAndroid Build Coastguard Worker    kmovb                k1, r3d
3393*c0909341SAndroid Build Coastguard Worker    mova                 m6, [spel_v_shuf8]
3394*c0909341SAndroid Build Coastguard Worker    vinserti64x2     m1{k1}, m2, [srcq+r6 *2], 0 ; 0 1 2
3395*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym0, [srcq+ssq*1]
3396*c0909341SAndroid Build Coastguard Worker    vinserti64x2     m0{k1}, m2, [srcq+ssq*2], 2 ; 2 3 4
3397*c0909341SAndroid Build Coastguard Worker    mova                 m7, [prep_endB]
3398*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m6, m1  ; 01 12
3399*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m6, m0  ; 23 34
3400*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
3401*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
3402*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym3, [srcq+r6 *1]
3403*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+ssq*0]
3404*c0909341SAndroid Build Coastguard Worker    vshufi64x2       m3{k1}, m0, m4, q1032       ; 4 5 6
3405*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym0, [srcq+ssq*1]
3406*c0909341SAndroid Build Coastguard Worker    vinserti64x2     m0{k1}, m4, [srcq+ssq*2], 2 ; 6 7 8
3407*c0909341SAndroid Build Coastguard Worker    mova                 m4, m10
3408*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m12, m1 ; a0 b0
3409*c0909341SAndroid Build Coastguard Worker    mova                 m5, m10
3410*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m12, m2 ; c0 d0
3411*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m6, m3  ; 45 56
3412*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m13, m2 ; a1 b1
3413*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m6, m0  ; 67 78
3414*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m13, m1 ; c1 d1
3415*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m14, m1 ; a2 b2
3416*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m14, m2 ; c2 d2
3417*c0909341SAndroid Build Coastguard Worker    vpermt2b             m4, m7, m5
3418*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m4
3419*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
3420*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3421*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
3422*c0909341SAndroid Build Coastguard Worker    RET
3423*c0909341SAndroid Build Coastguard Worker.v_w16:
3424*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m0, [srcq+r6 *1]
3425*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, m0, [srcq+ssq*0], 1 ; 1 2
3426*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+r6 *2], 0     ; 0 1
3427*c0909341SAndroid Build Coastguard Worker    mova                 m6, [spel_v_shuf16]
3428*c0909341SAndroid Build Coastguard Worker    movu                ym3, [srcq+ssq*1]
3429*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3430*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, [srcq+ssq*0], 1     ; 3 4
3431*c0909341SAndroid Build Coastguard Worker    mova                 m7, [prep_endA]
3432*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m6, m1     ; 12
3433*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m6, m0     ; 01
3434*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m6, m3     ; 34
3435*c0909341SAndroid Build Coastguard Worker    vpshrdd              m2, m1, m3, 16 ; 23
3436*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
3437*c0909341SAndroid Build Coastguard Worker    mova                 m5, m10
3438*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m12, m1    ; b0
3439*c0909341SAndroid Build Coastguard Worker    mova                 m4, m10
3440*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m12, m0    ; a0
3441*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3442*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m13, m3    ; b1
3443*c0909341SAndroid Build Coastguard Worker    movu                ym3, [srcq+ssq*1]
3444*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3445*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m13, m2    ; a1
3446*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, [srcq+ssq*0], 1
3447*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
3448*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m6, m3     ; 56
3449*c0909341SAndroid Build Coastguard Worker    vpshrdd              m2, m1, m3, 16 ; 45
3450*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m14, m3    ; b2
3451*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m14, m2    ; a2
3452*c0909341SAndroid Build Coastguard Worker    vpermt2b             m4, m7, m5
3453*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m4
3454*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
3455*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3456*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
3457*c0909341SAndroid Build Coastguard Worker    RET
3458*c0909341SAndroid Build Coastguard Worker.v_w32:
3459*c0909341SAndroid Build Coastguard Worker.v_w64:
3460*c0909341SAndroid Build Coastguard Worker.v_w128:
3461*c0909341SAndroid Build Coastguard Worker%if WIN64
3462*c0909341SAndroid Build Coastguard Worker    push                 r8
3463*c0909341SAndroid Build Coastguard Worker%endif
3464*c0909341SAndroid Build Coastguard Worker    mova                m11, [prep_endC]
3465*c0909341SAndroid Build Coastguard Worker    lea                  r5, [hq+wq*8-256]
3466*c0909341SAndroid Build Coastguard Worker.v_w32_loop0:
3467*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+r6 *2]
3468*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+r6 *1]
3469*c0909341SAndroid Build Coastguard Worker    lea                  r7, [srcq+ssq*2]
3470*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*0]
3471*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*1]
3472*c0909341SAndroid Build Coastguard Worker    mov                  r8, tmpq
3473*c0909341SAndroid Build Coastguard Worker    movu                 m8, [r7  +ssq*0]
3474*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m4, m5  ; 01
3475*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5
3476*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m5, m6  ; 12
3477*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6
3478*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m6, m7  ; 23
3479*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m7
3480*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m7, m8  ; 34
3481*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m8
3482*c0909341SAndroid Build Coastguard Worker.v_w32_loop:
3483*c0909341SAndroid Build Coastguard Worker    mova                m16, m10
3484*c0909341SAndroid Build Coastguard Worker    movu                 m9, [r7+ssq*1]
3485*c0909341SAndroid Build Coastguard Worker    mova                m18, m10
3486*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m12, m0 ; a0
3487*c0909341SAndroid Build Coastguard Worker    mova                m17, m10
3488*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m18, m12, m4
3489*c0909341SAndroid Build Coastguard Worker    mova                m19, m10
3490*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m12, m1 ; b0
3491*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+ssq*2]
3492*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m19, m12, m5
3493*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
3494*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m13, m2 ; a1
3495*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m8, m9  ; 45
3496*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
3497*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m18, m13, m6
3498*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m8, m9
3499*c0909341SAndroid Build Coastguard Worker    movu                 m8, [r7+ssq*0]
3500*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m13, m3 ; b1
3501*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3502*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m19, m13, m7
3503*c0909341SAndroid Build Coastguard Worker    mova                 m5, m7
3504*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m14, m2 ; a2
3505*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m9, m8  ; 56
3506*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m18, m14, m6
3507*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m9, m8
3508*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m14, m3 ; b2
3509*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m19, m14, m7
3510*c0909341SAndroid Build Coastguard Worker    vpermt2b            m16, m11, m18
3511*c0909341SAndroid Build Coastguard Worker    vpermt2b            m17, m11, m19
3512*c0909341SAndroid Build Coastguard Worker    mova          [r8+wq*0], m16
3513*c0909341SAndroid Build Coastguard Worker    mova          [r8+wq*2], m17
3514*c0909341SAndroid Build Coastguard Worker    lea                  r8, [r8+wq*4]
3515*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3516*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop
3517*c0909341SAndroid Build Coastguard Worker    add                srcq, 64
3518*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
3519*c0909341SAndroid Build Coastguard Worker    movzx                hd, r5b
3520*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 1<<8
3521*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop0
3522*c0909341SAndroid Build Coastguard Worker%if WIN64
3523*c0909341SAndroid Build Coastguard Worker    pop                  r8
3524*c0909341SAndroid Build Coastguard Worker%endif
3525*c0909341SAndroid Build Coastguard Worker    vzeroupper
3526*c0909341SAndroid Build Coastguard Worker    RET
3527*c0909341SAndroid Build Coastguard Worker.hv_w4:
3528*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
3529*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
3530*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3531*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3532*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
3533*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
3534*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r7m
3535*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm1, [base+subpel_filters+1+myq*8]
3536*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
3537*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
3538*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 11
3539*c0909341SAndroid Build Coastguard Worker    neg                  r6
3540*c0909341SAndroid Build Coastguard Worker    psllw              xmm0, [base+prep_hv_shift+r5*8]
3541*c0909341SAndroid Build Coastguard Worker    psllw              xmm1, 2
3542*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+ 0], xmm0
3543*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+16], xmm1
3544*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [tmpq+ 4]
3545*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 0xf0
3546*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [tmpq+ 8]
3547*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, xmm1
3548*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+r6 *2]
3549*c0909341SAndroid Build Coastguard Worker    kmovb                k1, r3d
3550*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym3, [srcq+r6 *1], 1
3551*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m2, [srcq+ssq*0]
3552*c0909341SAndroid Build Coastguard Worker    vinserti64x2     m3{k1}, m2, [srcq+ssq*1], 3
3553*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+ssq*2]
3554*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m5, [spel_h_shufA]
3555*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m6, [spel_h_shufB]
3556*c0909341SAndroid Build Coastguard Worker    mova                 m1, m11
3557*c0909341SAndroid Build Coastguard Worker    mova                m15, [spel_shuf4a]
3558*c0909341SAndroid Build Coastguard Worker    mova                xm2, xm11
3559*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3, m5
3560*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m8, m0
3561*c0909341SAndroid Build Coastguard Worker    pshufb              xm0, xm4, xm5
3562*c0909341SAndroid Build Coastguard Worker    vpdpwssd            xm2, xm8, xm0
3563*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [tmpq+20]
3564*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m6
3565*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [tmpq+24]
3566*c0909341SAndroid Build Coastguard Worker    pshufb              xm4, xm6
3567*c0909341SAndroid Build Coastguard Worker    mova                 m7, [spel_shuf4b]
3568*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m9, m3    ; 0 1 2 3
3569*c0909341SAndroid Build Coastguard Worker    vpdpwssd            xm2, xm9, xm4  ; 4
3570*c0909341SAndroid Build Coastguard Worker    vpermt2b             m1, m15, m2   ; 01 12 23 34
3571*c0909341SAndroid Build Coastguard Worker    mova               ym15, [prep_endA]
3572*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
3573*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
3574*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+r6 *1]
3575*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym4, [srcq+ssq*0], 1
3576*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [srcq+ssq*1]
3577*c0909341SAndroid Build Coastguard Worker    vinserti64x2     m4{k1}, m3, [srcq+ssq*2], 3
3578*c0909341SAndroid Build Coastguard Worker    mova                 m2, m11
3579*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4, m5
3580*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m8, m3
3581*c0909341SAndroid Build Coastguard Worker    mova                 m3, m10
3582*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m12, m1   ; a0 b0 c0 d0
3583*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m6
3584*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m9, m4    ; 5 6 7 8
3585*c0909341SAndroid Build Coastguard Worker    mova                 m4, m1
3586*c0909341SAndroid Build Coastguard Worker    vpermt2b             m1, m7, m2    ; 45 56 67 78
3587*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m14, m1   ; a2 b2 c2 d2
3588*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m1, q1032 ; 23 34 45 56
3589*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m13, m4   ; a1 b1 c1 d1
3590*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m15, m3
3591*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], ym3
3592*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
3593*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3594*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
3595*c0909341SAndroid Build Coastguard Worker    RET
3596*c0909341SAndroid Build Coastguard Worker.hv_w8:
3597*c0909341SAndroid Build Coastguard Worker    mova                 m8, [spel_h_shufA]
3598*c0909341SAndroid Build Coastguard Worker    movu               ym18, [srcq+r6 *2]
3599*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m18, [srcq+r6 *1], 1 ; 0 1
3600*c0909341SAndroid Build Coastguard Worker    movu               ym19, [srcq+ssq*0]
3601*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m19, [srcq+ssq*1], 1 ; 2 3
3602*c0909341SAndroid Build Coastguard Worker    movu               ym20, [srcq+ssq*2]    ; 4
3603*c0909341SAndroid Build Coastguard Worker    movu                 m9, [spel_h_shufC]
3604*c0909341SAndroid Build Coastguard Worker    mova                m21, [spel_shuf8a]
3605*c0909341SAndroid Build Coastguard Worker    mova                 m0, [spel_shuf8b]
3606*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m8, m18
3607*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
3608*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m8, m19
3609*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m12, m4  ; a0 b0
3610*c0909341SAndroid Build Coastguard Worker    mova                 m2, m10
3611*c0909341SAndroid Build Coastguard Worker    vpermb               m6, m8, m20
3612*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m12, m5  ; c0 d0
3613*c0909341SAndroid Build Coastguard Worker    mova                 m3, m10
3614*c0909341SAndroid Build Coastguard Worker    vpermb              m18, m9, m18
3615*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m12, m6  ; e0
3616*c0909341SAndroid Build Coastguard Worker    mova                 m7, [prep_endB]
3617*c0909341SAndroid Build Coastguard Worker    vpermb              m19, m9, m19
3618*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m14, m18 ; a2 b2
3619*c0909341SAndroid Build Coastguard Worker    vpermb              m20, m9, m20
3620*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m14, m19 ; c2 d2
3621*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m18, 0x55
3622*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m14, m20 ; e2
3623*c0909341SAndroid Build Coastguard Worker    shufpd               m5, m19, 0x55
3624*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m13, m4  ; a1 b1
3625*c0909341SAndroid Build Coastguard Worker    shufpd               m6, m20, 0x55
3626*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m13, m5  ; c1 d1
3627*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m13, m6  ; e1
3628*c0909341SAndroid Build Coastguard Worker    vpermt2b             m1, m21, m2  ; 01 12
3629*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m21, m3  ; 23 34
3630*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
3631*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
3632*c0909341SAndroid Build Coastguard Worker    movu               ym18, [srcq+r6 *1]
3633*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m18, [srcq+ssq*0], 1
3634*c0909341SAndroid Build Coastguard Worker    movu               ym19, [srcq+ssq*1]
3635*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m19, [srcq+ssq*2], 1
3636*c0909341SAndroid Build Coastguard Worker    mova                 m3, m10
3637*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m8, m18
3638*c0909341SAndroid Build Coastguard Worker    mova                 m4, m10
3639*c0909341SAndroid Build Coastguard Worker    vpermb               m6, m8, m19
3640*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m12, m5  ; f0 g0
3641*c0909341SAndroid Build Coastguard Worker    mova                m20, m11
3642*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m12, m6  ; h0 i0
3643*c0909341SAndroid Build Coastguard Worker    mova                m21, m11
3644*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m15, m1  ; A0 B0
3645*c0909341SAndroid Build Coastguard Worker    vpermb              m18, m9, m18
3646*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m21, m15, m2  ; C0 D0
3647*c0909341SAndroid Build Coastguard Worker    vpermb              m19, m9, m19
3648*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m14, m18 ; f2 g2
3649*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m14, m19 ; h2 i2
3650*c0909341SAndroid Build Coastguard Worker    shufpd               m5, m18, 0x55
3651*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m16, m2  ; A1 B1
3652*c0909341SAndroid Build Coastguard Worker    shufpd               m6, m19, 0x55
3653*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m13, m5  ; f1 g1
3654*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m13, m6  ; h1 i1
3655*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m0, m3   ; 45 56
3656*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m21, m16, m2  ; C1 D1
3657*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
3658*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m0, m4   ; 67 78
3659*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m17, m1  ; A2 B2
3660*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m21, m17, m2  ; A2 B2
3661*c0909341SAndroid Build Coastguard Worker    vpermt2b            m20, m7, m21
3662*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m20
3663*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
3664*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3665*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
3666*c0909341SAndroid Build Coastguard Worker    vzeroupper
3667*c0909341SAndroid Build Coastguard Worker    RET
3668*c0909341SAndroid Build Coastguard Worker.hv:
3669*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pd_128]
3670*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
3671*c0909341SAndroid Build Coastguard Worker    je .hv_w4
3672*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
3673*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm0, [base+subpel_filters+1+mxq*8]
3674*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3675*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3676*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
3677*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
3678*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r7m
3679*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm1, [base+subpel_filters+1+myq*8]
3680*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
3681*c0909341SAndroid Build Coastguard Worker    sub                srcq, 4
3682*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 11
3683*c0909341SAndroid Build Coastguard Worker    neg                  r6
3684*c0909341SAndroid Build Coastguard Worker    psllw              xmm0, [base+prep_hv_shift+r5*8]
3685*c0909341SAndroid Build Coastguard Worker    psllw              xmm1, 2
3686*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+ 0], xmm0
3687*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+16], xmm1
3688*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, xmm0
3689*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [tmpq+ 4]
3690*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [tmpq+ 8]
3691*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, xmm1
3692*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m16, [tmpq+20]
3693*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m17, [tmpq+24]
3694*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 16
3695*c0909341SAndroid Build Coastguard Worker    jl .hv_w8
3696*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m8, [spel_h_shufA]
3697*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m9, [spel_h_shufB]
3698*c0909341SAndroid Build Coastguard Worker    jg .hv_w32
3699*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m6, [srcq+r6 *2+ 8]
3700*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, m6, [srcq+r6 *2+16], 1
3701*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m6, [srcq+r6 *2+ 0], 0 ; 0
3702*c0909341SAndroid Build Coastguard Worker    movu               ym18, [srcq+r6 *1+ 0]
3703*c0909341SAndroid Build Coastguard Worker    movu               ym19, [srcq+r6 *1+12]
3704*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m18, [srcq+ssq*0+ 0], 1
3705*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m19, [srcq+ssq*0+12], 1 ; 1 2
3706*c0909341SAndroid Build Coastguard Worker    movu               ym20, [srcq+ssq*1+ 0]
3707*c0909341SAndroid Build Coastguard Worker    movu               ym21, [srcq+ssq*1+12]
3708*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3709*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m20, [srcq+ssq*0+ 0], 1
3710*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m21, [srcq+ssq*0+12], 1 ; 3 4
3711*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m8
3712*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
3713*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m18, m8
3714*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m14, m2    ; a2
3715*c0909341SAndroid Build Coastguard Worker    mova                 m2, m10
3716*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m19, m9
3717*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m12, m3    ; b0  c0
3718*c0909341SAndroid Build Coastguard Worker    mova                 m3, m10
3719*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m20, m8
3720*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m14, m4    ; b2' c2'
3721*c0909341SAndroid Build Coastguard Worker    mova                 m4, m10
3722*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m21, m9
3723*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m12, m5    ; d0  e0
3724*c0909341SAndroid Build Coastguard Worker    mova                 m5, m10
3725*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6, m8
3726*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m14, m7    ; d2' e2'
3727*c0909341SAndroid Build Coastguard Worker    mova                 m7, [spel_shuf16]
3728*c0909341SAndroid Build Coastguard Worker    pshufb              m18, m9
3729*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m12, m0    ; a0
3730*c0909341SAndroid Build Coastguard Worker    pshufb              m19, m8
3731*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m13, m18   ; b1  c1
3732*c0909341SAndroid Build Coastguard Worker    pshufb              m20, m9
3733*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m13, m19   ; b1' c1'
3734*c0909341SAndroid Build Coastguard Worker    pshufb              m21, m8
3735*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m13, m20   ; d1  e1
3736*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m9
3737*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m13, m21   ; d1' e1'
3738*c0909341SAndroid Build Coastguard Worker    mova                 m0, [prep_endB]
3739*c0909341SAndroid Build Coastguard Worker    shufpd              m18, m19, 0x55
3740*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m13, m6    ; a1
3741*c0909341SAndroid Build Coastguard Worker    shufpd              m20, m21, 0x55
3742*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m14, m18   ; b2  c2
3743*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m12, m18   ; b0' c0'
3744*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m14, m20   ; d2  e2
3745*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m12, m20   ; d0' e0'
3746*c0909341SAndroid Build Coastguard Worker    pslldq               m1, 1
3747*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m7, m3     ; 12
3748*c0909341SAndroid Build Coastguard Worker    vpermt2b             m4, m7, m5     ; 34
3749*c0909341SAndroid Build Coastguard Worker    vpshrdd              m1, m2, 16     ; 01
3750*c0909341SAndroid Build Coastguard Worker    vpshrdd              m3, m2, m4, 16 ; 23
3751*c0909341SAndroid Build Coastguard Worker.hv_w16_loop:
3752*c0909341SAndroid Build Coastguard Worker    movu               ym18, [srcq+ssq*1+ 0]
3753*c0909341SAndroid Build Coastguard Worker    movu               ym19, [srcq+ssq*1+12]
3754*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3755*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m18, [srcq+ssq*0+ 0], 1
3756*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m19, [srcq+ssq*0+12], 1
3757*c0909341SAndroid Build Coastguard Worker    mova                 m5, m10
3758*c0909341SAndroid Build Coastguard Worker    mova                 m6, m10
3759*c0909341SAndroid Build Coastguard Worker    pshufb              m21, m18, m8
3760*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m12, m21   ; f0  g0
3761*c0909341SAndroid Build Coastguard Worker    pshufb              m20, m19, m9
3762*c0909341SAndroid Build Coastguard Worker    mova                m21, m11
3763*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m14, m20   ; f2' g2'
3764*c0909341SAndroid Build Coastguard Worker    mova                m20, m11
3765*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m21, m15, m2    ; B0
3766*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
3767*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m15, m1    ; A0
3768*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3769*c0909341SAndroid Build Coastguard Worker    pshufb              m18, m9
3770*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m13, m18   ; f1  g1
3771*c0909341SAndroid Build Coastguard Worker    pshufb              m19, m8
3772*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m13, m19   ; f1' g1'
3773*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m21, m16, m4    ; B1
3774*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m16, m3    ; A1
3775*c0909341SAndroid Build Coastguard Worker    shufpd              m18, m19, 0x55
3776*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m14, m18   ; f2  g2
3777*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m12, m18   ; f0' g0'
3778*c0909341SAndroid Build Coastguard Worker    mova                 m4, m7
3779*c0909341SAndroid Build Coastguard Worker    vpermi2b             m4, m5, m6     ; 56
3780*c0909341SAndroid Build Coastguard Worker    vpshrdd              m3, m2, m4, 16 ; 45
3781*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m21, m17, m4    ; B2
3782*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m17, m3    ; A2
3783*c0909341SAndroid Build Coastguard Worker    vpermt2b            m20, m0, m21
3784*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m20
3785*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
3786*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3787*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop
3788*c0909341SAndroid Build Coastguard Worker    vzeroupper
3789*c0909341SAndroid Build Coastguard Worker    RET
3790*c0909341SAndroid Build Coastguard Worker.hv_w32:
3791*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      29
3792*c0909341SAndroid Build Coastguard Worker%if WIN64
3793*c0909341SAndroid Build Coastguard Worker    push                 r8
3794*c0909341SAndroid Build Coastguard Worker%endif
3795*c0909341SAndroid Build Coastguard Worker    mova                m27, [spel_shuf32]
3796*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [hq+wq*8-256]
3797*c0909341SAndroid Build Coastguard Worker    mova                m28, [prep_endC]
3798*c0909341SAndroid Build Coastguard Worker.hv_w32_loop0:
3799*c0909341SAndroid Build Coastguard Worker    movu                m18, [srcq+r6 *2+ 0]
3800*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+r6 *2+12]
3801*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+r6 *1+ 0]
3802*c0909341SAndroid Build Coastguard Worker    movu                m20, [srcq+r6 *1+12]
3803*c0909341SAndroid Build Coastguard Worker    lea                  r7, [srcq+ssq*2]
3804*c0909341SAndroid Build Coastguard Worker    movu                m19, [srcq+ssq*0+ 0]
3805*c0909341SAndroid Build Coastguard Worker    movu                m21, [srcq+ssq*0+12]
3806*c0909341SAndroid Build Coastguard Worker    movu                m22, [srcq+ssq*1+ 0]
3807*c0909341SAndroid Build Coastguard Worker    movu                m24, [srcq+ssq*1+12]
3808*c0909341SAndroid Build Coastguard Worker    mov                  r8, tmpq
3809*c0909341SAndroid Build Coastguard Worker    movu                m23, [r7  +ssq*0+ 0]
3810*c0909341SAndroid Build Coastguard Worker    movu                m25, [r7  +ssq*0+12]
3811*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m18, m8
3812*c0909341SAndroid Build Coastguard Worker    mova                 m0, m10
3813*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7, m9
3814*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m12, m1    ; a0
3815*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
3816*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m6, m8
3817*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m14, m2    ; a2'
3818*c0909341SAndroid Build Coastguard Worker    mova                 m2, m10
3819*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m19, m8
3820*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m12, m4    ; b0
3821*c0909341SAndroid Build Coastguard Worker    mova                 m4, m10
3822*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m20, m9
3823*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m12, m3    ; c0
3824*c0909341SAndroid Build Coastguard Worker    mova                 m3, m10
3825*c0909341SAndroid Build Coastguard Worker    pshufb              m26, m21, m9
3826*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m14, m5    ; b2'
3827*c0909341SAndroid Build Coastguard Worker    mova                 m5, m10
3828*c0909341SAndroid Build Coastguard Worker    pshufb              m18, m9
3829*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m14, m26   ; c2'
3830*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m8
3831*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m13, m18   ; a1
3832*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m9
3833*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m13, m7    ; a1'
3834*c0909341SAndroid Build Coastguard Worker    pshufb              m19, m9
3835*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m13, m6    ; b1
3836*c0909341SAndroid Build Coastguard Worker    pshufb              m20, m8
3837*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m13, m19   ; c1
3838*c0909341SAndroid Build Coastguard Worker    pshufb              m21, m8
3839*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m13, m20   ; b1'
3840*c0909341SAndroid Build Coastguard Worker    shufpd              m18, m7, 0x55
3841*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m13, m21   ; c1'
3842*c0909341SAndroid Build Coastguard Worker    shufpd               m6, m20, 0x55
3843*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m14, m18   ; a2
3844*c0909341SAndroid Build Coastguard Worker    shufpd              m19, m21, 0x55
3845*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m12, m18   ; a0'
3846*c0909341SAndroid Build Coastguard Worker    pshufb              m18, m22, m8
3847*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m14, m6    ; b2
3848*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m23, m8
3849*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m14, m19   ; c2
3850*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m12, m6    ; b0'
3851*c0909341SAndroid Build Coastguard Worker    mova                 m6, m10
3852*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m12, m19   ; c0'
3853*c0909341SAndroid Build Coastguard Worker    pshufb              m19, m24, m9
3854*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m12, m18   ; d0
3855*c0909341SAndroid Build Coastguard Worker    mova                m18, m10
3856*c0909341SAndroid Build Coastguard Worker    pshufb              m26, m25, m9
3857*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m18, m12, m7    ; e0
3858*c0909341SAndroid Build Coastguard Worker    mova                 m7, m10
3859*c0909341SAndroid Build Coastguard Worker    pshufb              m22, m9
3860*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m14, m19   ; d2'
3861*c0909341SAndroid Build Coastguard Worker    mova                m19, m10
3862*c0909341SAndroid Build Coastguard Worker    pshufb              m23, m9
3863*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m19, m14, m26   ; e2'
3864*c0909341SAndroid Build Coastguard Worker    pshufb              m24, m8
3865*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m13, m22   ; d1
3866*c0909341SAndroid Build Coastguard Worker    pshufb              m25, m8
3867*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m18, m13, m23   ; e1
3868*c0909341SAndroid Build Coastguard Worker    shufpd              m22, m24, 0x55
3869*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m13, m24   ; d1'
3870*c0909341SAndroid Build Coastguard Worker    shufpd              m23, m25, 0x55
3871*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m19, m13, m25   ; e1'
3872*c0909341SAndroid Build Coastguard Worker    pslldq               m0, 1
3873*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m14, m22   ; d2
3874*c0909341SAndroid Build Coastguard Worker    pslldq               m1, 1
3875*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m18, m14, m23   ; e2
3876*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m27, m4    ; 12
3877*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m12, m22   ; d0'
3878*c0909341SAndroid Build Coastguard Worker    vpermt2b             m3, m27, m5    ; 12'
3879*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m19, m12, m23   ; e0'
3880*c0909341SAndroid Build Coastguard Worker    vpshrdd              m0, m2, 16     ; 01
3881*c0909341SAndroid Build Coastguard Worker    vpermt2b             m6, m27, m18   ; 34
3882*c0909341SAndroid Build Coastguard Worker    vpshrdd              m1, m3, 16     ; 01'
3883*c0909341SAndroid Build Coastguard Worker    vpermt2b             m7, m27, m19   ; 34'
3884*c0909341SAndroid Build Coastguard Worker    vpshrdd              m4, m2, m6, 16 ; 23
3885*c0909341SAndroid Build Coastguard Worker    vpshrdd              m5, m3, m7, 16 ; 23'
3886*c0909341SAndroid Build Coastguard Worker.hv_w32_loop:
3887*c0909341SAndroid Build Coastguard Worker    movu                m22, [r7+ssq*1+ 0]
3888*c0909341SAndroid Build Coastguard Worker    movu                m24, [r7+ssq*1+12]
3889*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+ssq*2]
3890*c0909341SAndroid Build Coastguard Worker    movu                m23, [r7+ssq*0+ 0]
3891*c0909341SAndroid Build Coastguard Worker    movu                m25, [r7+ssq*0+12]
3892*c0909341SAndroid Build Coastguard Worker    mova                m19, m11
3893*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m19, m15, m2    ; B0
3894*c0909341SAndroid Build Coastguard Worker    mova                m21, m11
3895*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m21, m15, m3    ; B0'
3896*c0909341SAndroid Build Coastguard Worker    mova                m18, m11
3897*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m18, m15, m0    ; A0
3898*c0909341SAndroid Build Coastguard Worker    mova                m20, m11
3899*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m15, m1    ; A0'
3900*c0909341SAndroid Build Coastguard Worker    mova                 m2, m6
3901*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m19, m16, m6    ; B1
3902*c0909341SAndroid Build Coastguard Worker    mova                 m3, m7
3903*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m21, m16, m7    ; B1'
3904*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
3905*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m18, m16, m4    ; A1
3906*c0909341SAndroid Build Coastguard Worker    mova                 m1, m5
3907*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m22, m8
3908*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m16, m5    ; A1'
3909*c0909341SAndroid Build Coastguard Worker    mova                 m6, m10
3910*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m23, m8
3911*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m12, m4    ; f0
3912*c0909341SAndroid Build Coastguard Worker    mova                 m4, m10
3913*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m24, m9
3914*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m12, m7    ; g0
3915*c0909341SAndroid Build Coastguard Worker    mova                 m7, m10
3916*c0909341SAndroid Build Coastguard Worker    pshufb              m26, m25, m9
3917*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m14, m5    ; f2'
3918*c0909341SAndroid Build Coastguard Worker    mova                 m5, m10
3919*c0909341SAndroid Build Coastguard Worker    pshufb              m22, m9
3920*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m14, m26   ; g2'
3921*c0909341SAndroid Build Coastguard Worker    pshufb              m23, m9
3922*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m13, m22   ; f1
3923*c0909341SAndroid Build Coastguard Worker    pshufb              m24, m8
3924*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m13, m23   ; g1
3925*c0909341SAndroid Build Coastguard Worker    pshufb              m25, m8
3926*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m13, m24   ; f1'
3927*c0909341SAndroid Build Coastguard Worker    shufpd              m22, m24, 0x55
3928*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m13, m25   ; g1'
3929*c0909341SAndroid Build Coastguard Worker    shufpd              m23, m25, 0x55
3930*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m14, m22   ; f2
3931*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m14, m23   ; g2
3932*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m12, m22   ; f0'
3933*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m12, m23   ; g0'
3934*c0909341SAndroid Build Coastguard Worker    vpermt2b             m6, m27, m4    ; 56
3935*c0909341SAndroid Build Coastguard Worker    vpermt2b             m7, m27, m5    ; 56'
3936*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m19, m17, m6    ; B2
3937*c0909341SAndroid Build Coastguard Worker    vpshrdd              m4, m2, m6, 16 ; 45
3938*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m21, m17, m7    ; B2'
3939*c0909341SAndroid Build Coastguard Worker    vpshrdd              m5, m3, m7, 16 ; 45'
3940*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m18, m17, m4    ; A2
3941*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m17, m5    ; A2'
3942*c0909341SAndroid Build Coastguard Worker    vpermt2b            m19, m28, m21
3943*c0909341SAndroid Build Coastguard Worker    vpermt2b            m18, m28, m20
3944*c0909341SAndroid Build Coastguard Worker    mova          [r8+wq*0], m18
3945*c0909341SAndroid Build Coastguard Worker    mova          [r8+wq*2], m19
3946*c0909341SAndroid Build Coastguard Worker    lea                  r8, [r8+wq*4]
3947*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3948*c0909341SAndroid Build Coastguard Worker    jg .hv_w32_loop
3949*c0909341SAndroid Build Coastguard Worker    add                srcq, 64
3950*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
3951*c0909341SAndroid Build Coastguard Worker    movzx                hd, r5b
3952*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 1<<8
3953*c0909341SAndroid Build Coastguard Worker    jg .hv_w32_loop0
3954*c0909341SAndroid Build Coastguard Worker%if WIN64
3955*c0909341SAndroid Build Coastguard Worker    pop                  r8
3956*c0909341SAndroid Build Coastguard Worker%endif
3957*c0909341SAndroid Build Coastguard Worker    RET
3958*c0909341SAndroid Build Coastguard Worker
3959*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_16bpc
3960*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_16bpc
3961*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_16bpc
3962*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_16bpc
3963*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp,          SHARP,   SHARP
3964*c0909341SAndroid Build Coastguard Worker
3965*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_16bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my
3966*c0909341SAndroid Build Coastguard Worker%define base r7-prep_avx512icl
3967*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
3968*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
3969*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
3970*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 8tap_v, my, 4tap_v
3971*c0909341SAndroid Build Coastguard Worker    lea                  r7, [prep_avx512icl]
3972*c0909341SAndroid Build Coastguard Worker    mov                  wd, wm
3973*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
3974*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
3975*c0909341SAndroid Build Coastguard Worker    jnz .h
3976*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
3977*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _prep_6tap_16bpc_avx512icl).prep
3978*c0909341SAndroid Build Coastguard Worker.v:
3979*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3980*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3981*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
3982*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
3983*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r7m
3984*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [prep_8tap_rnd]
3985*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm0, [base+subpel_filters+myq*8]
3986*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, wd
3987*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 11
3988*c0909341SAndroid Build Coastguard Worker    movzx               r6d, word [r7+r6*2+table_offset(prep, _8tap_v)]
3989*c0909341SAndroid Build Coastguard Worker    psllw              xmm0, [base+prep_hv_shift+r5*8]
3990*c0909341SAndroid Build Coastguard Worker    add                  r7, r6
3991*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
3992*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
3993*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xmm0
3994*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, xmm0
3995*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [tmpq+ 4]
3996*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [tmpq+ 8]
3997*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [tmpq+12]
3998*c0909341SAndroid Build Coastguard Worker    jmp                  r7
3999*c0909341SAndroid Build Coastguard Worker.v_w4:
4000*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 0x330c
4001*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+strideq*0]
4002*c0909341SAndroid Build Coastguard Worker    kmovw                k1, r3d
4003*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    ym1{k1}, [srcq+strideq*1]
4004*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+r6       ]
4005*c0909341SAndroid Build Coastguard Worker    vinserti32x4     m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2 3
4006*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
4007*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    ym0{k1}, [srcq+strideq*0]
4008*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+strideq*1]
4009*c0909341SAndroid Build Coastguard Worker    vinserti32x4     m0{k1}, m2, [srcq+strideq*2], 3 ; 3 4 5 6
4010*c0909341SAndroid Build Coastguard Worker    mova                ym5, [prep_endA]
4011*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m1, m0, q1021 ; 1 2 3 4
4012*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m1, m0, q2132 ; 2 3 4 5
4013*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3            ; 01 12 23 34
4014*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m0            ; 23 34 45 56
4015*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
4016*c0909341SAndroid Build Coastguard Worker    movq                xm4, [srcq+r6       ]
4017*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
4018*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    ym4{k1}, [srcq+strideq*0]
4019*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+strideq*1]
4020*c0909341SAndroid Build Coastguard Worker    vinserti32x4     m4{k1}, m3, [srcq+strideq*2], 3 ; 7 8 9 a
4021*c0909341SAndroid Build Coastguard Worker    mova                 m3, m10
4022*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m12, m1       ; a0 b0 c0 d0
4023*c0909341SAndroid Build Coastguard Worker    valignq              m1, m4, m0, 6     ; 6 7 8 9
4024*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m13, m2       ; a1 b1 c1 d1
4025*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
4026*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m1, m4        ; 67 78 89 9a
4027*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m15, m4       ; a3 b3 c3 d3
4028*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m2, m4, q1032 ; 45 56 67 78
4029*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m14, m1       ; a2 b2 c2 d2
4030*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
4031*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m5, m3
4032*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], ym3
4033*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
4034*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4035*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
4036*c0909341SAndroid Build Coastguard Worker    RET
4037*c0909341SAndroid Build Coastguard Worker.v_w8:
4038*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*0]
4039*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 0x33
4040*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym1, [srcq+strideq*1]
4041*c0909341SAndroid Build Coastguard Worker    kmovb                k1, r3d
4042*c0909341SAndroid Build Coastguard Worker    mova                 m7, [spel_v_shuf8]
4043*c0909341SAndroid Build Coastguard Worker    vinserti64x2     m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2
4044*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
4045*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym2, [srcq+strideq*0]
4046*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [srcq+strideq*1]
4047*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym0, [srcq+strideq*2]
4048*c0909341SAndroid Build Coastguard Worker    vshufi64x2       m2{k1}, m1, m3, q1032    ; 2 3 4
4049*c0909341SAndroid Build Coastguard Worker    vinserti64x2     m0{k1}, m3, [srcq+r6], 2 ; 4 5 6
4050*c0909341SAndroid Build Coastguard Worker    mova                 m8, [prep_endB]
4051*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m7, m1  ; 01 12
4052*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m7, m2  ; 23 34
4053*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m7, m0  ; 45 56
4054*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
4055*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
4056*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym4, [srcq+strideq*0]
4057*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+strideq*1]
4058*c0909341SAndroid Build Coastguard Worker    vshufi64x2       m4{k1}, m0, m5, q1032    ; 6 7 8
4059*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym0, [srcq+strideq*2]
4060*c0909341SAndroid Build Coastguard Worker    vinserti64x2     m0{k1}, m5, [srcq+r6], 2 ; 8 9 a
4061*c0909341SAndroid Build Coastguard Worker    mova                 m5, m10
4062*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m12, m1 ; a0 b0
4063*c0909341SAndroid Build Coastguard Worker    mova                 m6, m10
4064*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m12, m2 ; c0 d0
4065*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
4066*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m13, m2 ; c1 d1
4067*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m13, m3 ; c1 d1
4068*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m7, m4  ; 67 78
4069*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m14, m3 ; a2 b2
4070*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m7, m0  ; 89 9a
4071*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m14, m2 ; c2 d2
4072*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m15, m2 ; a3 b3
4073*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m15, m3 ; c3 d3
4074*c0909341SAndroid Build Coastguard Worker    vpermt2b             m5, m8, m6
4075*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m5
4076*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
4077*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4078*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
4079*c0909341SAndroid Build Coastguard Worker    RET
4080*c0909341SAndroid Build Coastguard Worker.v_w16:
4081*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m0, [srcq+strideq*1]
4082*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, m0, [srcq+strideq*2], 1
4083*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+strideq*0], 0
4084*c0909341SAndroid Build Coastguard Worker    mova                 m8, [spel_v_shuf16]
4085*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
4086*c0909341SAndroid Build Coastguard Worker    movu                ym3, [srcq+strideq*0]
4087*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, [srcq+strideq*1], 1
4088*c0909341SAndroid Build Coastguard Worker    movu                ym5, [srcq+strideq*2]
4089*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
4090*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, [srcq+strideq*0], 1
4091*c0909341SAndroid Build Coastguard Worker    mova                m11, [prep_endA]
4092*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m8, m1     ; 12
4093*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m8, m0     ; 01
4094*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m8, m3     ; 34
4095*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m8, m5     ; 56
4096*c0909341SAndroid Build Coastguard Worker    vpshrdd              m2, m1, m3, 16 ; 23
4097*c0909341SAndroid Build Coastguard Worker    vpshrdd              m4, m3, m5, 16 ; 45
4098*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
4099*c0909341SAndroid Build Coastguard Worker    mova                 m7, m10
4100*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m12, m1    ; b0
4101*c0909341SAndroid Build Coastguard Worker    mova                 m6, m10
4102*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m12, m0    ; a0
4103*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
4104*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m13, m3    ; b1
4105*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
4106*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m13, m2    ; a1
4107*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
4108*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m14, m5    ; b2
4109*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
4110*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m14, m4    ; a2
4111*c0909341SAndroid Build Coastguard Worker    movu                ym5, [srcq+strideq*1]
4112*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
4113*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, [srcq+strideq*0], 1
4114*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m8, m5     ; 78
4115*c0909341SAndroid Build Coastguard Worker    vpshrdd              m4, m3, m5, 16 ; 67
4116*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m15, m5    ; b3
4117*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m15, m4    ; a3
4118*c0909341SAndroid Build Coastguard Worker    vpermt2b             m6, m11, m7
4119*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m6
4120*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
4121*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4122*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
4123*c0909341SAndroid Build Coastguard Worker    RET
4124*c0909341SAndroid Build Coastguard Worker.v_w32:
4125*c0909341SAndroid Build Coastguard Worker.v_w64:
4126*c0909341SAndroid Build Coastguard Worker.v_w128:
4127*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       23
4128*c0909341SAndroid Build Coastguard Worker%if WIN64
4129*c0909341SAndroid Build Coastguard Worker    push                 r8
4130*c0909341SAndroid Build Coastguard Worker%endif
4131*c0909341SAndroid Build Coastguard Worker    mova                m11, [prep_endC]
4132*c0909341SAndroid Build Coastguard Worker    lea                  r5, [hq+wq*8-256]
4133*c0909341SAndroid Build Coastguard Worker.v_w32_loop0:
4134*c0909341SAndroid Build Coastguard Worker    movu                m16, [srcq+strideq*0]
4135*c0909341SAndroid Build Coastguard Worker    movu                m17, [srcq+strideq*1]
4136*c0909341SAndroid Build Coastguard Worker    lea                  r7, [srcq+r6]
4137*c0909341SAndroid Build Coastguard Worker    movu                m18, [srcq+strideq*2]
4138*c0909341SAndroid Build Coastguard Worker    movu                m19, [r7  +strideq*0]
4139*c0909341SAndroid Build Coastguard Worker    mov                  r8, tmpq
4140*c0909341SAndroid Build Coastguard Worker    movu                m20, [r7  +strideq*1]
4141*c0909341SAndroid Build Coastguard Worker    movu                m21, [r7  +strideq*2]
4142*c0909341SAndroid Build Coastguard Worker    add                  r7, r6
4143*c0909341SAndroid Build Coastguard Worker    movu                m22, [r7  +strideq*0]
4144*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m16, m17 ; 01l
4145*c0909341SAndroid Build Coastguard Worker    punpckhwd           m16, m17      ; 01h
4146*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m17, m18 ; 12l
4147*c0909341SAndroid Build Coastguard Worker    punpckhwd           m17, m18      ; 12h
4148*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m18, m19 ; 23l
4149*c0909341SAndroid Build Coastguard Worker    punpckhwd           m18, m19      ; 23h
4150*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m19, m20 ; 34l
4151*c0909341SAndroid Build Coastguard Worker    punpckhwd           m19, m20      ; 34h
4152*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m20, m21 ; 45l
4153*c0909341SAndroid Build Coastguard Worker    punpckhwd           m20, m21      ; 45h
4154*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m21, m22 ; 56l
4155*c0909341SAndroid Build Coastguard Worker    punpckhwd           m21, m22      ; 56h
4156*c0909341SAndroid Build Coastguard Worker.v_w32_loop:
4157*c0909341SAndroid Build Coastguard Worker    mova                 m6, m10
4158*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m12, m0  ; a0l
4159*c0909341SAndroid Build Coastguard Worker    mova                 m8, m10
4160*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m12, m16 ; a0h
4161*c0909341SAndroid Build Coastguard Worker    mova                 m7, m10
4162*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m12, m1  ; b0l
4163*c0909341SAndroid Build Coastguard Worker    mova                 m9, m10
4164*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m9, m12, m17 ; b0h
4165*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
4166*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m13, m2  ; a1l
4167*c0909341SAndroid Build Coastguard Worker    mova                m16, m18
4168*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m13, m18 ; a1h
4169*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
4170*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m13, m3  ; b1l
4171*c0909341SAndroid Build Coastguard Worker    mova                m17, m19
4172*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m9, m13, m19 ; b1h
4173*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
4174*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m14, m4  ; a2l
4175*c0909341SAndroid Build Coastguard Worker    mova                m18, m20
4176*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m14, m20 ; a2h
4177*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
4178*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m14, m5  ; b2l
4179*c0909341SAndroid Build Coastguard Worker    mova                m19, m21
4180*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m9, m14, m21 ; b2h
4181*c0909341SAndroid Build Coastguard Worker    movu                m21, [r7+strideq*1]
4182*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+strideq*2]
4183*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m22, m21 ; 67l
4184*c0909341SAndroid Build Coastguard Worker    punpckhwd           m20, m22, m21 ; 67h
4185*c0909341SAndroid Build Coastguard Worker    movu                m22, [r7+strideq*0]
4186*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m15, m4  ; a3l
4187*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m15, m20 ; a3h
4188*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m21, m22 ; 78l
4189*c0909341SAndroid Build Coastguard Worker    punpckhwd           m21, m22      ; 78h
4190*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m15, m5  ; b3l
4191*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m9, m15, m21 ; b3h
4192*c0909341SAndroid Build Coastguard Worker    vpermt2b             m6, m11, m8
4193*c0909341SAndroid Build Coastguard Worker    vpermt2b             m7, m11, m9
4194*c0909341SAndroid Build Coastguard Worker    mova          [r8+wq*0], m6
4195*c0909341SAndroid Build Coastguard Worker    mova          [r8+wq*2], m7
4196*c0909341SAndroid Build Coastguard Worker    lea                  r8, [r8+wq*4]
4197*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4198*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop
4199*c0909341SAndroid Build Coastguard Worker    add                srcq, 64
4200*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
4201*c0909341SAndroid Build Coastguard Worker    movzx                hd, r5b
4202*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 1<<8
4203*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop0
4204*c0909341SAndroid Build Coastguard Worker%if WIN64
4205*c0909341SAndroid Build Coastguard Worker    pop                  r8
4206*c0909341SAndroid Build Coastguard Worker%endif
4207*c0909341SAndroid Build Coastguard Worker    RET
4208*c0909341SAndroid Build Coastguard Worker.h_w4:
4209*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
4210*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
4211*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
4212*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
4213*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r7m
4214*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m4, [spel_h_shufA]
4215*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m5, [spel_h_shufB]
4216*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 11
4217*c0909341SAndroid Build Coastguard Worker    mova                ym9, [prep_endA]
4218*c0909341SAndroid Build Coastguard Worker    psllw              xmm0, [base+prep_hv_shift+r5*8]
4219*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xmm0
4220*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [tmpq+4]
4221*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [tmpq+8]
4222*c0909341SAndroid Build Coastguard Worker.h_w4_loop:
4223*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+strideq*0]
4224*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym2, [srcq+strideq*1], 1
4225*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m2, [srcq+strideq*2], 2
4226*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m2, [srcq+r6       ], 3
4227*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
4228*c0909341SAndroid Build Coastguard Worker    mova                 m0, m10
4229*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2, m4
4230*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m6, m1
4231*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5
4232*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m7, m2
4233*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m9, m0
4234*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], ym0
4235*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
4236*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4237*c0909341SAndroid Build Coastguard Worker    jg .h_w4_loop
4238*c0909341SAndroid Build Coastguard Worker    RET
4239*c0909341SAndroid Build Coastguard Worker.h_w8:
4240*c0909341SAndroid Build Coastguard Worker    mova                 m6, [spel_h_shufA]
4241*c0909341SAndroid Build Coastguard Worker    movu                 m7, [spel_h_shufB]
4242*c0909341SAndroid Build Coastguard Worker    movu                 m8, [spel_h_shufC]
4243*c0909341SAndroid Build Coastguard Worker    mova                 m9, [spel_h_shufD]
4244*c0909341SAndroid Build Coastguard Worker    mova                m11, [prep_endB]
4245*c0909341SAndroid Build Coastguard Worker.h_w8_loop:
4246*c0909341SAndroid Build Coastguard Worker    movu                ym4, [srcq+strideq*0]
4247*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m4, [srcq+strideq*1], 1
4248*c0909341SAndroid Build Coastguard Worker    movu                ym5, [srcq+strideq*2]
4249*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, [srcq+r6       ], 1
4250*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
4251*c0909341SAndroid Build Coastguard Worker    mova                 m0, m10
4252*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
4253*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m6, m4
4254*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m6, m5
4255*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m12, m2
4256*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m12, m3
4257*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m7, m4
4258*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m7, m5
4259*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m13, m2
4260*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m13, m3
4261*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m8, m4
4262*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m8, m5
4263*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m14, m2
4264*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m14, m3
4265*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m9, m4
4266*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m9, m5
4267*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m15, m2
4268*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m15, m3
4269*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m11, m1
4270*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
4271*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
4272*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4273*c0909341SAndroid Build Coastguard Worker    jg .h_w8_loop
4274*c0909341SAndroid Build Coastguard Worker    RET
4275*c0909341SAndroid Build Coastguard Worker.h:
4276*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [prep_8tap_rnd]
4277*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
4278*c0909341SAndroid Build Coastguard Worker    jnz .hv
4279*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
4280*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
4281*c0909341SAndroid Build Coastguard Worker    je .h_w4
4282*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
4283*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
4284*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r7m
4285*c0909341SAndroid Build Coastguard Worker    sub                srcq, 6
4286*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 11
4287*c0909341SAndroid Build Coastguard Worker    psllw              xmm0, [base+prep_hv_shift+r5*8]
4288*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xmm0
4289*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, xmm0
4290*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [tmpq+ 4]
4291*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [tmpq+ 8]
4292*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [tmpq+12]
4293*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 16
4294*c0909341SAndroid Build Coastguard Worker    jl .h_w8
4295*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m6, [spel_h_shufA]
4296*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m7, [spel_h_shufB]
4297*c0909341SAndroid Build Coastguard Worker    mova                m11, [prep_endC]
4298*c0909341SAndroid Build Coastguard Worker    jg .h_w32
4299*c0909341SAndroid Build Coastguard Worker.h_w16_loop:
4300*c0909341SAndroid Build Coastguard Worker    movu                ym2, [srcq+strideq*0+ 0]
4301*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, [srcq+strideq*1+ 0], 1
4302*c0909341SAndroid Build Coastguard Worker    movu                ym3, [srcq+strideq*0+16]
4303*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, [srcq+strideq*1+16], 1
4304*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
4305*c0909341SAndroid Build Coastguard Worker    mova                 m0, m10
4306*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
4307*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m2, m6
4308*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m12, m4 ; a0
4309*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m3, m6
4310*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m14, m4 ; b2
4311*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m2, m7
4312*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m13, m4 ; a1
4313*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m3, m7
4314*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m15, m4 ; b3
4315*c0909341SAndroid Build Coastguard Worker    shufpd               m2, m3, 0x55
4316*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m2, m6
4317*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m14, m4 ; a2
4318*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m12, m4 ; b0
4319*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
4320*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m15, m2 ; a3
4321*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m13, m2 ; b1
4322*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m11, m1
4323*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
4324*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
4325*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4326*c0909341SAndroid Build Coastguard Worker    jg .h_w16_loop
4327*c0909341SAndroid Build Coastguard Worker    RET
4328*c0909341SAndroid Build Coastguard Worker.h_w32:
4329*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+wq*2]
4330*c0909341SAndroid Build Coastguard Worker    neg                  wq
4331*c0909341SAndroid Build Coastguard Worker.h_w32_loop0:
4332*c0909341SAndroid Build Coastguard Worker    mov                  r6, wq
4333*c0909341SAndroid Build Coastguard Worker.h_w32_loop:
4334*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6*2+ 0]
4335*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+r6*2+ 8]
4336*c0909341SAndroid Build Coastguard Worker    mova                 m0, m10
4337*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
4338*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m2, m6
4339*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m12, m4 ; a0
4340*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m3, m6
4341*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m12, m4 ; b0
4342*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m14, m4 ; a2
4343*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+r6*2+16]
4344*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7
4345*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m13, m3 ; b1
4346*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m15, m3 ; a3
4347*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4, m6
4348*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m14, m3 ; b2
4349*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
4350*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m13, m2 ; a1
4351*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m7
4352*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m15, m4 ; b3
4353*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m11, m1
4354*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
4355*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
4356*c0909341SAndroid Build Coastguard Worker    add                  r6, 32
4357*c0909341SAndroid Build Coastguard Worker    jl .h_w32_loop
4358*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
4359*c0909341SAndroid Build Coastguard Worker    dec                  hd
4360*c0909341SAndroid Build Coastguard Worker    jg .h_w32_loop0
4361*c0909341SAndroid Build Coastguard Worker    RET
4362*c0909341SAndroid Build Coastguard Worker.hv:
4363*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pd_128]
4364*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
4365*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
4366*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
4367*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
4368*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
4369*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
4370*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
4371*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
4372*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r7m
4373*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm1, [base+subpel_filters+myq*8]
4374*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
4375*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
4376*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 11
4377*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
4378*c0909341SAndroid Build Coastguard Worker    psllw              xmm0, [base+prep_hv_shift+r5*8]
4379*c0909341SAndroid Build Coastguard Worker    psllw              xmm1, 2
4380*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+ 0], xmm0
4381*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+16], xmm1
4382*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, xmm1
4383*c0909341SAndroid Build Coastguard Worker    movu               xm16, [srcq+strideq*0]
4384*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 0xff0
4385*c0909341SAndroid Build Coastguard Worker    vinserti128        ym16, [srcq+strideq*1], 1
4386*c0909341SAndroid Build Coastguard Worker    kmovw                k1, r3d
4387*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m18, [srcq+strideq*2]
4388*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
4389*c0909341SAndroid Build Coastguard Worker    vinserti64x2    m16{k1}, m18, [srcq+strideq*0], 3
4390*c0909341SAndroid Build Coastguard Worker    movu               xm17, [srcq+strideq*1]
4391*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4    ym18, [srcq+strideq*2]
4392*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
4393*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m17{k1}, m18, [srcq+strideq*0], 2
4394*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m5, [spel_h_shufA]
4395*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m6, [spel_h_shufB]
4396*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [tmpq+ 4]
4397*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [tmpq+ 8]
4398*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
4399*c0909341SAndroid Build Coastguard Worker    mova                m19, [spel_shuf4a]
4400*c0909341SAndroid Build Coastguard Worker    mova                 m2, m10
4401*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m16, m5
4402*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m8, m0
4403*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m17, m5
4404*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m8, m0
4405*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [tmpq+20]
4406*c0909341SAndroid Build Coastguard Worker    pshufb              m16, m6
4407*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [tmpq+24]
4408*c0909341SAndroid Build Coastguard Worker    pshufb              m17, m6
4409*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [tmpq+28]
4410*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m9, m16       ; 0 1 2 3
4411*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m9, m17       ; 4 5 6
4412*c0909341SAndroid Build Coastguard Worker    mova                 m7, [spel_shuf4b]
4413*c0909341SAndroid Build Coastguard Worker    vpermt2b             m1, m19, m2       ; 01 12 23 34
4414*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m19, m2       ; 45 56
4415*c0909341SAndroid Build Coastguard Worker    mova               ym19, [prep_endA]
4416*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m1, m2, q1032 ; 23 34 45 56
4417*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
4418*c0909341SAndroid Build Coastguard Worker    movu               xm17, [srcq+strideq*1]
4419*c0909341SAndroid Build Coastguard Worker    vinserti128        ym17, [srcq+strideq*2], 1
4420*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m16, [srcq+r6       ]
4421*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
4422*c0909341SAndroid Build Coastguard Worker    vinserti64x2    m17{k1}, m16, [srcq+strideq*0], 3
4423*c0909341SAndroid Build Coastguard Worker    mova                m18, m10
4424*c0909341SAndroid Build Coastguard Worker    pshufb              m16, m17, m5
4425*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m18, m8, m16
4426*c0909341SAndroid Build Coastguard Worker    mova                m16, m11
4427*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m12, m1       ; a0 b0 c0 d0
4428*c0909341SAndroid Build Coastguard Worker    pshufb              m17, m6
4429*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m18, m9, m17       ; 7 8 9 a
4430*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
4431*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m13, m2       ; a1 b1 c1 d1
4432*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m7, m18       ; 67 78 89 9a
4433*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m15, m2       ; a3 b3 c3 d3
4434*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m2, q1032     ; 45 56 67 78
4435*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m14, m1       ; a2 b2 c2 d2
4436*c0909341SAndroid Build Coastguard Worker    vpermb              m16, m19, m16
4437*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], ym16
4438*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
4439*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4440*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
4441*c0909341SAndroid Build Coastguard Worker    vzeroupper
4442*c0909341SAndroid Build Coastguard Worker    RET
4443*c0909341SAndroid Build Coastguard Worker.hv_w8:
4444*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
4445*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
4446*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
4447*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
4448*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
4449*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
4450*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r7m
4451*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xmm1, [base+subpel_filters+myq*8]
4452*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
4453*c0909341SAndroid Build Coastguard Worker    sub                srcq, 6
4454*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 11
4455*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
4456*c0909341SAndroid Build Coastguard Worker    psllw              xmm0, [base+prep_hv_shift+r5*8]
4457*c0909341SAndroid Build Coastguard Worker    psllw              xmm1, 2
4458*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+ 0], xmm0
4459*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+16], xmm1
4460*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, xmm0
4461*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [tmpq+ 4]
4462*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [tmpq+ 8]
4463*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [tmpq+12]
4464*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m16, xmm1
4465*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m17, [tmpq+20]
4466*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m18, [tmpq+24]
4467*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m19, [tmpq+28]
4468*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
4469*c0909341SAndroid Build Coastguard Worker    jg .hv_w16
4470*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      23
4471*c0909341SAndroid Build Coastguard Worker    mova                 m5, [spel_h_shufA]
4472*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+strideq*0]
4473*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+strideq*1], 1 ; 0 1
4474*c0909341SAndroid Build Coastguard Worker    movu                ym9, [srcq+strideq*2]
4475*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
4476*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m9, [srcq+strideq*0], 1 ; 2 3
4477*c0909341SAndroid Build Coastguard Worker    movu               ym20, [srcq+strideq*1]
4478*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m20, [srcq+strideq*2], 1 ; 4 5
4479*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
4480*c0909341SAndroid Build Coastguard Worker    movu               ym21, [srcq+strideq*0]    ; 6
4481*c0909341SAndroid Build Coastguard Worker    movu                 m6, [spel_h_shufB]
4482*c0909341SAndroid Build Coastguard Worker    movu                 m7, [spel_h_shufC]
4483*c0909341SAndroid Build Coastguard Worker    mova               ym22, [prep_endB]
4484*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m5, m0
4485*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
4486*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m12, m8  ; a0 b0
4487*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m5, m9
4488*c0909341SAndroid Build Coastguard Worker    mova                 m2, m10
4489*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m12, m8  ; c0 d0
4490*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m5, m20
4491*c0909341SAndroid Build Coastguard Worker    mova                 m3, m10
4492*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m12, m8  ; e0 f0
4493*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m5, m21
4494*c0909341SAndroid Build Coastguard Worker    mova                 m4, m10
4495*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m12, m8  ; g0
4496*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m6, m0
4497*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m13, m8  ; a1 b1
4498*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m6, m9
4499*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m13, m8  ; c1 d1
4500*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m6, m20
4501*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m13, m8  ; e1 f1
4502*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m6, m21
4503*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m13, m8  ; g1
4504*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m7, m0
4505*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m14, m8  ; a2 b2
4506*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m7, m9
4507*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m14, m8  ; c2 d2
4508*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m7, m20
4509*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m14, m8  ; e2 f2
4510*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m7, m21
4511*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m14, m8  ; g2
4512*c0909341SAndroid Build Coastguard Worker    mova                 m8, [spel_h_shufD]
4513*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m8, m0
4514*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m15, m0  ; a3 b3
4515*c0909341SAndroid Build Coastguard Worker    mova                 m0, [spel_shuf8a]
4516*c0909341SAndroid Build Coastguard Worker    vpermb               m9, m8, m9
4517*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m15, m9  ; c3 d3
4518*c0909341SAndroid Build Coastguard Worker    mova                 m9, [spel_shuf8b]
4519*c0909341SAndroid Build Coastguard Worker    vpermb              m20, m8, m20
4520*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m15, m20 ; e3 f3
4521*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m8, m21
4522*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m15, m21 ; g3
4523*c0909341SAndroid Build Coastguard Worker    vpermt2b             m1, m0, m2   ; 01 12
4524*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m0, m3   ; 23 34
4525*c0909341SAndroid Build Coastguard Worker    vpermt2b             m3, m0, m4   ; 45 56
4526*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
4527*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+strideq*1]
4528*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
4529*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+strideq*0], 1
4530*c0909341SAndroid Build Coastguard Worker    mova                 m4, m10
4531*c0909341SAndroid Build Coastguard Worker    mova                m20, m11
4532*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m5, m0
4533*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m12, m21 ; h0 i0
4534*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m6, m0
4535*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m16, m1  ; A0 B0
4536*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m13, m21 ; h1 i1
4537*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m7, m0
4538*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
4539*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m17, m2  ; A1 B1
4540*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m14, m21 ; h2 i2
4541*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m8, m0
4542*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
4543*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m18, m3  ; A2 B2
4544*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m15, m21 ; h3 i3
4545*c0909341SAndroid Build Coastguard Worker    vpermt2b             m3, m9, m4   ; 67 78
4546*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m19, m3  ; A3 B3
4547*c0909341SAndroid Build Coastguard Worker    vpermb              m20, m22, m20
4548*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], ym20
4549*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
4550*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4551*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
4552*c0909341SAndroid Build Coastguard Worker    RET
4553*c0909341SAndroid Build Coastguard Worker.hv_w16:
4554*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      27
4555*c0909341SAndroid Build Coastguard Worker%if WIN64
4556*c0909341SAndroid Build Coastguard Worker    push                 r8
4557*c0909341SAndroid Build Coastguard Worker%endif
4558*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m20, [spel_h_shufA]
4559*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m21, [spel_h_shufB]
4560*c0909341SAndroid Build Coastguard Worker    add                  wd, wd
4561*c0909341SAndroid Build Coastguard Worker    mova                 m9, [spel_shuf16]
4562*c0909341SAndroid Build Coastguard Worker    mova                m26, [prep_endB]
4563*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [hq+wq*8-256]
4564*c0909341SAndroid Build Coastguard Worker.hv_w16_loop0:
4565*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m5, [srcq+strideq*0+ 8]
4566*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m4, m5, [srcq+strideq*0+ 0], 0
4567*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, [srcq+strideq*0+16], 1 ; 0
4568*c0909341SAndroid Build Coastguard Worker    movu                ym6, [srcq+strideq*1+ 0]
4569*c0909341SAndroid Build Coastguard Worker    movu                ym7, [srcq+strideq*1+16]
4570*c0909341SAndroid Build Coastguard Worker    lea                  r7, [srcq+r6]
4571*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m6, [srcq+strideq*2+ 0], 1
4572*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m7, [srcq+strideq*2+16], 1 ; 1 2
4573*c0909341SAndroid Build Coastguard Worker    movu               ym22, [r7  +strideq*0+ 0]
4574*c0909341SAndroid Build Coastguard Worker    movu               ym23, [r7  +strideq*0+16]
4575*c0909341SAndroid Build Coastguard Worker    mov                  r8, tmpq
4576*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m22, [r7  +strideq*1+ 0], 1
4577*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m23, [r7  +strideq*1+16], 1 ; 3 4
4578*c0909341SAndroid Build Coastguard Worker    movu               ym24, [r7  +strideq*2+ 0]
4579*c0909341SAndroid Build Coastguard Worker    movu               ym25, [r7  +strideq*2+16]
4580*c0909341SAndroid Build Coastguard Worker    add                  r7, r6
4581*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m24, [r7  +strideq*0+ 0], 1
4582*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m25, [r7  +strideq*0+16], 1 ; 5 6
4583*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4, m20
4584*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
4585*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m12, m0    ; a0
4586*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6, m20
4587*c0909341SAndroid Build Coastguard Worker    mova                 m2, m10
4588*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m12, m0    ; b0
4589*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m7, m20
4590*c0909341SAndroid Build Coastguard Worker    mova                 m3, m10
4591*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m14, m0    ; c2
4592*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4, m21
4593*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m13, m0    ; a1
4594*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6, m21
4595*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m13, m0    ; b1
4596*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m7, m21
4597*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m15, m0    ; c3
4598*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m5, m20
4599*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m14, m0    ; a2
4600*c0909341SAndroid Build Coastguard Worker    shufpd               m6, m7, 0x55
4601*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m6, m20
4602*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m14, m7    ; b2
4603*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m12, m7    ; c0
4604*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m21
4605*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m15, m5    ; a3
4606*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m21
4607*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m15, m6    ; b3
4608*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m13, m6    ; c1
4609*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m22, m20
4610*c0909341SAndroid Build Coastguard Worker    mova                 m4, m10
4611*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m12, m0    ; d0
4612*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m23, m20
4613*c0909341SAndroid Build Coastguard Worker    mova                 m5, m10
4614*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m14, m0    ; e2
4615*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m24, m20
4616*c0909341SAndroid Build Coastguard Worker    mova                 m6, m10
4617*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m12, m0    ; f0
4618*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m25, m20
4619*c0909341SAndroid Build Coastguard Worker    mova                 m7, m10
4620*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m14, m0    ; g2
4621*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m22, m21
4622*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m13, m0    ; d1
4623*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m23, m21
4624*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m15, m0    ; e3
4625*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m24, m21
4626*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m13, m0    ; f1
4627*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m25, m21
4628*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m15, m0    ; g3
4629*c0909341SAndroid Build Coastguard Worker    shufpd              m22, m23, 0x55
4630*c0909341SAndroid Build Coastguard Worker    pshufb              m23, m22, m20
4631*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m14, m23   ; d2
4632*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m12, m23   ; e0
4633*c0909341SAndroid Build Coastguard Worker    shufpd              m24, m25, 0x55
4634*c0909341SAndroid Build Coastguard Worker    pshufb              m25, m24, m20
4635*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m14, m25   ; f2
4636*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m12, m25   ; g0
4637*c0909341SAndroid Build Coastguard Worker    pshufb              m22, m21
4638*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m15, m22   ; d3
4639*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m13, m22   ; e1
4640*c0909341SAndroid Build Coastguard Worker    pshufb              m24, m21
4641*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m15, m24   ; f3
4642*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m13, m24   ; g1
4643*c0909341SAndroid Build Coastguard Worker    pslldq               m1, 1
4644*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m9, m3     ; 12
4645*c0909341SAndroid Build Coastguard Worker    vpermt2b             m4, m9, m5     ; 34
4646*c0909341SAndroid Build Coastguard Worker    vpermt2b             m6, m9, m7     ; 56
4647*c0909341SAndroid Build Coastguard Worker    vpshrdd              m1, m2, 16     ; 01
4648*c0909341SAndroid Build Coastguard Worker    vpshrdd              m3, m2, m4, 16 ; 23
4649*c0909341SAndroid Build Coastguard Worker    vpshrdd              m5, m4, m6, 16 ; 45
4650*c0909341SAndroid Build Coastguard Worker.hv_w16_loop:
4651*c0909341SAndroid Build Coastguard Worker    movu               ym24, [r7+strideq*1+ 0]
4652*c0909341SAndroid Build Coastguard Worker    movu               ym25, [r7+strideq*1+16]
4653*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+strideq*2]
4654*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m24, [r7+strideq*0+ 0], 1
4655*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m25, [r7+strideq*0+16], 1
4656*c0909341SAndroid Build Coastguard Worker    mova                 m7, m10
4657*c0909341SAndroid Build Coastguard Worker    mova                 m8, m10
4658*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m24, m20
4659*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m12, m0    ; h0
4660*c0909341SAndroid Build Coastguard Worker    mova                m22, m11
4661*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m25, m20
4662*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m14, m0    ; i2
4663*c0909341SAndroid Build Coastguard Worker    mova                m23, m11
4664*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m22, m16, m1    ; A0
4665*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
4666*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m23, m16, m2    ; B0
4667*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
4668*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m24, m21
4669*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m13, m0    ; h1
4670*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m25, m21
4671*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m15, m0    ; i3
4672*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m22, m17, m3    ; A1
4673*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
4674*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m23, m17, m4    ; B1
4675*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
4676*c0909341SAndroid Build Coastguard Worker    shufpd              m24, m25, 0x55
4677*c0909341SAndroid Build Coastguard Worker    pshufb              m25, m24, m20
4678*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m14, m25   ; h2
4679*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m12, m25   ; i0
4680*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m22, m18, m5    ; A2
4681*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m23, m18, m6    ; B2
4682*c0909341SAndroid Build Coastguard Worker    pshufb              m24, m21
4683*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m7, m15, m24   ; h3
4684*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m13, m24   ; i1
4685*c0909341SAndroid Build Coastguard Worker    vpermt2b             m7, m9, m8     ; 78
4686*c0909341SAndroid Build Coastguard Worker    vpshrdd              m5, m6, m7, 16 ; 67
4687*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m22, m19, m5    ; A3
4688*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m23, m19, m7    ; B3
4689*c0909341SAndroid Build Coastguard Worker    mova                 m6, m7
4690*c0909341SAndroid Build Coastguard Worker    vpermt2b            m22, m26, m23
4691*c0909341SAndroid Build Coastguard Worker    mova          [r8+wq*0], ym22
4692*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [r8+wq*1], m22, 1
4693*c0909341SAndroid Build Coastguard Worker    lea                  r8, [r8+wq*2]
4694*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4695*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop
4696*c0909341SAndroid Build Coastguard Worker    add                srcq, 32
4697*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
4698*c0909341SAndroid Build Coastguard Worker    movzx                hd, r5b
4699*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 1<<8
4700*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop0
4701*c0909341SAndroid Build Coastguard Worker%if WIN64
4702*c0909341SAndroid Build Coastguard Worker    pop                  r8
4703*c0909341SAndroid Build Coastguard Worker%endif
4704*c0909341SAndroid Build Coastguard Worker    RET
4705*c0909341SAndroid Build Coastguard Worker
4706*c0909341SAndroid Build Coastguard Worker%if WIN64
4707*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5
4708*c0909341SAndroid Build Coastguard Worker%else
4709*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7
4710*c0909341SAndroid Build Coastguard Worker%endif
4711*c0909341SAndroid Build Coastguard Worker
4712*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts
4713*c0909341SAndroid Build Coastguard Worker%define base r6-pd_0to7
4714*c0909341SAndroid Build Coastguard Worker    mov                 t0d, r7m
4715*c0909341SAndroid Build Coastguard Worker    lea                  r6, [pd_0to7]
4716*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 11
4717*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+warp_8x8t_rnd_v]
4718*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [base+warp_8x8_rnd_h+t0*4]
4719*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main
4720*c0909341SAndroid Build Coastguard Worker    psrad               m14, m16, 15
4721*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
4722*c0909341SAndroid Build Coastguard Worker    psrad               m16, 15
4723*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m16
4724*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
4725*c0909341SAndroid Build Coastguard Worker    psrad               m15, m16, 15
4726*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
4727*c0909341SAndroid Build Coastguard Worker    add                 tsq, tsq
4728*c0909341SAndroid Build Coastguard Worker    psrad               m16, 15
4729*c0909341SAndroid Build Coastguard Worker    packssdw            m15, m16
4730*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end
4731*c0909341SAndroid Build Coastguard Worker
4732*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd
4733*c0909341SAndroid Build Coastguard Worker    mov                 t0d, r7m ; pixel_max
4734*c0909341SAndroid Build Coastguard Worker    lea                  r6, [pd_0to7]
4735*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 11
4736*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [base+warp_8x8_rnd_h+t0*4]
4737*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+warp_8x8_rnd_v+t0*4]
4738*c0909341SAndroid Build Coastguard Worker    call .main
4739*c0909341SAndroid Build Coastguard Worker    psrad               m14, m16, 13
4740*c0909341SAndroid Build Coastguard Worker    call .main2
4741*c0909341SAndroid Build Coastguard Worker    psrad               m16, 13
4742*c0909341SAndroid Build Coastguard Worker    packusdw            m14, m16
4743*c0909341SAndroid Build Coastguard Worker    call .main2
4744*c0909341SAndroid Build Coastguard Worker    psrad               m15, m16, 13
4745*c0909341SAndroid Build Coastguard Worker    call .main2
4746*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [base+bidir_shift+t0*4]
4747*c0909341SAndroid Build Coastguard Worker    vpsrlvw             m14, m0
4748*c0909341SAndroid Build Coastguard Worker    psrad               m16, 13
4749*c0909341SAndroid Build Coastguard Worker    packusdw            m15, m16
4750*c0909341SAndroid Build Coastguard Worker    vpsrlvw             m15, m0
4751*c0909341SAndroid Build Coastguard Worker.end:
4752*c0909341SAndroid Build Coastguard Worker    mova                 m0, [base+warp8x8_end]
4753*c0909341SAndroid Build Coastguard Worker    vpermb              m16, m0, m14
4754*c0909341SAndroid Build Coastguard Worker    lea                  r2, [dsq*3]
4755*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], xm16
4756*c0909341SAndroid Build Coastguard Worker    vextracti128  [dstq+dsq*1], ym16, 1
4757*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*2], m16, 2
4758*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r2   ], m16, 3
4759*c0909341SAndroid Build Coastguard Worker    vpermb              m16, m0, m15
4760*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*4]
4761*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], xm16
4762*c0909341SAndroid Build Coastguard Worker    vextracti128  [dstq+dsq*1], ym16, 1
4763*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*2], m16, 2
4764*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r2   ], m16, 3
4765*c0909341SAndroid Build Coastguard Worker    RET
4766*c0909341SAndroid Build Coastguard Worker.main:
4767*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym3, [base+pd_512]
4768*c0909341SAndroid Build Coastguard Worker%if WIN64
4769*c0909341SAndroid Build Coastguard Worker    mov               abcdq, r5mp
4770*c0909341SAndroid Build Coastguard Worker    vpaddd             ym18, ym3, r6m {1to8} ; mx
4771*c0909341SAndroid Build Coastguard Worker%else
4772*c0909341SAndroid Build Coastguard Worker    add                 r5d, 512
4773*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym18, r5d
4774*c0909341SAndroid Build Coastguard Worker%endif
4775*c0909341SAndroid Build Coastguard Worker    vpaddd             ym20, ym3, r7m {1to8} ; my
4776*c0909341SAndroid Build Coastguard Worker    mova               ym16, [base+pd_0to7]
4777*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym19, [abcdq+4*0]     ; alpha
4778*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym21, [abcdq+4*1]     ; gamma
4779*c0909341SAndroid Build Coastguard Worker    lea                  r4, [ssq*3+6]
4780*c0909341SAndroid Build Coastguard Worker    vpdpwssd           ym18, ym19, ym16      ; tmx
4781*c0909341SAndroid Build Coastguard Worker    vpdpwssd           ym20, ym21, ym16      ; tmy
4782*c0909341SAndroid Build Coastguard Worker    sub                srcq, r4
4783*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+warp8x8_permA]
4784*c0909341SAndroid Build Coastguard Worker    lea                  r4, [mc_warp_filter+64*8]
4785*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m12, [base+warp8x8_permC]
4786*c0909341SAndroid Build Coastguard Worker    kxnorb               k1, k1, k1
4787*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m13, [base+warp8x8_permD]
4788*c0909341SAndroid Build Coastguard Worker    movu                ym5, [srcq+0]
4789*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, [srcq+8], 1
4790*c0909341SAndroid Build Coastguard Worker    psrad              ym17, ym18, 10
4791*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+warp8x8_permB]
4792*c0909341SAndroid Build Coastguard Worker    kmovb                k2, k1
4793*c0909341SAndroid Build Coastguard Worker    vpgatherdq       m3{k1}, [r4+ym17*8]    ; filter_x0
4794*c0909341SAndroid Build Coastguard Worker    psrad              ym19, 16             ; beta
4795*c0909341SAndroid Build Coastguard Worker    psrad              ym21, 16             ; delta
4796*c0909341SAndroid Build Coastguard Worker    paddd              ym18, ym19
4797*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m10, m5
4798*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m9, [base+warp_shift_h+t0*8]
4799*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q3120
4800*c0909341SAndroid Build Coastguard Worker    paddd                m7, m1, m1
4801*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3, m12
4802*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m4, m2
4803*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m11, m5
4804*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m5, q1021
4805*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m13
4806*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m4, m3
4807*c0909341SAndroid Build Coastguard Worker    call .h
4808*c0909341SAndroid Build Coastguard Worker    psllq                m2, m1, 32
4809*c0909341SAndroid Build Coastguard Worker    paddd                m1, m2
4810*c0909341SAndroid Build Coastguard Worker    vpmultishiftqb       m1, m9, m1
4811*c0909341SAndroid Build Coastguard Worker    vpshrdq              m1, m0, 48          ; 01 12
4812*c0909341SAndroid Build Coastguard Worker    call .h
4813*c0909341SAndroid Build Coastguard Worker    vpshrdq              m2, m1, m0, 48      ; 23 34
4814*c0909341SAndroid Build Coastguard Worker    call .h
4815*c0909341SAndroid Build Coastguard Worker    vpshrdq              m3, m2, m0, 48      ; 45 56
4816*c0909341SAndroid Build Coastguard Worker.main2:
4817*c0909341SAndroid Build Coastguard Worker    call .h
4818*c0909341SAndroid Build Coastguard Worker    psrad               ym6, ym20, 10
4819*c0909341SAndroid Build Coastguard Worker    kmovb                k1, k2
4820*c0909341SAndroid Build Coastguard Worker    paddd              ym17, ym20, ym21      ; my += delta
4821*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m20{k2}, [r4+ym6*8]      ; filter_y0
4822*c0909341SAndroid Build Coastguard Worker    psrad              ym16, ym17, 10
4823*c0909341SAndroid Build Coastguard Worker    kmovb                k2, k1
4824*c0909341SAndroid Build Coastguard Worker    vpgatherdq       m6{k1}, [r4+ym16*8]     ; filter_y1
4825*c0909341SAndroid Build Coastguard Worker    shufps               m5, m20, m6, q2020
4826*c0909341SAndroid Build Coastguard Worker    mova                m16, m8
4827*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m5, m12
4828*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m1, m4          ; a0 b0
4829*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m13
4830*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
4831*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m2, m5          ; a1 b1
4832*c0909341SAndroid Build Coastguard Worker    shufps               m6, m20, m6, q3131
4833*c0909341SAndroid Build Coastguard Worker    paddd              ym20, ym17, ym21
4834*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m6, m12
4835*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
4836*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m3, m4          ; a2 b2
4837*c0909341SAndroid Build Coastguard Worker    vpshrdq              m3, m0, 48          ; 67 78
4838*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m13
4839*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m3, m6          ; a3 b3
4840*c0909341SAndroid Build Coastguard Worker    ret
4841*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4842*c0909341SAndroid Build Coastguard Worker.h:
4843*c0909341SAndroid Build Coastguard Worker    movu               ym16, [srcq+ssq*1]
4844*c0909341SAndroid Build Coastguard Worker    psrad               ym6, ym18, 10
4845*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
4846*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, m16, [srcq+ssq*0], 1
4847*c0909341SAndroid Build Coastguard Worker    kmovb                k1, k2
4848*c0909341SAndroid Build Coastguard Worker    paddd              ym17, ym18, ym19      ; mx += beta
4849*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m18{k2}, [r4+ym6*8]      ; filter_x1
4850*c0909341SAndroid Build Coastguard Worker    psrad              ym16, ym17, 10
4851*c0909341SAndroid Build Coastguard Worker    kmovb                k2, k1
4852*c0909341SAndroid Build Coastguard Worker    vpgatherdq       m6{k1}, [r4+ym16*8]     ; filter_x2
4853*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m10, m5
4854*c0909341SAndroid Build Coastguard Worker    shufps              m16, m18, m6, q2020
4855*c0909341SAndroid Build Coastguard Worker    shufps               m6, m18, m6, q3131
4856*c0909341SAndroid Build Coastguard Worker    mova                 m0, m7
4857*c0909341SAndroid Build Coastguard Worker    pshufb              m18, m16, m12
4858*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m4, m18         ; a0 b0
4859*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m11, m5
4860*c0909341SAndroid Build Coastguard Worker    pshufb              m18, m6, m13
4861*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m5, m18         ; a3 b3
4862*c0909341SAndroid Build Coastguard Worker    paddd              ym18, ym17, ym19
4863*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m4, m5, q1021
4864*c0909341SAndroid Build Coastguard Worker    pshufb              m16, m13
4865*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m17, m16        ; a1 b1
4866*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m5, q2132
4867*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m12
4868*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m4, m6          ; a2 b2
4869*c0909341SAndroid Build Coastguard Worker    vpmultishiftqb       m0, m9, m0          ; a a b b
4870*c0909341SAndroid Build Coastguard Worker    ret
4871*c0909341SAndroid Build Coastguard Worker
4872*c0909341SAndroid Build Coastguard Worker%macro BIDIR_FN 0
4873*c0909341SAndroid Build Coastguard Worker    call .main
4874*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
4875*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4876*c0909341SAndroid Build Coastguard Worker.w4:
4877*c0909341SAndroid Build Coastguard Worker    movq   [dstq          ], xm0
4878*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
4879*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, ym0, 1
4880*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm2
4881*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm2
4882*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
4883*c0909341SAndroid Build Coastguard Worker    jl .w4_end
4884*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, m0, 2
4885*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4886*c0909341SAndroid Build Coastguard Worker    movq   [dstq          ], xm2
4887*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm2
4888*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm0, m0, 3
4889*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm0
4890*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm0
4891*c0909341SAndroid Build Coastguard Worker    je .w4_end
4892*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4893*c0909341SAndroid Build Coastguard Worker    movq   [dstq          ], xm1
4894*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
4895*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm0, ym1, 1
4896*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm0
4897*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm0
4898*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm0, m1, 2
4899*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4900*c0909341SAndroid Build Coastguard Worker    movq   [dstq          ], xm0
4901*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
4902*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, m1, 3
4903*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
4904*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
4905*c0909341SAndroid Build Coastguard Worker.w4_end:
4906*c0909341SAndroid Build Coastguard Worker    RET
4907*c0909341SAndroid Build Coastguard Worker.w8_loop:
4908*c0909341SAndroid Build Coastguard Worker    call .main
4909*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4910*c0909341SAndroid Build Coastguard Worker.w8:
4911*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
4912*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
4913*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m0, 2
4914*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m0, 3
4915*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
4916*c0909341SAndroid Build Coastguard Worker    jl .w8_end
4917*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4918*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm1
4919*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym1, 1
4920*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m1, 2
4921*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m1, 3
4922*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
4923*c0909341SAndroid Build Coastguard Worker.w8_end:
4924*c0909341SAndroid Build Coastguard Worker    RET
4925*c0909341SAndroid Build Coastguard Worker.w16_loop:
4926*c0909341SAndroid Build Coastguard Worker    call .main
4927*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4928*c0909341SAndroid Build Coastguard Worker.w16:
4929*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
4930*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
4931*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*2], ym1
4932*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+stride3q ], m1, 1
4933*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4934*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
4935*c0909341SAndroid Build Coastguard Worker    RET
4936*c0909341SAndroid Build Coastguard Worker.w32_loop:
4937*c0909341SAndroid Build Coastguard Worker    call .main
4938*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
4939*c0909341SAndroid Build Coastguard Worker.w32:
4940*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
4941*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
4942*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4943*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
4944*c0909341SAndroid Build Coastguard Worker    RET
4945*c0909341SAndroid Build Coastguard Worker.w64_loop:
4946*c0909341SAndroid Build Coastguard Worker    call .main
4947*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
4948*c0909341SAndroid Build Coastguard Worker.w64:
4949*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
4950*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
4951*c0909341SAndroid Build Coastguard Worker    dec                  hd
4952*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
4953*c0909341SAndroid Build Coastguard Worker    RET
4954*c0909341SAndroid Build Coastguard Worker.w128_loop:
4955*c0909341SAndroid Build Coastguard Worker    call .main
4956*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
4957*c0909341SAndroid Build Coastguard Worker.w128:
4958*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
4959*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
4960*c0909341SAndroid Build Coastguard Worker    call .main
4961*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*2], m0
4962*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*3], m1
4963*c0909341SAndroid Build Coastguard Worker    dec                  hd
4964*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
4965*c0909341SAndroid Build Coastguard Worker    RET
4966*c0909341SAndroid Build Coastguard Worker%endmacro
4967*c0909341SAndroid Build Coastguard Worker
4968*c0909341SAndroid Build Coastguard Worker%if WIN64
4969*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5
4970*c0909341SAndroid Build Coastguard Worker%else
4971*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7
4972*c0909341SAndroid Build Coastguard Worker%endif
4973*c0909341SAndroid Build Coastguard Worker
4974*c0909341SAndroid Build Coastguard Workercglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3
4975*c0909341SAndroid Build Coastguard Worker%define base r6-avg_avx512icl_table
4976*c0909341SAndroid Build Coastguard Worker    lea                  r6, [avg_avx512icl_table]
4977*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
4978*c0909341SAndroid Build Coastguard Worker    mov                 t0d, r6m ; pixel_max
4979*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
4980*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 11
4981*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [base+avg_round+t0*4]
4982*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [base+avg_shift+t0*4]
4983*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
4984*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
4985*c0909341SAndroid Build Coastguard Worker    BIDIR_FN
4986*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4987*c0909341SAndroid Build Coastguard Worker.main:
4988*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tmp1q+64*0]
4989*c0909341SAndroid Build Coastguard Worker    paddsw               m0, [tmp2q+64*0]
4990*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp1q+64*1]
4991*c0909341SAndroid Build Coastguard Worker    paddsw               m1, [tmp2q+64*1]
4992*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 64*2
4993*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 64*2
4994*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m2
4995*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m2
4996*c0909341SAndroid Build Coastguard Worker    psubsw               m0, m2
4997*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m2
4998*c0909341SAndroid Build Coastguard Worker    vpsrlvw              m0, m3
4999*c0909341SAndroid Build Coastguard Worker    vpsrlvw              m1, m3
5000*c0909341SAndroid Build Coastguard Worker    ret
5001*c0909341SAndroid Build Coastguard Worker
5002*c0909341SAndroid Build Coastguard Workercglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3
5003*c0909341SAndroid Build Coastguard Worker%define base r6-w_avg_avx512icl_table
5004*c0909341SAndroid Build Coastguard Worker    lea                  r6, [w_avg_avx512icl_table]
5005*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5006*c0909341SAndroid Build Coastguard Worker    mov                 t0d, r7m ; pixel_max
5007*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 11
5008*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
5009*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+w_avg_round+t0*4]
5010*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+bidir_shift+t0*4]
5011*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
5012*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r6m ; weight
5013*c0909341SAndroid Build Coastguard Worker    lea                 t0d, [r6-16]
5014*c0909341SAndroid Build Coastguard Worker    shl                 r6d, 16
5015*c0909341SAndroid Build Coastguard Worker    sub                 r6d, t0d ; 16-weight, weight
5016*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5017*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, r6d
5018*c0909341SAndroid Build Coastguard Worker    BIDIR_FN
5019*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5020*c0909341SAndroid Build Coastguard Worker.main:
5021*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tmp1q+64*0]
5022*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp2q+64*0]
5023*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tmp1q+64*1]
5024*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tmp2q+64*1]
5025*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 64*2
5026*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 64*2
5027*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1, m3
5028*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m3
5029*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m0
5030*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0
5031*c0909341SAndroid Build Coastguard Worker    mova                 m0, m5
5032*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m6, m2
5033*c0909341SAndroid Build Coastguard Worker    mova                 m2, m5
5034*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m6, m1
5035*c0909341SAndroid Build Coastguard Worker    mova                 m1, m5
5036*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m6, m3
5037*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
5038*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m6, m4
5039*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 2}, m0, m2, m1, m3
5040*c0909341SAndroid Build Coastguard Worker    packusdw             m0, m2
5041*c0909341SAndroid Build Coastguard Worker    packusdw             m1, m3
5042*c0909341SAndroid Build Coastguard Worker    vpsrlvw              m0, m7
5043*c0909341SAndroid Build Coastguard Worker    vpsrlvw              m1, m7
5044*c0909341SAndroid Build Coastguard Worker    ret
5045*c0909341SAndroid Build Coastguard Worker
5046*c0909341SAndroid Build Coastguard Workercglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
5047*c0909341SAndroid Build Coastguard Worker%define base r7-mask_avx512icl_table
5048*c0909341SAndroid Build Coastguard Worker    lea                  r7, [mask_avx512icl_table]
5049*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5050*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r7m ; pixel_max
5051*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5052*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
5053*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r7+wq*4]
5054*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+pw_64]
5055*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+mask_round+r6*4]
5056*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+bidir_shift+r6*4]
5057*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
5058*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
5059*c0909341SAndroid Build Coastguard Worker    BIDIR_FN
5060*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5061*c0909341SAndroid Build Coastguard Worker.main:
5062*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m1, [maskq+32*0]
5063*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tmp1q+64*0]
5064*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmp2q+64*0]
5065*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m6, [maskq+32*1]
5066*c0909341SAndroid Build Coastguard Worker    mova                 m5, [tmp1q+64*1]
5067*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tmp2q+64*1]
5068*c0909341SAndroid Build Coastguard Worker    add               maskq, 32*2
5069*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 64*2
5070*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 64*2
5071*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m4, m2
5072*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m2
5073*c0909341SAndroid Build Coastguard Worker    psubw                m0, m8, m1
5074*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1, m0 ; m, 64-m
5075*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0
5076*c0909341SAndroid Build Coastguard Worker    mova                 m0, m9
5077*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m7, m2
5078*c0909341SAndroid Build Coastguard Worker    mova                 m2, m9
5079*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m4, m1 ; tmp1 * m + tmp2 * (64-m)
5080*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m5, m3
5081*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m3
5082*c0909341SAndroid Build Coastguard Worker    psubw                m1, m8, m6
5083*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m6, m1
5084*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m1
5085*c0909341SAndroid Build Coastguard Worker    mova                 m1, m9
5086*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m7, m3
5087*c0909341SAndroid Build Coastguard Worker    mova                 m3, m9
5088*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m5, m6
5089*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 4}, m0, m2, m1, m3
5090*c0909341SAndroid Build Coastguard Worker    packusdw             m0, m2
5091*c0909341SAndroid Build Coastguard Worker    packusdw             m1, m3
5092*c0909341SAndroid Build Coastguard Worker    vpsrlvw              m0, m10
5093*c0909341SAndroid Build Coastguard Worker    vpsrlvw              m1, m10
5094*c0909341SAndroid Build Coastguard Worker    ret
5095*c0909341SAndroid Build Coastguard Worker
5096*c0909341SAndroid Build Coastguard Workercglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
5097*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_420_avx512icl_table
5098*c0909341SAndroid Build Coastguard Worker    lea                  r7, [w_mask_420_avx512icl_table]
5099*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5100*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r8m ; pixel_max
5101*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5102*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
5103*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r7+wq*4]
5104*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
5105*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+pw_64]
5106*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [base+mask_round+r6*4]
5107*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [base+bidir_shift+r6*4]
5108*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r7m ; sign
5109*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [base+w_mask_round+r6*4]
5110*c0909341SAndroid Build Coastguard Worker    mova               ym15, [w_mask_end42x]
5111*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
5112*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
5113*c0909341SAndroid Build Coastguard Worker    call .main
5114*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
5115*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5116*c0909341SAndroid Build Coastguard Worker.w4:
5117*c0909341SAndroid Build Coastguard Worker    mova                 m4, [w_mask_shuf4]
5118*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m4, m3
5119*c0909341SAndroid Build Coastguard Worker    mova                 m3, m14
5120*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m2, [pb_64] {1to16}
5121*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m15, m3
5122*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
5123*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
5124*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, ym0, 1
5125*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm2
5126*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm2
5127*c0909341SAndroid Build Coastguard Worker    mova            [maskq], xm3
5128*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
5129*c0909341SAndroid Build Coastguard Worker    jl .w4_end
5130*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, m0, 2
5131*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5132*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm2
5133*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm2
5134*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm0, m0, 3
5135*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm0
5136*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm0
5137*c0909341SAndroid Build Coastguard Worker    je .w4_end
5138*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5139*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm1
5140*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
5141*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, ym1, 1
5142*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm2
5143*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm2
5144*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, m1, 2
5145*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5146*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm2
5147*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm2
5148*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, m1, 3
5149*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
5150*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
5151*c0909341SAndroid Build Coastguard Worker.w4_end:
5152*c0909341SAndroid Build Coastguard Worker    RET
5153*c0909341SAndroid Build Coastguard Worker.w8:
5154*c0909341SAndroid Build Coastguard Worker    mova                 m8, [w_mask_shuf8]
5155*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [pb_64]
5156*c0909341SAndroid Build Coastguard Worker    jmp .w8_start
5157*c0909341SAndroid Build Coastguard Worker.w8_loop:
5158*c0909341SAndroid Build Coastguard Worker    call .main
5159*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5160*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
5161*c0909341SAndroid Build Coastguard Worker.w8_start:
5162*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m8, m3
5163*c0909341SAndroid Build Coastguard Worker    mova                 m3, m14
5164*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m2, m9
5165*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m15, m3
5166*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
5167*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
5168*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m0, 2
5169*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m0, 3
5170*c0909341SAndroid Build Coastguard Worker    mova            [maskq], xm3
5171*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
5172*c0909341SAndroid Build Coastguard Worker    jl .w8_end
5173*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5174*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm1
5175*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym1, 1
5176*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m1, 2
5177*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m1, 3
5178*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
5179*c0909341SAndroid Build Coastguard Worker.w8_end:
5180*c0909341SAndroid Build Coastguard Worker    RET
5181*c0909341SAndroid Build Coastguard Worker.w16:
5182*c0909341SAndroid Build Coastguard Worker    mova                 m8, [w_mask_shuf16]
5183*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [pb_64]
5184*c0909341SAndroid Build Coastguard Worker    jmp .w16_start
5185*c0909341SAndroid Build Coastguard Worker.w16_loop:
5186*c0909341SAndroid Build Coastguard Worker    call .main
5187*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5188*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
5189*c0909341SAndroid Build Coastguard Worker.w16_start:
5190*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m8, m3
5191*c0909341SAndroid Build Coastguard Worker    mova                 m3, m14
5192*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m2, m9
5193*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m15, m3
5194*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
5195*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
5196*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*2], ym1
5197*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+stride3q ], m1, 1
5198*c0909341SAndroid Build Coastguard Worker    mova            [maskq], xm3
5199*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5200*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
5201*c0909341SAndroid Build Coastguard Worker    RET
5202*c0909341SAndroid Build Coastguard Worker.w32_loop:
5203*c0909341SAndroid Build Coastguard Worker    call .main
5204*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5205*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
5206*c0909341SAndroid Build Coastguard Worker.w32:
5207*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
5208*c0909341SAndroid Build Coastguard Worker    mova                 m8, m14
5209*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m11, m2
5210*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
5211*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
5212*c0909341SAndroid Build Coastguard Worker    call .main
5213*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
5214*c0909341SAndroid Build Coastguard Worker    mova                 m3, m14
5215*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m11, m2
5216*c0909341SAndroid Build Coastguard Worker    vpermt2b             m8, m15, m3
5217*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m0
5218*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], m1
5219*c0909341SAndroid Build Coastguard Worker    mova            [maskq], ym8
5220*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5221*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
5222*c0909341SAndroid Build Coastguard Worker    RET
5223*c0909341SAndroid Build Coastguard Worker.w64_loop:
5224*c0909341SAndroid Build Coastguard Worker    call .main
5225*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
5226*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
5227*c0909341SAndroid Build Coastguard Worker.w64:
5228*c0909341SAndroid Build Coastguard Worker    mova                 m8, m2
5229*c0909341SAndroid Build Coastguard Worker    mova                 m9, m3
5230*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+64*0], m0
5231*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+64*1], m1
5232*c0909341SAndroid Build Coastguard Worker    call .main
5233*c0909341SAndroid Build Coastguard Worker    paddw                m8, m2
5234*c0909341SAndroid Build Coastguard Worker    paddw                m9, m3
5235*c0909341SAndroid Build Coastguard Worker    mova                 m2, m14
5236*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m11, m8
5237*c0909341SAndroid Build Coastguard Worker    mova                 m3, m14
5238*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m11, m9
5239*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m15, m3
5240*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+64*0], m0
5241*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+64*1], m1
5242*c0909341SAndroid Build Coastguard Worker    mova            [maskq], ym2
5243*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5244*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
5245*c0909341SAndroid Build Coastguard Worker    RET
5246*c0909341SAndroid Build Coastguard Worker.w128_loop:
5247*c0909341SAndroid Build Coastguard Worker    call .main
5248*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
5249*c0909341SAndroid Build Coastguard Worker    add               maskq, 64
5250*c0909341SAndroid Build Coastguard Worker.w128:
5251*c0909341SAndroid Build Coastguard Worker    mova               m16, m2
5252*c0909341SAndroid Build Coastguard Worker    mova                m8, m3
5253*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+64*0], m0
5254*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+64*1], m1
5255*c0909341SAndroid Build Coastguard Worker    call .main
5256*c0909341SAndroid Build Coastguard Worker    mova                m17, m2
5257*c0909341SAndroid Build Coastguard Worker    mova                 m9, m3
5258*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+64*2], m0
5259*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+64*3], m1
5260*c0909341SAndroid Build Coastguard Worker    call .main
5261*c0909341SAndroid Build Coastguard Worker    paddw                m2, m16
5262*c0909341SAndroid Build Coastguard Worker    paddw                m3, m8
5263*c0909341SAndroid Build Coastguard Worker    mova                m16, m14
5264*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m11, m2
5265*c0909341SAndroid Build Coastguard Worker    mova                 m8, m14
5266*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m11, m3
5267*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+64*0], m0
5268*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+64*1], m1
5269*c0909341SAndroid Build Coastguard Worker    call .main
5270*c0909341SAndroid Build Coastguard Worker    paddw                m2, m17
5271*c0909341SAndroid Build Coastguard Worker    paddw                m3, m9
5272*c0909341SAndroid Build Coastguard Worker    mova                m17, m14
5273*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m11, m2
5274*c0909341SAndroid Build Coastguard Worker    mova                 m9, m14
5275*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m9, m11, m3
5276*c0909341SAndroid Build Coastguard Worker    vpermt2b            m16, m15, m8
5277*c0909341SAndroid Build Coastguard Worker    vpermt2b            m17, m15, m9
5278*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+64*2], m0
5279*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+64*3], m1
5280*c0909341SAndroid Build Coastguard Worker    mova       [maskq+32*0], ym16
5281*c0909341SAndroid Build Coastguard Worker    mova       [maskq+32*1], ym17
5282*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5283*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
5284*c0909341SAndroid Build Coastguard Worker    vzeroupper
5285*c0909341SAndroid Build Coastguard Worker    RET
5286*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5287*c0909341SAndroid Build Coastguard Worker.main:
5288*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp1q+64*0]
5289*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tmp2q+64*0]
5290*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tmp1q+64*1]
5291*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tmp2q+64*1]
5292*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 64*2
5293*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 64*2
5294*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m1, m3
5295*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m3, m1
5296*c0909341SAndroid Build Coastguard Worker    pabsw                m6, m6
5297*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m1
5298*c0909341SAndroid Build Coastguard Worker    psubusw              m6, m10, m6
5299*c0909341SAndroid Build Coastguard Worker    psrlw                m6, 10      ; 64-m
5300*c0909341SAndroid Build Coastguard Worker    psubw                m2, m11, m6 ; m
5301*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m6, m2
5302*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m2
5303*c0909341SAndroid Build Coastguard Worker    mova                 m0, m12
5304*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m5, m1
5305*c0909341SAndroid Build Coastguard Worker    mova                 m1, m12
5306*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m3, m6
5307*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m4, m7
5308*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7, m4
5309*c0909341SAndroid Build Coastguard Worker    pabsw                m5, m5
5310*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m4
5311*c0909341SAndroid Build Coastguard Worker    psubusw              m5, m10, m5
5312*c0909341SAndroid Build Coastguard Worker    psrlw                m5, 10
5313*c0909341SAndroid Build Coastguard Worker    psubw                m3, m11, m5
5314*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m3
5315*c0909341SAndroid Build Coastguard Worker    psrad                m0, 4
5316*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m3
5317*c0909341SAndroid Build Coastguard Worker    psrad                m1, 4
5318*c0909341SAndroid Build Coastguard Worker    packusdw             m0, m1
5319*c0909341SAndroid Build Coastguard Worker    mova                 m1, m12
5320*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m6, m4
5321*c0909341SAndroid Build Coastguard Worker    mova                 m4, m12
5322*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m7, m5
5323*c0909341SAndroid Build Coastguard Worker    psrad                m1, 4
5324*c0909341SAndroid Build Coastguard Worker    psrad                m4, 4
5325*c0909341SAndroid Build Coastguard Worker    packusdw             m1, m4
5326*c0909341SAndroid Build Coastguard Worker    vpsrlvw              m0, m13
5327*c0909341SAndroid Build Coastguard Worker    vpsrlvw              m1, m13
5328*c0909341SAndroid Build Coastguard Worker    ret
5329*c0909341SAndroid Build Coastguard Worker
5330*c0909341SAndroid Build Coastguard Workercglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
5331*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_422_avx512icl_table
5332*c0909341SAndroid Build Coastguard Worker    lea                  r7, [w_mask_422_avx512icl_table]
5333*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5334*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r8m ; pixel_max
5335*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5336*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
5337*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r7+wq*4]
5338*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
5339*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+pw_64]
5340*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+mask_round+r6*4]
5341*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+bidir_shift+r6*4]
5342*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r7m ; sign
5343*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [base+w_mask_round+r6*4]
5344*c0909341SAndroid Build Coastguard Worker    mova               ym13, [w_mask_end42x]
5345*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
5346*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
5347*c0909341SAndroid Build Coastguard Worker    paddw               m14, m9, m9 ; pw_128
5348*c0909341SAndroid Build Coastguard Worker    call .main
5349*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
5350*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5351*c0909341SAndroid Build Coastguard Worker.w4:
5352*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
5353*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
5354*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, ym0, 1
5355*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm2
5356*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm2
5357*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
5358*c0909341SAndroid Build Coastguard Worker    jl .w4_end
5359*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, m0, 2
5360*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5361*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm2
5362*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm2
5363*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm0, m0, 3
5364*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm0
5365*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm0
5366*c0909341SAndroid Build Coastguard Worker    je .w4_end
5367*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5368*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm1
5369*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
5370*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, ym1, 1
5371*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm2
5372*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm2
5373*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, m1, 2
5374*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5375*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm2
5376*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm2
5377*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, m1, 3
5378*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
5379*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
5380*c0909341SAndroid Build Coastguard Worker.w4_end:
5381*c0909341SAndroid Build Coastguard Worker    RET
5382*c0909341SAndroid Build Coastguard Worker.w8_loop:
5383*c0909341SAndroid Build Coastguard Worker    call .main
5384*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5385*c0909341SAndroid Build Coastguard Worker.w8:
5386*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
5387*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
5388*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m0, 2
5389*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m0, 3
5390*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
5391*c0909341SAndroid Build Coastguard Worker    jl .w8_end
5392*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5393*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm1
5394*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym1, 1
5395*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m1, 2
5396*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m1, 3
5397*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
5398*c0909341SAndroid Build Coastguard Worker.w8_end:
5399*c0909341SAndroid Build Coastguard Worker    RET
5400*c0909341SAndroid Build Coastguard Worker.w16_loop:
5401*c0909341SAndroid Build Coastguard Worker    call .main
5402*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5403*c0909341SAndroid Build Coastguard Worker.w16:
5404*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
5405*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
5406*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*2], ym1
5407*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+stride3q ], m1, 1
5408*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5409*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
5410*c0909341SAndroid Build Coastguard Worker    RET
5411*c0909341SAndroid Build Coastguard Worker.w32_loop:
5412*c0909341SAndroid Build Coastguard Worker    call .main
5413*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
5414*c0909341SAndroid Build Coastguard Worker.w32:
5415*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
5416*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
5417*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5418*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
5419*c0909341SAndroid Build Coastguard Worker    RET
5420*c0909341SAndroid Build Coastguard Worker.w64_loop:
5421*c0909341SAndroid Build Coastguard Worker    call .main
5422*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
5423*c0909341SAndroid Build Coastguard Worker.w64:
5424*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
5425*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
5426*c0909341SAndroid Build Coastguard Worker    dec                  hd
5427*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
5428*c0909341SAndroid Build Coastguard Worker    RET
5429*c0909341SAndroid Build Coastguard Worker.w128_loop:
5430*c0909341SAndroid Build Coastguard Worker    call .main
5431*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
5432*c0909341SAndroid Build Coastguard Worker.w128:
5433*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
5434*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
5435*c0909341SAndroid Build Coastguard Worker    call .main
5436*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*2], m0
5437*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*3], m1
5438*c0909341SAndroid Build Coastguard Worker    dec                  hd
5439*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
5440*c0909341SAndroid Build Coastguard Worker    RET
5441*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5442*c0909341SAndroid Build Coastguard Worker.main:
5443*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp1q+64*0]
5444*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tmp2q+64*0]
5445*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tmp1q+64*1]
5446*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tmp2q+64*1]
5447*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 64*2
5448*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 64*2
5449*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m1, m3
5450*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m3, m1
5451*c0909341SAndroid Build Coastguard Worker    pabsw                m6, m6
5452*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m1
5453*c0909341SAndroid Build Coastguard Worker    psubusw              m6, m8, m6
5454*c0909341SAndroid Build Coastguard Worker    psrlw                m6, 10
5455*c0909341SAndroid Build Coastguard Worker    psubw                m2, m9, m6
5456*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m6, m2
5457*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m2
5458*c0909341SAndroid Build Coastguard Worker    mova                 m0, m10
5459*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m5, m1
5460*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
5461*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m3, m6
5462*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m4, m7
5463*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7, m4
5464*c0909341SAndroid Build Coastguard Worker    pabsw                m5, m5
5465*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m4
5466*c0909341SAndroid Build Coastguard Worker    psubusw              m5, m8, m5
5467*c0909341SAndroid Build Coastguard Worker    psrlw                m5, 10
5468*c0909341SAndroid Build Coastguard Worker    psubw                m3, m9, m5
5469*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m3
5470*c0909341SAndroid Build Coastguard Worker    psrad                m0, 4
5471*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m3
5472*c0909341SAndroid Build Coastguard Worker    psrad                m1, 4
5473*c0909341SAndroid Build Coastguard Worker    packusdw             m0, m1
5474*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
5475*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m6, m4
5476*c0909341SAndroid Build Coastguard Worker    mova                 m4, m10
5477*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m7, m5
5478*c0909341SAndroid Build Coastguard Worker    mova                 m5, m12
5479*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m14, m2
5480*c0909341SAndroid Build Coastguard Worker    mova                 m2, m12
5481*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m14, m3
5482*c0909341SAndroid Build Coastguard Worker    psrad                m1, 4
5483*c0909341SAndroid Build Coastguard Worker    psrad                m4, 4
5484*c0909341SAndroid Build Coastguard Worker    packusdw             m1, m4
5485*c0909341SAndroid Build Coastguard Worker    vpermt2b             m5, m13, m2
5486*c0909341SAndroid Build Coastguard Worker    vpsrlvw              m0, m11
5487*c0909341SAndroid Build Coastguard Worker    vpsrlvw              m1, m11
5488*c0909341SAndroid Build Coastguard Worker    mova            [maskq], ym5
5489*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
5490*c0909341SAndroid Build Coastguard Worker    ret
5491*c0909341SAndroid Build Coastguard Worker
5492*c0909341SAndroid Build Coastguard Workercglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3
5493*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_444_avx512icl_table
5494*c0909341SAndroid Build Coastguard Worker    lea                  r7, [w_mask_444_avx512icl_table]
5495*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5496*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r8m ; pixel_max
5497*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5498*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
5499*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r7+wq*4]
5500*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
5501*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+pw_64]
5502*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+mask_round+r6*4]
5503*c0909341SAndroid Build Coastguard Worker    mova                m11, [w_mask_end444]
5504*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [base+bidir_shift+r6*4]
5505*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
5506*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
5507*c0909341SAndroid Build Coastguard Worker    call .main
5508*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
5509*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5510*c0909341SAndroid Build Coastguard Worker.w4:
5511*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
5512*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
5513*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, ym0, 1
5514*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm2
5515*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm2
5516*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
5517*c0909341SAndroid Build Coastguard Worker    jl .w4_end
5518*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, m0, 2
5519*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5520*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm2
5521*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm2
5522*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm0, m0, 3
5523*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm0
5524*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm0
5525*c0909341SAndroid Build Coastguard Worker    je .w4_end
5526*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5527*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm1
5528*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
5529*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, ym1, 1
5530*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm2
5531*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm2
5532*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, m1, 2
5533*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5534*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm2
5535*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm2
5536*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, m1, 3
5537*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
5538*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
5539*c0909341SAndroid Build Coastguard Worker.w4_end:
5540*c0909341SAndroid Build Coastguard Worker    RET
5541*c0909341SAndroid Build Coastguard Worker.w8_loop:
5542*c0909341SAndroid Build Coastguard Worker    call .main
5543*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5544*c0909341SAndroid Build Coastguard Worker.w8:
5545*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
5546*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
5547*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m0, 2
5548*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m0, 3
5549*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
5550*c0909341SAndroid Build Coastguard Worker    jl .w8_end
5551*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5552*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm1
5553*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym1, 1
5554*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m1, 2
5555*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m1, 3
5556*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
5557*c0909341SAndroid Build Coastguard Worker.w8_end:
5558*c0909341SAndroid Build Coastguard Worker    RET
5559*c0909341SAndroid Build Coastguard Worker.w16_loop:
5560*c0909341SAndroid Build Coastguard Worker    call .main
5561*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5562*c0909341SAndroid Build Coastguard Worker.w16:
5563*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
5564*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
5565*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*2], ym1
5566*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+stride3q ], m1, 1
5567*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5568*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
5569*c0909341SAndroid Build Coastguard Worker    RET
5570*c0909341SAndroid Build Coastguard Worker.w32_loop:
5571*c0909341SAndroid Build Coastguard Worker    call .main
5572*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
5573*c0909341SAndroid Build Coastguard Worker.w32:
5574*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
5575*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
5576*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5577*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
5578*c0909341SAndroid Build Coastguard Worker    RET
5579*c0909341SAndroid Build Coastguard Worker.w64_loop:
5580*c0909341SAndroid Build Coastguard Worker    call .main
5581*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
5582*c0909341SAndroid Build Coastguard Worker.w64:
5583*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
5584*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
5585*c0909341SAndroid Build Coastguard Worker    dec                  hd
5586*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
5587*c0909341SAndroid Build Coastguard Worker    RET
5588*c0909341SAndroid Build Coastguard Worker.w128_loop:
5589*c0909341SAndroid Build Coastguard Worker    call .main
5590*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
5591*c0909341SAndroid Build Coastguard Worker.w128:
5592*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
5593*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
5594*c0909341SAndroid Build Coastguard Worker    call .main
5595*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*2], m0
5596*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*3], m1
5597*c0909341SAndroid Build Coastguard Worker    dec                  hd
5598*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
5599*c0909341SAndroid Build Coastguard Worker    RET
5600*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5601*c0909341SAndroid Build Coastguard Worker.main:
5602*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp1q+64*0]
5603*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tmp2q+64*0]
5604*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tmp1q+64*1]
5605*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tmp2q+64*1]
5606*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 64*2
5607*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 64*2
5608*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m1, m3
5609*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m3, m1
5610*c0909341SAndroid Build Coastguard Worker    pabsw                m6, m6
5611*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m1
5612*c0909341SAndroid Build Coastguard Worker    psubusw              m6, m8, m6
5613*c0909341SAndroid Build Coastguard Worker    psrlw                m6, 10
5614*c0909341SAndroid Build Coastguard Worker    psubw                m2, m9, m6
5615*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m6, m2
5616*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m2
5617*c0909341SAndroid Build Coastguard Worker    mova                 m0, m10
5618*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m5, m1
5619*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
5620*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m3, m6
5621*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m4, m7
5622*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7, m4
5623*c0909341SAndroid Build Coastguard Worker    pabsw                m5, m5
5624*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m4
5625*c0909341SAndroid Build Coastguard Worker    psubusw              m5, m8, m5
5626*c0909341SAndroid Build Coastguard Worker    psrlw                m5, 10
5627*c0909341SAndroid Build Coastguard Worker    psubw                m3, m9, m5
5628*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m3
5629*c0909341SAndroid Build Coastguard Worker    psrad                m0, 4
5630*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m3
5631*c0909341SAndroid Build Coastguard Worker    psrad                m1, 4
5632*c0909341SAndroid Build Coastguard Worker    packusdw             m0, m1
5633*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
5634*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m6, m4
5635*c0909341SAndroid Build Coastguard Worker    mova                 m4, m10
5636*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m7, m5
5637*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m11, m3
5638*c0909341SAndroid Build Coastguard Worker    psrad                m1, 4
5639*c0909341SAndroid Build Coastguard Worker    psrad                m4, 4
5640*c0909341SAndroid Build Coastguard Worker    packusdw             m1, m4
5641*c0909341SAndroid Build Coastguard Worker    vpsrlvw              m0, m12
5642*c0909341SAndroid Build Coastguard Worker    vpsrlvw              m1, m12
5643*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m2
5644*c0909341SAndroid Build Coastguard Worker    add               maskq, 64
5645*c0909341SAndroid Build Coastguard Worker    ret
5646*c0909341SAndroid Build Coastguard Worker
5647*c0909341SAndroid Build Coastguard Workercglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
5648*c0909341SAndroid Build Coastguard Worker%define base r6-blend_avx512icl_table
5649*c0909341SAndroid Build Coastguard Worker    lea                  r6, [blend_avx512icl_table]
5650*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5651*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5652*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
5653*c0909341SAndroid Build Coastguard Worker    movifnidn         maskq, maskmp
5654*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pw_m512]
5655*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
5656*c0909341SAndroid Build Coastguard Worker    lea                  r6, [dsq*3]
5657*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5658*c0909341SAndroid Build Coastguard Worker.w4:
5659*c0909341SAndroid Build Coastguard Worker    pmovzxbw           ym19, [maskq]
5660*c0909341SAndroid Build Coastguard Worker    movq               xm16, [dstq+dsq*0]
5661*c0909341SAndroid Build Coastguard Worker    movhps             xm16, [dstq+dsq*1]
5662*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ym17, [dstq+dsq*2]
5663*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ym18, [dstq+r6   ]
5664*c0909341SAndroid Build Coastguard Worker    pmullw             ym19, ym6
5665*c0909341SAndroid Build Coastguard Worker    vpblendd           ym16, ym17, 0x30
5666*c0909341SAndroid Build Coastguard Worker    vpblendd           ym16, ym18, 0xc0
5667*c0909341SAndroid Build Coastguard Worker    psubw              ym17, ym16, [tmpq]
5668*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
5669*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
5670*c0909341SAndroid Build Coastguard Worker    pmulhrsw           ym17, ym19
5671*c0909341SAndroid Build Coastguard Worker    paddw              ym16, ym17
5672*c0909341SAndroid Build Coastguard Worker    vextracti128       xm17, ym16, 1
5673*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm16
5674*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm16
5675*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*2], xm17
5676*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+r6   ], xm17
5677*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*4]
5678*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5679*c0909341SAndroid Build Coastguard Worker    jg .w4
5680*c0909341SAndroid Build Coastguard Worker    vzeroupper
5681*c0909341SAndroid Build Coastguard Worker    RET
5682*c0909341SAndroid Build Coastguard Worker.w8:
5683*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m2, [maskq]
5684*c0909341SAndroid Build Coastguard Worker    mova                xm0, [dstq+dsq*0]
5685*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, [dstq+dsq*1], 1
5686*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, [dstq+dsq*2], 2
5687*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, [dstq+r6   ], 3
5688*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m6
5689*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0, [tmpq]
5690*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
5691*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
5692*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
5693*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
5694*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], xm0
5695*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*1], ym0, 1
5696*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*2], m0, 2
5697*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r6   ], m0, 3
5698*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*4]
5699*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5700*c0909341SAndroid Build Coastguard Worker    jg .w8
5701*c0909341SAndroid Build Coastguard Worker    RET
5702*c0909341SAndroid Build Coastguard Worker.w16:
5703*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m4, [maskq+32*0]
5704*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m5, [maskq+32*1]
5705*c0909341SAndroid Build Coastguard Worker    mova                ym0, [dstq+dsq*0]
5706*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [dstq+dsq*1], 1
5707*c0909341SAndroid Build Coastguard Worker    mova                ym1, [dstq+dsq*2]
5708*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [dstq+r6   ], 1
5709*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m6
5710*c0909341SAndroid Build Coastguard Worker    pmullw               m5, m6
5711*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, [tmpq+64*0]
5712*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, [tmpq+64*1]
5713*c0909341SAndroid Build Coastguard Worker    add               maskq, 32*2
5714*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
5715*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
5716*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
5717*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
5718*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
5719*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym0
5720*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m0, 1
5721*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*2], ym1
5722*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+r6   ], m1, 1
5723*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*4]
5724*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5725*c0909341SAndroid Build Coastguard Worker    jg .w16
5726*c0909341SAndroid Build Coastguard Worker    RET
5727*c0909341SAndroid Build Coastguard Worker.w32:
5728*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m4, [maskq+32*0]
5729*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m5, [maskq+32*1]
5730*c0909341SAndroid Build Coastguard Worker    mova                 m0, [dstq+dsq*0]
5731*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq+dsq*1]
5732*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m6
5733*c0909341SAndroid Build Coastguard Worker    pmullw               m5, m6
5734*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, [tmpq+ 64*0]
5735*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, [tmpq+ 64*1]
5736*c0909341SAndroid Build Coastguard Worker    add               maskq, 32*2
5737*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
5738*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
5739*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
5740*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
5741*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
5742*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m0
5743*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
5744*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5745*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5746*c0909341SAndroid Build Coastguard Worker    jg .w32
5747*c0909341SAndroid Build Coastguard Worker    RET
5748*c0909341SAndroid Build Coastguard Worker
5749*c0909341SAndroid Build Coastguard Workercglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h
5750*c0909341SAndroid Build Coastguard Worker    lea                  r5, [blend_v_avx512icl_table]
5751*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5752*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5753*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
5754*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
5755*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5756*c0909341SAndroid Build Coastguard Worker.w2:
5757*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm2, [obmc_masks_avx2+2*2]
5758*c0909341SAndroid Build Coastguard Worker.w2_loop:
5759*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [dstq+dsq*0]
5760*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm0, [dstq+dsq*1], 1
5761*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [tmpq]
5762*c0909341SAndroid Build Coastguard Worker    add                tmpq, 4*2
5763*c0909341SAndroid Build Coastguard Worker    psubw              xmm1, xmm0, xmm1
5764*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm1, xmm2
5765*c0909341SAndroid Build Coastguard Worker    paddw              xmm0, xmm1
5766*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm0
5767*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm0, 1
5768*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5769*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5770*c0909341SAndroid Build Coastguard Worker    jg .w2_loop
5771*c0909341SAndroid Build Coastguard Worker    RET
5772*c0909341SAndroid Build Coastguard Worker.w4:
5773*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       xmm2, [obmc_masks_avx2+4*2]
5774*c0909341SAndroid Build Coastguard Worker.w4_loop:
5775*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [dstq+dsq*0]
5776*c0909341SAndroid Build Coastguard Worker    movhps             xmm0, [dstq+dsq*1]
5777*c0909341SAndroid Build Coastguard Worker    psubw              xmm1, xmm0, [tmpq]
5778*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8*2
5779*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm1, xmm2
5780*c0909341SAndroid Build Coastguard Worker    paddw              xmm0, xmm1
5781*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xmm0
5782*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xmm0
5783*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5784*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5785*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
5786*c0909341SAndroid Build Coastguard Worker    RET
5787*c0909341SAndroid Build Coastguard Worker.w8:
5788*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym2, [obmc_masks_avx2+8*2]
5789*c0909341SAndroid Build Coastguard Worker.w8_loop:
5790*c0909341SAndroid Build Coastguard Worker    mova                xm0, [dstq+dsq*0]
5791*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, [dstq+dsq*1], 1
5792*c0909341SAndroid Build Coastguard Worker    psubw               ym1, ym0, [tmpq]
5793*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
5794*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym1, ym2
5795*c0909341SAndroid Build Coastguard Worker    paddw               ym0, ym1
5796*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], xm0
5797*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*1], ym0, 1
5798*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5799*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5800*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
5801*c0909341SAndroid Build Coastguard Worker    RET
5802*c0909341SAndroid Build Coastguard Worker.w16:
5803*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m2, [obmc_masks_avx2+16*2]
5804*c0909341SAndroid Build Coastguard Worker.w16_loop:
5805*c0909341SAndroid Build Coastguard Worker    mova                ym0, [dstq+dsq*0]
5806*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [dstq+dsq*1], 1
5807*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0, [tmpq]
5808*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
5809*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
5810*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
5811*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym0
5812*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m0, 1
5813*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5814*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5815*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
5816*c0909341SAndroid Build Coastguard Worker    RET
5817*c0909341SAndroid Build Coastguard Worker.w32:
5818*c0909341SAndroid Build Coastguard Worker    mova                 m4, [obmc_masks_avx2+32*2]
5819*c0909341SAndroid Build Coastguard Worker.w32_loop:
5820*c0909341SAndroid Build Coastguard Worker    mova                 m0,     [dstq+dsq*0]
5821*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, [tmpq+ 64*0]
5822*c0909341SAndroid Build Coastguard Worker    mova                 m1,     [dstq+dsq*1]
5823*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, [tmpq+ 64*1]
5824*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
5825*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
5826*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4
5827*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
5828*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
5829*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m0
5830*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
5831*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5832*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5833*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
5834*c0909341SAndroid Build Coastguard Worker    RET
5835*c0909341SAndroid Build Coastguard Worker
5836*c0909341SAndroid Build Coastguard Workercglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask
5837*c0909341SAndroid Build Coastguard Worker%define base r6-$$
5838*c0909341SAndroid Build Coastguard Worker    lea                  r6, [$$]
5839*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5840*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
5841*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [base+blend_h_avx512icl_table+wq*4]
5842*c0909341SAndroid Build Coastguard Worker    lea               maskq, [base+obmc_masks_avx2+hq*2]
5843*c0909341SAndroid Build Coastguard Worker    lea                  hd, [hq*3]
5844*c0909341SAndroid Build Coastguard Worker    lea                  wq, [base+blend_h_avx512icl_table+wq]
5845*c0909341SAndroid Build Coastguard Worker    shr                  hd, 2 ; h * 3/4
5846*c0909341SAndroid Build Coastguard Worker    lea               maskq, [maskq+hq*2]
5847*c0909341SAndroid Build Coastguard Worker    neg                  hq
5848*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5849*c0909341SAndroid Build Coastguard Worker.w2:
5850*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [dstq+dsq*0]
5851*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm0, [dstq+dsq*1], 1
5852*c0909341SAndroid Build Coastguard Worker    movd               xmm2, [maskq+hq*2]
5853*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [tmpq]
5854*c0909341SAndroid Build Coastguard Worker    add                tmpq, 4*2
5855*c0909341SAndroid Build Coastguard Worker    punpcklwd          xmm2, xmm2
5856*c0909341SAndroid Build Coastguard Worker    psubw              xmm1, xmm0, xmm1
5857*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm1, xmm2
5858*c0909341SAndroid Build Coastguard Worker    paddw              xmm0, xmm1
5859*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm0
5860*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm0, 1
5861*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5862*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
5863*c0909341SAndroid Build Coastguard Worker    jl .w2
5864*c0909341SAndroid Build Coastguard Worker    RET
5865*c0909341SAndroid Build Coastguard Worker.w4:
5866*c0909341SAndroid Build Coastguard Worker    mova               xmm3, [blend_shuf]
5867*c0909341SAndroid Build Coastguard Worker.w4_loop:
5868*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [dstq+dsq*0]
5869*c0909341SAndroid Build Coastguard Worker    movhps             xmm0, [dstq+dsq*1]
5870*c0909341SAndroid Build Coastguard Worker    movd               xmm2, [maskq+hq*2]
5871*c0909341SAndroid Build Coastguard Worker    psubw              xmm1, xmm0, [tmpq]
5872*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8*2
5873*c0909341SAndroid Build Coastguard Worker    pshufb             xmm2, xmm3
5874*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm1, xmm2
5875*c0909341SAndroid Build Coastguard Worker    paddw              xmm0, xmm1
5876*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xmm0
5877*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xmm0
5878*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5879*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
5880*c0909341SAndroid Build Coastguard Worker    jl .w4_loop
5881*c0909341SAndroid Build Coastguard Worker    RET
5882*c0909341SAndroid Build Coastguard Worker.w8:
5883*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym3, [blend_shuf]
5884*c0909341SAndroid Build Coastguard Worker    shufpd              ym3, ym3, 0x0c
5885*c0909341SAndroid Build Coastguard Worker.w8_loop:
5886*c0909341SAndroid Build Coastguard Worker    mova                xm0, [dstq+dsq*0]
5887*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, [dstq+dsq*1], 1
5888*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym2, [maskq+hq*2]
5889*c0909341SAndroid Build Coastguard Worker    psubw               ym1, ym0, [tmpq]
5890*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
5891*c0909341SAndroid Build Coastguard Worker    pshufb              ym2, ym3
5892*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym1, ym2
5893*c0909341SAndroid Build Coastguard Worker    paddw               ym0, ym1
5894*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], xm0
5895*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*1], ym0, 1
5896*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5897*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
5898*c0909341SAndroid Build Coastguard Worker    jl .w8_loop
5899*c0909341SAndroid Build Coastguard Worker    RET
5900*c0909341SAndroid Build Coastguard Worker.w16:
5901*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [blend_shuf]
5902*c0909341SAndroid Build Coastguard Worker    shufpd               m3, m3, 0xf0
5903*c0909341SAndroid Build Coastguard Worker.w16_loop:
5904*c0909341SAndroid Build Coastguard Worker    mova                ym0, [dstq+dsq*0]
5905*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [dstq+dsq*1], 1
5906*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [maskq+hq*2]
5907*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0, [tmpq]
5908*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
5909*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3
5910*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
5911*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
5912*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym0
5913*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m0, 1
5914*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5915*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
5916*c0909341SAndroid Build Coastguard Worker    jl .w16_loop
5917*c0909341SAndroid Build Coastguard Worker    RET
5918*c0909341SAndroid Build Coastguard Worker.w32:
5919*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, [maskq+hq*2]
5920*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, [maskq+hq*2+2]
5921*c0909341SAndroid Build Coastguard Worker    mova                 m0,     [dstq+dsq*0]
5922*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, [tmpq+ 64*0]
5923*c0909341SAndroid Build Coastguard Worker    mova                 m1,     [dstq+dsq*1]
5924*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, [tmpq+ 64*1]
5925*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
5926*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
5927*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
5928*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
5929*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
5930*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m0
5931*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
5932*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5933*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
5934*c0909341SAndroid Build Coastguard Worker    jl .w32
5935*c0909341SAndroid Build Coastguard Worker    RET
5936*c0909341SAndroid Build Coastguard Worker.w64:
5937*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, [maskq+hq*2]
5938*c0909341SAndroid Build Coastguard Worker    mova                 m0,     [dstq+64*0]
5939*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, [tmpq+64*0]
5940*c0909341SAndroid Build Coastguard Worker    mova                 m1,     [dstq+64*1]
5941*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, [tmpq+64*1]
5942*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
5943*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
5944*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4
5945*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
5946*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
5947*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
5948*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
5949*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
5950*c0909341SAndroid Build Coastguard Worker    inc                  hq
5951*c0909341SAndroid Build Coastguard Worker    jl .w64
5952*c0909341SAndroid Build Coastguard Worker    RET
5953*c0909341SAndroid Build Coastguard Worker.w128:
5954*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m8, [maskq+hq*2]
5955*c0909341SAndroid Build Coastguard Worker    mova                 m0,     [dstq+64*0]
5956*c0909341SAndroid Build Coastguard Worker    psubw                m4, m0, [tmpq+64*0]
5957*c0909341SAndroid Build Coastguard Worker    mova                 m1,     [dstq+64*1]
5958*c0909341SAndroid Build Coastguard Worker    psubw                m5, m1, [tmpq+64*1]
5959*c0909341SAndroid Build Coastguard Worker    mova                 m2,     [dstq+64*2]
5960*c0909341SAndroid Build Coastguard Worker    psubw                m6, m2, [tmpq+64*2]
5961*c0909341SAndroid Build Coastguard Worker    mova                 m3,     [dstq+64*3]
5962*c0909341SAndroid Build Coastguard Worker    psubw                m7, m3, [tmpq+64*3]
5963*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*4
5964*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m8}, m4, m5, m6, m7
5965*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
5966*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
5967*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6
5968*c0909341SAndroid Build Coastguard Worker    paddw                m3, m7
5969*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
5970*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
5971*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*2], m2
5972*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*3], m3
5973*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
5974*c0909341SAndroid Build Coastguard Worker    inc                  hq
5975*c0909341SAndroid Build Coastguard Worker    jl .w128
5976*c0909341SAndroid Build Coastguard Worker    RET
5977*c0909341SAndroid Build Coastguard Worker
5978*c0909341SAndroid Build Coastguard Workercglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \
5979*c0909341SAndroid Build Coastguard Worker                                 dst_w, h, src_w, dx, mx0, pxmax
5980*c0909341SAndroid Build Coastguard Worker    sub          dword mx0m, 4<<14
5981*c0909341SAndroid Build Coastguard Worker    sub        dword src_wm, 8
5982*c0909341SAndroid Build Coastguard Worker    mov                  r6, ~0
5983*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, dxm
5984*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, mx0m
5985*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, src_wm
5986*c0909341SAndroid Build Coastguard Worker    kmovq                k6, r6
5987*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
5988*c0909341SAndroid Build Coastguard Worker    LEA                  r7, $$
5989*c0909341SAndroid Build Coastguard Worker%define base r7-$$
5990*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [base+pd_16384]
5991*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+pd_63]
5992*c0909341SAndroid Build Coastguard Worker    mova                m24, [base+resize_permA]
5993*c0909341SAndroid Build Coastguard Worker    mova                m25, [base+resize_permB]
5994*c0909341SAndroid Build Coastguard Worker    mova                m26, [base+resize_permC]
5995*c0909341SAndroid Build Coastguard Worker    mova                m27, [base+resize_permD]
5996*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m28, [base+resize_shufA]
5997*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m29, [base+resize_shufB]
5998*c0909341SAndroid Build Coastguard Worker    mova                m30, [base+resize_permE]
5999*c0909341SAndroid Build Coastguard Worker    vpbroadcastw       ym31, pxmaxm
6000*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
6001*c0909341SAndroid Build Coastguard Worker    pslld                m5, 4                      ; dx*16
6002*c0909341SAndroid Build Coastguard Worker    pslld                m6, 14
6003*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
6004*c0909341SAndroid Build Coastguard Worker.loop_y:
6005*c0909341SAndroid Build Coastguard Worker    xor                  xd, xd
6006*c0909341SAndroid Build Coastguard Worker    mova                 m4, m8     ; per-line working version of mx
6007*c0909341SAndroid Build Coastguard Worker.loop_x:
6008*c0909341SAndroid Build Coastguard Worker    pmaxsd               m0, m4, m2
6009*c0909341SAndroid Build Coastguard Worker    psrad                m9, m4, 8  ; filter offset (unmasked)
6010*c0909341SAndroid Build Coastguard Worker    pminsd               m0, m6     ; iclip(mx, 0, src_w-8)
6011*c0909341SAndroid Build Coastguard Worker    psubd                m1, m4, m0 ; pshufb offset
6012*c0909341SAndroid Build Coastguard Worker    psrad                m0, 14     ; clipped src_x offset
6013*c0909341SAndroid Build Coastguard Worker    psrad                m1, 14     ; pshufb edge_emu offset
6014*c0909341SAndroid Build Coastguard Worker    vptestmd             k5, m1, m1
6015*c0909341SAndroid Build Coastguard Worker    pand                 m9, m7     ; filter offset (masked)
6016*c0909341SAndroid Build Coastguard Worker    ktestw               k5, k5
6017*c0909341SAndroid Build Coastguard Worker    jz .load
6018*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m14, [base+pd_0_4]
6019*c0909341SAndroid Build Coastguard Worker    vpermq              m10, m0, q1100
6020*c0909341SAndroid Build Coastguard Worker    vpermq              m11, m0, q3322
6021*c0909341SAndroid Build Coastguard Worker    vpermq              m20, m1, q1100
6022*c0909341SAndroid Build Coastguard Worker    vpermq              m21, m1, q3322
6023*c0909341SAndroid Build Coastguard Worker    punpckldq           m10, m10
6024*c0909341SAndroid Build Coastguard Worker    punpckldq           m11, m11
6025*c0909341SAndroid Build Coastguard Worker    punpckldq           m20, m20
6026*c0909341SAndroid Build Coastguard Worker    punpckldq           m21, m21
6027*c0909341SAndroid Build Coastguard Worker    paddd               m10, m14
6028*c0909341SAndroid Build Coastguard Worker    paddd               m11, m14
6029*c0909341SAndroid Build Coastguard Worker    paddd               m20, m14
6030*c0909341SAndroid Build Coastguard Worker    paddd               m21, m14
6031*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym12, m10, 1
6032*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym13, m11, 1
6033*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym22, m20, 1
6034*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym23, m21, 1
6035*c0909341SAndroid Build Coastguard Worker    kmovq                k1, k6
6036*c0909341SAndroid Build Coastguard Worker    kmovq                k2, k6
6037*c0909341SAndroid Build Coastguard Worker    kmovq                k3, k6
6038*c0909341SAndroid Build Coastguard Worker    kmovq                k4, k6
6039*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m16{k1}, [srcq+ym10*2] ; 0 1 2 3
6040*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m17{k2}, [srcq+ym11*2] ; 4 5 6 7
6041*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m18{k3}, [srcq+ym12*2] ; 8 9 A B
6042*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m19{k4}, [srcq+ym13*2] ; C D E F
6043*c0909341SAndroid Build Coastguard Worker    kmovq                k1, k6
6044*c0909341SAndroid Build Coastguard Worker    kmovq                k2, k6
6045*c0909341SAndroid Build Coastguard Worker    kmovq                k3, k6
6046*c0909341SAndroid Build Coastguard Worker    kmovq                k4, k6
6047*c0909341SAndroid Build Coastguard Worker    vpgatherdq       m0{k1}, [base+resize_shuf+8+ym20*2]
6048*c0909341SAndroid Build Coastguard Worker    vpgatherdq       m1{k2}, [base+resize_shuf+8+ym21*2]
6049*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m14{k3}, [base+resize_shuf+8+ym22*2]
6050*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m15{k4}, [base+resize_shuf+8+ym23*2]
6051*c0909341SAndroid Build Coastguard Worker    pshufb              m16, m0
6052*c0909341SAndroid Build Coastguard Worker    pshufb              m17, m1
6053*c0909341SAndroid Build Coastguard Worker    pshufb              m18, m14
6054*c0909341SAndroid Build Coastguard Worker    pshufb              m19, m15
6055*c0909341SAndroid Build Coastguard Worker    mova                m20, m24
6056*c0909341SAndroid Build Coastguard Worker    mova                m22, m24
6057*c0909341SAndroid Build Coastguard Worker    mova                m21, m25
6058*c0909341SAndroid Build Coastguard Worker    mova                m23, m25
6059*c0909341SAndroid Build Coastguard Worker    vpermi2d            m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b
6060*c0909341SAndroid Build Coastguard Worker    vpermi2d            m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d
6061*c0909341SAndroid Build Coastguard Worker    vpermi2d            m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb
6062*c0909341SAndroid Build Coastguard Worker    vpermi2d            m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd
6063*c0909341SAndroid Build Coastguard Worker    mova                m15, m26
6064*c0909341SAndroid Build Coastguard Worker    mova                m17, m26
6065*c0909341SAndroid Build Coastguard Worker    mova                m16, m27
6066*c0909341SAndroid Build Coastguard Worker    mova                m18, m27
6067*c0909341SAndroid Build Coastguard Worker    vpermi2q            m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa
6068*c0909341SAndroid Build Coastguard Worker    vpermi2q            m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb
6069*c0909341SAndroid Build Coastguard Worker    vpermi2q            m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc
6070*c0909341SAndroid Build Coastguard Worker    vpermi2q            m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd
6071*c0909341SAndroid Build Coastguard Worker    kmovq                k1, k6
6072*c0909341SAndroid Build Coastguard Worker    kmovq                k2, k6
6073*c0909341SAndroid Build Coastguard Worker    vpgatherdd      m11{k1}, [base+resize_filter+m9*8+0]
6074*c0909341SAndroid Build Coastguard Worker    vpgatherdd      m13{k2}, [base+resize_filter+m9*8+4]
6075*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m11, m28
6076*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m11, m29
6077*c0909341SAndroid Build Coastguard Worker    pshufb              m12, m13, m28
6078*c0909341SAndroid Build Coastguard Worker    pshufb              m13, m13, m29
6079*c0909341SAndroid Build Coastguard Worker    jmp .filter
6080*c0909341SAndroid Build Coastguard Worker.load:
6081*c0909341SAndroid Build Coastguard Worker    kmovq                k1, k6
6082*c0909341SAndroid Build Coastguard Worker    kmovq                k2, k6
6083*c0909341SAndroid Build Coastguard Worker    kmovq                k3, k6
6084*c0909341SAndroid Build Coastguard Worker    kmovq                k4, k6
6085*c0909341SAndroid Build Coastguard Worker    vpgatherdd      m11{k1}, [base+resize_filter+m9*8+0]
6086*c0909341SAndroid Build Coastguard Worker    vpgatherdd      m13{k2}, [base+resize_filter+m9*8+4]
6087*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m11, m28
6088*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m11, m29
6089*c0909341SAndroid Build Coastguard Worker    pshufb              m12, m13, m28
6090*c0909341SAndroid Build Coastguard Worker    pshufb              m13, m13, m29
6091*c0909341SAndroid Build Coastguard Worker    vpgatherdd      m15{k3}, [srcq+m0*2+ 0]
6092*c0909341SAndroid Build Coastguard Worker    vpgatherdd      m16{k4}, [srcq+m0*2+ 4]
6093*c0909341SAndroid Build Coastguard Worker    kmovq                k1, k6
6094*c0909341SAndroid Build Coastguard Worker    kmovq                k2, k6
6095*c0909341SAndroid Build Coastguard Worker    vpgatherdd      m17{k1}, [srcq+m0*2+ 8]
6096*c0909341SAndroid Build Coastguard Worker    vpgatherdd      m18{k2}, [srcq+m0*2+12]
6097*c0909341SAndroid Build Coastguard Worker.filter:
6098*c0909341SAndroid Build Coastguard Worker    mova                m14, m2
6099*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m14, m15, m10
6100*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m14, m16, m11
6101*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m14, m17, m12
6102*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m14, m18, m13
6103*c0909341SAndroid Build Coastguard Worker    psubd               m14, m3, m14
6104*c0909341SAndroid Build Coastguard Worker    psrad               m14, 15
6105*c0909341SAndroid Build Coastguard Worker    packusdw            m14, m14
6106*c0909341SAndroid Build Coastguard Worker    vpermq              m14, m30, m14
6107*c0909341SAndroid Build Coastguard Worker    pminsw             ym14, ym31
6108*c0909341SAndroid Build Coastguard Worker    mova        [dstq+xq*2], ym14
6109*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
6110*c0909341SAndroid Build Coastguard Worker    add                  xd, 16
6111*c0909341SAndroid Build Coastguard Worker    cmp                  xd, dst_wd
6112*c0909341SAndroid Build Coastguard Worker    jl .loop_x
6113*c0909341SAndroid Build Coastguard Worker    add                dstq, dst_strideq
6114*c0909341SAndroid Build Coastguard Worker    add                srcq, src_strideq
6115*c0909341SAndroid Build Coastguard Worker    dec                  hd
6116*c0909341SAndroid Build Coastguard Worker    jg .loop_y
6117*c0909341SAndroid Build Coastguard Worker    RET
6118*c0909341SAndroid Build Coastguard Worker
6119*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
6120