xref: /aosp_15_r20/external/libdav1d/src/x86/mc_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2020, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2020, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64
32*c0909341SAndroid Build Coastguard Worker
33*c0909341SAndroid Build Coastguard Workerobmc_masks:
34*c0909341SAndroid Build Coastguard Workerpw_512:         times 2 dw 512
35*c0909341SAndroid Build Coastguard Worker                ; 2
36*c0909341SAndroid Build Coastguard Worker                db 45, 19, 64,  0
37*c0909341SAndroid Build Coastguard Worker                ; 4
38*c0909341SAndroid Build Coastguard Worker                db 39, 25, 50, 14, 59,  5, 64,  0
39*c0909341SAndroid Build Coastguard Worker                ; 8
40*c0909341SAndroid Build Coastguard Worker                db 36, 28, 42, 22, 48, 16, 53, 11, 57,  7, 61,  3, 64,  0, 64,  0
41*c0909341SAndroid Build Coastguard Worker                ; 16
42*c0909341SAndroid Build Coastguard Worker                db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
43*c0909341SAndroid Build Coastguard Worker                db 56,  8, 58,  6, 60,  4, 61,  3, 64,  0, 64,  0, 64,  0, 64,  0
44*c0909341SAndroid Build Coastguard Worker                ; 32
45*c0909341SAndroid Build Coastguard Worker                db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
46*c0909341SAndroid Build Coastguard Worker                db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
47*c0909341SAndroid Build Coastguard Worker                db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
48*c0909341SAndroid Build Coastguard Worker                db 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0
49*c0909341SAndroid Build Coastguard Worker
50*c0909341SAndroid Build Coastguard Workerwarp_8x8_permA: db  4,  5,  6,  7, 16, 17, 18, 19,  5,  6,  7,  8, 17, 18, 19, 20
51*c0909341SAndroid Build Coastguard Worker                db  6,  7,  8,  9, 18, 19, 20, 21,  7,  8,  9, 10, 19, 20, 21, 22
52*c0909341SAndroid Build Coastguard Worker                db  8,  9, 10, 11, 20, 21, 22, 23,  9, 10, 11, 12, 21, 22, 23, 24
53*c0909341SAndroid Build Coastguard Worker                db 10, 11, 12, 13, 22, 23, 24, 25, 11, 12, 13, 14, 23, 24, 25, 26
54*c0909341SAndroid Build Coastguard Workerwarp_8x8_permB: db  0,  1,  2,  3, 20, 21, 22, 23,  1,  2,  3,  4, 21, 22, 23, 24
55*c0909341SAndroid Build Coastguard Worker                db  2,  3,  4,  5, 22, 23, 24, 25,  3,  4,  5,  6, 23, 24, 25, 26
56*c0909341SAndroid Build Coastguard Worker                db  4,  5,  6,  7, 24, 25, 26, 27,  5,  6,  7,  8, 25, 26, 27, 28
57*c0909341SAndroid Build Coastguard Worker                db  6,  7,  8,  9, 26, 27, 28, 29,  7,  8,  9, 10, 27, 28, 29, 30
58*c0909341SAndroid Build Coastguard Workerwarp_8x8_permC: db -1,  0, -1,  1, -1,  8, -1,  9, -1,  4, -1,  5, -1, 12, -1, 13
59*c0909341SAndroid Build Coastguard Workerwarp_8x8_permD: db -1,  2, -1,  3, -1, 10, -1, 11, -1,  6, -1,  7, -1, 14, -1, 15
60*c0909341SAndroid Build Coastguard Workerpd_0to7:        dd  0,  1,  2,  3,  4,  5,  6,  7
61*c0909341SAndroid Build Coastguard Workerwarp_8x8_hpack: db  3, 11,  3, 11, 35, 43, 35, 43
62*c0909341SAndroid Build Coastguard Workerpd_16384:       dd 16384
63*c0909341SAndroid Build Coastguard Workerpd_262144:      dd 262144
64*c0909341SAndroid Build Coastguard Workerwarp_8x8_end:   db  0,  4, 16, 20, 32, 36, 48, 52,  2,  6, 18, 22, 34, 38, 50, 54
65*c0909341SAndroid Build Coastguard Workerwarp_8x8t_end:  db  2,  3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59
66*c0909341SAndroid Build Coastguard Worker                db  6,  7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63
67*c0909341SAndroid Build Coastguard Workerbidir_sctr_w4:  dd  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
68*c0909341SAndroid Build Coastguard Workerwm_420_perm4:   db  1,  3,  9, 11,  5,  7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31
69*c0909341SAndroid Build Coastguard Worker                db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63
70*c0909341SAndroid Build Coastguard Worker                db  0,  2,  8, 10,  4,  6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
71*c0909341SAndroid Build Coastguard Worker                db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
72*c0909341SAndroid Build Coastguard Workerwm_420_perm8:   db  1,  3, 17, 19,  5,  7, 21, 23,  9, 11, 25, 27, 13, 15, 29, 31
73*c0909341SAndroid Build Coastguard Worker                db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63
74*c0909341SAndroid Build Coastguard Worker                db  0,  2, 16, 18,  4,  6, 20, 22,  8, 10, 24, 26, 12, 14, 28, 30
75*c0909341SAndroid Build Coastguard Worker                db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
76*c0909341SAndroid Build Coastguard Workerwm_420_perm16:  db  1,  3, 33, 35,  5,  7, 37, 39,  9, 11, 41, 43, 13, 15, 45, 47
77*c0909341SAndroid Build Coastguard Worker                db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63
78*c0909341SAndroid Build Coastguard Worker                db  0,  2, 32, 34,  4,  6, 36, 38,  8, 10, 40, 42, 12, 14, 44, 46
79*c0909341SAndroid Build Coastguard Worker                db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
80*c0909341SAndroid Build Coastguard Workerwm_420_mask:    db  3,  7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
81*c0909341SAndroid Build Coastguard Worker                db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127
82*c0909341SAndroid Build Coastguard Worker                db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
83*c0909341SAndroid Build Coastguard Worker                db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
84*c0909341SAndroid Build Coastguard Workerwm_422_mask:    db  2,  6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62
85*c0909341SAndroid Build Coastguard Worker                db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
86*c0909341SAndroid Build Coastguard Worker                db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126
87*c0909341SAndroid Build Coastguard Worker                db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
88*c0909341SAndroid Build Coastguard Workerwm_444_mask:    db  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
89*c0909341SAndroid Build Coastguard Worker                db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
90*c0909341SAndroid Build Coastguard Worker                db  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
91*c0909341SAndroid Build Coastguard Worker                db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
92*c0909341SAndroid Build Coastguard Workerbilin_h_perm16: db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
93*c0909341SAndroid Build Coastguard Worker                db  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16
94*c0909341SAndroid Build Coastguard Worker                db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40
95*c0909341SAndroid Build Coastguard Worker                db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48
96*c0909341SAndroid Build Coastguard Workerbilin_h_perm32: db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
97*c0909341SAndroid Build Coastguard Worker                db  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16
98*c0909341SAndroid Build Coastguard Worker                db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24
99*c0909341SAndroid Build Coastguard Worker                db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32
100*c0909341SAndroid Build Coastguard Workerbilin_v_perm8:  db  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23
101*c0909341SAndroid Build Coastguard Worker                db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87
102*c0909341SAndroid Build Coastguard Worker                db 80, 32, 81, 33, 82, 34, 83, 35, 84, 36, 85, 37, 86, 38, 87, 39
103*c0909341SAndroid Build Coastguard Worker                db 32, 64, 33, 65, 34, 66, 35, 67, 36, 68, 37, 69, 38, 70, 39, 71
104*c0909341SAndroid Build Coastguard Workerbilin_v_perm16: db  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23
105*c0909341SAndroid Build Coastguard Worker                db  8, 24,  9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
106*c0909341SAndroid Build Coastguard Worker                db 16, 64, 17, 65, 18, 66, 19, 67, 20, 68, 21, 69, 22, 70, 23, 71
107*c0909341SAndroid Build Coastguard Worker                db 24, 72, 25, 73, 26, 74, 27, 75, 28, 76, 29, 77, 30, 78, 31, 79
108*c0909341SAndroid Build Coastguard Workerbilin_v_perm32: db  0, 64,  1, 65,  2, 66,  3, 67,  4, 68,  5, 69,  6, 70,  7, 71
109*c0909341SAndroid Build Coastguard Worker                db  8, 72,  9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79
110*c0909341SAndroid Build Coastguard Worker                db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87
111*c0909341SAndroid Build Coastguard Worker                db 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95
112*c0909341SAndroid Build Coastguard Workerbilin_v_perm64: dd  0,  0,  4,  8,  1,  1,  5,  9,  2,  2,  6, 10,  3,  3,  7, 11
113*c0909341SAndroid Build Coastguard Workerspel_h_perm16:  db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
114*c0909341SAndroid Build Coastguard Worker                db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
115*c0909341SAndroid Build Coastguard Worker                db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
116*c0909341SAndroid Build Coastguard Worker                db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
117*c0909341SAndroid Build Coastguard Workerspel_h_perm32:  db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
118*c0909341SAndroid Build Coastguard Worker                db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
119*c0909341SAndroid Build Coastguard Worker                db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
120*c0909341SAndroid Build Coastguard Worker                db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
121*c0909341SAndroid Build Coastguard Workerspel_v_perm8:   db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
122*c0909341SAndroid Build Coastguard Worker                db  8, 16,  9, 17, 10, 18, 11, 19, 12, 20, 13, 21, 14, 22, 15, 23
123*c0909341SAndroid Build Coastguard Worker                db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
124*c0909341SAndroid Build Coastguard Worker                db 24, 32, 25, 33, 26, 34, 27, 35, 28, 36, 29, 37, 30, 38, 31, 39
125*c0909341SAndroid Build Coastguard Workerspel_v_perm16a: db 32,  0, 33,  1, 34,  2, 35,  3, 36,  4, 37,  5, 38,  6, 39,  7
126*c0909341SAndroid Build Coastguard Worker                db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
127*c0909341SAndroid Build Coastguard Worker                db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
128*c0909341SAndroid Build Coastguard Worker                db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
129*c0909341SAndroid Build Coastguard Workerspel_v_perm16b: db 32,  0, 33,  1, 34,  2, 35,  3, 36,  4, 37,  5, 38,  6, 39,  7
130*c0909341SAndroid Build Coastguard Worker                db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
131*c0909341SAndroid Build Coastguard Worker                db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
132*c0909341SAndroid Build Coastguard Worker                db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
133*c0909341SAndroid Build Coastguard Workerspel_v_perm32:  db  0, 32,  1, 33,  2, 34,  3, 35,  4, 36,  5, 37,  6, 38,  7, 39
134*c0909341SAndroid Build Coastguard Worker                db  8, 40,  9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
135*c0909341SAndroid Build Coastguard Worker                db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
136*c0909341SAndroid Build Coastguard Worker                db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
137*c0909341SAndroid Build Coastguard Workerspel_hv_perm4a: db  8,  9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23
138*c0909341SAndroid Build Coastguard Worker                db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31
139*c0909341SAndroid Build Coastguard Workerspel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39
140*c0909341SAndroid Build Coastguard Worker                db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47
141*c0909341SAndroid Build Coastguard Workerspel_hv_perm4c: db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55
142*c0909341SAndroid Build Coastguard Worker                db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63
143*c0909341SAndroid Build Coastguard Workerspel_hv_perm4d: db 18, 19,  0,  1, 22, 23,  4,  5, 26, 27,  8,  9, 30, 31, 12, 13
144*c0909341SAndroid Build Coastguard Worker                db  0,  1, 16, 17,  4,  5, 20, 21,  8,  9, 24, 25, 12, 13, 28, 29
145*c0909341SAndroid Build Coastguard Workerspel_hv_perm8a: db  0,  1, 16, 17,  2,  3, 18, 19,  4,  5, 20, 21,  6,  7, 22, 23
146*c0909341SAndroid Build Coastguard Worker                db  8,  9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
147*c0909341SAndroid Build Coastguard Worker                db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
148*c0909341SAndroid Build Coastguard Worker                db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
149*c0909341SAndroid Build Coastguard Workerspel_hv_perm8b: db 34, 35,  0,  1, 38, 39,  4,  5, 42, 43,  8,  9, 46, 47, 12, 13
150*c0909341SAndroid Build Coastguard Worker                db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29
151*c0909341SAndroid Build Coastguard Worker                db  0,  1, 32, 33,  4,  5, 36, 37,  8,  9, 40, 41, 12, 13, 44, 45
152*c0909341SAndroid Build Coastguard Worker                db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61
153*c0909341SAndroid Build Coastguard Workerspel_hv_perm16a:db  0,  1,  2,  3, 32, 33, 34, 35,  1,  2,  3,  4, 33, 34, 35, 36
154*c0909341SAndroid Build Coastguard Worker                db  2,  3,  4,  5, 34, 35, 36, 37,  3,  4,  5,  6, 35, 36, 37, 38
155*c0909341SAndroid Build Coastguard Worker                db  8,  9, 10, 11, 40, 41, 42, 43,  9, 10, 11, 12, 41, 42, 43, 44
156*c0909341SAndroid Build Coastguard Worker                db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46
157*c0909341SAndroid Build Coastguard Workerspel_hv_perm16b:db  0,  1,  2,  3,  1,  2,  3,  4,  4,  5,  6,  7,  5,  6,  7,  8
158*c0909341SAndroid Build Coastguard Worker                db  2,  3,  4,  5,  3,  4,  5,  6,  6,  7,  8,  9,  7,  8,  9, 10
159*c0909341SAndroid Build Coastguard Worker                db  8,  9, 10, 11,  9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16
160*c0909341SAndroid Build Coastguard Worker                db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18
161*c0909341SAndroid Build Coastguard Workerspel_hv_end16:  db  1,  3, 17, 19,  5,  7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55
162*c0909341SAndroid Build Coastguard Worker                db  9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63
163*c0909341SAndroid Build Coastguard Workerspel_hv_end:    db  1,  3,  5,  7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55
164*c0909341SAndroid Build Coastguard Workerdeint_shuf4:    db  0,  4,  1,  5,  2,  6,  3,  7,  4,  8,  5,  9,  6, 10,  7, 11
165*c0909341SAndroid Build Coastguard Workersubpel_h_shuf4: db  0,  1,  2,  3,  1,  2,  3,  4,  8,  9, 10, 11,  9, 10, 11, 12
166*c0909341SAndroid Build Coastguard Worker                db  2,  3,  4,  5,  3,  4,  5,  6, 10, 11, 12, 13, 11, 12, 13, 14
167*c0909341SAndroid Build Coastguard Workersubpel_h_shufA: db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
168*c0909341SAndroid Build Coastguard Workersubpel_h_shufB: db  4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
169*c0909341SAndroid Build Coastguard Workersubpel_h_shufC: db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
170*c0909341SAndroid Build Coastguard Workerbilin_h_shuf4:  db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
171*c0909341SAndroid Build Coastguard Workerbilin_v_shuf4:  db  0,  4,  1,  5,  2,  6,  3,  7,  4,  8,  5,  9,  6, 10,  7, 11
172*c0909341SAndroid Build Coastguard Workerblend_shuf:     db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
173*c0909341SAndroid Build Coastguard Workerrescale_mul:    dd  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
174*c0909341SAndroid Build Coastguard Workerresize_permA:   dd  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
175*c0909341SAndroid Build Coastguard Workerresize_permB:   dd  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
176*c0909341SAndroid Build Coastguard Workerresize_permC:   dd  0,  4,  8, 12
177*c0909341SAndroid Build Coastguard Workerresize_shuf:    db  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  7,  7,  7,  7
178*c0909341SAndroid Build Coastguard Workerpb_02461357:    db  0,  2,  4,  6,  1,  3,  5,  7
179*c0909341SAndroid Build Coastguard Worker
180*c0909341SAndroid Build Coastguard Workerwm_420_perm64:  dq 0xfedcba9876543210
181*c0909341SAndroid Build Coastguard Workerwm_sign:        dd 0x40804080, 0xc0c0c0c0, 0x40404040
182*c0909341SAndroid Build Coastguard Worker
183*c0909341SAndroid Build Coastguard Workerpb_8x0_8x8: times 8 db 0
184*c0909341SAndroid Build Coastguard Worker            times 8 db 8
185*c0909341SAndroid Build Coastguard Workerpb_4:       times 4 db 4
186*c0909341SAndroid Build Coastguard Workerpb_32:      times 4 db 32
187*c0909341SAndroid Build Coastguard Workerpb_127:     times 4 db 127
188*c0909341SAndroid Build Coastguard Workerpw_m128     times 2 dw -128
189*c0909341SAndroid Build Coastguard Workerpw_m256:    times 2 dw -256
190*c0909341SAndroid Build Coastguard Workerpw_1024:    times 2 dw 1024
191*c0909341SAndroid Build Coastguard Workerpw_2048:    times 2 dw 2048
192*c0909341SAndroid Build Coastguard Workerpw_6903:    times 2 dw 6903
193*c0909341SAndroid Build Coastguard Workerpw_8192:    times 2 dw 8192
194*c0909341SAndroid Build Coastguard Workerpd_32:              dd 32
195*c0909341SAndroid Build Coastguard Workerpd_34:              dd 34
196*c0909341SAndroid Build Coastguard Workerpd_63:              dd 63
197*c0909341SAndroid Build Coastguard Workerpd_512:             dd 512
198*c0909341SAndroid Build Coastguard Worker
199*c0909341SAndroid Build Coastguard Worker%define pb_m64 (wm_sign+4)
200*c0909341SAndroid Build Coastguard Worker%define pb_64  (wm_sign+8)
201*c0909341SAndroid Build Coastguard Worker%define pd_2   (pd_0to7+8)
202*c0909341SAndroid Build Coastguard Worker
203*c0909341SAndroid Build Coastguard Workercextern mc_subpel_filters
204*c0909341SAndroid Build Coastguard Worker%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
205*c0909341SAndroid Build Coastguard Workercextern mc_warp_filter
206*c0909341SAndroid Build Coastguard Workercextern resize_filter
207*c0909341SAndroid Build Coastguard Worker
208*c0909341SAndroid Build Coastguard Worker%macro BASE_JMP_TABLE 3-*
209*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - %3)
210*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_%2
211*c0909341SAndroid Build Coastguard Worker    %%table:
212*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
213*c0909341SAndroid Build Coastguard Worker        dw %%base %+ _w%3 - %%base
214*c0909341SAndroid Build Coastguard Worker        %rotate 1
215*c0909341SAndroid Build Coastguard Worker    %endrep
216*c0909341SAndroid Build Coastguard Worker%endmacro
217*c0909341SAndroid Build Coastguard Worker
218*c0909341SAndroid Build Coastguard Worker%macro HV_JMP_TABLE 5-*
219*c0909341SAndroid Build Coastguard Worker    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
220*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_%3
221*c0909341SAndroid Build Coastguard Worker    %assign %%types %4
222*c0909341SAndroid Build Coastguard Worker    %if %%types & 1
223*c0909341SAndroid Build Coastguard Worker        %xdefine %1_%2_h_%3_table  (%%h  - %5)
224*c0909341SAndroid Build Coastguard Worker        %%h:
225*c0909341SAndroid Build Coastguard Worker        %rep %0 - 4
226*c0909341SAndroid Build Coastguard Worker            dw %%prefix %+ .h_w%5 - %%base
227*c0909341SAndroid Build Coastguard Worker            %rotate 1
228*c0909341SAndroid Build Coastguard Worker        %endrep
229*c0909341SAndroid Build Coastguard Worker        %rotate 4
230*c0909341SAndroid Build Coastguard Worker    %endif
231*c0909341SAndroid Build Coastguard Worker    %if %%types & 2
232*c0909341SAndroid Build Coastguard Worker        %xdefine %1_%2_v_%3_table  (%%v  - %5)
233*c0909341SAndroid Build Coastguard Worker        %%v:
234*c0909341SAndroid Build Coastguard Worker        %rep %0 - 4
235*c0909341SAndroid Build Coastguard Worker            dw %%prefix %+ .v_w%5 - %%base
236*c0909341SAndroid Build Coastguard Worker            %rotate 1
237*c0909341SAndroid Build Coastguard Worker        %endrep
238*c0909341SAndroid Build Coastguard Worker        %rotate 4
239*c0909341SAndroid Build Coastguard Worker    %endif
240*c0909341SAndroid Build Coastguard Worker    %if %%types & 4
241*c0909341SAndroid Build Coastguard Worker        %xdefine %1_%2_hv_%3_table (%%hv - %5)
242*c0909341SAndroid Build Coastguard Worker        %%hv:
243*c0909341SAndroid Build Coastguard Worker        %rep %0 - 4
244*c0909341SAndroid Build Coastguard Worker            dw %%prefix %+ .hv_w%5 - %%base
245*c0909341SAndroid Build Coastguard Worker            %rotate 1
246*c0909341SAndroid Build Coastguard Worker        %endrep
247*c0909341SAndroid Build Coastguard Worker    %endif
248*c0909341SAndroid Build Coastguard Worker%endmacro
249*c0909341SAndroid Build Coastguard Worker
250*c0909341SAndroid Build Coastguard Worker%macro BIDIR_JMP_TABLE 2-*
251*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - 2*%3)
252*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_%2_table
253*c0909341SAndroid Build Coastguard Worker    %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
254*c0909341SAndroid Build Coastguard Worker    %%table:
255*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
256*c0909341SAndroid Build Coastguard Worker        dd %%prefix %+ .w%3 - %%base
257*c0909341SAndroid Build Coastguard Worker        %rotate 1
258*c0909341SAndroid Build Coastguard Worker    %endrep
259*c0909341SAndroid Build Coastguard Worker%endmacro
260*c0909341SAndroid Build Coastguard Worker
261*c0909341SAndroid Build Coastguard Worker%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_8bpc_avx512icl.put)
262*c0909341SAndroid Build Coastguard Worker%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep)
263*c0909341SAndroid Build Coastguard Worker
264*c0909341SAndroid Build Coastguard Worker%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
265*c0909341SAndroid Build Coastguard Worker
266*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE put,  avx512icl,         2, 4, 8, 16, 32, 64, 128
267*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE prep, avx512icl,            4, 8, 16, 32, 64, 128
268*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE put,  bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
269*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE prep, bilin, avx512icl, 7,    4, 8, 16, 32, 64, 128
270*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE put,  6tap,  avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
271*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE put,  8tap,  avx512icl, 3, 2, 4, 8, 16, 32, 64, 128
272*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE prep, 6tap,  avx512icl, 2,    4, 8, 16, 32, 64, 128
273*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE prep, 8tap,  avx512icl, 3,    4, 8, 16, 32, 64, 128
274*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE avg, avx512icl,            4, 8, 16, 32, 64, 128
275*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_avg, avx512icl,          4, 8, 16, 32, 64, 128
276*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE mask, avx512icl,           4, 8, 16, 32, 64, 128
277*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_420, avx512icl,     4, 8, 16, 32, 64, 128
278*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_422, avx512icl,     4, 8, 16, 32, 64, 128
279*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_444, avx512icl,     4, 8, 16, 32, 64, 128
280*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend, avx512icl,          4, 8, 16, 32
281*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend_v, avx512icl,     2, 4, 8, 16, 32
282*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend_h, avx512icl,     2, 4, 8, 16, 32, 64, 128
283*c0909341SAndroid Build Coastguard Worker
284*c0909341SAndroid Build Coastguard WorkerSECTION .text
285*c0909341SAndroid Build Coastguard Worker
286*c0909341SAndroid Build Coastguard Worker%macro WRAP_YMM 1+
287*c0909341SAndroid Build Coastguard WorkerINIT_YMM cpuname
288*c0909341SAndroid Build Coastguard Worker    %1
289*c0909341SAndroid Build Coastguard WorkerINIT_ZMM cpuname
290*c0909341SAndroid Build Coastguard Worker%endmacro
291*c0909341SAndroid Build Coastguard Worker
292*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
293*c0909341SAndroid Build Coastguard Workercglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
294*c0909341SAndroid Build Coastguard Worker    movifnidn          mxyd, r6m ; mx
295*c0909341SAndroid Build Coastguard Worker    lea                  r7, [put_avx512icl]
296*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
297*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
298*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
299*c0909341SAndroid Build Coastguard Worker    jnz .h
300*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r7m ; my
301*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
302*c0909341SAndroid Build Coastguard Worker    jnz .v
303*c0909341SAndroid Build Coastguard Worker.put:
304*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(put,)]
305*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
306*c0909341SAndroid Build Coastguard Worker    jmp                  wq
307*c0909341SAndroid Build Coastguard Worker.put_w2:
308*c0909341SAndroid Build Coastguard Worker    movzx               r6d, word [srcq+ssq*0]
309*c0909341SAndroid Build Coastguard Worker    movzx               r7d, word [srcq+ssq*1]
310*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
311*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r6w
312*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r7w
313*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
314*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
315*c0909341SAndroid Build Coastguard Worker    jg .put_w2
316*c0909341SAndroid Build Coastguard Worker    RET
317*c0909341SAndroid Build Coastguard Worker.put_w4:
318*c0909341SAndroid Build Coastguard Worker    mov                 r6d, [srcq+ssq*0]
319*c0909341SAndroid Build Coastguard Worker    mov                 r7d, [srcq+ssq*1]
320*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
321*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r6d
322*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r7d
323*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
324*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
325*c0909341SAndroid Build Coastguard Worker    jg .put_w4
326*c0909341SAndroid Build Coastguard Worker    RET
327*c0909341SAndroid Build Coastguard Worker.put_w8:
328*c0909341SAndroid Build Coastguard Worker    mov                  r6, [srcq+ssq*0]
329*c0909341SAndroid Build Coastguard Worker    mov                  r7, [srcq+ssq*1]
330*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
331*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r6
332*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r7
333*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
334*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
335*c0909341SAndroid Build Coastguard Worker    jg .put_w8
336*c0909341SAndroid Build Coastguard Worker    RET
337*c0909341SAndroid Build Coastguard Worker.put_w16:
338*c0909341SAndroid Build Coastguard Worker    movu               xmm0, [srcq+ssq*0]
339*c0909341SAndroid Build Coastguard Worker    movu               xmm1, [srcq+ssq*1]
340*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
341*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], xmm0
342*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], xmm1
343*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
344*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
345*c0909341SAndroid Build Coastguard Worker    jg .put_w16
346*c0909341SAndroid Build Coastguard Worker    RET
347*c0909341SAndroid Build Coastguard Worker.put_w32:
348*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+ssq*0]
349*c0909341SAndroid Build Coastguard Worker    movu                ym1, [srcq+ssq*1]
350*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
351*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], ym0
352*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], ym1
353*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
354*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
355*c0909341SAndroid Build Coastguard Worker    jg .put_w32
356*c0909341SAndroid Build Coastguard Worker    RET
357*c0909341SAndroid Build Coastguard Worker.put_w64:
358*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
359*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
360*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
361*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m0
362*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
363*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
364*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
365*c0909341SAndroid Build Coastguard Worker    jg .put_w64
366*c0909341SAndroid Build Coastguard Worker    RET
367*c0909341SAndroid Build Coastguard Worker.put_w128:
368*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+64*0]
369*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+64*1]
370*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1+64*0]
371*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1+64*1]
372*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
373*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+64*0], m0
374*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+64*1], m1
375*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+64*0], m2
376*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+64*1], m3
377*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
378*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
379*c0909341SAndroid Build Coastguard Worker    jg .put_w128
380*c0909341SAndroid Build Coastguard Worker    RET
381*c0909341SAndroid Build Coastguard Worker.h:
382*c0909341SAndroid Build Coastguard Worker    ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
383*c0909341SAndroid Build Coastguard Worker    ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
384*c0909341SAndroid Build Coastguard Worker    imul               mxyd, 255
385*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [bilin_h_perm16]
386*c0909341SAndroid Build Coastguard Worker    add                mxyd, 16
387*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, mxyd
388*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r7m ; my
389*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
390*c0909341SAndroid Build Coastguard Worker    jnz .hv
391*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_h)]
392*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [pw_2048]
393*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
394*c0909341SAndroid Build Coastguard Worker    jmp                  wq
395*c0909341SAndroid Build Coastguard Worker.h_w2:
396*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [srcq+ssq*0]
397*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm0, [srcq+ssq*1], 1
398*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
399*c0909341SAndroid Build Coastguard Worker    pshufb             xmm0, xm4
400*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm0, xm5
401*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm0, xm3
402*c0909341SAndroid Build Coastguard Worker    packuswb           xmm0, xmm0
403*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xmm0, 0
404*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xmm0, 2
405*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
406*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
407*c0909341SAndroid Build Coastguard Worker    jg .h_w2
408*c0909341SAndroid Build Coastguard Worker    RET
409*c0909341SAndroid Build Coastguard Worker.h_w4:
410*c0909341SAndroid Build Coastguard Worker    mova               xmm4, [bilin_h_shuf4]
411*c0909341SAndroid Build Coastguard Worker.h_w4_loop:
412*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [srcq+ssq*0]
413*c0909341SAndroid Build Coastguard Worker    movhps             xmm0, [srcq+ssq*1]
414*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
415*c0909341SAndroid Build Coastguard Worker    pshufb             xmm0, xmm4
416*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm0, xm5
417*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm0, xm3
418*c0909341SAndroid Build Coastguard Worker    packuswb           xmm0, xmm0
419*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm0
420*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm0, 1
421*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
422*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
423*c0909341SAndroid Build Coastguard Worker    jg .h_w4_loop
424*c0909341SAndroid Build Coastguard Worker    RET
425*c0909341SAndroid Build Coastguard Worker.h_w8:
426*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
427*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, [srcq+ssq*1], 1
428*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
429*c0909341SAndroid Build Coastguard Worker    pshufb              ym0, ym4
430*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym0, ym5
431*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym0, ym3
432*c0909341SAndroid Build Coastguard Worker    vpmovuswb           xm0, ym0
433*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm0
434*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm0
435*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
436*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
437*c0909341SAndroid Build Coastguard Worker    jg .h_w8
438*c0909341SAndroid Build Coastguard Worker    RET
439*c0909341SAndroid Build Coastguard Worker.h_w16:
440*c0909341SAndroid Build Coastguard Worker    mova                 m4, [bilin_h_perm16]
441*c0909341SAndroid Build Coastguard Worker.h_w16_loop:
442*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+ssq*0]
443*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*1], 1
444*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
445*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m4, m0
446*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
447*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
448*c0909341SAndroid Build Coastguard Worker    vpmovuswb           ym0, m0
449*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm0
450*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], ym0, 1
451*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
452*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
453*c0909341SAndroid Build Coastguard Worker    jg .h_w16_loop
454*c0909341SAndroid Build Coastguard Worker    RET
455*c0909341SAndroid Build Coastguard Worker.h_w32:
456*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+ssq*0+8*0]
457*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*1+8*0], 1
458*c0909341SAndroid Build Coastguard Worker    movu                ym1, [srcq+ssq*0+8*1]
459*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [srcq+ssq*1+8*1], 1
460*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
461*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
462*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
463*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
464*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
465*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
466*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
467*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
468*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym0
469*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m0, 1
470*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
471*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
472*c0909341SAndroid Build Coastguard Worker    jg .h_w32
473*c0909341SAndroid Build Coastguard Worker    RET
474*c0909341SAndroid Build Coastguard Worker.h_w64:
475*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+8*0]
476*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+8*1]
477*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
478*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
479*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
480*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
481*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
482*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
483*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
484*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
485*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
486*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
487*c0909341SAndroid Build Coastguard Worker    dec                  hd
488*c0909341SAndroid Build Coastguard Worker    jg .h_w64
489*c0909341SAndroid Build Coastguard Worker    RET
490*c0909341SAndroid Build Coastguard Worker.h_w128:
491*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+8*0]
492*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+8*1]
493*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+8*8]
494*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+8*9]
495*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
496*c0909341SAndroid Build Coastguard Worker    REPX  {pshufb    x, m4}, m0, m2, m1, m6
497*c0909341SAndroid Build Coastguard Worker    REPX  {pmaddubsw x, m5}, m0, m2, m1, m6
498*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw  x, m3}, m0, m2, m1, m6
499*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m2
500*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m6
501*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
502*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
503*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
504*c0909341SAndroid Build Coastguard Worker    dec                  hd
505*c0909341SAndroid Build Coastguard Worker    jg .h_w128
506*c0909341SAndroid Build Coastguard Worker    RET
507*c0909341SAndroid Build Coastguard Worker.v:
508*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_v)]
509*c0909341SAndroid Build Coastguard Worker    imul               mxyd, 255
510*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pw_2048]
511*c0909341SAndroid Build Coastguard Worker    add                mxyd, 16
512*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
513*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, mxyd
514*c0909341SAndroid Build Coastguard Worker    jmp                  wq
515*c0909341SAndroid Build Coastguard Worker.v_w2:
516*c0909341SAndroid Build Coastguard Worker    movd               xmm0,       [srcq+ssq*0]
517*c0909341SAndroid Build Coastguard Worker.v_w2_loop:
518*c0909341SAndroid Build Coastguard Worker    pinsrw             xmm1, xmm0, [srcq+ssq*1], 1 ; 0 1
519*c0909341SAndroid Build Coastguard Worker    lea                srcq,       [srcq+ssq*2]
520*c0909341SAndroid Build Coastguard Worker    pinsrw             xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1
521*c0909341SAndroid Build Coastguard Worker    pshuflw            xmm1, xmm1, q2301           ; 1 0
522*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm1, xmm0
523*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm1, xm4
524*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm1, xm5
525*c0909341SAndroid Build Coastguard Worker    packuswb           xmm1, xmm1
526*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xmm1, 1
527*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xmm1, 0
528*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
529*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
530*c0909341SAndroid Build Coastguard Worker    jg .v_w2_loop
531*c0909341SAndroid Build Coastguard Worker    RET
532*c0909341SAndroid Build Coastguard Worker.v_w4:
533*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [srcq+ssq*0]
534*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
535*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm2, [srcq+ssq*1]
536*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
537*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm1, xmm2, xmm0, 0x01 ; 0 1
538*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm0, [srcq+ssq*0]
539*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm2, xmm0, 0x02       ; 1 2
540*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm1, xmm2
541*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm1, xm4
542*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm1, xm5
543*c0909341SAndroid Build Coastguard Worker    packuswb           xmm1, xmm1
544*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm1
545*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm1, 1
546*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
547*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
548*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
549*c0909341SAndroid Build Coastguard Worker    RET
550*c0909341SAndroid Build Coastguard Worker.v_w8:
551*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [srcq+ssq*0]
552*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
553*c0909341SAndroid Build Coastguard Worker    movq               xmm2, [srcq+ssq*1]
554*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
555*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm1, xmm0, xmm2
556*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [srcq+ssq*0]
557*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm2, xmm0
558*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm1, xm4
559*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm2, xm4
560*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm1, xm5
561*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm2, xm5
562*c0909341SAndroid Build Coastguard Worker    packuswb           xmm1, xmm2
563*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xmm1
564*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xmm1
565*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
566*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
567*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
568*c0909341SAndroid Build Coastguard Worker    RET
569*c0909341SAndroid Build Coastguard Worker.v_w16:
570*c0909341SAndroid Build Coastguard Worker    movu               xmm0, [srcq+ssq*0]
571*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
572*c0909341SAndroid Build Coastguard Worker    vbroadcasti128     ymm3, [srcq+ssq*1]
573*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
574*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm2, ymm3, ymm0, 0x0f ; 0 1
575*c0909341SAndroid Build Coastguard Worker    vbroadcasti128     ymm0, [srcq+ssq*0]
576*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm3, ymm0, 0xf0       ; 1 2
577*c0909341SAndroid Build Coastguard Worker    punpcklbw          ymm1, ymm2, ymm3
578*c0909341SAndroid Build Coastguard Worker    punpckhbw          ymm2, ymm3
579*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm1, ym4
580*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm2, ym4
581*c0909341SAndroid Build Coastguard Worker    pmulhrsw           ymm1, ym5
582*c0909341SAndroid Build Coastguard Worker    pmulhrsw           ymm2, ym5
583*c0909341SAndroid Build Coastguard Worker    packuswb           ymm1, ymm2
584*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xmm1
585*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], ymm1, 1
586*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
587*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
588*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
589*c0909341SAndroid Build Coastguard Worker    vzeroupper
590*c0909341SAndroid Build Coastguard Worker    RET
591*c0909341SAndroid Build Coastguard Worker.v_w32:
592*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+ssq*0]
593*c0909341SAndroid Build Coastguard Worker    kxnorb               k1, k1, k1
594*c0909341SAndroid Build Coastguard Worker.v_w32_loop:
595*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m3, [srcq+ssq*1]
596*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
597*c0909341SAndroid Build Coastguard Worker    vpblendmd        m2{k1}, m3, m0 ; 0 1
598*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m0, [srcq+ssq*0]
599*c0909341SAndroid Build Coastguard Worker    vpblendmd        m3{k1}, m0, m3 ; 1 2
600*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2, m3
601*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m3
602*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m4
603*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m4
604*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
605*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5
606*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m2
607*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym1
608*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m1, 1
609*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
610*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
611*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop
612*c0909341SAndroid Build Coastguard Worker    RET
613*c0909341SAndroid Build Coastguard Worker.v_w64:
614*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
615*c0909341SAndroid Build Coastguard Worker.v_w64_loop:
616*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1]
617*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
618*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m0, m3
619*c0909341SAndroid Build Coastguard Worker    punpckhbw            m6, m0, m3
620*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
621*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m4
622*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m4
623*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m0
624*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m0
625*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m4
626*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m4
627*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m5}, m1, m6, m2, m3
628*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m6
629*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m3
630*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m1
631*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m2
632*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
633*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
634*c0909341SAndroid Build Coastguard Worker    jg .v_w64_loop
635*c0909341SAndroid Build Coastguard Worker    RET
636*c0909341SAndroid Build Coastguard Worker.v_w128:
637*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+64*0]
638*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+64*1]
639*c0909341SAndroid Build Coastguard Worker.v_w128_loop:
640*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
641*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+64*0]
642*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+64*1]
643*c0909341SAndroid Build Coastguard Worker    punpcklbw            m6, m0, m2
644*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m4
645*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m2
646*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m4
647*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m1, m3
648*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m4
649*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m3
650*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m4
651*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m5}, m6, m0, m7, m1
652*c0909341SAndroid Build Coastguard Worker    packuswb             m6, m0
653*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
654*c0909341SAndroid Build Coastguard Worker    packuswb             m7, m1
655*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
656*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m6
657*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m7
658*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
659*c0909341SAndroid Build Coastguard Worker    dec                  hd
660*c0909341SAndroid Build Coastguard Worker    jg .v_w128_loop
661*c0909341SAndroid Build Coastguard Worker    RET
662*c0909341SAndroid Build Coastguard Worker.hv:
663*c0909341SAndroid Build Coastguard Worker    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
664*c0909341SAndroid Build Coastguard Worker    ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
665*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
666*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       8
667*c0909341SAndroid Build Coastguard Worker    shl                mxyd, 11 ; can't shift by 12 due to signed overflow
668*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pw_2048]
669*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
670*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, mxyd
671*c0909341SAndroid Build Coastguard Worker    jmp                  wq
672*c0909341SAndroid Build Coastguard Worker.hv_w2:
673*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm0, [srcq+ssq*0]
674*c0909341SAndroid Build Coastguard Worker    pshufb             xmm0, xm4
675*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm0, xm5
676*c0909341SAndroid Build Coastguard Worker.hv_w2_loop:
677*c0909341SAndroid Build Coastguard Worker    movd               xmm1, [srcq+ssq*1]
678*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
679*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm1, [srcq+ssq*0], 1
680*c0909341SAndroid Build Coastguard Worker    pshufb             xmm1, xm4
681*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm1, xm5               ; 1 _ 2 _
682*c0909341SAndroid Build Coastguard Worker    shufps             xmm2, xmm0, xmm1, q1032 ; 0 _ 1 _
683*c0909341SAndroid Build Coastguard Worker    mova               xmm0, xmm1
684*c0909341SAndroid Build Coastguard Worker    psubw              xmm1, xmm2
685*c0909341SAndroid Build Coastguard Worker    paddw              xmm1, xmm1
686*c0909341SAndroid Build Coastguard Worker    pmulhw             xmm1, xm6
687*c0909341SAndroid Build Coastguard Worker    paddw              xmm1, xmm2
688*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm1, xm7
689*c0909341SAndroid Build Coastguard Worker    packuswb           xmm1, xmm1
690*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xmm1, 0
691*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xmm1, 2
692*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
693*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
694*c0909341SAndroid Build Coastguard Worker    jg .hv_w2_loop
695*c0909341SAndroid Build Coastguard Worker    RET
696*c0909341SAndroid Build Coastguard Worker.hv_w4:
697*c0909341SAndroid Build Coastguard Worker    mova               xmm4, [bilin_h_shuf4]
698*c0909341SAndroid Build Coastguard Worker    movddup            xmm0, [srcq+ssq*0]
699*c0909341SAndroid Build Coastguard Worker    pshufb             xmm0, xmm4
700*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm0, xm5
701*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
702*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [srcq+ssq*1]
703*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
704*c0909341SAndroid Build Coastguard Worker    movhps             xmm1, [srcq+ssq*0]
705*c0909341SAndroid Build Coastguard Worker    pshufb             xmm1, xmm4
706*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm1, xm5               ; 1 2
707*c0909341SAndroid Build Coastguard Worker    shufps             xmm2, xmm0, xmm1, q1032 ; 0 1
708*c0909341SAndroid Build Coastguard Worker    mova               xmm0, xmm1
709*c0909341SAndroid Build Coastguard Worker    psubw              xmm1, xmm2
710*c0909341SAndroid Build Coastguard Worker    paddw              xmm1, xmm1
711*c0909341SAndroid Build Coastguard Worker    pmulhw             xmm1, xm6
712*c0909341SAndroid Build Coastguard Worker    paddw              xmm1, xmm2
713*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm1, xm7
714*c0909341SAndroid Build Coastguard Worker    packuswb           xmm1, xmm1
715*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm1
716*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm1, 1
717*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
718*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
719*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
720*c0909341SAndroid Build Coastguard Worker    RET
721*c0909341SAndroid Build Coastguard Worker.hv_w8:
722*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      ym0, [srcq+ssq*0]
723*c0909341SAndroid Build Coastguard Worker    pshufb              ym0, ym4
724*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym0, ym5
725*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
726*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*1]
727*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
728*c0909341SAndroid Build Coastguard Worker    vinserti128         ym1, [srcq+ssq*0], 1
729*c0909341SAndroid Build Coastguard Worker    pshufb              ym1, ym4
730*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym1, ym5            ; 1 2
731*c0909341SAndroid Build Coastguard Worker    valignq             ym2, ym1, ym0, 2
732*c0909341SAndroid Build Coastguard Worker    mova                ym0, ym1
733*c0909341SAndroid Build Coastguard Worker    psubw               ym1, ym2
734*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym1
735*c0909341SAndroid Build Coastguard Worker    pmulhw              ym1, ym6
736*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym2
737*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym1, ym7
738*c0909341SAndroid Build Coastguard Worker    vpmovuswb           xm1, ym1
739*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm1
740*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm1
741*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
742*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
743*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
744*c0909341SAndroid Build Coastguard Worker    RET
745*c0909341SAndroid Build Coastguard Worker.hv_w16:
746*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m0, [srcq+ssq*0]
747*c0909341SAndroid Build Coastguard Worker    mova                 m4, [bilin_h_perm16]
748*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m4, m0
749*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
750*c0909341SAndroid Build Coastguard Worker.hv_w16_loop:
751*c0909341SAndroid Build Coastguard Worker    movu                ym1, [srcq+ssq*1]
752*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
753*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [srcq+ssq*0], 1
754*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m4, m1
755*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5        ; 1 2
756*c0909341SAndroid Build Coastguard Worker    valignq              m2, m1, m0, 4 ; 0 1
757*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
758*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
759*c0909341SAndroid Build Coastguard Worker    paddw                m1, m1
760*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m6
761*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
762*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7
763*c0909341SAndroid Build Coastguard Worker    vpmovuswb           ym1, m1
764*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], xm1
765*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*1], ym1, 1
766*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
767*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
768*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop
769*c0909341SAndroid Build Coastguard Worker    RET
770*c0909341SAndroid Build Coastguard Worker.hv_w32:
771*c0909341SAndroid Build Coastguard Worker    mova                 m4, [bilin_h_perm32]
772*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m4, [srcq+ssq*0]
773*c0909341SAndroid Build Coastguard Worker    pmovzxbq             m8, [pb_02461357]
774*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
775*c0909341SAndroid Build Coastguard Worker.hv_w32_loop:
776*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m4, [srcq+ssq*1]
777*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
778*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m4, [srcq+ssq*0]
779*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
780*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2, m0
781*c0909341SAndroid Build Coastguard Worker    paddw                m1, m1
782*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m6
783*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
784*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3, m5
785*c0909341SAndroid Build Coastguard Worker    psubw                m3, m0, m2
786*c0909341SAndroid Build Coastguard Worker    paddw                m3, m3
787*c0909341SAndroid Build Coastguard Worker    pmulhw               m3, m6
788*c0909341SAndroid Build Coastguard Worker    paddw                m3, m2
789*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7
790*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m7
791*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m3
792*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m8, m1
793*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym1
794*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m1, 1
795*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
796*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
797*c0909341SAndroid Build Coastguard Worker    jg .hv_w32_loop
798*c0909341SAndroid Build Coastguard Worker    RET
799*c0909341SAndroid Build Coastguard Worker.hv_w64:
800*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+8*0]
801*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+8*1]
802*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
803*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
804*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
805*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
806*c0909341SAndroid Build Coastguard Worker.hv_w64_loop:
807*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
808*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+8*0]
809*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+8*1]
810*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
811*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4
812*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
813*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5
814*c0909341SAndroid Build Coastguard Worker    psubw                m8, m2, m0
815*c0909341SAndroid Build Coastguard Worker    psubw                m9, m3, m1
816*c0909341SAndroid Build Coastguard Worker    paddw                m8, m8
817*c0909341SAndroid Build Coastguard Worker    pmulhw               m8, m6
818*c0909341SAndroid Build Coastguard Worker    paddw                m9, m9
819*c0909341SAndroid Build Coastguard Worker    pmulhw               m9, m6
820*c0909341SAndroid Build Coastguard Worker    paddw                m8, m0
821*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m7
822*c0909341SAndroid Build Coastguard Worker    paddw                m9, m1
823*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m7
824*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
825*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
826*c0909341SAndroid Build Coastguard Worker    packuswb             m8, m9
827*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m8
828*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
829*c0909341SAndroid Build Coastguard Worker    dec                  hd
830*c0909341SAndroid Build Coastguard Worker    jg .hv_w64_loop
831*c0909341SAndroid Build Coastguard Worker    RET
832*c0909341SAndroid Build Coastguard Worker.hv_w128:
833*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+8*0]
834*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+8*1]
835*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+8*8]
836*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+8*9]
837*c0909341SAndroid Build Coastguard Worker    REPX  {pshufb    x, m4}, m0, m1, m2, m3
838*c0909341SAndroid Build Coastguard Worker    REPX  {pmaddubsw x, m5}, m0, m1, m2, m3
839*c0909341SAndroid Build Coastguard Worker.hv_w128_loop:
840*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
841*c0909341SAndroid Build Coastguard Worker    movu                 m8, [srcq+8*0]
842*c0909341SAndroid Build Coastguard Worker    movu                 m9, [srcq+8*1]
843*c0909341SAndroid Build Coastguard Worker    movu                m10, [srcq+8*8]
844*c0909341SAndroid Build Coastguard Worker    movu                m11, [srcq+8*9]
845*c0909341SAndroid Build Coastguard Worker    REPX  {pshufb    x, m4}, m8, m9, m10, m11
846*c0909341SAndroid Build Coastguard Worker    REPX  {pmaddubsw x, m5}, m8, m9, m10, m11
847*c0909341SAndroid Build Coastguard Worker    psubw               m12, m8, m0
848*c0909341SAndroid Build Coastguard Worker    psubw               m13, m9, m1
849*c0909341SAndroid Build Coastguard Worker    psubw               m14, m10, m2
850*c0909341SAndroid Build Coastguard Worker    psubw               m15, m11, m3
851*c0909341SAndroid Build Coastguard Worker    paddw               m12, m12
852*c0909341SAndroid Build Coastguard Worker    pmulhw              m12, m6
853*c0909341SAndroid Build Coastguard Worker    paddw               m13, m13
854*c0909341SAndroid Build Coastguard Worker    pmulhw              m13, m6
855*c0909341SAndroid Build Coastguard Worker    paddw               m14, m14
856*c0909341SAndroid Build Coastguard Worker    pmulhw              m14, m6
857*c0909341SAndroid Build Coastguard Worker    paddw               m15, m15
858*c0909341SAndroid Build Coastguard Worker    pmulhw              m15, m6
859*c0909341SAndroid Build Coastguard Worker    paddw               m12, m0
860*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m12, m7
861*c0909341SAndroid Build Coastguard Worker    paddw               m13, m1
862*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m13, m7
863*c0909341SAndroid Build Coastguard Worker    paddw               m14, m2
864*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m7
865*c0909341SAndroid Build Coastguard Worker    paddw               m15, m3
866*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m7
867*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
868*c0909341SAndroid Build Coastguard Worker    mova                 m1, m9
869*c0909341SAndroid Build Coastguard Worker    mova                 m2, m10
870*c0909341SAndroid Build Coastguard Worker    mova                 m3, m11
871*c0909341SAndroid Build Coastguard Worker    packuswb            m12, m13
872*c0909341SAndroid Build Coastguard Worker    packuswb            m14, m15
873*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m12
874*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m14
875*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
876*c0909341SAndroid Build Coastguard Worker    dec                  hd
877*c0909341SAndroid Build Coastguard Worker    jg .hv_w128_loop
878*c0909341SAndroid Build Coastguard Worker    RET
879*c0909341SAndroid Build Coastguard Worker
880*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 3, 5, 6
881*c0909341SAndroid Build Coastguard Worker
882*c0909341SAndroid Build Coastguard Workercglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
883*c0909341SAndroid Build Coastguard Worker    movifnidn          mxyd, r5m ; mx
884*c0909341SAndroid Build Coastguard Worker    lea                  t2, [prep_avx512icl]
885*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
886*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
887*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
888*c0909341SAndroid Build Coastguard Worker    jnz .h
889*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r6m ; my
890*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
891*c0909341SAndroid Build Coastguard Worker    jnz .v
892*c0909341SAndroid Build Coastguard Worker.prep:
893*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [t2+wq*2+table_offset(prep,)]
894*c0909341SAndroid Build Coastguard Worker    add                  wq, t2
895*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
896*c0909341SAndroid Build Coastguard Worker    jmp                  wq
897*c0909341SAndroid Build Coastguard Worker.prep_w4:
898*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [srcq+strideq*0]
899*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm0, [srcq+strideq*1], 1
900*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm0, [srcq+strideq*2], 2
901*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm0, [srcq+stride3q ], 3
902*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
903*c0909341SAndroid Build Coastguard Worker    pmovzxbw            ym0, xmm0
904*c0909341SAndroid Build Coastguard Worker    psllw               ym0, 4
905*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], ym0
906*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
907*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
908*c0909341SAndroid Build Coastguard Worker    jg .prep_w4
909*c0909341SAndroid Build Coastguard Worker    RET
910*c0909341SAndroid Build Coastguard Worker.prep_w8:
911*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [srcq+strideq*0]
912*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [srcq+strideq*1]
913*c0909341SAndroid Build Coastguard Worker    vinserti128         ym0, ymm0, [srcq+strideq*2], 1
914*c0909341SAndroid Build Coastguard Worker    vinserti128         ym1, ymm1, [srcq+stride3q ], 1
915*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
916*c0909341SAndroid Build Coastguard Worker    punpcklqdq          ym0, ym1
917*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m0, ym0
918*c0909341SAndroid Build Coastguard Worker    psllw                m0, 4
919*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
920*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
921*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
922*c0909341SAndroid Build Coastguard Worker    jg .prep_w8
923*c0909341SAndroid Build Coastguard Worker    RET
924*c0909341SAndroid Build Coastguard Worker.prep_w16:
925*c0909341SAndroid Build Coastguard Worker    movu               xmm0, [srcq+strideq*0]
926*c0909341SAndroid Build Coastguard Worker    vinserti128         ym0, ymm0, [srcq+strideq*1], 1
927*c0909341SAndroid Build Coastguard Worker    movu               xmm1, [srcq+strideq*2]
928*c0909341SAndroid Build Coastguard Worker    vinserti128         ym1, ymm1, [srcq+stride3q ], 1
929*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
930*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m0, ym0
931*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m1, ym1
932*c0909341SAndroid Build Coastguard Worker    psllw                m0, 4
933*c0909341SAndroid Build Coastguard Worker    psllw                m1, 4
934*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m0
935*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m1
936*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*4
937*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
938*c0909341SAndroid Build Coastguard Worker    jg .prep_w16
939*c0909341SAndroid Build Coastguard Worker    RET
940*c0909341SAndroid Build Coastguard Worker.prep_w32:
941*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m0, [srcq+strideq*0]
942*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m1, [srcq+strideq*1]
943*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m2, [srcq+strideq*2]
944*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m3, [srcq+stride3q ]
945*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
946*c0909341SAndroid Build Coastguard Worker    REPX       {psllw x, 4}, m0, m1, m2, m3
947*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m0
948*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m1
949*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*2], m2
950*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*3], m3
951*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*4
952*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
953*c0909341SAndroid Build Coastguard Worker    jg .prep_w32
954*c0909341SAndroid Build Coastguard Worker    RET
955*c0909341SAndroid Build Coastguard Worker.prep_w64:
956*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m0, [srcq+strideq*0+32*0]
957*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m1, [srcq+strideq*0+32*1]
958*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m2, [srcq+strideq*1+32*0]
959*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m3, [srcq+strideq*1+32*1]
960*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
961*c0909341SAndroid Build Coastguard Worker    REPX       {psllw x, 4}, m0, m1, m2, m3
962*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m0
963*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m1
964*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*2], m2
965*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*3], m3
966*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*4
967*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
968*c0909341SAndroid Build Coastguard Worker    jg .prep_w64
969*c0909341SAndroid Build Coastguard Worker    RET
970*c0909341SAndroid Build Coastguard Worker.prep_w128:
971*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m0, [srcq+32*0]
972*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m1, [srcq+32*1]
973*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m2, [srcq+32*2]
974*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m3, [srcq+32*3]
975*c0909341SAndroid Build Coastguard Worker    REPX       {psllw x, 4}, m0, m1, m2, m3
976*c0909341SAndroid Build Coastguard Worker    mova    [tmpq+64*0], m0
977*c0909341SAndroid Build Coastguard Worker    mova    [tmpq+64*1], m1
978*c0909341SAndroid Build Coastguard Worker    mova    [tmpq+64*2], m2
979*c0909341SAndroid Build Coastguard Worker    mova    [tmpq+64*3], m3
980*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*4
981*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
982*c0909341SAndroid Build Coastguard Worker    dec                  hd
983*c0909341SAndroid Build Coastguard Worker    jg .prep_w128
984*c0909341SAndroid Build Coastguard Worker    RET
985*c0909341SAndroid Build Coastguard Worker.h:
986*c0909341SAndroid Build Coastguard Worker    ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
987*c0909341SAndroid Build Coastguard Worker    ; = (16 - mx) * src[x] + mx * src[x + 1]
988*c0909341SAndroid Build Coastguard Worker    imul               mxyd, 255
989*c0909341SAndroid Build Coastguard Worker    add                mxyd, 16
990*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, mxyd
991*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r6m ; my
992*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
993*c0909341SAndroid Build Coastguard Worker    jnz .hv
994*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
995*c0909341SAndroid Build Coastguard Worker    add                  wq, t2
996*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
997*c0909341SAndroid Build Coastguard Worker    jmp                  wq
998*c0909341SAndroid Build Coastguard Worker.h_w4:
999*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym4, [bilin_h_shuf4]
1000*c0909341SAndroid Build Coastguard Worker.h_w4_loop:
1001*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [srcq+strideq*0]
1002*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [srcq+strideq*1]
1003*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, ymm0, [srcq+strideq*2], 1
1004*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, ymm1, [srcq+stride3q ], 1
1005*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1006*c0909341SAndroid Build Coastguard Worker    punpcklqdq          ym0, ym1
1007*c0909341SAndroid Build Coastguard Worker    pshufb              ym0, ym4
1008*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym0, ym5
1009*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], ym0
1010*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
1011*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1012*c0909341SAndroid Build Coastguard Worker    jg .h_w4_loop
1013*c0909341SAndroid Build Coastguard Worker    RET
1014*c0909341SAndroid Build Coastguard Worker.h_w8:
1015*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m4, [bilin_h_perm16]
1016*c0909341SAndroid Build Coastguard Worker.h_w8_loop:
1017*c0909341SAndroid Build Coastguard Worker    movu               xmm0, [srcq+strideq*0]
1018*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, ymm0, [srcq+strideq*1], 1
1019*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, [srcq+strideq*2], 2
1020*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, [srcq+stride3q ], 3
1021*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1022*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
1023*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1024*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
1025*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
1026*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1027*c0909341SAndroid Build Coastguard Worker    jg .h_w8_loop
1028*c0909341SAndroid Build Coastguard Worker    RET
1029*c0909341SAndroid Build Coastguard Worker.h_w16:
1030*c0909341SAndroid Build Coastguard Worker    mova                 m4, [bilin_h_perm16]
1031*c0909341SAndroid Build Coastguard Worker.h_w16_loop:
1032*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+strideq*0]
1033*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+strideq*1], 1
1034*c0909341SAndroid Build Coastguard Worker    movu                ym1, [srcq+strideq*2]
1035*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [srcq+stride3q ], 1
1036*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1037*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m4, m0
1038*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m4, m1
1039*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1040*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
1041*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m0
1042*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m1
1043*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
1044*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1045*c0909341SAndroid Build Coastguard Worker    jg .h_w16_loop
1046*c0909341SAndroid Build Coastguard Worker    RET
1047*c0909341SAndroid Build Coastguard Worker.h_w32:
1048*c0909341SAndroid Build Coastguard Worker    mova                 m4, [bilin_h_perm32]
1049*c0909341SAndroid Build Coastguard Worker.h_w32_loop:
1050*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m4, [srcq+strideq*0]
1051*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m4, [srcq+strideq*1]
1052*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m4, [srcq+strideq*2]
1053*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m4, [srcq+stride3q ]
1054*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1055*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1056*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
1057*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
1058*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5
1059*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m0
1060*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m1
1061*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*2], m2
1062*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*3], m3
1063*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*4
1064*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1065*c0909341SAndroid Build Coastguard Worker    jg .h_w32_loop
1066*c0909341SAndroid Build Coastguard Worker    RET
1067*c0909341SAndroid Build Coastguard Worker.h_w64:
1068*c0909341SAndroid Build Coastguard Worker    mova                 m4, [bilin_h_perm32]
1069*c0909341SAndroid Build Coastguard Worker.h_w64_loop:
1070*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m4, [srcq+strideq*0+32*0]
1071*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m4, [srcq+strideq*0+32*1]
1072*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m4, [srcq+strideq*1+32*0]
1073*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m4, [srcq+strideq*1+32*1]
1074*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1075*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1076*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
1077*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
1078*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5
1079*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m0
1080*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m1
1081*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*2], m2
1082*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*3], m3
1083*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*4
1084*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1085*c0909341SAndroid Build Coastguard Worker    jg .h_w64_loop
1086*c0909341SAndroid Build Coastguard Worker    RET
1087*c0909341SAndroid Build Coastguard Worker.h_w128:
1088*c0909341SAndroid Build Coastguard Worker    mova                 m4, [bilin_h_perm32]
1089*c0909341SAndroid Build Coastguard Worker.h_w128_loop:
1090*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m4, [srcq+32*0]
1091*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m4, [srcq+32*1]
1092*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m4, [srcq+32*2]
1093*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m4, [srcq+32*3]
1094*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1095*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
1096*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
1097*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5
1098*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m0
1099*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m1
1100*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*2], m2
1101*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*3], m3
1102*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*4
1103*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
1104*c0909341SAndroid Build Coastguard Worker    dec                  hd
1105*c0909341SAndroid Build Coastguard Worker    jg .h_w128_loop
1106*c0909341SAndroid Build Coastguard Worker    RET
1107*c0909341SAndroid Build Coastguard Worker.v:
1108*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       7
1109*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
1110*c0909341SAndroid Build Coastguard Worker    imul               mxyd, 255
1111*c0909341SAndroid Build Coastguard Worker    add                mxyd, 16
1112*c0909341SAndroid Build Coastguard Worker    add                  wq, t2
1113*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
1114*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, mxyd
1115*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1116*c0909341SAndroid Build Coastguard Worker.v_w4:
1117*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [srcq+strideq*0]
1118*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 0x29
1119*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym3, [bilin_v_shuf4]
1120*c0909341SAndroid Build Coastguard Worker    kmovb                k1, r3d
1121*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
1122*c0909341SAndroid Build Coastguard Worker    vpblendmd       xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____
1123*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym2, [srcq+strideq*2]
1124*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    ym2{k1}, [srcq+stride3q ]             ; __2_ 23__
1125*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1126*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym0, [srcq+strideq*0]
1127*c0909341SAndroid Build Coastguard Worker    punpckhqdq      ym2{k1}, ym1, ym0                     ; 012_ 234_
1128*c0909341SAndroid Build Coastguard Worker    pshufb              ym2, ym3
1129*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym2, ym6
1130*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], ym2
1131*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
1132*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1133*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
1134*c0909341SAndroid Build Coastguard Worker    RET
1135*c0909341SAndroid Build Coastguard Worker.v_w8:
1136*c0909341SAndroid Build Coastguard Worker    mova                 m5, [bilin_v_perm8]
1137*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym0, [srcq+strideq*0]
1138*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
1139*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, ym0, [srcq+strideq*1], 1
1140*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        ym0, [srcq+strideq*2]
1141*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, [srcq+stride3q ], 2
1142*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1143*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, [srcq+strideq*0], 0
1144*c0909341SAndroid Build Coastguard Worker    vpermt2b             m1, m5, m0
1145*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6
1146*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m1
1147*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
1148*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1149*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
1150*c0909341SAndroid Build Coastguard Worker    RET
1151*c0909341SAndroid Build Coastguard Worker.v_w16:
1152*c0909341SAndroid Build Coastguard Worker    mova                 m5, [bilin_v_perm16]
1153*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*0]
1154*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
1155*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+strideq*2]
1156*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, ym0, [srcq+strideq*1], 1
1157*c0909341SAndroid Build Coastguard Worker    vpermt2b             m1, m5, m2
1158*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym2, [srcq+stride3q ], 1
1159*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1160*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*0]
1161*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m5, m0
1162*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6
1163*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6
1164*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m1
1165*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m2
1166*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
1167*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1168*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
1169*c0909341SAndroid Build Coastguard Worker    RET
1170*c0909341SAndroid Build Coastguard Worker.v_w32:
1171*c0909341SAndroid Build Coastguard Worker    mova                 m5, [bilin_v_perm32]
1172*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+strideq*0]
1173*c0909341SAndroid Build Coastguard Worker.v_w32_loop:
1174*c0909341SAndroid Build Coastguard Worker    movu                ym2, [srcq+strideq*1]
1175*c0909341SAndroid Build Coastguard Worker    movu                ym3, [srcq+strideq*2]
1176*c0909341SAndroid Build Coastguard Worker    movu                ym4, [srcq+stride3q ]
1177*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1178*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m5, m2
1179*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m5, m3
1180*c0909341SAndroid Build Coastguard Worker    vpermt2b             m3, m5, m4
1181*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m0, m6
1182*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+strideq*0]
1183*c0909341SAndroid Build Coastguard Worker    vpermt2b             m4, m5, m0
1184*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6
1185*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m6
1186*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m6
1187*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m1
1188*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m2
1189*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*2], m3
1190*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*3], m4
1191*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*4
1192*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1193*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop
1194*c0909341SAndroid Build Coastguard Worker    RET
1195*c0909341SAndroid Build Coastguard Worker.v_w64:
1196*c0909341SAndroid Build Coastguard Worker    mova                 m5, [bilin_v_perm64]
1197*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m5, [srcq+strideq*0]
1198*c0909341SAndroid Build Coastguard Worker.v_w64_loop:
1199*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m5, [srcq+strideq*1]
1200*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1201*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m0, m1
1202*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m0, m1
1203*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m5, [srcq+strideq*0]
1204*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m1, m0
1205*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m0
1206*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m6
1207*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6
1208*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m6
1209*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6
1210*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m4
1211*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m2
1212*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*2], m3
1213*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*3], m1
1214*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*4
1215*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1216*c0909341SAndroid Build Coastguard Worker    jg .v_w64_loop
1217*c0909341SAndroid Build Coastguard Worker    RET
1218*c0909341SAndroid Build Coastguard Worker.v_w128:
1219*c0909341SAndroid Build Coastguard Worker    mova                 m5, [bilin_v_perm64]
1220*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m5, [srcq+strideq*0+ 0]
1221*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m5, [srcq+strideq*0+64]
1222*c0909341SAndroid Build Coastguard Worker.v_w128_loop:
1223*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m5, [srcq+strideq*1+ 0]
1224*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m5, [srcq+strideq*1+64]
1225*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1226*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m0, m2
1227*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m2
1228*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m6
1229*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m6
1230*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m4
1231*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m0
1232*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m1, m3
1233*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m3
1234*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m6
1235*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6
1236*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*2], m4
1237*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*3], m1
1238*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m5, [srcq+strideq*0+ 0]
1239*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m5, [srcq+strideq*0+64]
1240*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m2, m0
1241*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m0
1242*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m6
1243*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6
1244*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*4], m4
1245*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*5], m2
1246*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m3, m1
1247*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m1
1248*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m6
1249*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m6
1250*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*6], m4
1251*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*7], m3
1252*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*8
1253*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1254*c0909341SAndroid Build Coastguard Worker    jg .v_w128_loop
1255*c0909341SAndroid Build Coastguard Worker    RET
1256*c0909341SAndroid Build Coastguard Worker.hv:
1257*c0909341SAndroid Build Coastguard Worker    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
1258*c0909341SAndroid Build Coastguard Worker    ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
1259*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       7
1260*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
1261*c0909341SAndroid Build Coastguard Worker    shl                mxyd, 11
1262*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, mxyd
1263*c0909341SAndroid Build Coastguard Worker    add                  wq, t2
1264*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
1265*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1266*c0909341SAndroid Build Coastguard Worker.hv_w4:
1267*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym4, [bilin_h_shuf4]
1268*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        ym0, [srcq+strideq*0]
1269*c0909341SAndroid Build Coastguard Worker    pshufb              ym0, ym4
1270*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym0, ym5
1271*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
1272*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [srcq+strideq*1]
1273*c0909341SAndroid Build Coastguard Worker    movq               xmm2, [srcq+strideq*2]
1274*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, ymm1, [srcq+stride3q ], 1
1275*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1276*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym2, ymm2, [srcq+strideq*0], 1
1277*c0909341SAndroid Build Coastguard Worker    punpcklqdq          ym1, ym2
1278*c0909341SAndroid Build Coastguard Worker    pshufb              ym1, ym4
1279*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym1, ym5         ; 1 2 3 4
1280*c0909341SAndroid Build Coastguard Worker    valignq             ym2, ym1, ym0, 3 ; 0 1 2 3
1281*c0909341SAndroid Build Coastguard Worker    mova                ym0, ym1
1282*c0909341SAndroid Build Coastguard Worker    psubw               ym1, ym2
1283*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym1, ym6
1284*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym2
1285*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], ym1
1286*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
1287*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1288*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
1289*c0909341SAndroid Build Coastguard Worker    RET
1290*c0909341SAndroid Build Coastguard Worker.hv_w8:
1291*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m4, [bilin_h_perm16]
1292*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [srcq+strideq*0]
1293*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
1294*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1295*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
1296*c0909341SAndroid Build Coastguard Worker    movu               xmm1, [srcq+strideq*1]
1297*c0909341SAndroid Build Coastguard Worker    vinserti128         ym1, ymm1, [srcq+strideq*2], 1
1298*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+stride3q ], 2
1299*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1300*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+strideq*0], 3
1301*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1302*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5        ; 1 2 3 4
1303*c0909341SAndroid Build Coastguard Worker    valignq              m2, m1, m0, 6 ; 0 1 2 3
1304*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
1305*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
1306*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6
1307*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1308*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m1
1309*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
1310*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1311*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
1312*c0909341SAndroid Build Coastguard Worker    RET
1313*c0909341SAndroid Build Coastguard Worker.hv_w16:
1314*c0909341SAndroid Build Coastguard Worker    mova                 m4, [bilin_h_perm16]
1315*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m0, [srcq+strideq*0]
1316*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m4, m0
1317*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1318*c0909341SAndroid Build Coastguard Worker.hv_w16_loop:
1319*c0909341SAndroid Build Coastguard Worker    movu                ym1, [srcq+strideq*1]
1320*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [srcq+strideq*2], 1
1321*c0909341SAndroid Build Coastguard Worker    movu                ym2, [srcq+stride3q ]
1322*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1323*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, [srcq+strideq*0], 1
1324*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m4, m1
1325*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m4, m2
1326*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5            ; 1 2
1327*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m0, m1, q1032 ; 0 1
1328*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2, m5        ; 3 4
1329*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m1, m0, q1032 ; 2 3
1330*c0909341SAndroid Build Coastguard Worker    psubw                m1, m3
1331*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6
1332*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
1333*c0909341SAndroid Build Coastguard Worker    psubw                m3, m0, m2
1334*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6
1335*c0909341SAndroid Build Coastguard Worker    paddw                m3, m2
1336*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m1
1337*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m3
1338*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
1339*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1340*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop
1341*c0909341SAndroid Build Coastguard Worker    RET
1342*c0909341SAndroid Build Coastguard Worker.hv_w32:
1343*c0909341SAndroid Build Coastguard Worker    mova                 m4, [bilin_h_perm32]
1344*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m4, [srcq+strideq*0]
1345*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1346*c0909341SAndroid Build Coastguard Worker.hv_w32_loop:
1347*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m4, [srcq+strideq*1]
1348*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1349*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m4, [srcq+strideq*0]
1350*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
1351*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, m0
1352*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6
1353*c0909341SAndroid Build Coastguard Worker    paddw                m3, m0
1354*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2, m5
1355*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, m1
1356*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
1357*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1
1358*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m3
1359*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m2
1360*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
1361*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1362*c0909341SAndroid Build Coastguard Worker    jg .hv_w32_loop
1363*c0909341SAndroid Build Coastguard Worker    RET
1364*c0909341SAndroid Build Coastguard Worker.hv_w64:
1365*c0909341SAndroid Build Coastguard Worker    mova                 m4, [bilin_h_perm32]
1366*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m4, [srcq+32*0]
1367*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m4, [srcq+32*1]
1368*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1369*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
1370*c0909341SAndroid Build Coastguard Worker.hv_w64_loop:
1371*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
1372*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m4, [srcq+32*0]
1373*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m4, [srcq+32*1]
1374*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
1375*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5
1376*c0909341SAndroid Build Coastguard Worker    psubw                m7, m2, m0
1377*c0909341SAndroid Build Coastguard Worker    psubw                m8, m3, m1
1378*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m6
1379*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m6
1380*c0909341SAndroid Build Coastguard Worker    paddw                m7, m0
1381*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
1382*c0909341SAndroid Build Coastguard Worker    paddw                m8, m1
1383*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
1384*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m7
1385*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m8
1386*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
1387*c0909341SAndroid Build Coastguard Worker    dec                  hd
1388*c0909341SAndroid Build Coastguard Worker    jg .hv_w64_loop
1389*c0909341SAndroid Build Coastguard Worker    RET
1390*c0909341SAndroid Build Coastguard Worker.hv_w128:
1391*c0909341SAndroid Build Coastguard Worker    mova                 m4, [bilin_h_perm32]
1392*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m4, [srcq+32*0]
1393*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m4, [srcq+32*1]
1394*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m4, [srcq+32*2]
1395*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m4, [srcq+32*3]
1396*c0909341SAndroid Build Coastguard Worker    REPX  {pmaddubsw x, m5}, m0, m1, m2, m3
1397*c0909341SAndroid Build Coastguard Worker.hv_w128_loop:
1398*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
1399*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m4, [srcq+32*0]
1400*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m4, [srcq+32*1]
1401*c0909341SAndroid Build Coastguard Worker    vpermb               m9, m4, [srcq+32*2]
1402*c0909341SAndroid Build Coastguard Worker    vpermb              m10, m4, [srcq+32*3]
1403*c0909341SAndroid Build Coastguard Worker    REPX  {pmaddubsw x, m5}, m7, m8, m9, m10
1404*c0909341SAndroid Build Coastguard Worker    psubw               m11, m7, m0
1405*c0909341SAndroid Build Coastguard Worker    psubw               m12, m8, m1
1406*c0909341SAndroid Build Coastguard Worker    psubw               m13, m9, m2
1407*c0909341SAndroid Build Coastguard Worker    psubw               m14, m10, m3
1408*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw  x, m6}, m11, m12, m13, m14
1409*c0909341SAndroid Build Coastguard Worker    paddw               m11, m0
1410*c0909341SAndroid Build Coastguard Worker    mova                 m0, m7
1411*c0909341SAndroid Build Coastguard Worker    paddw               m12, m1
1412*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
1413*c0909341SAndroid Build Coastguard Worker    paddw               m13, m2
1414*c0909341SAndroid Build Coastguard Worker    mova                 m2, m9
1415*c0909341SAndroid Build Coastguard Worker    paddw               m14, m3
1416*c0909341SAndroid Build Coastguard Worker    mova                 m3, m10
1417*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m11
1418*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m12
1419*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*2], m13
1420*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*3], m14
1421*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*4
1422*c0909341SAndroid Build Coastguard Worker    dec                  hd
1423*c0909341SAndroid Build Coastguard Worker    jg .hv_w128_loop
1424*c0909341SAndroid Build Coastguard Worker    RET
1425*c0909341SAndroid Build Coastguard Worker
1426*c0909341SAndroid Build Coastguard Worker; int8_t subpel_filters[5][15][8]
1427*c0909341SAndroid Build Coastguard Worker%assign FILTER_REGULAR (0*15 << 16) | 3*15
1428*c0909341SAndroid Build Coastguard Worker%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
1429*c0909341SAndroid Build Coastguard Worker%assign FILTER_SHARP   (2*15 << 16) | 3*15
1430*c0909341SAndroid Build Coastguard Worker
1431*c0909341SAndroid Build Coastguard Worker%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to
1432*c0909341SAndroid Build Coastguard Workercglobal %1_%2_8bpc
1433*c0909341SAndroid Build Coastguard Worker    mov                 t0d, FILTER_%3
1434*c0909341SAndroid Build Coastguard Worker%ifidn %3, %4
1435*c0909341SAndroid Build Coastguard Worker    mov                 t1d, t0d
1436*c0909341SAndroid Build Coastguard Worker%else
1437*c0909341SAndroid Build Coastguard Worker    mov                 t1d, FILTER_%4
1438*c0909341SAndroid Build Coastguard Worker%endif
1439*c0909341SAndroid Build Coastguard Worker%if %0 == 5 ; skip the jump in the last filter
1440*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
1441*c0909341SAndroid Build Coastguard Worker%endif
1442*c0909341SAndroid Build Coastguard Worker%endmacro
1443*c0909341SAndroid Build Coastguard Worker
1444*c0909341SAndroid Build Coastguard Worker%macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb
1445*c0909341SAndroid Build Coastguard Worker%if %5
1446*c0909341SAndroid Build Coastguard Worker    vpermb              m%2, m6, m%1
1447*c0909341SAndroid Build Coastguard Worker    vpermb              m%3, m7, m%1
1448*c0909341SAndroid Build Coastguard Worker    vpermb              m%4, m8, m%1
1449*c0909341SAndroid Build Coastguard Worker%else
1450*c0909341SAndroid Build Coastguard Worker%if %2 < %4 ; reuse a previous value if possible
1451*c0909341SAndroid Build Coastguard Worker    pshufb              m%2, m%1, m6
1452*c0909341SAndroid Build Coastguard Worker%endif
1453*c0909341SAndroid Build Coastguard Worker    pshufb              m%3, m%1, m7
1454*c0909341SAndroid Build Coastguard Worker    pshufb              m%4, m%1, m8
1455*c0909341SAndroid Build Coastguard Worker%endif
1456*c0909341SAndroid Build Coastguard Worker    mova                m%1, m5
1457*c0909341SAndroid Build Coastguard Worker    vpdpbusd            m%1, m%2, m9
1458*c0909341SAndroid Build Coastguard Worker    mova                m%2, m5
1459*c0909341SAndroid Build Coastguard Worker    vpdpbusd            m%2, m%3, m9
1460*c0909341SAndroid Build Coastguard Worker    vpdpbusd            m%1, m%3, m10
1461*c0909341SAndroid Build Coastguard Worker    vpdpbusd            m%2, m%4, m10
1462*c0909341SAndroid Build Coastguard Worker    packusdw            m%1, m%2
1463*c0909341SAndroid Build Coastguard Worker    psrlw               m%1, 6
1464*c0909341SAndroid Build Coastguard Worker%endmacro
1465*c0909341SAndroid Build Coastguard Worker
1466*c0909341SAndroid Build Coastguard Worker%if WIN64
1467*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4, 5
1468*c0909341SAndroid Build Coastguard Worker%else
1469*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7, 8
1470*c0909341SAndroid Build Coastguard Worker%endif
1471*c0909341SAndroid Build Coastguard Worker
1472*c0909341SAndroid Build Coastguard Worker; Due to the use of vpdpbusd (which does 4 pixels per instruction) in
1473*c0909341SAndroid Build Coastguard Worker; the horizontal filter, 6-tap is only used for the vertical filter.
1474*c0909341SAndroid Build Coastguard Worker%define PUT_8TAP_FN FN put_8tap,
1475*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  put_6tap_8bpc
1476*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_6tap_8bpc
1477*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_8bpc
1478*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_8bpc
1479*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_8bpc
1480*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular,        REGULAR, REGULAR
1481*c0909341SAndroid Build Coastguard Worker
1482*c0909341SAndroid Build Coastguard Workercglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns
1483*c0909341SAndroid Build Coastguard Worker%define base r8-put_avx512icl
1484*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
1485*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
1486*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
1487*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 6tap_v, my, 4tap_v
1488*c0909341SAndroid Build Coastguard Worker    lea                  r8, [put_avx512icl]
1489*c0909341SAndroid Build Coastguard Worker    movsxd               wq, wm
1490*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
1491*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
1492*c0909341SAndroid Build Coastguard Worker    jnz .h
1493*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
1494*c0909341SAndroid Build Coastguard Worker    jnz .v
1495*c0909341SAndroid Build Coastguard Worker.put:
1496*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
1497*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r8+wq*2+table_offset(put,)]
1498*c0909341SAndroid Build Coastguard Worker    add                  wq, r8
1499*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
1500*c0909341SAndroid Build Coastguard Worker    lea                  r7, [dsq*3]
1501*c0909341SAndroid Build Coastguard Worker%if WIN64
1502*c0909341SAndroid Build Coastguard Worker    pop                  r8
1503*c0909341SAndroid Build Coastguard Worker%endif
1504*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1505*c0909341SAndroid Build Coastguard Worker.v:
1506*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1507*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1508*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1509*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
1510*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, wd
1511*c0909341SAndroid Build Coastguard Worker    movzx               r6d, word [r8+r6*2+table_offset(put, _6tap_v)]
1512*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [pw_512]
1513*c0909341SAndroid Build Coastguard Worker    lea                 myq, [base+subpel_filters+1+myq*8]
1514*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, [myq+0]
1515*c0909341SAndroid Build Coastguard Worker    add                  r6, r8
1516*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m8, [myq+2]
1517*c0909341SAndroid Build Coastguard Worker    mov                 nsq, ssq
1518*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, [myq+4]
1519*c0909341SAndroid Build Coastguard Worker    neg                 nsq
1520*c0909341SAndroid Build Coastguard Worker    jmp                  r6
1521*c0909341SAndroid Build Coastguard Worker.v_w2:
1522*c0909341SAndroid Build Coastguard Worker    movd               xmm2, [srcq+nsq*2]
1523*c0909341SAndroid Build Coastguard Worker    pinsrw             xmm2, [srcq+nsq*1], 2
1524*c0909341SAndroid Build Coastguard Worker    pinsrw             xmm2, [srcq+ssq*0], 4
1525*c0909341SAndroid Build Coastguard Worker    pinsrw             xmm2, [srcq+ssq*1], 6  ; 0 1 2 3
1526*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1527*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm0, [srcq+ssq*0]
1528*c0909341SAndroid Build Coastguard Worker    palignr            xmm3, xmm0, xmm2, 4    ; 1 2 3 4
1529*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm1, xmm2, xmm3       ; 01 12
1530*c0909341SAndroid Build Coastguard Worker    punpckhbw          xmm2, xmm3             ; 23 34
1531*c0909341SAndroid Build Coastguard Worker.v_w2_loop:
1532*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm4, [srcq+ssq*1]
1533*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1534*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm3, xmm1, xm7        ; a0 b0
1535*c0909341SAndroid Build Coastguard Worker    mova               xmm1, xmm2
1536*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm2, xm8              ; a1 b1
1537*c0909341SAndroid Build Coastguard Worker    paddw              xmm3, xmm2
1538*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm2, xmm0, xmm4, 0x02 ; 4 5
1539*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm0, [srcq+ssq*0]
1540*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm4, xmm0, 0x02       ; 5 6
1541*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm2, xmm4             ; 67 78
1542*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm4, xmm2, xm9        ; a3 b3
1543*c0909341SAndroid Build Coastguard Worker    paddw              xmm3, xmm4
1544*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm3, xm6
1545*c0909341SAndroid Build Coastguard Worker    packuswb           xmm3, xmm3
1546*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xmm3, 0
1547*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xmm3, 2
1548*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1549*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1550*c0909341SAndroid Build Coastguard Worker    jg .v_w2_loop
1551*c0909341SAndroid Build Coastguard Worker    RET
1552*c0909341SAndroid Build Coastguard Worker.v_w4:
1553*c0909341SAndroid Build Coastguard Worker    movd               xmm2, [srcq+nsq*2]
1554*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm2, [srcq+nsq*1], 1
1555*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm2, [srcq+ssq*0], 2
1556*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm2, [srcq+ssq*1], 3  ; 0 1 2 3
1557*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1558*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm0, [srcq+ssq*0]
1559*c0909341SAndroid Build Coastguard Worker    palignr            xmm3, xmm0, xmm2, 4    ; 1 2 3 4
1560*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm1, xmm2, xmm3       ; 01 12
1561*c0909341SAndroid Build Coastguard Worker    punpckhbw          xmm2, xmm3             ; 23 34
1562*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
1563*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm4, [srcq+ssq*1]
1564*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1565*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm3, xmm1, xm7        ; a0 b0
1566*c0909341SAndroid Build Coastguard Worker    mova               xmm1, xmm2
1567*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm2, xm8              ; a1 b1
1568*c0909341SAndroid Build Coastguard Worker    paddw              xmm3, xmm2
1569*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm2, xmm0, xmm4, 0x02 ; 4 5
1570*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm0, [srcq+ssq*0]
1571*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm4, xmm0, 0x02       ; 5 6
1572*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm2, xmm4             ; 45 56
1573*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm4, xmm2, xm9        ; a2 b2
1574*c0909341SAndroid Build Coastguard Worker    paddw              xmm3, xmm4
1575*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm3, xm6
1576*c0909341SAndroid Build Coastguard Worker    packuswb           xmm3, xmm3
1577*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm3
1578*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm3, 1
1579*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1580*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1581*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
1582*c0909341SAndroid Build Coastguard Worker    RET
1583*c0909341SAndroid Build Coastguard Worker.v_w8:
1584*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [srcq+nsq*2]
1585*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm3, [srcq+nsq*1]
1586*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm2, [srcq+ssq*0]
1587*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm4, [srcq+ssq*1]
1588*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1589*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm0, [srcq+ssq*0]
1590*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm1, ymm3, 0x30
1591*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm3, ymm2, 0x30
1592*c0909341SAndroid Build Coastguard Worker    punpcklbw          ymm1, ymm3      ; 01 12
1593*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm2, ymm4, 0x30
1594*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm4, ymm0, 0x30
1595*c0909341SAndroid Build Coastguard Worker    punpcklbw          ymm2, ymm4      ; 23 34
1596*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
1597*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm4, [srcq+ssq*1]
1598*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1599*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm3, ymm1, ym7 ; a0 b0
1600*c0909341SAndroid Build Coastguard Worker    mova               ymm1, ymm2
1601*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm2, ym8       ; a1 b1
1602*c0909341SAndroid Build Coastguard Worker    paddw              ymm3, ymm2
1603*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm2, ymm0, ymm4, 0x30
1604*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm0, [srcq+ssq*0]
1605*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm4, ymm0, 0x30
1606*c0909341SAndroid Build Coastguard Worker    punpcklbw          ymm2, ymm4      ; 45 56
1607*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm4, ymm2, ym9 ; a2 b2
1608*c0909341SAndroid Build Coastguard Worker    paddw              ymm3, ymm4
1609*c0909341SAndroid Build Coastguard Worker    pmulhrsw           ymm3, ym6
1610*c0909341SAndroid Build Coastguard Worker    vextracti128       xmm4, ymm3, 1
1611*c0909341SAndroid Build Coastguard Worker    packuswb           xmm3, xmm4
1612*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xmm3
1613*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xmm3
1614*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1615*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1616*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
1617*c0909341SAndroid Build Coastguard Worker    vzeroupper
1618*c0909341SAndroid Build Coastguard Worker    RET
1619*c0909341SAndroid Build Coastguard Worker.v_w16:
1620*c0909341SAndroid Build Coastguard Worker    mova                 m5, [spel_v_perm16a]
1621*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m1, [srcq+nsq*2]
1622*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym3, [srcq+nsq*1]
1623*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x0f
1624*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m2, [srcq+ssq*0]
1625*c0909341SAndroid Build Coastguard Worker    kmovb                k1, r6d
1626*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym4, [srcq+ssq*1]
1627*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1628*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [srcq+ssq*0]
1629*c0909341SAndroid Build Coastguard Worker    vshufpd          m1{k1}, m3, m2, 0xcc
1630*c0909341SAndroid Build Coastguard Worker    vshufpd          m2{k1}, m4, m0, 0xcc
1631*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m5, m1 ; 01 12
1632*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m5, m2 ; 23 34
1633*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
1634*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym4, [srcq+ssq*1]
1635*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1636*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m1, m7 ; a0 b0
1637*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
1638*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m8     ; a1 b1
1639*c0909341SAndroid Build Coastguard Worker    paddw                m3, m2
1640*c0909341SAndroid Build Coastguard Worker    mova                 m2, m0
1641*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [srcq+ssq*0]
1642*c0909341SAndroid Build Coastguard Worker    vshufpd          m2{k1}, m4, m0, 0xcc
1643*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m5, m2 ; 45 56
1644*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m2, m9 ; a2 b2
1645*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
1646*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6
1647*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym4, m3, 1
1648*c0909341SAndroid Build Coastguard Worker    packuswb            ym3, ym4
1649*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], xm3
1650*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*1], ym3, 1
1651*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1652*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1653*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
1654*c0909341SAndroid Build Coastguard Worker    RET
1655*c0909341SAndroid Build Coastguard Worker.v_w32:
1656*c0909341SAndroid Build Coastguard Worker    mova                m10, [spel_v_perm32]
1657*c0909341SAndroid Build Coastguard Worker    pmovzxbq             m5, [pb_02461357]
1658*c0909341SAndroid Build Coastguard Worker    vpshrdw             m11, m10, m10, 8
1659*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+nsq*2]
1660*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+nsq*1], 1
1661*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m10, m0 ; 01
1662*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*0], 0
1663*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m11, m0 ; 12
1664*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*1], 1
1665*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1666*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m10, m0 ; 23
1667*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*0], 0
1668*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m11, m0 ; 34
1669*c0909341SAndroid Build Coastguard Worker.v_w32_loop:
1670*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*1], 1
1671*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1672*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, m1, m7
1673*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
1674*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m2, m7
1675*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
1676*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m3, m8
1677*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m10, m0 ; 45
1678*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*0], 0
1679*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m15, m4, m8
1680*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m11, m0 ; 56
1681*c0909341SAndroid Build Coastguard Worker    paddw               m12, m14
1682*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m3, m9
1683*c0909341SAndroid Build Coastguard Worker    paddw               m13, m15
1684*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m15, m4, m9
1685*c0909341SAndroid Build Coastguard Worker    paddw               m12, m14
1686*c0909341SAndroid Build Coastguard Worker    paddw               m13, m15
1687*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m12, m6
1688*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m13, m6
1689*c0909341SAndroid Build Coastguard Worker    packuswb            m12, m13
1690*c0909341SAndroid Build Coastguard Worker    vpermq              m12, m5, m12
1691*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym12
1692*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m12, 1
1693*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1694*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1695*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop
1696*c0909341SAndroid Build Coastguard Worker    RET
1697*c0909341SAndroid Build Coastguard Worker.v_w64:
1698*c0909341SAndroid Build Coastguard Worker.v_w128:
1699*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+wq*4-256]
1700*c0909341SAndroid Build Coastguard Worker.v_loop0:
1701*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+nsq*2]
1702*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+nsq*1]
1703*c0909341SAndroid Build Coastguard Worker    lea                  r4, [srcq+ssq*2]
1704*c0909341SAndroid Build Coastguard Worker    movu                m11, [srcq+ssq*0]
1705*c0909341SAndroid Build Coastguard Worker    movu                m13, [srcq+ssq*1]
1706*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
1707*c0909341SAndroid Build Coastguard Worker    movu                 m0, [r4  +ssq*0]
1708*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2, m4   ; 01l
1709*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m4       ; 01h
1710*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4, m11  ; 12l
1711*c0909341SAndroid Build Coastguard Worker    punpckhbw            m4, m11      ; 12h
1712*c0909341SAndroid Build Coastguard Worker    punpcklbw           m10, m11, m13 ; 23l
1713*c0909341SAndroid Build Coastguard Worker    punpckhbw           m11, m13      ; 23h
1714*c0909341SAndroid Build Coastguard Worker    punpcklbw           m12, m13, m0  ; 34l
1715*c0909341SAndroid Build Coastguard Worker    punpckhbw           m13, m0       ; 34h
1716*c0909341SAndroid Build Coastguard Worker.v_loop:
1717*c0909341SAndroid Build Coastguard Worker    movu                 m5, [r4+ssq*1]
1718*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m1, m7   ; a0l
1719*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
1720*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m8       ; a1l
1721*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r4+ssq*2]
1722*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m15, m2, m7   ; a0h
1723*c0909341SAndroid Build Coastguard Worker    mova                 m2, m11
1724*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m11, m8       ; a1h
1725*c0909341SAndroid Build Coastguard Worker    paddw               m14, m10
1726*c0909341SAndroid Build Coastguard Worker    punpcklbw           m10, m0, m5   ; 45l
1727*c0909341SAndroid Build Coastguard Worker    paddw               m15, m11
1728*c0909341SAndroid Build Coastguard Worker    punpckhbw           m11, m0, m5   ; 45h
1729*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m10, m9  ; a2l
1730*c0909341SAndroid Build Coastguard Worker    paddw               m14, m0
1731*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m11, m9  ; a2h
1732*c0909341SAndroid Build Coastguard Worker    paddw               m15, m0
1733*c0909341SAndroid Build Coastguard Worker    movu                 m0, [r4+ssq*0]
1734*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m6
1735*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m6
1736*c0909341SAndroid Build Coastguard Worker    packuswb            m14, m15
1737*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m15, m3, m7   ; b0l
1738*c0909341SAndroid Build Coastguard Worker    mova                 m3, m12
1739*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, m8       ; b1l
1740*c0909341SAndroid Build Coastguard Worker    mova         [r7+dsq*0], m14
1741*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m4, m7   ; b0h
1742*c0909341SAndroid Build Coastguard Worker    mova                 m4, m13
1743*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m8       ; b1h
1744*c0909341SAndroid Build Coastguard Worker    paddw               m15, m12
1745*c0909341SAndroid Build Coastguard Worker    punpcklbw           m12, m5, m0   ; 56l
1746*c0909341SAndroid Build Coastguard Worker    paddw               m14, m13
1747*c0909341SAndroid Build Coastguard Worker    punpckhbw           m13, m5, m0   ; 56h
1748*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m12, m9  ; b2l
1749*c0909341SAndroid Build Coastguard Worker    paddw               m15, m5
1750*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m13, m9  ; b2h
1751*c0909341SAndroid Build Coastguard Worker    paddw               m14, m5
1752*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m6
1753*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m6
1754*c0909341SAndroid Build Coastguard Worker    packuswb            m15, m14
1755*c0909341SAndroid Build Coastguard Worker    mova         [r7+dsq*1], m15
1756*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+dsq*2]
1757*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1758*c0909341SAndroid Build Coastguard Worker    jg .v_loop
1759*c0909341SAndroid Build Coastguard Worker    add                srcq, 64
1760*c0909341SAndroid Build Coastguard Worker    add                dstq, 64
1761*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
1762*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 256
1763*c0909341SAndroid Build Coastguard Worker    jg .v_loop0
1764*c0909341SAndroid Build Coastguard Worker    RET
1765*c0909341SAndroid Build Coastguard Worker.h:
1766*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
1767*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _put_8tap_8bpc_avx512icl).h2
1768*c0909341SAndroid Build Coastguard Worker.hv:
1769*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [pd_34]
1770*c0909341SAndroid Build Coastguard Worker    mova               xm10, [spel_hv_end]
1771*c0909341SAndroid Build Coastguard Worker    pxor                xm0, xm0
1772*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1773*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
1774*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
1775*c0909341SAndroid Build Coastguard Worker    dec                srcq
1776*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+subpel_filters+mxq*8+2]
1777*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1778*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1779*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1780*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
1781*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        ym1, [base+subpel_filters+1+myq*8]
1782*c0909341SAndroid Build Coastguard Worker    mov                 nsq, ssq
1783*c0909341SAndroid Build Coastguard Worker    punpcklbw           ym0, ym1
1784*c0909341SAndroid Build Coastguard Worker    neg                 nsq
1785*c0909341SAndroid Build Coastguard Worker    psraw               ym0, 2 ; << 6
1786*c0909341SAndroid Build Coastguard Worker    pshufd             ym11, ym0, q0000
1787*c0909341SAndroid Build Coastguard Worker    pshufd             ym12, ym0, q1111
1788*c0909341SAndroid Build Coastguard Worker    pshufd             ym13, ym0, q2222
1789*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1790*c0909341SAndroid Build Coastguard Worker    je .hv_w4
1791*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      ym5, [subpel_h_shuf4]
1792*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [srcq+nsq*2]
1793*c0909341SAndroid Build Coastguard Worker    movhps             xmm0, [srcq+nsq*1]
1794*c0909341SAndroid Build Coastguard Worker    movq               xmm2, [srcq+ssq*0]
1795*c0909341SAndroid Build Coastguard Worker    movhps             xmm2, [srcq+ssq*1]
1796*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1797*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm1, [srcq+ssq*0]
1798*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm0, ymm1, 0x30
1799*c0909341SAndroid Build Coastguard Worker    pshufb             xmm2, xm5        ; 2 3
1800*c0909341SAndroid Build Coastguard Worker    pshufb             ymm0, ym5        ; 0 1   4
1801*c0909341SAndroid Build Coastguard Worker    mova               xmm1, xm9
1802*c0909341SAndroid Build Coastguard Worker    vpdpbusd           xmm1, xmm2, xm7
1803*c0909341SAndroid Build Coastguard Worker    mova               ymm2, ym9
1804*c0909341SAndroid Build Coastguard Worker    vpdpbusd           ymm2, ymm0, ym7
1805*c0909341SAndroid Build Coastguard Worker    packssdw           ymm2, ymm1
1806*c0909341SAndroid Build Coastguard Worker    psraw              ymm2, 2
1807*c0909341SAndroid Build Coastguard Worker    vextracti128       xmm0, ymm2, 1
1808*c0909341SAndroid Build Coastguard Worker    vzeroupper
1809*c0909341SAndroid Build Coastguard Worker    palignr            xmm0, xmm2, 4
1810*c0909341SAndroid Build Coastguard Worker    punpcklwd          xmm1, xmm2, xmm0 ; 01 12
1811*c0909341SAndroid Build Coastguard Worker    punpckhwd          xmm2, xmm0       ; 23 34
1812*c0909341SAndroid Build Coastguard Worker.hv_w2_loop:
1813*c0909341SAndroid Build Coastguard Worker    movq               xmm3, [srcq+ssq*1]
1814*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1815*c0909341SAndroid Build Coastguard Worker    movhps             xmm3, [srcq+ssq*0]
1816*c0909341SAndroid Build Coastguard Worker    pmaddwd            xmm4, xmm1, xm11 ; a0 b0
1817*c0909341SAndroid Build Coastguard Worker    mova               xmm1, xmm2
1818*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm4, xmm2, xm12 ; a1 b1
1819*c0909341SAndroid Build Coastguard Worker    pshufb             xmm3, xm5
1820*c0909341SAndroid Build Coastguard Worker    mova               xmm2, xm9
1821*c0909341SAndroid Build Coastguard Worker    vpdpbusd           xmm2, xmm3, xm7
1822*c0909341SAndroid Build Coastguard Worker    packssdw           xmm3, xmm2, xmm2
1823*c0909341SAndroid Build Coastguard Worker    psraw              xmm3, 2
1824*c0909341SAndroid Build Coastguard Worker    palignr            xmm2, xmm3, xmm0, 12
1825*c0909341SAndroid Build Coastguard Worker    mova               xmm0, xmm3
1826*c0909341SAndroid Build Coastguard Worker    punpcklwd          xmm2, xmm3       ; 45 56
1827*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm4, xmm2, xm13 ; a2 b2
1828*c0909341SAndroid Build Coastguard Worker    packuswb           xmm4, xmm4
1829*c0909341SAndroid Build Coastguard Worker    pshufb             xmm4, xm10
1830*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xmm4, 0
1831*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xmm4, 1
1832*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1833*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1834*c0909341SAndroid Build Coastguard Worker    jg .hv_w2_loop
1835*c0909341SAndroid Build Coastguard Worker    RET
1836*c0909341SAndroid Build Coastguard Worker.hv_w4:
1837*c0909341SAndroid Build Coastguard Worker    movq                xm2, [srcq+nsq*2]
1838*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        ym1, [srcq+nsq*1]
1839*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym2, [srcq+ssq*0], 1
1840*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, [srcq+ssq*1], 2 ; _ 1 3
1841*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1842*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m5, [subpel_h_shufA]
1843*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m2, [srcq+ssq*0], 2 ; 0 2 4
1844*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m5
1845*c0909341SAndroid Build Coastguard Worker    mova                 m0, m9
1846*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5
1847*c0909341SAndroid Build Coastguard Worker    mova                 m3, m9
1848*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m1, m7
1849*c0909341SAndroid Build Coastguard Worker    mova                ym1, [spel_hv_perm4a]
1850*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m2, m7
1851*c0909341SAndroid Build Coastguard Worker    mova                ym2, [spel_hv_perm4b]
1852*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x5555
1853*c0909341SAndroid Build Coastguard Worker    mova                ym6, [spel_hv_perm4d]
1854*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m3
1855*c0909341SAndroid Build Coastguard Worker    kmovw                k1, r6d
1856*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2 ; _ 0   1 2   3 4   5 6
1857*c0909341SAndroid Build Coastguard Worker    vpermb              ym1, ym1, ym0 ; 01 12
1858*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m2, m0   ; 23 34
1859*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
1860*c0909341SAndroid Build Coastguard Worker    movq                xm3, [srcq+ssq*1]
1861*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1862*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym3, [srcq+ssq*0], 1
1863*c0909341SAndroid Build Coastguard Worker    pmaddwd             ym4, ym1, ym11 ; a0 b0
1864*c0909341SAndroid Build Coastguard Worker    mova                ym1, ym2
1865*c0909341SAndroid Build Coastguard Worker    pshufb              ym3, ym5
1866*c0909341SAndroid Build Coastguard Worker    mova                ym0, ym9
1867*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym3, ym7
1868*c0909341SAndroid Build Coastguard Worker    vpdpwssd            ym4, ym2, ym12 ; a1 b1
1869*c0909341SAndroid Build Coastguard Worker    vpsraw          ym2{k1}, ym0, 2    ; 5 6
1870*c0909341SAndroid Build Coastguard Worker    vpermb              ym2, ym6, ym2  ; 45 56
1871*c0909341SAndroid Build Coastguard Worker    vpdpwssd            ym4, ym2, ym13 ; a2 b2
1872*c0909341SAndroid Build Coastguard Worker    packuswb            ym4, ym4
1873*c0909341SAndroid Build Coastguard Worker    vpermb              ym4, ym10, ym4
1874*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm4
1875*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm4, 1
1876*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1877*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1878*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
1879*c0909341SAndroid Build Coastguard Worker    RET
1880*c0909341SAndroid Build Coastguard Worker.hv_w8:
1881*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
1882*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
1883*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+subpel_filters+mxq*8+0]
1884*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [base+subpel_filters+mxq*8+4]
1885*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1886*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1887*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1888*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
1889*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [base+subpel_filters+1+myq*8]
1890*c0909341SAndroid Build Coastguard Worker    mov                 nsq, ssq
1891*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1
1892*c0909341SAndroid Build Coastguard Worker    neg                 nsq
1893*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2 ; << 6
1894*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m0, q0000
1895*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m0, q1111
1896*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m0, q2222
1897*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
1898*c0909341SAndroid Build Coastguard Worker    jne .hv_w16
1899*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+nsq*2]
1900*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, [srcq+nsq*1], 1
1901*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m1, [subpel_h_shufA]
1902*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, [srcq+ssq*0], 2
1903*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m4, [subpel_h_shufB]
1904*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, [srcq+ssq*1], 3
1905*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1906*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m7, [subpel_h_shufC]
1907*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym5, [srcq+ssq*0]
1908*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m6, [subpel_h_shufA]
1909*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m0, m1   ; 0 1 2 3    0123
1910*c0909341SAndroid Build Coastguard Worker    mova                 m2, m9
1911*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m1, m11
1912*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m0, m4   ; 0 1 2 3    4567
1913*c0909341SAndroid Build Coastguard Worker    mova                 m1, m9
1914*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m4, m11
1915*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m7       ; 0 1 2 3    89ab
1916*c0909341SAndroid Build Coastguard Worker    pshufb              ym7, ym5, ym6 ; 4     0123 4567
1917*c0909341SAndroid Build Coastguard Worker    mova                ym3, ym9
1918*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym3, ym7, ym11
1919*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m7, [subpel_h_shufB]
1920*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m4, m12
1921*c0909341SAndroid Build Coastguard Worker    mova                 m4, [spel_hv_perm8a]
1922*c0909341SAndroid Build Coastguard Worker    pshufb              ym5, ym7      ; 4     4567 89ab
1923*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m0, m12
1924*c0909341SAndroid Build Coastguard Worker    vpaddd               m0, m4, [pb_32] {1to16}
1925*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym3, ym5, ym12
1926*c0909341SAndroid Build Coastguard Worker    mova                 m5, [spel_hv_perm8b]
1927*c0909341SAndroid Build Coastguard Worker    mov                  r6, 0x55555555ff00
1928*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m1
1929*c0909341SAndroid Build Coastguard Worker    vpmovsdw            xm3, ym3
1930*c0909341SAndroid Build Coastguard Worker    kmovq                k1, r6
1931*c0909341SAndroid Build Coastguard Worker    psraw                m2, 2        ; 0 1 2 3
1932*c0909341SAndroid Build Coastguard Worker    psraw               xm3, 2        ; 4
1933*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m4, m2   ; 01 12
1934*c0909341SAndroid Build Coastguard Worker    kshiftrq             k2, k1, 16
1935*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m0, m3   ; 23 34
1936*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
1937*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym3, [srcq+ssq*1]
1938*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1939*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4  m3{k1}, [srcq+ssq*0]
1940*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m1, m13  ; a0 b0
1941*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3, m6   ; 5 6   0123 4567
1942*c0909341SAndroid Build Coastguard Worker    mova                 m4, m9
1943*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m1, m11
1944*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7       ; 5 6   4567 89ab
1945*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m2, m14  ; a1 b1
1946*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
1947*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m3, m12
1948*c0909341SAndroid Build Coastguard Worker    psraw            m2{k2}, m4, 2    ; 53 64
1949*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m5, m2   ; 45 56
1950*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m2, m15  ; a2 b2
1951*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
1952*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m10, m0
1953*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm0
1954*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm0
1955*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1956*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1957*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
1958*c0909341SAndroid Build Coastguard Worker    RET
1959*c0909341SAndroid Build Coastguard Worker.hv_w16:
1960*c0909341SAndroid Build Coastguard Worker    movu                m19, [spel_hv_perm16a]
1961*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pb_4]
1962*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [wq*2-32]
1963*c0909341SAndroid Build Coastguard Worker    mova                 m6, [spel_hv_perm16b]
1964*c0909341SAndroid Build Coastguard Worker    paddb               m20, m7, m19
1965*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+r6*8]
1966*c0909341SAndroid Build Coastguard Worker    paddb               m21, m7, m20
1967*c0909341SAndroid Build Coastguard Worker    mova               ym10, [spel_hv_end16]
1968*c0909341SAndroid Build Coastguard Worker    paddb                m7, m6
1969*c0909341SAndroid Build Coastguard Worker.hv_w16_loop0:
1970*c0909341SAndroid Build Coastguard Worker    movu               ym16, [srcq+nsq*2]
1971*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m16, [srcq+nsq*1], 1
1972*c0909341SAndroid Build Coastguard Worker    lea                  r4, [srcq+ssq*2]
1973*c0909341SAndroid Build Coastguard Worker    movu               ym17, [srcq+ssq*0]
1974*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m17, [srcq+ssq*1], 1
1975*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
1976*c0909341SAndroid Build Coastguard Worker    movu               ym18, [r4  +ssq*0]
1977*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m19, m16    ; 0 1   0123   89ab
1978*c0909341SAndroid Build Coastguard Worker    mova                 m1, m9
1979*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m21, m16    ; 0 1   89ab   ghij
1980*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m2, m11
1981*c0909341SAndroid Build Coastguard Worker    mova                 m2, m9
1982*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m19, m17    ; 2 3   0123   89ab
1983*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m3, m12
1984*c0909341SAndroid Build Coastguard Worker    mova                 m3, m9
1985*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m21, m17    ; 2 3   89ab   ghij
1986*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m4, m11
1987*c0909341SAndroid Build Coastguard Worker    mova                 m4, m9
1988*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m6, m18     ; 4     0145   2367   89cd   abef
1989*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m5, m12
1990*c0909341SAndroid Build Coastguard Worker    mova                 m5, m9
1991*c0909341SAndroid Build Coastguard Worker    vpermb              m16, m20, m16    ; 0 1   4567   cdef
1992*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m5, m0, m11
1993*c0909341SAndroid Build Coastguard Worker    vpermb              m17, m20, m17    ; 2 3   4567   cdef
1994*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m16, m12
1995*c0909341SAndroid Build Coastguard Worker    vpermb              m18, m7, m18     ; 4     4589   67ab   cdgh   efij
1996*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m16, m11
1997*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m17, m12
1998*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m17, m11
1999*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m5, m18, m12
2000*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m2          ; 01
2001*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m4          ; 23
2002*c0909341SAndroid Build Coastguard Worker    REPX       {psraw x, 2}, m1, m3, m5
2003*c0909341SAndroid Build Coastguard Worker    vpshrdd              m2, m1, m3, 16  ; 12
2004*c0909341SAndroid Build Coastguard Worker    vpshrdd              m4, m3, m5, 16  ; 34
2005*c0909341SAndroid Build Coastguard Worker.hv_w16_loop:
2006*c0909341SAndroid Build Coastguard Worker    movu               ym18, [r4+ssq*1]
2007*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r4+ssq*2]
2008*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m18, [r4+ssq*0], 1
2009*c0909341SAndroid Build Coastguard Worker    pmaddwd             m16, m1, m13     ; a0
2010*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m19, m18    ; 5 6   0123   89ab
2011*c0909341SAndroid Build Coastguard Worker    pmaddwd             m17, m2, m13     ; b0
2012*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m20, m18    ; 5 6   4567   cdef
2013*c0909341SAndroid Build Coastguard Worker    mova                 m0, m9
2014*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m1, m11
2015*c0909341SAndroid Build Coastguard Worker    vpermb              m18, m21, m18
2016*c0909341SAndroid Build Coastguard Worker    mova                 m1, m9
2017*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m2, m11
2018*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m3, m14    ; a1
2019*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m4, m14    ; b1
2020*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m2, m12
2021*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
2022*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m18, m12
2023*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
2024*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
2025*c0909341SAndroid Build Coastguard Worker    psraw                m4, m0, 2      ; 5 6
2026*c0909341SAndroid Build Coastguard Worker    vpshrdd              m3, m2, m4, 16 ; 4 5
2027*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m4, m15    ; b2
2028*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m3, m15    ; a2
2029*c0909341SAndroid Build Coastguard Worker    packuswb            m16, m17
2030*c0909341SAndroid Build Coastguard Worker    vpermb              m16, m10, m16
2031*c0909341SAndroid Build Coastguard Worker    mova         [r7+dsq*0], xm16
2032*c0909341SAndroid Build Coastguard Worker    vextracti128 [r7+dsq*1], ym16, 1
2033*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+dsq*2]
2034*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2035*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop
2036*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
2037*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
2038*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
2039*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
2040*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop0
2041*c0909341SAndroid Build Coastguard Worker    vzeroupper
2042*c0909341SAndroid Build Coastguard Worker    RET
2043*c0909341SAndroid Build Coastguard Worker
2044*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_8bpc
2045*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_sharp,  REGULAR, SHARP,   put_8tap_8bpc
2046*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp,          SHARP,   SHARP
2047*c0909341SAndroid Build Coastguard Worker
2048*c0909341SAndroid Build Coastguard Workercglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
2049*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
2050*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
2051*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
2052*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 8tap_v, my, 4tap_v
2053*c0909341SAndroid Build Coastguard Worker    lea                  r8, [put_avx512icl]
2054*c0909341SAndroid Build Coastguard Worker    movsxd               wq, wm
2055*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
2056*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
2057*c0909341SAndroid Build Coastguard Worker    jnz .h
2058*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2059*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _put_6tap_8bpc_avx512icl).put
2060*c0909341SAndroid Build Coastguard Worker.v:
2061*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2062*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2063*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2064*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
2065*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, wd
2066*c0909341SAndroid Build Coastguard Worker    lea                 myq, [base+subpel_filters+myq*8]
2067*c0909341SAndroid Build Coastguard Worker    movzx               r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
2068*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pw_512]
2069*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m8, [myq+0]
2070*c0909341SAndroid Build Coastguard Worker    add                  r6, r8
2071*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, [myq+2]
2072*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
2073*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m10, [myq+4]
2074*c0909341SAndroid Build Coastguard Worker    sub                srcq, ss3q
2075*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m11, [myq+6]
2076*c0909341SAndroid Build Coastguard Worker    jmp                  r6
2077*c0909341SAndroid Build Coastguard Worker.v_w2:
2078*c0909341SAndroid Build Coastguard Worker    movd               xmm2, [srcq+ssq*0]
2079*c0909341SAndroid Build Coastguard Worker    pinsrw             xmm2, [srcq+ssq*1], 2
2080*c0909341SAndroid Build Coastguard Worker    pinsrw             xmm2, [srcq+ssq*2], 4
2081*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2082*c0909341SAndroid Build Coastguard Worker    pinsrw             xmm2, [srcq+ssq*0], 6  ; 0 1 2 3
2083*c0909341SAndroid Build Coastguard Worker    movd               xmm3, [srcq+ssq*1]
2084*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm1, [srcq+ssq*2]
2085*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2086*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm0, [srcq+ssq*0]
2087*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm3, xmm3, xmm1, 0x02 ; 4 5
2088*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm1, xmm1, xmm0, 0x02 ; 5 6
2089*c0909341SAndroid Build Coastguard Worker    palignr            xmm4, xmm3, xmm2, 4    ; 1 2 3 4
2090*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm3, xmm1             ; 45 56
2091*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm1, xmm2, xmm4       ; 01 12
2092*c0909341SAndroid Build Coastguard Worker    punpckhbw          xmm2, xmm4             ; 23 34
2093*c0909341SAndroid Build Coastguard Worker.v_w2_loop:
2094*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm5, xmm1, xm8        ; a0 b0
2095*c0909341SAndroid Build Coastguard Worker    mova               xmm1, xmm2
2096*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm2, xm9              ; a1 b1
2097*c0909341SAndroid Build Coastguard Worker    paddw              xmm5, xmm2
2098*c0909341SAndroid Build Coastguard Worker    mova               xmm2, xmm3
2099*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm3, xm10             ; a2 b2
2100*c0909341SAndroid Build Coastguard Worker    paddw              xmm5, xmm3
2101*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm4, [srcq+ssq*1]
2102*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2103*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm3, xmm0, xmm4, 0x02 ; 6 7
2104*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm0, [srcq+ssq*0]
2105*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm4, xmm4, xmm0, 0x02 ; 7 8
2106*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm3, xmm4             ; 67 78
2107*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm4, xmm3, xm11       ; a3 b3
2108*c0909341SAndroid Build Coastguard Worker    paddw              xmm5, xmm4
2109*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm5, xm7
2110*c0909341SAndroid Build Coastguard Worker    packuswb           xmm5, xmm5
2111*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xmm5, 0
2112*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xmm5, 2
2113*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2114*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2115*c0909341SAndroid Build Coastguard Worker    jg .v_w2_loop
2116*c0909341SAndroid Build Coastguard Worker    RET
2117*c0909341SAndroid Build Coastguard Worker.v_w4:
2118*c0909341SAndroid Build Coastguard Worker    movd               xmm2, [srcq+ssq*0]
2119*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm2, [srcq+ssq*1], 1
2120*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm2, [srcq+ssq*2], 2
2121*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2122*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm2, [srcq+ssq*0], 3  ; 0 1 2 3
2123*c0909341SAndroid Build Coastguard Worker    movd               xmm3, [srcq+ssq*1]
2124*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm1, [srcq+ssq*2]
2125*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2126*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm0, [srcq+ssq*0]
2127*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm3, xmm3, xmm1, 0x02 ; 4 5
2128*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm1, xmm1, xmm0, 0x02 ; 5 6
2129*c0909341SAndroid Build Coastguard Worker    palignr            xmm4, xmm3, xmm2, 4    ; 1 2 3 4
2130*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm3, xmm1             ; 45 56
2131*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm1, xmm2, xmm4       ; 01 12
2132*c0909341SAndroid Build Coastguard Worker    punpckhbw          xmm2, xmm4             ; 23 34
2133*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
2134*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm4, [srcq+ssq*1]
2135*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2136*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm5, xmm1, xm8        ; a0 b0
2137*c0909341SAndroid Build Coastguard Worker    mova               xmm1, xmm2
2138*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm2, xm9              ; a1 b1
2139*c0909341SAndroid Build Coastguard Worker    paddw              xmm5, xmm2
2140*c0909341SAndroid Build Coastguard Worker    mova               xmm2, xmm3
2141*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm3, xm10             ; a2 b2
2142*c0909341SAndroid Build Coastguard Worker    paddw              xmm5, xmm3
2143*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm3, xmm0, xmm4, 0x02 ; 6 7
2144*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm0, [srcq+ssq*0]
2145*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm4, xmm4, xmm0, 0x02 ; 7 8
2146*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm3, xmm4             ; 67 78
2147*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm4, xmm3, xm11       ; a3 b3
2148*c0909341SAndroid Build Coastguard Worker    paddw              xmm5, xmm4
2149*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm5, xm7
2150*c0909341SAndroid Build Coastguard Worker    packuswb           xmm5, xmm5
2151*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm5
2152*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm5, 1
2153*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2154*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2155*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
2156*c0909341SAndroid Build Coastguard Worker    RET
2157*c0909341SAndroid Build Coastguard Worker.v_w8:
2158*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [srcq+ssq*0]
2159*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm0, [srcq+ssq*1]
2160*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm2, [srcq+ssq*2]
2161*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2162*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm5, [srcq+ssq*0]
2163*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm3, [srcq+ssq*1]
2164*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm4, [srcq+ssq*2]
2165*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2166*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm1, ymm0, 0x30
2167*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm0, ymm2, 0x30
2168*c0909341SAndroid Build Coastguard Worker    punpcklbw          ymm1, ymm0 ; 01 12
2169*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm0, [srcq+ssq*0]
2170*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm2, ymm5, 0x30
2171*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm5, ymm3, 0x30
2172*c0909341SAndroid Build Coastguard Worker    punpcklbw          ymm2, ymm5 ; 23 34
2173*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm3, ymm4, 0x30
2174*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm4, ymm0, 0x30
2175*c0909341SAndroid Build Coastguard Worker    punpcklbw          ymm3, ymm4 ; 45 56
2176*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
2177*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm4, [srcq+ssq*1]
2178*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2179*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm5, ymm1, ym8  ; a0 b0
2180*c0909341SAndroid Build Coastguard Worker    mova               ymm1, ymm2
2181*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm2, ym9        ; a1 b1
2182*c0909341SAndroid Build Coastguard Worker    paddw              ymm5, ymm2
2183*c0909341SAndroid Build Coastguard Worker    mova               ymm2, ymm3
2184*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm3, ym10       ; a2 b2
2185*c0909341SAndroid Build Coastguard Worker    paddw              ymm5, ymm3
2186*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm3, ymm0, ymm4, 0x30
2187*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm0, [srcq+ssq*0]
2188*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm4, ymm4, ymm0, 0x30
2189*c0909341SAndroid Build Coastguard Worker    punpcklbw          ymm3, ymm4       ; 67 78
2190*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm4, ymm3, ym11 ; a3 b3
2191*c0909341SAndroid Build Coastguard Worker    paddw              ymm5, ymm4
2192*c0909341SAndroid Build Coastguard Worker    pmulhrsw           ymm5, ym7
2193*c0909341SAndroid Build Coastguard Worker    vextracti128       xmm4, ymm5, 1
2194*c0909341SAndroid Build Coastguard Worker    packuswb           xmm5, xmm4
2195*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xmm5
2196*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xmm5
2197*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2198*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2199*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
2200*c0909341SAndroid Build Coastguard Worker    vzeroupper
2201*c0909341SAndroid Build Coastguard Worker    RET
2202*c0909341SAndroid Build Coastguard Worker.v_w16:
2203*c0909341SAndroid Build Coastguard Worker    mova                m12, [spel_v_perm16a]
2204*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m1, [srcq+ssq*0]
2205*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym4, [srcq+ssq*1]
2206*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x0f
2207*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m2, [srcq+ssq*2]
2208*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2209*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym5, [srcq+ssq*0]
2210*c0909341SAndroid Build Coastguard Worker    kmovb                k1, r6d
2211*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [srcq+ssq*1]
2212*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym6, [srcq+ssq*2]
2213*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2214*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [srcq+ssq*0]
2215*c0909341SAndroid Build Coastguard Worker    vshufpd          m1{k1}, m4, m2, 0xcc
2216*c0909341SAndroid Build Coastguard Worker    vshufpd          m2{k1}, m5, m3, 0xcc
2217*c0909341SAndroid Build Coastguard Worker    vshufpd          m3{k1}, m6, m0, 0xcc
2218*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m12, m1 ; 01 12
2219*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m12, m2 ; 23 34
2220*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m12, m3 ; 45 56
2221*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
2222*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m1, m8  ; a0 b0
2223*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
2224*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m2, m9  ; a1 b1
2225*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
2226*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m3, m10 ; a2 b2
2227*c0909341SAndroid Build Coastguard Worker    mova                 m3, m0
2228*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
2229*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym5, [srcq+ssq*1]
2230*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2231*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [srcq+ssq*0]
2232*c0909341SAndroid Build Coastguard Worker    vshufpd          m3{k1}, m5, m0, 0xcc
2233*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m12, m3 ; 67 78
2234*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m3, m11 ; a3 b3
2235*c0909341SAndroid Build Coastguard Worker    paddw                m4, m6
2236*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
2237*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m7
2238*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym5, m4, 1
2239*c0909341SAndroid Build Coastguard Worker    packuswb            ym4, ym5
2240*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], xm4
2241*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*1], ym4, 1
2242*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2243*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2244*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
2245*c0909341SAndroid Build Coastguard Worker    RET
2246*c0909341SAndroid Build Coastguard Worker.v_w32:
2247*c0909341SAndroid Build Coastguard Worker    mova                m12, [spel_v_perm32]
2248*c0909341SAndroid Build Coastguard Worker    pmovzxbq            m14, [pb_02461357]
2249*c0909341SAndroid Build Coastguard Worker    vpshrdw             m13, m12, m12, 8
2250*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+ssq*0]
2251*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*1], 1
2252*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m12, m0 ; 01
2253*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*2], 0
2254*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2255*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m13, m0 ; 12
2256*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*0], 1
2257*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m12, m0 ; 23
2258*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*1], 0
2259*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m13, m0 ; 34
2260*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*2], 1
2261*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2262*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m12, m0 ; 45
2263*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*0], 0
2264*c0909341SAndroid Build Coastguard Worker    vpermb               m6, m13, m0 ; 56
2265*c0909341SAndroid Build Coastguard Worker.v_w32_loop:
2266*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*1], 1
2267*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2268*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m15, m1, m8
2269*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
2270*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m16, m2, m8
2271*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
2272*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m17, m3, m9
2273*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
2274*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m18, m4, m9
2275*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
2276*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m19, m5, m10
2277*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m12, m0 ; 67
2278*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*0], 0
2279*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m20, m6, m10
2280*c0909341SAndroid Build Coastguard Worker    vpermb               m6, m13, m0 ; 78
2281*c0909341SAndroid Build Coastguard Worker    paddw               m15, m17
2282*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m17, m5, m11
2283*c0909341SAndroid Build Coastguard Worker    paddw               m16, m18
2284*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m18, m6, m11
2285*c0909341SAndroid Build Coastguard Worker    paddw               m15, m19
2286*c0909341SAndroid Build Coastguard Worker    paddw               m16, m20
2287*c0909341SAndroid Build Coastguard Worker    paddw               m15, m17
2288*c0909341SAndroid Build Coastguard Worker    paddw               m16, m18
2289*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m7
2290*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m7
2291*c0909341SAndroid Build Coastguard Worker    packuswb            m15, m16
2292*c0909341SAndroid Build Coastguard Worker    vpermq              m15, m14, m15
2293*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym15
2294*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m15, 1
2295*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2296*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2297*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop
2298*c0909341SAndroid Build Coastguard Worker    vzeroupper
2299*c0909341SAndroid Build Coastguard Worker    RET
2300*c0909341SAndroid Build Coastguard Worker.v_w64:
2301*c0909341SAndroid Build Coastguard Worker.v_w128:
2302*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+wq*4-256]
2303*c0909341SAndroid Build Coastguard Worker    mov                  r4, srcq
2304*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
2305*c0909341SAndroid Build Coastguard Worker.v_loop0:
2306*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*0]
2307*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*1]
2308*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*2]
2309*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2310*c0909341SAndroid Build Coastguard Worker    movu                m13, [srcq+ssq*0]
2311*c0909341SAndroid Build Coastguard Worker    movu                m15, [srcq+ssq*1]
2312*c0909341SAndroid Build Coastguard Worker    movu                m17, [srcq+ssq*2]
2313*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2314*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
2315*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2, m4    ; 01l
2316*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m4        ; 01h
2317*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4, m6    ; 12l
2318*c0909341SAndroid Build Coastguard Worker    punpckhbw            m4, m6        ; 12h
2319*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m6, m13   ; 23l
2320*c0909341SAndroid Build Coastguard Worker    punpckhbw            m6, m13       ; 23h
2321*c0909341SAndroid Build Coastguard Worker    punpcklbw           m12, m13, m15  ; 34l
2322*c0909341SAndroid Build Coastguard Worker    punpckhbw           m13, m15       ; 34h
2323*c0909341SAndroid Build Coastguard Worker    punpcklbw           m14, m15, m17  ; 45l
2324*c0909341SAndroid Build Coastguard Worker    punpckhbw           m15, m17       ; 45h
2325*c0909341SAndroid Build Coastguard Worker    punpcklbw           m16, m17, m0   ; 56l
2326*c0909341SAndroid Build Coastguard Worker    punpckhbw           m17, m0        ; 56h
2327*c0909341SAndroid Build Coastguard Worker.v_loop:
2328*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m18, m1, m8    ; a0l
2329*c0909341SAndroid Build Coastguard Worker    mova                 m1, m5
2330*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m19, m2, m8    ; a0h
2331*c0909341SAndroid Build Coastguard Worker    mova                 m2, m6
2332*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m20, m3, m8    ; b0l
2333*c0909341SAndroid Build Coastguard Worker    mova                 m3, m12
2334*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m21, m4, m8    ; b0h
2335*c0909341SAndroid Build Coastguard Worker    mova                 m4, m13
2336*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m9        ; a1l
2337*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m9        ; a1h
2338*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, m9        ; b1l
2339*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m9        ; b1h
2340*c0909341SAndroid Build Coastguard Worker    paddw               m18, m5
2341*c0909341SAndroid Build Coastguard Worker    mova                 m5, m14
2342*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m10       ; a2l
2343*c0909341SAndroid Build Coastguard Worker    paddw               m19, m6
2344*c0909341SAndroid Build Coastguard Worker    mova                 m6, m15
2345*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m15, m10       ; a2h
2346*c0909341SAndroid Build Coastguard Worker    paddw               m20, m12
2347*c0909341SAndroid Build Coastguard Worker    mova                m12, m16
2348*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m16, m10       ; b2l
2349*c0909341SAndroid Build Coastguard Worker    paddw               m21, m13
2350*c0909341SAndroid Build Coastguard Worker    mova                m13, m17
2351*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m17, m10       ; b2h
2352*c0909341SAndroid Build Coastguard Worker    paddw               m18, m14
2353*c0909341SAndroid Build Coastguard Worker    paddw               m19, m15
2354*c0909341SAndroid Build Coastguard Worker    paddw               m20, m16
2355*c0909341SAndroid Build Coastguard Worker    paddw               m21, m17
2356*c0909341SAndroid Build Coastguard Worker    movu                m17, [srcq+ssq*1]
2357*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2358*c0909341SAndroid Build Coastguard Worker    punpcklbw           m14, m0, m17  ; 67l
2359*c0909341SAndroid Build Coastguard Worker    punpckhbw           m15, m0, m17  ; 67h
2360*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m16, m14, m11 ; a3l
2361*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m15, m11 ; a3h
2362*c0909341SAndroid Build Coastguard Worker    paddw               m18, m16
2363*c0909341SAndroid Build Coastguard Worker    paddw               m19, m0
2364*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
2365*c0909341SAndroid Build Coastguard Worker    punpcklbw           m16, m17, m0  ; 78l
2366*c0909341SAndroid Build Coastguard Worker    punpckhbw           m17, m0       ; 78h
2367*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m7
2368*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m7
2369*c0909341SAndroid Build Coastguard Worker    packuswb            m18, m19
2370*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m18
2371*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m18, m16, m11 ; b3l
2372*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m19, m17, m11 ; b3h
2373*c0909341SAndroid Build Coastguard Worker    paddw               m18, m20
2374*c0909341SAndroid Build Coastguard Worker    paddw               m19, m21
2375*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m7
2376*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m7
2377*c0909341SAndroid Build Coastguard Worker    packuswb            m18, m19
2378*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m18
2379*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2380*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2381*c0909341SAndroid Build Coastguard Worker    jg .v_loop
2382*c0909341SAndroid Build Coastguard Worker    add                  r4, 64
2383*c0909341SAndroid Build Coastguard Worker    add                  r7, 64
2384*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
2385*c0909341SAndroid Build Coastguard Worker    mov                srcq, r4
2386*c0909341SAndroid Build Coastguard Worker    mov                dstq, r7
2387*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 256
2388*c0909341SAndroid Build Coastguard Worker    jg .v_loop0
2389*c0909341SAndroid Build Coastguard Worker    vzeroupper
2390*c0909341SAndroid Build Coastguard Worker    RET
2391*c0909341SAndroid Build Coastguard Worker.h:
2392*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2393*c0909341SAndroid Build Coastguard Worker    jnz .hv
2394*c0909341SAndroid Build Coastguard Worker.h2:
2395*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pd_34] ; 2 + (8 << 2)
2396*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2397*c0909341SAndroid Build Coastguard Worker    jl .h_w2
2398*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [subpel_h_shufA]
2399*c0909341SAndroid Build Coastguard Worker    je .h_w4
2400*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
2401*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [subpel_h_shufB]
2402*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [subpel_h_shufC]
2403*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
2404*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
2405*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r8+wq*2+table_offset(put, _8tap_h)]
2406*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+mxq*8+subpel_filters+0]
2407*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+mxq*8+subpel_filters+4]
2408*c0909341SAndroid Build Coastguard Worker    add                  wq, r8
2409*c0909341SAndroid Build Coastguard Worker    jmp                  wq
2410*c0909341SAndroid Build Coastguard Worker.h_w2:
2411*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
2412*c0909341SAndroid Build Coastguard Worker    dec                srcq
2413*c0909341SAndroid Build Coastguard Worker    mova               xmm4, [subpel_h_shuf4]
2414*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm3, [base+mxq*8+subpel_filters+2]
2415*c0909341SAndroid Build Coastguard Worker.h_w2_loop:
2416*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [srcq+ssq*0]
2417*c0909341SAndroid Build Coastguard Worker    movhps             xmm0, [srcq+ssq*1]
2418*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2419*c0909341SAndroid Build Coastguard Worker    pshufb             xmm0, xmm4
2420*c0909341SAndroid Build Coastguard Worker    mova               xmm1, xm5
2421*c0909341SAndroid Build Coastguard Worker    vpdpbusd           xmm1, xmm0, xmm3
2422*c0909341SAndroid Build Coastguard Worker    packssdw           xmm0, xmm1, xmm1
2423*c0909341SAndroid Build Coastguard Worker    psraw              xmm0, 6
2424*c0909341SAndroid Build Coastguard Worker    packuswb           xmm0, xm0
2425*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xmm0, 0
2426*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xmm0, 1
2427*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2428*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2429*c0909341SAndroid Build Coastguard Worker    jg .h_w2_loop
2430*c0909341SAndroid Build Coastguard Worker    RET
2431*c0909341SAndroid Build Coastguard Worker.h_w4:
2432*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
2433*c0909341SAndroid Build Coastguard Worker    dec                srcq
2434*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm3, [base+mxq*8+subpel_filters+2]
2435*c0909341SAndroid Build Coastguard Worker.h_w4_loop:
2436*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [srcq+ssq*0]
2437*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [srcq+ssq*1]
2438*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2439*c0909341SAndroid Build Coastguard Worker    pshufb             xmm0, xm6
2440*c0909341SAndroid Build Coastguard Worker    pshufb             xmm1, xm6
2441*c0909341SAndroid Build Coastguard Worker    mova               xmm2, xm5
2442*c0909341SAndroid Build Coastguard Worker    vpdpbusd           xmm2, xmm0, xmm3
2443*c0909341SAndroid Build Coastguard Worker    mova               xmm0, xm5
2444*c0909341SAndroid Build Coastguard Worker    vpdpbusd           xmm0, xmm1, xmm3
2445*c0909341SAndroid Build Coastguard Worker    packssdw           xmm0, xmm2, xmm0
2446*c0909341SAndroid Build Coastguard Worker    psraw              xmm0, 6
2447*c0909341SAndroid Build Coastguard Worker    packuswb           xmm0, xmm0
2448*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm0
2449*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm0, 1
2450*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2451*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2452*c0909341SAndroid Build Coastguard Worker    jg .h_w4_loop
2453*c0909341SAndroid Build Coastguard Worker    RET
2454*c0909341SAndroid Build Coastguard Worker.h_w8:
2455*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
2456*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, [srcq+ssq*1], 1
2457*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2458*c0909341SAndroid Build Coastguard Worker    WRAP_YMM PUT_8TAP_H   0, 1, 2, 3
2459*c0909341SAndroid Build Coastguard Worker    vpmovuswb           xm0, ym0
2460*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm0
2461*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm0
2462*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2463*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2464*c0909341SAndroid Build Coastguard Worker    jg .h_w8
2465*c0909341SAndroid Build Coastguard Worker    RET
2466*c0909341SAndroid Build Coastguard Worker.h_w16:
2467*c0909341SAndroid Build Coastguard Worker    mova                 m6, [spel_h_perm16]
2468*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [pb_4]
2469*c0909341SAndroid Build Coastguard Worker    paddb                m7, m8, m6
2470*c0909341SAndroid Build Coastguard Worker    paddb                m8, m7
2471*c0909341SAndroid Build Coastguard Worker.h_w16_loop:
2472*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+ssq*0]
2473*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*1], 1
2474*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2475*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H            0, 1, 2, 3, 1
2476*c0909341SAndroid Build Coastguard Worker    vpmovuswb           ym0, m0
2477*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm0
2478*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], ym0, 1
2479*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2480*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2481*c0909341SAndroid Build Coastguard Worker    jg .h_w16_loop
2482*c0909341SAndroid Build Coastguard Worker    RET
2483*c0909341SAndroid Build Coastguard Worker.h_w32:
2484*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+ssq*0+8*0]
2485*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+ssq*1+8*0], 1
2486*c0909341SAndroid Build Coastguard Worker    movu                ym1, [srcq+ssq*0+8*1]
2487*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [srcq+ssq*1+8*1], 1
2488*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2489*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H            0, 2, 3, 4
2490*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H            1, 4, 3, 2
2491*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2492*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym0
2493*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m0, 1
2494*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2495*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2496*c0909341SAndroid Build Coastguard Worker    jg .h_w32
2497*c0909341SAndroid Build Coastguard Worker    RET
2498*c0909341SAndroid Build Coastguard Worker.h_w64:
2499*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+8*0]
2500*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+8*1]
2501*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
2502*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H            0, 2, 3, 4
2503*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H            1, 4, 3, 2
2504*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2505*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
2506*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
2507*c0909341SAndroid Build Coastguard Worker    dec                  hd
2508*c0909341SAndroid Build Coastguard Worker    jg .h_w64
2509*c0909341SAndroid Build Coastguard Worker    RET
2510*c0909341SAndroid Build Coastguard Worker.h_w128:
2511*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+8*0]
2512*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+8*1]
2513*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+8*8]
2514*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+8*9]
2515*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
2516*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H            0,  4, 11, 12
2517*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H            2, 12, 11,  4
2518*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H            1,  4, 11, 12
2519*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H            3, 12, 11,  4
2520*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m2
2521*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m3
2522*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
2523*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
2524*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
2525*c0909341SAndroid Build Coastguard Worker    dec                  hd
2526*c0909341SAndroid Build Coastguard Worker    jg .h_w128
2527*c0909341SAndroid Build Coastguard Worker    RET
2528*c0909341SAndroid Build Coastguard Worker.hv:
2529*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [pd_34]
2530*c0909341SAndroid Build Coastguard Worker    pxor                xm0, xm0
2531*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2532*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
2533*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
2534*c0909341SAndroid Build Coastguard Worker    dec                srcq
2535*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+subpel_filters+mxq*8+2]
2536*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2537*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2538*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2539*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
2540*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        ym1, [base+subpel_filters+myq*8]
2541*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
2542*c0909341SAndroid Build Coastguard Worker    mov                  r6, srcq
2543*c0909341SAndroid Build Coastguard Worker    punpcklbw           ym0, ym1
2544*c0909341SAndroid Build Coastguard Worker    sub                  r6, ss3q
2545*c0909341SAndroid Build Coastguard Worker    psraw               ym0, 2 ; << 6
2546*c0909341SAndroid Build Coastguard Worker    mova               xm14, [spel_hv_end]
2547*c0909341SAndroid Build Coastguard Worker    pshufd             ym10, ym0, q0000
2548*c0909341SAndroid Build Coastguard Worker    pshufd             ym11, ym0, q1111
2549*c0909341SAndroid Build Coastguard Worker    pshufd             ym12, ym0, q2222
2550*c0909341SAndroid Build Coastguard Worker    pshufd             ym13, ym0, q3333
2551*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2552*c0909341SAndroid Build Coastguard Worker    je .hv_w4
2553*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      ym6, [subpel_h_shuf4]
2554*c0909341SAndroid Build Coastguard Worker    movq               xmm2, [r6+ssq*0]
2555*c0909341SAndroid Build Coastguard Worker    movhps             xmm2, [r6+ssq*1]
2556*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [r6+ssq*2]
2557*c0909341SAndroid Build Coastguard Worker    movhps             xmm0, [srcq+ssq*0]
2558*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm3, [srcq+ssq*1]
2559*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm4, [srcq+ssq*2]
2560*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2561*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm1, [srcq+ssq*0]
2562*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm2, ymm3, 0x30
2563*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm0, ymm1, 0x30 ; 2 3   6 _
2564*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm2, ymm4, 0xc0 ; 0 1   4 5
2565*c0909341SAndroid Build Coastguard Worker    pshufb             ymm2, ym6
2566*c0909341SAndroid Build Coastguard Worker    pshufb             ymm0, ym6
2567*c0909341SAndroid Build Coastguard Worker    mova               ymm1, ym9
2568*c0909341SAndroid Build Coastguard Worker    vpdpbusd           ymm1, ymm2, ym7
2569*c0909341SAndroid Build Coastguard Worker    mova               ymm2, ym9
2570*c0909341SAndroid Build Coastguard Worker    vpdpbusd           ymm2, ymm0, ym7
2571*c0909341SAndroid Build Coastguard Worker    packssdw           ymm2, ymm1, ymm2
2572*c0909341SAndroid Build Coastguard Worker    psraw              ymm2, 2
2573*c0909341SAndroid Build Coastguard Worker    vextracti128       xmm3, ymm2, 1
2574*c0909341SAndroid Build Coastguard Worker    palignr            xmm4, xmm3, xmm2, 4
2575*c0909341SAndroid Build Coastguard Worker    punpcklwd          xmm1, xmm2, xmm4 ; 01 12
2576*c0909341SAndroid Build Coastguard Worker    punpckhwd          xmm2, xmm4       ; 23 34
2577*c0909341SAndroid Build Coastguard Worker    pshufd             xmm0, xmm3, q2121
2578*c0909341SAndroid Build Coastguard Worker    punpcklwd          xmm3, xmm0       ; 45 56
2579*c0909341SAndroid Build Coastguard Worker.hv_w2_loop:
2580*c0909341SAndroid Build Coastguard Worker    movq               xmm4, [srcq+ssq*1]
2581*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2582*c0909341SAndroid Build Coastguard Worker    movhps             xmm4, [srcq+ssq*0]
2583*c0909341SAndroid Build Coastguard Worker    pmaddwd            xmm5, xmm1, xm10 ; a0 b0
2584*c0909341SAndroid Build Coastguard Worker    mova               xmm1, xmm2
2585*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm5, xmm2, xm11 ; a1 b1
2586*c0909341SAndroid Build Coastguard Worker    pshufb             xmm4, xm6
2587*c0909341SAndroid Build Coastguard Worker    mova               xmm2, xmm3
2588*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm5, xmm3, xm12 ; a2 b2
2589*c0909341SAndroid Build Coastguard Worker    mova               xmm3, xm9
2590*c0909341SAndroid Build Coastguard Worker    vpdpbusd           xmm3, xmm4, xm7
2591*c0909341SAndroid Build Coastguard Worker    packssdw           xmm4, xmm3, xmm3
2592*c0909341SAndroid Build Coastguard Worker    psraw              xmm4, 2
2593*c0909341SAndroid Build Coastguard Worker    palignr            xmm3, xmm4, xmm0, 12
2594*c0909341SAndroid Build Coastguard Worker    mova               xmm0, xmm4
2595*c0909341SAndroid Build Coastguard Worker    punpcklwd          xmm3, xmm4       ; 67 78
2596*c0909341SAndroid Build Coastguard Worker    vpdpwssd           xmm5, xmm3, xm13 ; a3 b3
2597*c0909341SAndroid Build Coastguard Worker    packuswb           xmm5, xmm5
2598*c0909341SAndroid Build Coastguard Worker    pshufb             xmm5, xm14
2599*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xmm5, 0
2600*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xmm5, 1
2601*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2602*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2603*c0909341SAndroid Build Coastguard Worker    jg .hv_w2_loop
2604*c0909341SAndroid Build Coastguard Worker    vzeroupper
2605*c0909341SAndroid Build Coastguard Worker    RET
2606*c0909341SAndroid Build Coastguard Worker.hv_w4:
2607*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [r6+ssq*0]
2608*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        ym2, [r6+ssq*1]
2609*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, ymm1, [r6+ssq*2], 1
2610*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m2, [srcq+ssq*0], 2
2611*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, [srcq+ssq*1], 2
2612*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m2, [srcq+ssq*2], 3 ; _ 1 3 5
2613*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m6, [subpel_h_shufA]
2614*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2615*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, [srcq+ssq*0], 3 ; 0 2 4 6
2616*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m6
2617*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m6
2618*c0909341SAndroid Build Coastguard Worker    mova                 m0, m9
2619*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m2, m7
2620*c0909341SAndroid Build Coastguard Worker    mova                 m4, m9
2621*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m1, m7
2622*c0909341SAndroid Build Coastguard Worker    mova                ym1, [spel_hv_perm4a]
2623*c0909341SAndroid Build Coastguard Worker    mova                ym2, [spel_hv_perm4b]
2624*c0909341SAndroid Build Coastguard Worker    mova                ym3, [spel_hv_perm4c]
2625*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m4
2626*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2 ; _ 0   1 2   3 4   5 6
2627*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x5555
2628*c0909341SAndroid Build Coastguard Worker    vpermb              ym1, ym1, ym0 ; 01 12
2629*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m2, m0   ; 23 34
2630*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m3, m0   ; 45 56
2631*c0909341SAndroid Build Coastguard Worker    kmovw                k1, r6d
2632*c0909341SAndroid Build Coastguard Worker    mova               ym15, [spel_hv_perm4d]
2633*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
2634*c0909341SAndroid Build Coastguard Worker    movq               xmm4, [srcq+ssq*1]
2635*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2636*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym4, ymm4, [srcq+ssq*0], 1
2637*c0909341SAndroid Build Coastguard Worker    pmaddwd             ym5, ym1, ym10 ; a0 b0
2638*c0909341SAndroid Build Coastguard Worker    mova                ym1, ym2
2639*c0909341SAndroid Build Coastguard Worker    pshufb              ym4, ym6
2640*c0909341SAndroid Build Coastguard Worker    mova                ym0, ym9
2641*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym4, ym7
2642*c0909341SAndroid Build Coastguard Worker    vpdpwssd            ym5, ym2, ym11 ; a1 b1
2643*c0909341SAndroid Build Coastguard Worker    mova                ym2, ym3
2644*c0909341SAndroid Build Coastguard Worker    vpdpwssd            ym5, ym3, ym12 ; a2 b2
2645*c0909341SAndroid Build Coastguard Worker    vpsraw          ym3{k1}, ym0, 2    ; 7 8
2646*c0909341SAndroid Build Coastguard Worker    vpermb              ym3, ym15, ym3 ; 67 78
2647*c0909341SAndroid Build Coastguard Worker    vpdpwssd            ym5, ym3, ym13 ; a3 b3
2648*c0909341SAndroid Build Coastguard Worker    packuswb            ym5, ym5
2649*c0909341SAndroid Build Coastguard Worker    vpermb              ym5, ym14, ym5
2650*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm5
2651*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm5, 1
2652*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2653*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2654*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
2655*c0909341SAndroid Build Coastguard Worker    RET
2656*c0909341SAndroid Build Coastguard Worker.hv_w8:
2657*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
2658*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
2659*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+subpel_filters+mxq*8+0]
2660*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+subpel_filters+mxq*8+4]
2661*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2662*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2663*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2664*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
2665*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [base+subpel_filters+myq*8]
2666*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1
2667*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
2668*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2 ; << 6
2669*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m0, q0000
2670*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m0, q1111
2671*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m0, q2222
2672*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m0, q3333
2673*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
2674*c0909341SAndroid Build Coastguard Worker    jne .hv_w16
2675*c0909341SAndroid Build Coastguard Worker    mov                  r6, srcq
2676*c0909341SAndroid Build Coastguard Worker    sub                  r6, ss3q
2677*c0909341SAndroid Build Coastguard Worker    movu               xmm1, [r6+ssq*0]
2678*c0909341SAndroid Build Coastguard Worker    vinserti128        ymm1, [r6+ssq*1], 1
2679*c0909341SAndroid Build Coastguard Worker    movu               xmm2, [srcq+ssq*1]
2680*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m6, zmm1, [r6+ssq*2], 2
2681*c0909341SAndroid Build Coastguard Worker    vinserti128        ymm2, [srcq+ssq*2], 1
2682*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m6, [srcq+ssq*0], 3 ; 0 1 2 3
2683*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2684*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m4, [subpel_h_shufA]
2685*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _
2686*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m7, [subpel_h_shufB]
2687*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m8, [subpel_h_shufC]
2688*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m6, m4  ; 0 1 2 3   0123
2689*c0909341SAndroid Build Coastguard Worker    mova                 m2, m9
2690*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m1, m10
2691*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m6, m7  ; 0 1 2 3   4567
2692*c0909341SAndroid Build Coastguard Worker    mova                 m1, m9
2693*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m5, m10
2694*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m0, m4  ; 4 5 6 _   0123
2695*c0909341SAndroid Build Coastguard Worker    mova                 m3, m9
2696*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m4, m10
2697*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m0, m7  ; 4 5 6 _   4567
2698*c0909341SAndroid Build Coastguard Worker    mova                 m4, m9
2699*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m7, m10
2700*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m8
2701*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m5, m11
2702*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m6, m11
2703*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m0, m8
2704*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m7, m11
2705*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m6, m11
2706*c0909341SAndroid Build Coastguard Worker    mova                 m5, [spel_hv_perm8a]
2707*c0909341SAndroid Build Coastguard Worker    vpaddd               m0, m5, [pb_32] {1to16}
2708*c0909341SAndroid Build Coastguard Worker    mov                  r6, 0x55555555ff00
2709*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m1
2710*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m4
2711*c0909341SAndroid Build Coastguard Worker    mova                 m8, [spel_hv_perm8b]
2712*c0909341SAndroid Build Coastguard Worker    psraw                m2, 2 ; 0 1 2 3
2713*c0909341SAndroid Build Coastguard Worker    psraw                m3, 2 ; 4 5 6 _
2714*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m5, m2 ; 01 12
2715*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m6, [subpel_h_shufA]
2716*c0909341SAndroid Build Coastguard Worker    kmovq                k1, r6
2717*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m0, m3 ; 23 34
2718*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m7, [subpel_h_shufB]
2719*c0909341SAndroid Build Coastguard Worker    kshiftrq             k2, k1, 16
2720*c0909341SAndroid Build Coastguard Worker    mova               xm16, [spel_hv_end]
2721*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m5, m3 ; 45 56
2722*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
2723*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym4, [srcq+ssq*1]
2724*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2725*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4  m4{k1}, [srcq+ssq*0]
2726*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m1, m12 ; a0 b0
2727*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m6  ; 7 8   0123 4567
2728*c0909341SAndroid Build Coastguard Worker    mova                 m5, m9
2729*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m5, m1, m10
2730*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m7      ; 7 8   4567 89ab
2731*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m2, m13 ; a1 b1
2732*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
2733*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m5, m4, m11
2734*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
2735*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m3, m14 ; a2 b2
2736*c0909341SAndroid Build Coastguard Worker    psraw            m3{k2}, m5, 2   ; 75 86
2737*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m8, m3  ; 67 78
2738*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m3, m15 ; a3 b3
2739*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
2740*c0909341SAndroid Build Coastguard Worker    vpermb             zmm1, m16, m0
2741*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xmm1
2742*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xmm1
2743*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2744*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2745*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
2746*c0909341SAndroid Build Coastguard Worker    vzeroupper
2747*c0909341SAndroid Build Coastguard Worker    RET
2748*c0909341SAndroid Build Coastguard Worker.hv_w16:
2749*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      23
2750*c0909341SAndroid Build Coastguard Worker    movu                m22, [spel_hv_perm16a]
2751*c0909341SAndroid Build Coastguard Worker    sub                srcq, ss3q
2752*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [pb_4]
2753*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [wq*2-32]
2754*c0909341SAndroid Build Coastguard Worker    mova                 m7, [spel_hv_perm16b]
2755*c0909341SAndroid Build Coastguard Worker    paddb               m20, m8, m22
2756*c0909341SAndroid Build Coastguard Worker    mova               ym16, [spel_hv_end16]
2757*c0909341SAndroid Build Coastguard Worker    paddb               m21, m8, m20
2758*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+r6*8]
2759*c0909341SAndroid Build Coastguard Worker    paddb                m8, m7
2760*c0909341SAndroid Build Coastguard Worker.hv_w16_loop0:
2761*c0909341SAndroid Build Coastguard Worker    movu               ym17, [srcq+ssq*0]
2762*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m17, [srcq+ssq*1], 1 ; 0 1
2763*c0909341SAndroid Build Coastguard Worker    lea                  r4, [srcq+ss3q]
2764*c0909341SAndroid Build Coastguard Worker    movu               ym18, [srcq+ssq*2]
2765*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m18, [r4  +ssq*0], 1 ; 2 3
2766*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
2767*c0909341SAndroid Build Coastguard Worker    movu               ym19, [r4  +ssq*1]
2768*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m19, [r4  +ssq*2], 1 ; 4 5
2769*c0909341SAndroid Build Coastguard Worker    add                  r4, ss3q
2770*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m22, m17    ; 0 1   0123   89ab
2771*c0909341SAndroid Build Coastguard Worker    mova                 m1, m9
2772*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m21, m17    ; 0 1   89ab   ghij
2773*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m2, m10
2774*c0909341SAndroid Build Coastguard Worker    mova                 m2, m9
2775*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m22, m18    ; 2 3   0123   89ab
2776*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m3, m11
2777*c0909341SAndroid Build Coastguard Worker    mova                 m3, m9
2778*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m21, m18    ; 2 3   89ab   ghij
2779*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m4, m10
2780*c0909341SAndroid Build Coastguard Worker    mova                 m4, m9
2781*c0909341SAndroid Build Coastguard Worker    vpermb               m6, m22, m19    ; 4 5   0123   89ab
2782*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m5, m11
2783*c0909341SAndroid Build Coastguard Worker    mova                 m5, m9
2784*c0909341SAndroid Build Coastguard Worker    vpermb              m17, m20, m17    ; 0 1   4567   cdef
2785*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m5, m6, m10
2786*c0909341SAndroid Build Coastguard Worker    mova                 m6, m9
2787*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m21, m19    ; 4 5   89ab   ghij
2788*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m17, m11
2789*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m17, m10
2790*c0909341SAndroid Build Coastguard Worker    movu               ym17, [r4+ssq*0]  ; 6
2791*c0909341SAndroid Build Coastguard Worker    vpermb              m18, m20, m18    ; 2 3   4567   cdef
2792*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m6, m0, m11
2793*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m7, m17     ; 6     0145   2367   89cd   abef
2794*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m18, m11
2795*c0909341SAndroid Build Coastguard Worker    vpermb              m19, m20, m19    ; 4 5   4567   cdef
2796*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m18, m10
2797*c0909341SAndroid Build Coastguard Worker    mova                m18, m9
2798*c0909341SAndroid Build Coastguard Worker    vpermb              m17, m8, m17     ; 6     4589   67ab   cdgh   efij
2799*c0909341SAndroid Build Coastguard Worker    vpdpbusd            m18, m0, m10
2800*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m2
2801*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m5, m19, m11
2802*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m6, m19, m10
2803*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m4
2804*c0909341SAndroid Build Coastguard Worker    vpdpbusd            m18, m17, m11
2805*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2           ; 01
2806*c0909341SAndroid Build Coastguard Worker    psraw                m3, 2           ; 23
2807*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m6
2808*c0909341SAndroid Build Coastguard Worker    vpshrdd              m2, m1, m3, 16  ; 12
2809*c0909341SAndroid Build Coastguard Worker    psraw                m5, 2           ; 45
2810*c0909341SAndroid Build Coastguard Worker    vpshrdd              m4, m3, m5, 16  ; 34
2811*c0909341SAndroid Build Coastguard Worker    psraw               m18, 2
2812*c0909341SAndroid Build Coastguard Worker    vpshrdd              m6, m5, m18, 16 ; 56
2813*c0909341SAndroid Build Coastguard Worker.hv_w16_loop:
2814*c0909341SAndroid Build Coastguard Worker    movu               ym19, [r4+ssq*1]
2815*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r4+ssq*2]
2816*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m19, [r4+ssq*0], 1
2817*c0909341SAndroid Build Coastguard Worker    pmaddwd             m17, m1, m12     ; a0
2818*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m22, m19    ; 7 8   0123   89ab
2819*c0909341SAndroid Build Coastguard Worker    pmaddwd             m18, m2, m12     ; b0
2820*c0909341SAndroid Build Coastguard Worker    mova                 m0, m9
2821*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m21, m19    ; 7 8   89ab   ghij
2822*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m1, m10
2823*c0909341SAndroid Build Coastguard Worker    mova                 m1, m9
2824*c0909341SAndroid Build Coastguard Worker    vpermb              m19, m20, m19    ; 7 8   4567   cdef
2825*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m2, m11
2826*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
2827*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m3, m13     ; a1
2828*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m18, m4, m13     ; b1
2829*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
2830*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m19, m11
2831*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m19, m10
2832*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m5, m14     ; a2
2833*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m18, m6, m14     ; b2
2834*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
2835*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
2836*c0909341SAndroid Build Coastguard Worker    psraw                m6, m0, 2       ; 78
2837*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
2838*c0909341SAndroid Build Coastguard Worker    vpshrdd              m5, m4, m6, 16  ; 67
2839*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m18, m6, m15     ; b3
2840*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m5, m15     ; a3
2841*c0909341SAndroid Build Coastguard Worker    packuswb            m17, m18
2842*c0909341SAndroid Build Coastguard Worker    vpermb              m17, m16, m17
2843*c0909341SAndroid Build Coastguard Worker    mova         [r7+dsq*0], xm17
2844*c0909341SAndroid Build Coastguard Worker    vextracti128 [r7+dsq*1], ym17, 1
2845*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+dsq*2]
2846*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2847*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop
2848*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
2849*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
2850*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
2851*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
2852*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop0
2853*c0909341SAndroid Build Coastguard Worker    RET
2854*c0909341SAndroid Build Coastguard Worker
2855*c0909341SAndroid Build Coastguard Worker%if WIN64
2856*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 4
2857*c0909341SAndroid Build Coastguard Worker%else
2858*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 7
2859*c0909341SAndroid Build Coastguard Worker%endif
2860*c0909341SAndroid Build Coastguard Worker
2861*c0909341SAndroid Build Coastguard Worker%define PREP_8TAP_FN FN prep_8tap,
2862*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_6tap_8bpc
2863*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_6tap_8bpc
2864*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_8bpc
2865*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_8bpc
2866*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_8bpc
2867*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular,        REGULAR, REGULAR
2868*c0909341SAndroid Build Coastguard Worker
2869*c0909341SAndroid Build Coastguard Workercglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my, ss3
2870*c0909341SAndroid Build Coastguard Worker%define base r7-prep_avx512icl
2871*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
2872*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
2873*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
2874*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 6tap_v, my, 4tap_v
2875*c0909341SAndroid Build Coastguard Worker    lea                  r7, [prep_avx512icl]
2876*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
2877*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
2878*c0909341SAndroid Build Coastguard Worker    jnz .h
2879*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2880*c0909341SAndroid Build Coastguard Worker    jnz .v
2881*c0909341SAndroid Build Coastguard Worker.prep:
2882*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
2883*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(prep,)]
2884*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
2885*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
2886*c0909341SAndroid Build Coastguard Worker%if WIN64
2887*c0909341SAndroid Build Coastguard Worker    pop                  r7
2888*c0909341SAndroid Build Coastguard Worker%endif
2889*c0909341SAndroid Build Coastguard Worker    jmp                  wq
2890*c0909341SAndroid Build Coastguard Worker.v:
2891*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2892*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2893*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
2894*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
2895*c0909341SAndroid Build Coastguard Worker    tzcnt               r5d, wd
2896*c0909341SAndroid Build Coastguard Worker    lea                 myq, [base+subpel_filters+1+myq*8]
2897*c0909341SAndroid Build Coastguard Worker    movzx               r5d, word [r7+r5*2+table_offset(prep, _6tap_v)]
2898*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pw_8192]
2899*c0909341SAndroid Build Coastguard Worker    sub                srcq, ssq
2900*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m8, [myq+0]
2901*c0909341SAndroid Build Coastguard Worker    add                  r5, r7
2902*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, [myq+2]
2903*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
2904*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m10, [myq+4]
2905*c0909341SAndroid Build Coastguard Worker    sub                srcq, ssq
2906*c0909341SAndroid Build Coastguard Worker    jmp                  r5
2907*c0909341SAndroid Build Coastguard Worker.v_w4:
2908*c0909341SAndroid Build Coastguard Worker    movd               xmm2, [srcq+ssq*0]
2909*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm2, [srcq+ssq*1], 1
2910*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ymm1, [srcq+ssq*2]
2911*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2912*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ymm3, [srcq+ssq*0]
2913*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ymm0, [srcq+ssq*1]
2914*c0909341SAndroid Build Coastguard Worker    vbroadcasti128     ymm5, [deint_shuf4]
2915*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm1, ymm2, 0xeb
2916*c0909341SAndroid Build Coastguard Worker    punpcklqdq         ymm3, ymm0
2917*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm1, ymm3, 0x60 ; 0 1 2 _   2 3 4 _
2918*c0909341SAndroid Build Coastguard Worker    pshufb             ymm1, ymm5       ; 01 12 23 34
2919*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
2920*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm0, [srcq+ssq*2], 1
2921*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ymm2, [srcq+ss3q ]
2922*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
2923*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ymm3, [srcq+ssq*0]
2924*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm2, ymm0, 0xeb
2925*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ymm0, [srcq+ssq*1]
2926*c0909341SAndroid Build Coastguard Worker    punpcklqdq         ymm3, ymm0
2927*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm2, ymm3, 0x60 ; 4 5 6 _   6 7 8 _
2928*c0909341SAndroid Build Coastguard Worker    pshufb             ymm2, ymm5       ; 45 56 67 78
2929*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm3, ymm1, ym8  ; a0 b0 c0 d0
2930*c0909341SAndroid Build Coastguard Worker    vperm2i128         ymm1, ymm2, 0x21 ; 23 34 45 56
2931*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm4, ymm2, ym10 ; a2 b2 c2 d2
2932*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm1, ym9        ; a1 b1 c1 d1
2933*c0909341SAndroid Build Coastguard Worker    paddw              ymm3, ymm4
2934*c0909341SAndroid Build Coastguard Worker    paddw              ymm3, ymm1
2935*c0909341SAndroid Build Coastguard Worker    pmulhrsw           ymm3, ym7
2936*c0909341SAndroid Build Coastguard Worker    mova               ymm1, ymm2
2937*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], ymm3
2938*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
2939*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2940*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
2941*c0909341SAndroid Build Coastguard Worker    vzeroupper
2942*c0909341SAndroid Build Coastguard Worker    RET
2943*c0909341SAndroid Build Coastguard Worker.v_w8:
2944*c0909341SAndroid Build Coastguard Worker    mova                 m6, [spel_v_perm8]
2945*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+ssq*0]
2946*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x3e
2947*c0909341SAndroid Build Coastguard Worker    movq                xm2, [srcq+ssq*1]
2948*c0909341SAndroid Build Coastguard Worker    kmovb                k1, r6d
2949*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        ym3, [srcq+ssq*2]
2950*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2951*c0909341SAndroid Build Coastguard Worker    vpunpcklqdq         ym2, [srcq+ssq*0] {1to4}
2952*c0909341SAndroid Build Coastguard Worker    vpunpcklqdq      m1{k1}, m3, [srcq+ssq*1] {1to8}
2953*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+ssq*1]
2954*c0909341SAndroid Build Coastguard Worker    kshiftlb             k2, k1, 2
2955*c0909341SAndroid Build Coastguard Worker    shufpd               m1, m2, 0x18  ; 0 1   2 3   4
2956*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m6, m1    ; 01 12 23 34
2957*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
2958*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        ym3, [srcq+ss3q ]
2959*c0909341SAndroid Build Coastguard Worker    vpunpcklqdq     ym0{k1}, ym3, [srcq+ssq*2] {1to4}
2960*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
2961*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+ssq*1]
2962*c0909341SAndroid Build Coastguard Worker    vpunpcklqdq      m0{k2}, m3, [srcq+ssq*0] {1to8}
2963*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m1, m8    ; a0 b0 c0 d0
2964*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m6, m0    ; 45 56 67 78
2965*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm3
2966*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m2, q1032 ; 23 34 45 56
2967*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m2, m10   ; a3 b3 c3 d3
2968*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m1, m9    ; a2 b2 c2 d2
2969*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
2970*c0909341SAndroid Build Coastguard Worker    paddw                m4, m3
2971*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
2972*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m7
2973*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m4
2974*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
2975*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2976*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
2977*c0909341SAndroid Build Coastguard Worker    RET
2978*c0909341SAndroid Build Coastguard Worker.v_w16:
2979*c0909341SAndroid Build Coastguard Worker    mova                m11, [spel_v_perm16b]
2980*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m1, [srcq+ssq*0]
2981*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x0f
2982*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym3, [srcq+ssq*1]
2983*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m2, [srcq+ssq*2]
2984*c0909341SAndroid Build Coastguard Worker    kmovb                k1, r6d
2985*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2986*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym4, [srcq+ssq*0]
2987*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [srcq+ssq*1]
2988*c0909341SAndroid Build Coastguard Worker    vshufpd          m1{k1}, m3, m2, 0xcc
2989*c0909341SAndroid Build Coastguard Worker    vshufpd          m2{k1}, m4, m0, 0xcc
2990*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m11, m1 ; 01 12
2991*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m11, m2 ; 23 34
2992*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
2993*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m1, m8  ; a0 b0
2994*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m2, m9  ; a1 b1
2995*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym6, [srcq+ssq*2]
2996*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m2, m8  ; c0 d0
2997*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m2, [srcq+ss3q ]
2998*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
2999*c0909341SAndroid Build Coastguard Worker    vshufpd          m0{k1}, m6, m2, 0xcc
3000*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym6, [srcq+ssq*0]
3001*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m11, m0 ; 45 56
3002*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [srcq+ssq*1]
3003*c0909341SAndroid Build Coastguard Worker    vshufpd          m2{k1}, m6, m0, 0xcc
3004*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m1, m9  ; c1 d1
3005*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m11, m2 ; 67 78
3006*c0909341SAndroid Build Coastguard Worker    paddw                m3, m5
3007*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m1, m10 ; a2 b2
3008*c0909341SAndroid Build Coastguard Worker    paddw                m4, m6
3009*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m2, m10 ; c2 d2
3010*c0909341SAndroid Build Coastguard Worker    paddw                m3, m5
3011*c0909341SAndroid Build Coastguard Worker    paddw                m4, m6
3012*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m7
3013*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m7
3014*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+ 0], m3
3015*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+64], m4
3016*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
3017*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3018*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
3019*c0909341SAndroid Build Coastguard Worker    RET
3020*c0909341SAndroid Build Coastguard Worker.v_w32:
3021*c0909341SAndroid Build Coastguard Worker    movshdup             m6, [bilin_v_perm64]
3022*c0909341SAndroid Build Coastguard Worker    movu               ym16, [srcq+ssq*0]
3023*c0909341SAndroid Build Coastguard Worker    movu               ym17, [srcq+ssq*1]
3024*c0909341SAndroid Build Coastguard Worker    movu               ym18, [srcq+ssq*2]
3025*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
3026*c0909341SAndroid Build Coastguard Worker    movu               ym19, [srcq+ssq*0]
3027*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
3028*c0909341SAndroid Build Coastguard Worker    movu               ym20, [srcq+ssq*0]
3029*c0909341SAndroid Build Coastguard Worker    vpermt2q            m16, m6, m18   ; 0 2
3030*c0909341SAndroid Build Coastguard Worker    vpermt2q            m17, m6, m19   ; 1 3
3031*c0909341SAndroid Build Coastguard Worker    vpermt2q            m18, m6, m20   ; 2 4
3032*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m16, m17  ; 01
3033*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m17, m18  ; 12
3034*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m16, m17  ; 23
3035*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m17, m18  ; 34
3036*c0909341SAndroid Build Coastguard Worker.v_w32_loop:
3037*c0909341SAndroid Build Coastguard Worker    movu               ym16, [srcq+ssq*1]
3038*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3039*c0909341SAndroid Build Coastguard Worker    movu               ym17, [srcq+ssq*0]
3040*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m0, m8    ; a0
3041*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
3042*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9        ; a1
3043*c0909341SAndroid Build Coastguard Worker    vpermt2q            m16, m6, m17   ; 5 6
3044*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m1, m8    ; b0
3045*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3046*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m9        ; b1
3047*c0909341SAndroid Build Coastguard Worker    shufpd              m18, m16, 0x55 ; 4 5
3048*c0909341SAndroid Build Coastguard Worker    paddw                m4, m2
3049*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m18, m16  ; 45
3050*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3
3051*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m18, m16  ; 56
3052*c0909341SAndroid Build Coastguard Worker    mova                m18, m16
3053*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m16, m2, m10   ; a2
3054*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m17, m3, m10   ; b2
3055*c0909341SAndroid Build Coastguard Worker    paddw                m4, m16
3056*c0909341SAndroid Build Coastguard Worker    paddw                m5, m17
3057*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m7
3058*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7
3059*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+ 0], m4
3060*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+64], m5
3061*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
3062*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3063*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop
3064*c0909341SAndroid Build Coastguard Worker    vzeroupper
3065*c0909341SAndroid Build Coastguard Worker    RET
3066*c0909341SAndroid Build Coastguard Worker.v_w64:
3067*c0909341SAndroid Build Coastguard Worker.v_w128:
3068*c0909341SAndroid Build Coastguard Worker    mova                 m6, [bilin_v_perm64]
3069*c0909341SAndroid Build Coastguard Worker    add                  wd, wd
3070*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+wq]
3071*c0909341SAndroid Build Coastguard Worker.v_loop0:
3072*c0909341SAndroid Build Coastguard Worker    vpermq              m12, m6, [srcq+ssq*0]
3073*c0909341SAndroid Build Coastguard Worker    vpermq              m13, m6, [srcq+ssq*1]
3074*c0909341SAndroid Build Coastguard Worker    lea                  r5, [srcq+ssq*2]
3075*c0909341SAndroid Build Coastguard Worker    vpermq              m14, m6, [r5  +ssq*0]
3076*c0909341SAndroid Build Coastguard Worker    vpermq              m15, m6, [r5  +ssq*1]
3077*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+ssq*2]
3078*c0909341SAndroid Build Coastguard Worker    vpermq              m16, m6, [r5  +ssq*0]
3079*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
3080*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m12, m13 ; 01
3081*c0909341SAndroid Build Coastguard Worker    punpckhbw           m12, m13
3082*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m13, m14 ; 12
3083*c0909341SAndroid Build Coastguard Worker    punpckhbw           m13, m14
3084*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m14, m15 ; 23
3085*c0909341SAndroid Build Coastguard Worker    punpckhbw           m14, m15
3086*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m15, m16 ; 34
3087*c0909341SAndroid Build Coastguard Worker    punpckhbw           m15, m16
3088*c0909341SAndroid Build Coastguard Worker.v_loop:
3089*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m17, m0, m8   ; a0
3090*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m6, [r5+ssq*1]
3091*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m18, m12, m8
3092*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
3093*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9       ; a1
3094*c0909341SAndroid Build Coastguard Worker    mova                m12, m14
3095*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m9
3096*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+ssq*2]
3097*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m19, m1, m8   ; b0
3098*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m20, m13, m8
3099*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3100*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m9       ; b1
3101*c0909341SAndroid Build Coastguard Worker    mova                m13, m15
3102*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m15, m9
3103*c0909341SAndroid Build Coastguard Worker    paddw               m17, m2
3104*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m16, m5  ; 67
3105*c0909341SAndroid Build Coastguard Worker    paddw               m18, m14
3106*c0909341SAndroid Build Coastguard Worker    punpckhbw           m14, m16, m5
3107*c0909341SAndroid Build Coastguard Worker    vpermq              m16, m6, [r5+ssq*0]
3108*c0909341SAndroid Build Coastguard Worker    paddw               m19, m3
3109*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m2, m10  ; a3
3110*c0909341SAndroid Build Coastguard Worker    paddw               m20, m15
3111*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m15, m14, m10
3112*c0909341SAndroid Build Coastguard Worker    paddw               m17, m3
3113*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m5, m16  ; 78
3114*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m3, m10  ; b3
3115*c0909341SAndroid Build Coastguard Worker    paddw               m18, m15
3116*c0909341SAndroid Build Coastguard Worker    punpckhbw           m15, m5, m16
3117*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m15, m10
3118*c0909341SAndroid Build Coastguard Worker    paddw               m19, m4
3119*c0909341SAndroid Build Coastguard Worker    paddw               m20, m5
3120*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m7}, m17, m18, m19, m20
3121*c0909341SAndroid Build Coastguard Worker    mova       [r7+wq*0+ 0], m17
3122*c0909341SAndroid Build Coastguard Worker    mova       [r7+wq*0+64], m18
3123*c0909341SAndroid Build Coastguard Worker    mova       [r7+wq*1+ 0], m19
3124*c0909341SAndroid Build Coastguard Worker    mova       [r7+wq*1+64], m20
3125*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+wq*2]
3126*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3127*c0909341SAndroid Build Coastguard Worker    jg .v_loop
3128*c0909341SAndroid Build Coastguard Worker    add                srcq, 64
3129*c0909341SAndroid Build Coastguard Worker    add                tmpq, 128
3130*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
3131*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
3132*c0909341SAndroid Build Coastguard Worker    jg .v_loop0
3133*c0909341SAndroid Build Coastguard Worker    vzeroupper
3134*c0909341SAndroid Build Coastguard Worker    RET
3135*c0909341SAndroid Build Coastguard Worker.h:
3136*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
3137*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _prep_8tap_8bpc_avx512icl).h2
3138*c0909341SAndroid Build Coastguard Worker.hv:
3139*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [pd_2]
3140*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [pd_32]
3141*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
3142*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
3143*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
3144*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+subpel_filters+mxq*8+2]
3145*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3146*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3147*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
3148*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
3149*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [base+subpel_filters+1+myq*8]
3150*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m10, [subpel_h_shufA]
3151*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*2+1]
3152*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 0x30
3153*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
3154*c0909341SAndroid Build Coastguard Worker    kmovb                k1, r3d
3155*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        ym2, [srcq+ssq*0]
3156*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
3157*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [srcq+ssq*1]
3158*c0909341SAndroid Build Coastguard Worker    kaddb                k2, k1, k1
3159*c0909341SAndroid Build Coastguard Worker    vpbroadcastq     m2{k1}, [srcq+ssq*2]
3160*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
3161*c0909341SAndroid Build Coastguard Worker    vpbroadcastq     m1{k2}, [srcq+ssq*0] ; _ _ 1 3
3162*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m3
3163*c0909341SAndroid Build Coastguard Worker    vpbroadcastq     m2{k2}, [srcq+ssq*1] ; _ 0 2 4
3164*c0909341SAndroid Build Coastguard Worker    psraw                m3, 8 ; sign-extend
3165*c0909341SAndroid Build Coastguard Worker    mova                 m6, [spel_hv_perm4a]
3166*c0909341SAndroid Build Coastguard Worker    kshiftrb             k1, k1, 2
3167*c0909341SAndroid Build Coastguard Worker    movu                 m7, [spel_hv_perm4b]
3168*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m10
3169*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
3170*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m1, m11
3171*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m10
3172*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
3173*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m2, m11
3174*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m3, q0000
3175*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m3, q1111
3176*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m3, q2222
3177*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1           ; _ _   _ 0   1 2   3 4
3178*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
3179*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m7, m0       ; 01 12 23 34
3180*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
3181*c0909341SAndroid Build Coastguard Worker    movq                xm3, [srcq+ssq*2]
3182*c0909341SAndroid Build Coastguard Worker    movq                xm4, [srcq+ss3q ]
3183*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
3184*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    ym3{k1}, [srcq+ssq*0] ; 5 7
3185*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    ym4{k1}, [srcq+ssq*1] ; 6 8
3186*c0909341SAndroid Build Coastguard Worker    pshufb              ym3, ym10
3187*c0909341SAndroid Build Coastguard Worker    mova                ym2, ym8
3188*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym2, ym3, ym11
3189*c0909341SAndroid Build Coastguard Worker    pshufb              ym4, ym10
3190*c0909341SAndroid Build Coastguard Worker    mova                ym3, ym8
3191*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym3, ym4, ym11
3192*c0909341SAndroid Build Coastguard Worker    mova                 m4, m9
3193*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m1, m12      ; a0 b0 c0 d0
3194*c0909341SAndroid Build Coastguard Worker    packssdw            ym2, ym3          ; 5 6   7 8
3195*c0909341SAndroid Build Coastguard Worker    psraw               ym2, 2
3196*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m2, q1032    ; _ 2   3 4   5 6   7 8
3197*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m6, m0       ; 23 34 45 56
3198*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m7, m0       ; 45 56 67 78
3199*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m2, m13      ; a1 b1 c1 d1
3200*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m1, m14      ; a2 b2 c2 d2
3201*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
3202*c0909341SAndroid Build Coastguard Worker    vpmovdw          [tmpq], m4
3203*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
3204*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3205*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
3206*c0909341SAndroid Build Coastguard Worker    RET
3207*c0909341SAndroid Build Coastguard Worker.hv_w8:
3208*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
3209*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+subpel_filters+mxq*8+0]
3210*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+subpel_filters+mxq*8+4]
3211*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3212*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3213*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
3214*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
3215*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [base+subpel_filters+1+myq*8]
3216*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*2+3]
3217*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
3218*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
3219*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
3220*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
3221*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m0, q0000
3222*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m0, q1111
3223*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m0, q2222
3224*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
3225*c0909341SAndroid Build Coastguard Worker    jg .hv_w16
3226*c0909341SAndroid Build Coastguard Worker    movu               xm16, [srcq+ssq*0]
3227*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m19, [subpel_h_shufA]
3228*c0909341SAndroid Build Coastguard Worker    vinserti128        ym16, [srcq+ssq*1], 1
3229*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m21, [subpel_h_shufC]
3230*c0909341SAndroid Build Coastguard Worker    vinserti32x4        m16, [srcq+ssq*2], 2
3231*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
3232*c0909341SAndroid Build Coastguard Worker    vinserti32x4        m16, [srcq+ssq*0], 3
3233*c0909341SAndroid Build Coastguard Worker    movu               xm17, [srcq+ssq*1]
3234*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m20, [subpel_h_shufB]
3235*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m16, m19   ; 0 1 2 3   0123
3236*c0909341SAndroid Build Coastguard Worker    mova                 m2, m8
3237*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m16, m21   ; 0 1 2 3   89ab
3238*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m3, m10
3239*c0909341SAndroid Build Coastguard Worker    mova                 m3, m8
3240*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm17, xm19 ; 3 4 5 6   0123
3241*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m0, m11
3242*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm8
3243*c0909341SAndroid Build Coastguard Worker    pshufb             xm18, xm17, xm21 ; 3 4 5 6   89ab
3244*c0909341SAndroid Build Coastguard Worker    vpdpbusd            xm0, xm1, xm10
3245*c0909341SAndroid Build Coastguard Worker    mova                xm1, xm8
3246*c0909341SAndroid Build Coastguard Worker    pshufb              m16, m20        ; 0 1 2 3   4567
3247*c0909341SAndroid Build Coastguard Worker    vpdpbusd            xm1, xm18, xm11
3248*c0909341SAndroid Build Coastguard Worker    pshufb             xm17, xm20       ; 3 4 5 6   4567
3249*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m16, m11
3250*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m16, m10
3251*c0909341SAndroid Build Coastguard Worker    vpdpbusd            xm0, xm17, xm11
3252*c0909341SAndroid Build Coastguard Worker    vpdpbusd            xm1, xm17, xm10
3253*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
3254*c0909341SAndroid Build Coastguard Worker    packssdw            xm0, xm1
3255*c0909341SAndroid Build Coastguard Worker    psraw                m2, 2          ; 0 1 2 3
3256*c0909341SAndroid Build Coastguard Worker    psraw               xm0, 2          ; 4
3257*c0909341SAndroid Build Coastguard Worker    valignq              m0, m2, 2      ; 1 2 3 4
3258*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m0     ; 01 12 23 34
3259*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0
3260*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
3261*c0909341SAndroid Build Coastguard Worker    movu               xm16, [srcq+ssq*2]
3262*c0909341SAndroid Build Coastguard Worker    vinserti128        ym16, [srcq+ss3q ], 1
3263*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
3264*c0909341SAndroid Build Coastguard Worker    vinserti32x4        m16, [srcq+ssq*0], 2
3265*c0909341SAndroid Build Coastguard Worker    vinserti32x4        m16, [srcq+ssq*1], 3
3266*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m16, m19   ; 5 6 7 8   0123
3267*c0909341SAndroid Build Coastguard Worker    mova                 m5, m8
3268*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m16, m21   ; 5 6 7 8   89ab
3269*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m5, m6, m10
3270*c0909341SAndroid Build Coastguard Worker    mova                 m6, m8
3271*c0909341SAndroid Build Coastguard Worker    pshufb              m16, m20        ; 5 6 7 8   4567
3272*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m6, m3, m11
3273*c0909341SAndroid Build Coastguard Worker    mova                 m3, m9
3274*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m1, m12    ; a0 b0 c0 d0
3275*c0909341SAndroid Build Coastguard Worker    mova                 m4, m9
3276*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m2, m12
3277*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m5, m16, m11
3278*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m6, m16, m10
3279*c0909341SAndroid Build Coastguard Worker    mova                m16, m1
3280*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m6
3281*c0909341SAndroid Build Coastguard Worker    mova                 m6, m2
3282*c0909341SAndroid Build Coastguard Worker    psraw                m5, 2          ; 5 6 7 8
3283*c0909341SAndroid Build Coastguard Worker    valignq              m2, m5, m0, 6  ; 4 5 6 7
3284*c0909341SAndroid Build Coastguard Worker    mova                 m0, m5
3285*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m5     ; 45 56 67 78
3286*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m5
3287*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m1, m14    ; a2 b2 c2 d2
3288*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m2, m14
3289*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m16, m1, q1032  ; 23 34 45 56
3290*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m6, m2, q1032
3291*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m16, m13   ; a1 b1 c1 d1
3292*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m6, m13
3293*c0909341SAndroid Build Coastguard Worker    psrad                m3, 6
3294*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
3295*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m4
3296*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m3
3297*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
3298*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3299*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
3300*c0909341SAndroid Build Coastguard Worker    vzeroupper
3301*c0909341SAndroid Build Coastguard Worker    RET
3302*c0909341SAndroid Build Coastguard Worker.hv_w16:
3303*c0909341SAndroid Build Coastguard Worker    mova                m16, [spel_h_perm16]
3304*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m18, [pb_4]
3305*c0909341SAndroid Build Coastguard Worker    add                  wd, wd
3306*c0909341SAndroid Build Coastguard Worker    paddb               m17, m18, m16
3307*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+wq*8-256]
3308*c0909341SAndroid Build Coastguard Worker    paddb               m18, m17
3309*c0909341SAndroid Build Coastguard Worker.hv_w16_loop0:
3310*c0909341SAndroid Build Coastguard Worker    movu               ym19, [srcq+ssq*0]
3311*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m19, [srcq+ssq*1], 1
3312*c0909341SAndroid Build Coastguard Worker    lea                  r5, [srcq+ssq*2]
3313*c0909341SAndroid Build Coastguard Worker    movu               ym20, [r5  +ssq*0]
3314*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m20, [r5  +ssq*1], 1
3315*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5  +ssq*2]
3316*c0909341SAndroid Build Coastguard Worker    movu               ym21, [r5  +ssq*0]
3317*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
3318*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m16, m19      ; 0 1   0123   89ab
3319*c0909341SAndroid Build Coastguard Worker    mova                 m2, m8
3320*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m18, m19      ; 0 1   89ab   ghij
3321*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m3, m10
3322*c0909341SAndroid Build Coastguard Worker    mova                 m3, m8
3323*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m16, m20      ; 2 3   0123   89ab
3324*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m4, m11
3325*c0909341SAndroid Build Coastguard Worker    mova                 m4, m8
3326*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m18, m20      ; 2 3   89ab   ghij
3327*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m5, m10
3328*c0909341SAndroid Build Coastguard Worker    mova                 m5, m8
3329*c0909341SAndroid Build Coastguard Worker    vpermb              ym1, ym16, ym21    ; 4     0123   89ab
3330*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m5, m0, m11
3331*c0909341SAndroid Build Coastguard Worker    mova                ym0, ym8
3332*c0909341SAndroid Build Coastguard Worker    vpermb              ym6, ym18, ym21    ; 4     89ab   ghij
3333*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym1, ym10
3334*c0909341SAndroid Build Coastguard Worker    mova                ym1, ym8
3335*c0909341SAndroid Build Coastguard Worker    vpermb              m19, m17, m19      ; 0 1   4567   cdef
3336*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym1, ym6, ym11
3337*c0909341SAndroid Build Coastguard Worker    vpermb              m20, m17, m20      ; 2 3   4567   cdef
3338*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m19, m11
3339*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m19, m10
3340*c0909341SAndroid Build Coastguard Worker    vpermb             ym21, ym17, ym21    ; 4     4567   cdef
3341*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m20, m11
3342*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m5, m20, m10
3343*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym21, ym11
3344*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym1, ym21, ym10
3345*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3            ; 0 1
3346*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5            ; 2 3
3347*c0909341SAndroid Build Coastguard Worker    packssdw            ym0, ym1           ; 4
3348*c0909341SAndroid Build Coastguard Worker    REPX       {psraw x, 2}, m2, m4, ym0
3349*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m2, m4, q1032 ; 1 2
3350*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m4, m0, q1032 ; 3 4
3351*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m3        ; 01 12
3352*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m3
3353*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m0        ; 23 34
3354*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0
3355*c0909341SAndroid Build Coastguard Worker.hv_w16_loop:
3356*c0909341SAndroid Build Coastguard Worker    movu               ym19, [r5+ssq*1]
3357*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+ssq*2]
3358*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m19, [r5+ssq*0], 1
3359*c0909341SAndroid Build Coastguard Worker    vpermb               m6, m16, m19      ; 5 6   0123   89ab
3360*c0909341SAndroid Build Coastguard Worker    mova                 m5, m8
3361*c0909341SAndroid Build Coastguard Worker    vpermb              m20, m18, m19      ; 5 6   89ab   ghij
3362*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m5, m6, m10
3363*c0909341SAndroid Build Coastguard Worker    mova                 m6, m8
3364*c0909341SAndroid Build Coastguard Worker    vpermb              m19, m17, m19      ; 5 6   4567   cdef
3365*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m6, m20, m11
3366*c0909341SAndroid Build Coastguard Worker    mova                m20, m9
3367*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m1, m12       ; a0 b0
3368*c0909341SAndroid Build Coastguard Worker    mova                m21, m9
3369*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m21, m2, m12
3370*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m5, m19, m11
3371*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m6, m19, m10
3372*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m3, m13       ; a1 b1
3373*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m21, m4, m13
3374*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m6
3375*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3376*c0909341SAndroid Build Coastguard Worker    psraw                m5, 2             ; 5 6
3377*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
3378*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m0, m5, q1032 ; 4 5
3379*c0909341SAndroid Build Coastguard Worker    mova                 m0, m5
3380*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m0        ; 45 56
3381*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0
3382*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m3, m14       ; a2 b2
3383*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m21, m4, m14
3384*c0909341SAndroid Build Coastguard Worker    psrad               m20, 6
3385*c0909341SAndroid Build Coastguard Worker    psrad               m21, 6
3386*c0909341SAndroid Build Coastguard Worker    packssdw            m20, m21
3387*c0909341SAndroid Build Coastguard Worker    mova          [r7+wq*0], ym20
3388*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [r7+wq*1], m20, 1
3389*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+wq*2]
3390*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3391*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop
3392*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
3393*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
3394*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
3395*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
3396*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop0
3397*c0909341SAndroid Build Coastguard Worker    vzeroupper
3398*c0909341SAndroid Build Coastguard Worker    RET
3399*c0909341SAndroid Build Coastguard Worker
3400*c0909341SAndroid Build Coastguard Worker%macro PREP_8TAP_H 0
3401*c0909341SAndroid Build Coastguard Worker    vpermb              m10, m5, m0
3402*c0909341SAndroid Build Coastguard Worker    vpermb              m11, m5, m1
3403*c0909341SAndroid Build Coastguard Worker    vpermb              m12, m6, m0
3404*c0909341SAndroid Build Coastguard Worker    vpermb              m13, m6, m1
3405*c0909341SAndroid Build Coastguard Worker    vpermb              m14, m7, m0
3406*c0909341SAndroid Build Coastguard Worker    vpermb              m15, m7, m1
3407*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
3408*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m10, m8
3409*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
3410*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m12, m8
3411*c0909341SAndroid Build Coastguard Worker    mova                 m1, m4
3412*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m11, m8
3413*c0909341SAndroid Build Coastguard Worker    mova                 m3, m4
3414*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m13, m8
3415*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m12, m9
3416*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m14, m9
3417*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m13, m9
3418*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m15, m9
3419*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2
3420*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3
3421*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
3422*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
3423*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*0], m0
3424*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+64*1], m1
3425*c0909341SAndroid Build Coastguard Worker%endmacro
3426*c0909341SAndroid Build Coastguard Worker
3427*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_8bpc
3428*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_8bpc
3429*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp,          SHARP,   SHARP
3430*c0909341SAndroid Build Coastguard Worker
3431*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my, stride3
3432*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
3433*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
3434*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
3435*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 8tap_v, my, 4tap_v
3436*c0909341SAndroid Build Coastguard Worker    lea                  r7, [prep_avx512icl]
3437*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
3438*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
3439*c0909341SAndroid Build Coastguard Worker    jnz .h
3440*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
3441*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _prep_6tap_8bpc_avx512icl).prep
3442*c0909341SAndroid Build Coastguard Worker.v:
3443*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb ; Select 4-tap/8-tap filter multipliers.
3444*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16  ; Note that the code is 8-tap only, having
3445*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4   ; a separate 4-tap code path for (4|8|16)x4
3446*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd ; had a negligible effect on performance.
3447*c0909341SAndroid Build Coastguard Worker    tzcnt               r5d, wd
3448*c0909341SAndroid Build Coastguard Worker    lea                 myq, [base+subpel_filters+myq*8]
3449*c0909341SAndroid Build Coastguard Worker    movzx               r5d, word [r7+r5*2+table_offset(prep, _8tap_v)]
3450*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pw_8192]
3451*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m8, [myq+0]
3452*c0909341SAndroid Build Coastguard Worker    add                  r5, r7
3453*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, [myq+2]
3454*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
3455*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m10, [myq+4]
3456*c0909341SAndroid Build Coastguard Worker    sub                srcq, stride3q
3457*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m11, [myq+6]
3458*c0909341SAndroid Build Coastguard Worker    jmp                  r5
3459*c0909341SAndroid Build Coastguard Worker.v_w4:
3460*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [srcq+strideq*0]
3461*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ymm1, [srcq+strideq*2]
3462*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm2, [srcq+strideq*1]
3463*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ymm3, [srcq+stride3q ]
3464*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
3465*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm1, ymm0, 0x01       ; 0 2 2 _   2 _ _ _
3466*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm3, ymm2, 0x03       ; 1 1 3 3   3 3 _ _
3467*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ymm0, [srcq+strideq*0]
3468*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ymm2, [srcq+strideq*1]
3469*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm1, ymm0, 0x68       ; 0 2 2 4   2 4 4 _
3470*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ymm0, [srcq+strideq*2]
3471*c0909341SAndroid Build Coastguard Worker    vbroadcasti128     ymm5, [deint_shuf4]
3472*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm3, ymm2, 0xc0       ; 1 1 3 3   3 3 5 5
3473*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3   2 3 4 5
3474*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm3, ymm1, 0xaa       ; 1 2 3 4   3 4 5 _
3475*c0909341SAndroid Build Coastguard Worker    punpcklbw          ymm1, ymm2, ymm3       ; 01  12    23  34
3476*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm3, ymm0, 0x80       ; 1 2 3 4   3 4 5 6
3477*c0909341SAndroid Build Coastguard Worker    punpckhbw          ymm2, ymm3             ; 23  34    45  56
3478*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
3479*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm0, [srcq+stride3q ], 1
3480*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
3481*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ymm3, [srcq+strideq*0]
3482*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ymm4, [srcq+strideq*1]
3483*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm3, ymm4, 0x20       ; _ _ 8 _   8 9 _ _
3484*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm3, ymm0, 0x03       ; 6 7 8 _   8 9 _ _
3485*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ymm0, [srcq+strideq*2]
3486*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm3, ymm0, 0x40       ; 6 7 8 _   8 9 a _
3487*c0909341SAndroid Build Coastguard Worker    pshufb             ymm3, ymm5             ; 67  78    89  9a
3488*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm4, ymm1, ym8
3489*c0909341SAndroid Build Coastguard Worker    vperm2i128         ymm1, ymm2, ymm3, 0x21 ; 45  56    67  78
3490*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm2, ym9
3491*c0909341SAndroid Build Coastguard Worker    paddw              ymm4, ymm2
3492*c0909341SAndroid Build Coastguard Worker    mova               ymm2, ymm3
3493*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm3, ym11
3494*c0909341SAndroid Build Coastguard Worker    paddw              ymm3, ymm4
3495*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm4, ymm1, ym10
3496*c0909341SAndroid Build Coastguard Worker    paddw              ymm3, ymm4
3497*c0909341SAndroid Build Coastguard Worker    pmulhrsw           ymm3, ym7
3498*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], ymm3
3499*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
3500*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3501*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
3502*c0909341SAndroid Build Coastguard Worker    vzeroupper
3503*c0909341SAndroid Build Coastguard Worker    RET
3504*c0909341SAndroid Build Coastguard Worker.v_w8:
3505*c0909341SAndroid Build Coastguard Worker    mova                 m6, [spel_v_perm8]
3506*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+strideq*0]
3507*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x3e
3508*c0909341SAndroid Build Coastguard Worker    movq                xm2, [srcq+strideq*1]
3509*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        ym3, [srcq+strideq*2]
3510*c0909341SAndroid Build Coastguard Worker    kmovb                k1, r6d
3511*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        ym4, [srcq+stride3q ]
3512*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
3513*c0909341SAndroid Build Coastguard Worker    vpunpcklqdq      m1{k1}, m3, [srcq+strideq*0] {1to8}
3514*c0909341SAndroid Build Coastguard Worker    vpunpcklqdq      m2{k1}, m4, [srcq+strideq*1] {1to8}
3515*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+strideq*2]
3516*c0909341SAndroid Build Coastguard Worker    kshiftlb             k2, k1, 2
3517*c0909341SAndroid Build Coastguard Worker    shufpd               m1, m2, 0x30      ; 0 1   2 3   4 5
3518*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m1, m0, q0021 ; 2 3   4 5   6 _
3519*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m6, m1        ; 01 12 23 34
3520*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m6, m2        ; 23 34 45 56
3521*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
3522*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        ym3, [srcq+strideq*4]
3523*c0909341SAndroid Build Coastguard Worker    vpunpcklqdq     ym0{k1}, ym3, [srcq+stride3q] {1to4}
3524*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
3525*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+strideq*2]
3526*c0909341SAndroid Build Coastguard Worker    vpunpcklqdq      m0{k2}, m3, [srcq+strideq*1] {1to8}
3527*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m1, m8        ; a0 b0 c0 d0
3528*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
3529*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m2, m9        ; a1 b1 c1 d1
3530*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m6, m0        ; 67 78 89 9a
3531*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm3
3532*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m2, q1032     ; 45 56 67 78
3533*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m2, m11       ; a3 b3 c3 d3
3534*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
3535*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m1, m10       ; a2 b2 c2 d2
3536*c0909341SAndroid Build Coastguard Worker    paddw                m4, m3
3537*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
3538*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m7
3539*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m4
3540*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
3541*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3542*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
3543*c0909341SAndroid Build Coastguard Worker    RET
3544*c0909341SAndroid Build Coastguard Worker.v_w16:
3545*c0909341SAndroid Build Coastguard Worker    mova                m12, [spel_v_perm16b]
3546*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m1, [srcq+strideq*0]
3547*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x0f
3548*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym4, [srcq+strideq*1]
3549*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m2, [srcq+strideq*2]
3550*c0909341SAndroid Build Coastguard Worker    kmovb                k1, r6d
3551*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym5, [srcq+stride3q ]
3552*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
3553*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [srcq+strideq*0]
3554*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym6, [srcq+strideq*1]
3555*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [srcq+strideq*2]
3556*c0909341SAndroid Build Coastguard Worker    vshufpd          m1{k1}, m4, m2, 0xcc
3557*c0909341SAndroid Build Coastguard Worker    vshufpd          m2{k1}, m5, m3, 0xcc
3558*c0909341SAndroid Build Coastguard Worker    vshufpd          m3{k1}, m6, m0, 0xcc
3559*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m12, m1 ; 01 12
3560*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m12, m2 ; 23 34
3561*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m12, m3 ; 45 56
3562*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
3563*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m1, m8  ; a0 b0
3564*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3565*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m2, m9  ; a1 b1
3566*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym6, [srcq+stride3q ]
3567*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m2, m8  ; c0 d0
3568*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
3569*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m3, m9  ; c1 d1
3570*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [srcq+strideq*0]
3571*c0909341SAndroid Build Coastguard Worker    vshufpd          m0{k1}, m6, m3, 0xcc
3572*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym6, [srcq+strideq*1]
3573*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m12, m0 ; 67 78
3574*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [srcq+strideq*2]
3575*c0909341SAndroid Build Coastguard Worker    vshufpd          m3{k1}, m6, m0, 0xcc
3576*c0909341SAndroid Build Coastguard Worker    paddw                m4, m13
3577*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m1, m10 ; a2 b2
3578*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m12, m3 ; 89 9a
3579*c0909341SAndroid Build Coastguard Worker    paddw                m5, m14
3580*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m2, m10 ; c2 d2
3581*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m15, m2, m11 ; a3 b3
3582*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m3, m11 ; c3 d3
3583*c0909341SAndroid Build Coastguard Worker    paddw                m4, m13
3584*c0909341SAndroid Build Coastguard Worker    paddw                m5, m14
3585*c0909341SAndroid Build Coastguard Worker    paddw                m4, m15
3586*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6
3587*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m7
3588*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7
3589*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+ 0], m4
3590*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+64], m5
3591*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
3592*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3593*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
3594*c0909341SAndroid Build Coastguard Worker    RET
3595*c0909341SAndroid Build Coastguard Worker.v_w32:
3596*c0909341SAndroid Build Coastguard Worker    movshdup            m21, [bilin_v_perm64]
3597*c0909341SAndroid Build Coastguard Worker    movu               ym16, [srcq+strideq*0]
3598*c0909341SAndroid Build Coastguard Worker    movu               ym17, [srcq+strideq*1]
3599*c0909341SAndroid Build Coastguard Worker    movu               ym18, [srcq+strideq*2]
3600*c0909341SAndroid Build Coastguard Worker    add                srcq, stride3q
3601*c0909341SAndroid Build Coastguard Worker    movu               ym19, [srcq+strideq*0]
3602*c0909341SAndroid Build Coastguard Worker    vpermt2q            m16, m21, m19  ; 0 3
3603*c0909341SAndroid Build Coastguard Worker    movu               ym20, [srcq+strideq*1]
3604*c0909341SAndroid Build Coastguard Worker    vpermt2q            m17, m21, m20  ; 1 4
3605*c0909341SAndroid Build Coastguard Worker    movu               ym20, [srcq+strideq*2]
3606*c0909341SAndroid Build Coastguard Worker    add                srcq, stride3q
3607*c0909341SAndroid Build Coastguard Worker    vpermt2q            m18, m21, m20  ; 2 5
3608*c0909341SAndroid Build Coastguard Worker    movu               ym20, [srcq+strideq*0]
3609*c0909341SAndroid Build Coastguard Worker    vpermt2q            m19, m21, m20  ; 3 6
3610*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m16, m17  ; 01
3611*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m17, m18  ; 12
3612*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m18, m19  ; 23
3613*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m16, m17  ; 34
3614*c0909341SAndroid Build Coastguard Worker    punpckhbw            m4, m17, m18  ; 45
3615*c0909341SAndroid Build Coastguard Worker    punpckhbw            m5, m18, m19  ; 56
3616*c0909341SAndroid Build Coastguard Worker.v_w32_loop:
3617*c0909341SAndroid Build Coastguard Worker    movu               ym16, [srcq+strideq*1]
3618*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
3619*c0909341SAndroid Build Coastguard Worker    movu               ym17, [srcq+strideq*0]
3620*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m0, m8
3621*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
3622*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m15, m1, m8
3623*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3624*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9
3625*c0909341SAndroid Build Coastguard Worker    vpermt2q            m16, m21, m17  ; 7 8
3626*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m9
3627*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, m4, m10
3628*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m5, m10
3629*c0909341SAndroid Build Coastguard Worker    shufpd              m19, m16, 0x55 ; 6 7
3630*c0909341SAndroid Build Coastguard Worker    paddw               m14, m2
3631*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
3632*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m19, m16  ; 67
3633*c0909341SAndroid Build Coastguard Worker    paddw               m15, m3
3634*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
3635*c0909341SAndroid Build Coastguard Worker    punpckhbw            m5, m19, m16  ; 78
3636*c0909341SAndroid Build Coastguard Worker    paddw               m14, m12
3637*c0909341SAndroid Build Coastguard Worker    paddw               m15, m13
3638*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, m4, m11
3639*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m5, m11
3640*c0909341SAndroid Build Coastguard Worker    mova                m19, m16
3641*c0909341SAndroid Build Coastguard Worker    paddw               m14, m12
3642*c0909341SAndroid Build Coastguard Worker    paddw               m15, m13
3643*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m7
3644*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m7
3645*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+ 0], m14
3646*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+64], m15
3647*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
3648*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3649*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop
3650*c0909341SAndroid Build Coastguard Worker    vzeroupper
3651*c0909341SAndroid Build Coastguard Worker    RET
3652*c0909341SAndroid Build Coastguard Worker.v_w64:
3653*c0909341SAndroid Build Coastguard Worker.v_w128:
3654*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      24
3655*c0909341SAndroid Build Coastguard Worker    mova                m23, [bilin_v_perm64]
3656*c0909341SAndroid Build Coastguard Worker    add                  wd, wd
3657*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+wq]
3658*c0909341SAndroid Build Coastguard Worker.v_loop0:
3659*c0909341SAndroid Build Coastguard Worker    vpermq              m12, m23, [srcq+strideq*0]
3660*c0909341SAndroid Build Coastguard Worker    vpermq              m13, m23, [srcq+strideq*1]
3661*c0909341SAndroid Build Coastguard Worker    lea                  r5, [srcq+strideq*2]
3662*c0909341SAndroid Build Coastguard Worker    vpermq              m14, m23, [r5  +strideq*0]
3663*c0909341SAndroid Build Coastguard Worker    vpermq              m15, m23, [r5  +strideq*1]
3664*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+strideq*2]
3665*c0909341SAndroid Build Coastguard Worker    vpermq              m16, m23, [r5  +strideq*0]
3666*c0909341SAndroid Build Coastguard Worker    vpermq              m17, m23, [r5  +strideq*1]
3667*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+strideq*2]
3668*c0909341SAndroid Build Coastguard Worker    vpermq              m18, m23, [r5  +strideq*0]
3669*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
3670*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m12, m13 ; 01
3671*c0909341SAndroid Build Coastguard Worker    punpckhbw           m12, m13
3672*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m13, m14 ; 12
3673*c0909341SAndroid Build Coastguard Worker    punpckhbw           m13, m14
3674*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m14, m15 ; 23
3675*c0909341SAndroid Build Coastguard Worker    punpckhbw           m14, m15
3676*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m15, m16 ; 34
3677*c0909341SAndroid Build Coastguard Worker    punpckhbw           m15, m16
3678*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m16, m17 ; 45
3679*c0909341SAndroid Build Coastguard Worker    punpckhbw           m16, m17
3680*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m17, m18 ; 56
3681*c0909341SAndroid Build Coastguard Worker    punpckhbw           m17, m18
3682*c0909341SAndroid Build Coastguard Worker.v_loop:
3683*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m19, m0, m8   ; a0
3684*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m23, [r5+strideq*1]
3685*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m20, m12, m8
3686*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
3687*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9       ; a1
3688*c0909341SAndroid Build Coastguard Worker    mova                m12, m14
3689*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m9
3690*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+strideq*2]
3691*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m21, m1, m8   ; b0
3692*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m22, m13, m8
3693*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3694*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m9       ; b1
3695*c0909341SAndroid Build Coastguard Worker    mova                m13, m15
3696*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m15, m9
3697*c0909341SAndroid Build Coastguard Worker    paddw               m19, m2
3698*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
3699*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m10      ; a2
3700*c0909341SAndroid Build Coastguard Worker    paddw               m20, m14
3701*c0909341SAndroid Build Coastguard Worker    mova                m14, m16
3702*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m16, m10
3703*c0909341SAndroid Build Coastguard Worker    paddw               m21, m3
3704*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
3705*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m10      ; b2
3706*c0909341SAndroid Build Coastguard Worker    paddw               m22, m15
3707*c0909341SAndroid Build Coastguard Worker    mova                m15, m17
3708*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m17, m10
3709*c0909341SAndroid Build Coastguard Worker    paddw               m19, m4
3710*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m18, m6  ; 67
3711*c0909341SAndroid Build Coastguard Worker    paddw               m20, m16
3712*c0909341SAndroid Build Coastguard Worker    punpckhbw           m16, m18, m6
3713*c0909341SAndroid Build Coastguard Worker    vpermq              m18, m23, [r5+strideq*0]
3714*c0909341SAndroid Build Coastguard Worker    paddw               m21, m5
3715*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m4, m11  ; a3
3716*c0909341SAndroid Build Coastguard Worker    paddw               m22, m17
3717*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m17, m16, m11
3718*c0909341SAndroid Build Coastguard Worker    paddw               m19, m5
3719*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m6, m18  ; 78
3720*c0909341SAndroid Build Coastguard Worker    paddw               m20, m17
3721*c0909341SAndroid Build Coastguard Worker    punpckhbw           m17, m6, m18
3722*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m5, m11  ; b3
3723*c0909341SAndroid Build Coastguard Worker    paddw               m21, m6
3724*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m17, m11
3725*c0909341SAndroid Build Coastguard Worker    paddw               m22, m6
3726*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m7}, m19, m20, m21, m22
3727*c0909341SAndroid Build Coastguard Worker    mova       [r7+wq*0+ 0], m19
3728*c0909341SAndroid Build Coastguard Worker    mova       [r7+wq*0+64], m20
3729*c0909341SAndroid Build Coastguard Worker    mova       [r7+wq*1+ 0], m21
3730*c0909341SAndroid Build Coastguard Worker    mova       [r7+wq*1+64], m22
3731*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+wq*2]
3732*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3733*c0909341SAndroid Build Coastguard Worker    jg .v_loop
3734*c0909341SAndroid Build Coastguard Worker    add                srcq, 64
3735*c0909341SAndroid Build Coastguard Worker    add                tmpq, 128
3736*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
3737*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
3738*c0909341SAndroid Build Coastguard Worker    jg .v_loop0
3739*c0909341SAndroid Build Coastguard Worker    RET
3740*c0909341SAndroid Build Coastguard Worker.h:
3741*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
3742*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
3743*c0909341SAndroid Build Coastguard Worker    jnz .hv
3744*c0909341SAndroid Build Coastguard Worker.h2:
3745*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [pd_2]
3746*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
3747*c0909341SAndroid Build Coastguard Worker    je .h_w4
3748*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
3749*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
3750*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
3751*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
3752*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+subpel_filters+mxq*8+0]
3753*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+subpel_filters+mxq*8+4]
3754*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
3755*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3756*c0909341SAndroid Build Coastguard Worker.h_w4:
3757*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
3758*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      ym5, [subpel_h_shufA]
3759*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 0x4
3760*c0909341SAndroid Build Coastguard Worker    dec                srcq
3761*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym6, [base+subpel_filters+mxq*8+2]
3762*c0909341SAndroid Build Coastguard Worker    kmovb                k1, r3d
3763*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
3764*c0909341SAndroid Build Coastguard Worker.h_w4_loop:
3765*c0909341SAndroid Build Coastguard Worker    movq                xm2, [srcq+strideq*0]
3766*c0909341SAndroid Build Coastguard Worker    movq                xm3, [srcq+strideq*1]
3767*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    ym2{k1}, [srcq+strideq*2]
3768*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    ym3{k1}, [srcq+stride3q ]
3769*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
3770*c0909341SAndroid Build Coastguard Worker    pshufb              ym2, ym5
3771*c0909341SAndroid Build Coastguard Worker    pshufb              ym3, ym5
3772*c0909341SAndroid Build Coastguard Worker    mova                ym0, ym4
3773*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym2, ym6
3774*c0909341SAndroid Build Coastguard Worker    mova                ym1, ym4
3775*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym1, ym3, ym6
3776*c0909341SAndroid Build Coastguard Worker    packssdw            ym0, ym1
3777*c0909341SAndroid Build Coastguard Worker    psraw               ym0, 2
3778*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], ym0
3779*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
3780*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3781*c0909341SAndroid Build Coastguard Worker    jg .h_w4_loop
3782*c0909341SAndroid Build Coastguard Worker    RET
3783*c0909341SAndroid Build Coastguard Worker.h_w8:
3784*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [subpel_h_shufA]
3785*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [subpel_h_shufB]
3786*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [subpel_h_shufC]
3787*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
3788*c0909341SAndroid Build Coastguard Worker.h_w8_loop:
3789*c0909341SAndroid Build Coastguard Worker    movu               xmm3, [srcq+strideq*0]
3790*c0909341SAndroid Build Coastguard Worker    vinserti128         ym3, ymm3, [srcq+strideq*1], 1
3791*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+strideq*2], 2
3792*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+stride3q ], 3
3793*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
3794*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3, m5
3795*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3, m6
3796*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
3797*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m1, m8
3798*c0909341SAndroid Build Coastguard Worker    mova                 m1, m4
3799*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m2, m8
3800*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7
3801*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m2, m9
3802*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m3, m9
3803*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
3804*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
3805*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
3806*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
3807*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3808*c0909341SAndroid Build Coastguard Worker    jg .h_w8_loop
3809*c0909341SAndroid Build Coastguard Worker    RET
3810*c0909341SAndroid Build Coastguard Worker.h_w16:
3811*c0909341SAndroid Build Coastguard Worker    mova                 m5, [spel_h_perm16]
3812*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pb_4]
3813*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
3814*c0909341SAndroid Build Coastguard Worker    paddb                m6, m7, m5
3815*c0909341SAndroid Build Coastguard Worker    paddb                m7, m6
3816*c0909341SAndroid Build Coastguard Worker.h_w16_loop:
3817*c0909341SAndroid Build Coastguard Worker    movu                ym0, [srcq+strideq*0]
3818*c0909341SAndroid Build Coastguard Worker    movu                ym1, [srcq+strideq*2]
3819*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [srcq+strideq*1], 1
3820*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [srcq+stride3q ], 1
3821*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
3822*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_H
3823*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
3824*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3825*c0909341SAndroid Build Coastguard Worker    jg .h_w16_loop
3826*c0909341SAndroid Build Coastguard Worker    RET
3827*c0909341SAndroid Build Coastguard Worker.h_w32:
3828*c0909341SAndroid Build Coastguard Worker    mova                 m5, [spel_h_perm32]
3829*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pb_4]
3830*c0909341SAndroid Build Coastguard Worker    paddb                m6, m7, m5
3831*c0909341SAndroid Build Coastguard Worker    paddb                m7, m6
3832*c0909341SAndroid Build Coastguard Worker.h_w32_loop:
3833*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
3834*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+strideq*1]
3835*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
3836*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_H
3837*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
3838*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3839*c0909341SAndroid Build Coastguard Worker    jg .h_w32_loop
3840*c0909341SAndroid Build Coastguard Worker    RET
3841*c0909341SAndroid Build Coastguard Worker.h_w64:
3842*c0909341SAndroid Build Coastguard Worker    xor                 r6d, r6d
3843*c0909341SAndroid Build Coastguard Worker    jmp .h_start
3844*c0909341SAndroid Build Coastguard Worker.h_w128:
3845*c0909341SAndroid Build Coastguard Worker    mov                  r6, -64*1
3846*c0909341SAndroid Build Coastguard Worker.h_start:
3847*c0909341SAndroid Build Coastguard Worker    mova                 m5, [spel_h_perm32]
3848*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pb_4]
3849*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
3850*c0909341SAndroid Build Coastguard Worker    paddb                m6, m7, m5
3851*c0909341SAndroid Build Coastguard Worker    paddb                m7, m6
3852*c0909341SAndroid Build Coastguard Worker.h_loop0:
3853*c0909341SAndroid Build Coastguard Worker    mov                  r5, r6
3854*c0909341SAndroid Build Coastguard Worker.h_loop:
3855*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r5+32*0]
3856*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r5+32*1]
3857*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_H
3858*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
3859*c0909341SAndroid Build Coastguard Worker    add                  r5, 64
3860*c0909341SAndroid Build Coastguard Worker    jle .h_loop
3861*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
3862*c0909341SAndroid Build Coastguard Worker    dec                  hd
3863*c0909341SAndroid Build Coastguard Worker    jg .h_loop0
3864*c0909341SAndroid Build Coastguard Worker    RET
3865*c0909341SAndroid Build Coastguard Worker.hv:
3866*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
3867*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [pd_2]
3868*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [pd_32]
3869*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
3870*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
3871*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
3872*c0909341SAndroid Build Coastguard Worker    dec                srcq
3873*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+subpel_filters+mxq*8+2]
3874*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3875*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3876*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
3877*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
3878*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [base+subpel_filters+myq*8]
3879*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
3880*c0909341SAndroid Build Coastguard Worker    sub                srcq, stride3q
3881*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 0x04
3882*c0909341SAndroid Build Coastguard Worker    kmovb                k1, r3d
3883*c0909341SAndroid Build Coastguard Worker    kshiftlb             k2, k1, 2
3884*c0909341SAndroid Build Coastguard Worker    kshiftlb             k3, k1, 4
3885*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m10, [subpel_h_shufA]
3886*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
3887*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
3888*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m0, q0000
3889*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m0, q1111
3890*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m0, q2222
3891*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m0, q3333
3892*c0909341SAndroid Build Coastguard Worker    movq                xm3, [srcq+strideq*0]
3893*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        ym2, [srcq+strideq*1]
3894*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    ym3{k1}, [srcq+strideq*2]
3895*c0909341SAndroid Build Coastguard Worker    vpbroadcastq     m2{k2}, [srcq+stride3q ]
3896*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
3897*c0909341SAndroid Build Coastguard Worker    vpbroadcastq     m3{k2}, [srcq+strideq*0]
3898*c0909341SAndroid Build Coastguard Worker    vpbroadcastq     m2{k3}, [srcq+strideq*1]
3899*c0909341SAndroid Build Coastguard Worker    vpbroadcastq     m3{k3}, [srcq+strideq*2]
3900*c0909341SAndroid Build Coastguard Worker    mova                 m6, [spel_hv_perm4a]
3901*c0909341SAndroid Build Coastguard Worker    movu                 m7, [spel_hv_perm4b]
3902*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
3903*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
3904*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m10
3905*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m10
3906*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m2, m11
3907*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m3, m11
3908*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1        ; _ 0  1 2  3 4  5 6
3909*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
3910*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m6, m0    ; 01 12 23 34
3911*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m7, m0    ; 23 34 45 56
3912*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
3913*c0909341SAndroid Build Coastguard Worker    movq                xm3, [srcq+stride3q ]
3914*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
3915*c0909341SAndroid Build Coastguard Worker    movq                xm4, [srcq+strideq*0]
3916*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    ym3{k1}, [srcq+strideq*1]
3917*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    ym4{k1}, [srcq+strideq*2]
3918*c0909341SAndroid Build Coastguard Worker    mova                 m5, m9
3919*c0909341SAndroid Build Coastguard Worker    pshufb              ym3, ym10
3920*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m1, m12   ; a0 b0 c0 d0
3921*c0909341SAndroid Build Coastguard Worker    mova                ym1, ym8
3922*c0909341SAndroid Build Coastguard Worker    pshufb              ym4, ym10
3923*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym1, ym3, ym11
3924*c0909341SAndroid Build Coastguard Worker    mova                ym3, ym8
3925*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym3, ym4, ym11
3926*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m2, m13   ; a1 b1 c1 d1
3927*c0909341SAndroid Build Coastguard Worker    packssdw            ym1, ym3       ; 7 8  9 a
3928*c0909341SAndroid Build Coastguard Worker    psraw               ym1, 2
3929*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m1, q1032 ; _ 4  5 6  7 8  9 a
3930*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m6, m0    ; 45 56 67 78
3931*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m7, m0    ; 67 78 89 9a
3932*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m1, m14   ; a2 b2 c2 d2
3933*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m2, m15   ; a3 b3 c3 d3
3934*c0909341SAndroid Build Coastguard Worker    psrad                m5, 6
3935*c0909341SAndroid Build Coastguard Worker    vpmovdw          [tmpq], m5
3936*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
3937*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3938*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
3939*c0909341SAndroid Build Coastguard Worker    RET
3940*c0909341SAndroid Build Coastguard Worker.hv_w8:
3941*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
3942*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
3943*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+subpel_filters+mxq*8+0]
3944*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+subpel_filters+mxq*8+4]
3945*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3946*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3947*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
3948*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
3949*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [base+subpel_filters+myq*8]
3950*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
3951*c0909341SAndroid Build Coastguard Worker    sub                srcq, stride3q
3952*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
3953*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
3954*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m0, q0000
3955*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m0, q1111
3956*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m0, q2222
3957*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m0, q3333
3958*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
3959*c0909341SAndroid Build Coastguard Worker    jg .hv_w16
3960*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m17, [srcq+stride3q ]
3961*c0909341SAndroid Build Coastguard Worker    vinserti32x4        m16, m17, [srcq+strideq*0], 0
3962*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m19, [subpel_h_shufA]
3963*c0909341SAndroid Build Coastguard Worker    vinserti32x4        m16, [srcq+strideq*1], 1
3964*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m21, [subpel_h_shufC]
3965*c0909341SAndroid Build Coastguard Worker    vinserti32x4        m16, [srcq+strideq*2], 2
3966*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
3967*c0909341SAndroid Build Coastguard Worker    vinserti128        ym17, [srcq+strideq*0], 1
3968*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m20, [subpel_h_shufB]
3969*c0909341SAndroid Build Coastguard Worker    vinserti32x4        m17, [srcq+strideq*1], 2
3970*c0909341SAndroid Build Coastguard Worker    vinserti32x4        m17, [srcq+strideq*2], 3
3971*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m16, m19      ; 0 1 2 3   0123
3972*c0909341SAndroid Build Coastguard Worker    mova                 m2, m8
3973*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m16, m21      ; 0 1 2 3   89ab
3974*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m3, m10
3975*c0909341SAndroid Build Coastguard Worker    mova                 m3, m8
3976*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m17, m19      ; 3 4 5 6   0123
3977*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m0, m11
3978*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
3979*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m17, m21      ; 3 4 5 6   89ab
3980*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m1, m10
3981*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
3982*c0909341SAndroid Build Coastguard Worker    pshufb              m16, m20           ; 0 1 2 3   4567
3983*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m4, m11
3984*c0909341SAndroid Build Coastguard Worker    pshufb              m17, m20           ; 3 4 5 6   4567
3985*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m16, m11
3986*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m16, m10
3987*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m17, m11
3988*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m17, m10
3989*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
3990*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
3991*c0909341SAndroid Build Coastguard Worker    psraw                m2, 2             ; 0 1 2 3
3992*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2             ; 3 4 5 6
3993*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m2, m0, q2132 ; 2 3 4 5
3994*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m5, m2, m0, q1021 ; 1 2 3 4
3995*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m0        ; 23 34 45 56
3996*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0
3997*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m5        ; 01 12 23 34
3998*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m5
3999*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
4000*c0909341SAndroid Build Coastguard Worker    movu               xm18, [srcq+stride3q ]
4001*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
4002*c0909341SAndroid Build Coastguard Worker    vinserti128        ym18, [srcq+strideq*0], 1
4003*c0909341SAndroid Build Coastguard Worker    vinserti32x4        m18, [srcq+strideq*1], 2
4004*c0909341SAndroid Build Coastguard Worker    vinserti32x4        m18, [srcq+strideq*2], 3
4005*c0909341SAndroid Build Coastguard Worker    pshufb              m17, m18, m19      ; 7 8 9 a   0123
4006*c0909341SAndroid Build Coastguard Worker    mova                m16, m8
4007*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m18, m21      ; 7 8 9 a   89ab
4008*c0909341SAndroid Build Coastguard Worker    vpdpbusd            m16, m17, m10
4009*c0909341SAndroid Build Coastguard Worker    mova                m17, m8
4010*c0909341SAndroid Build Coastguard Worker    pshufb              m18, m20           ; 7 8 9 a   4567
4011*c0909341SAndroid Build Coastguard Worker    vpdpbusd            m17, m5, m11
4012*c0909341SAndroid Build Coastguard Worker    mova                 m5, m9
4013*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m3, m13       ; a1 b1 c1 d1
4014*c0909341SAndroid Build Coastguard Worker    mova                 m6, m9
4015*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m4, m13
4016*c0909341SAndroid Build Coastguard Worker    vpdpbusd            m16, m18, m11
4017*c0909341SAndroid Build Coastguard Worker    vpdpbusd            m17, m18, m10
4018*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m1, m12       ; a0 b0 c0 d0
4019*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
4020*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m2, m12
4021*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
4022*c0909341SAndroid Build Coastguard Worker    packssdw            m16, m17
4023*c0909341SAndroid Build Coastguard Worker    psraw               m16, 2             ; 7 8 9 a
4024*c0909341SAndroid Build Coastguard Worker    valignq              m4, m16, m0, 6    ; 6 7 8 9
4025*c0909341SAndroid Build Coastguard Worker    mova                 m0, m16
4026*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m16       ; 67 78 89 9a
4027*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m16
4028*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m3, m15       ; a3 b3 c3 d3
4029*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m4, m15
4030*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m3, q1032     ; 45 56 67 78
4031*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m4, q1032
4032*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m1, m14       ; a2 b2 c2 d2
4033*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m2, m14
4034*c0909341SAndroid Build Coastguard Worker    psrad                m5, 6
4035*c0909341SAndroid Build Coastguard Worker    psrad                m6, 6
4036*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m6
4037*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m5
4038*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
4039*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4040*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
4041*c0909341SAndroid Build Coastguard Worker    vzeroupper
4042*c0909341SAndroid Build Coastguard Worker    RET
4043*c0909341SAndroid Build Coastguard Worker.hv_w16:
4044*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      23
4045*c0909341SAndroid Build Coastguard Worker    mova                m16, [spel_h_perm16]
4046*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m18, [pb_4]
4047*c0909341SAndroid Build Coastguard Worker    add                  wd, wd
4048*c0909341SAndroid Build Coastguard Worker    paddb               m17, m18, m16
4049*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+wq*8-256]
4050*c0909341SAndroid Build Coastguard Worker    paddb               m18, m17
4051*c0909341SAndroid Build Coastguard Worker.hv_w16_loop0:
4052*c0909341SAndroid Build Coastguard Worker    movu               ym19, [srcq+strideq*0]
4053*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m19, [srcq+strideq*1], 1
4054*c0909341SAndroid Build Coastguard Worker    lea                  r5, [srcq+strideq*2]
4055*c0909341SAndroid Build Coastguard Worker    movu               ym20, [r5  +strideq*0]
4056*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m20, [r5  +strideq*1], 1
4057*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5  +strideq*2]
4058*c0909341SAndroid Build Coastguard Worker    movu               ym21, [r5  +strideq*0]
4059*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m21, [r5  +strideq*1], 1
4060*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5  +strideq*2]
4061*c0909341SAndroid Build Coastguard Worker    movu               ym22, [r5  +strideq*0]
4062*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
4063*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m16, m19      ; 0 1   0123   89ab
4064*c0909341SAndroid Build Coastguard Worker    mova                 m2, m8
4065*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m18, m19      ; 0 1   89ab   ghij
4066*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m3, m10
4067*c0909341SAndroid Build Coastguard Worker    mova                 m3, m8
4068*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m16, m20      ; 2 3   0123   89ab
4069*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m4, m11
4070*c0909341SAndroid Build Coastguard Worker    mova                 m4, m8
4071*c0909341SAndroid Build Coastguard Worker    vpermb               m6, m18, m20      ; 2 3   89ab   ghij
4072*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m5, m10
4073*c0909341SAndroid Build Coastguard Worker    mova                 m5, m8
4074*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m16, m21      ; 4 5   0123   89ab
4075*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m5, m6, m11
4076*c0909341SAndroid Build Coastguard Worker    mova                 m6, m8
4077*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m18, m21      ; 4 5   89ab   ghij
4078*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m6, m7, m10
4079*c0909341SAndroid Build Coastguard Worker    mova                 m7, m8
4080*c0909341SAndroid Build Coastguard Worker    vpermb              ym1, ym16, ym22    ; 6     0123   89ab
4081*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m7, m0, m11
4082*c0909341SAndroid Build Coastguard Worker    mova                ym0, ym8
4083*c0909341SAndroid Build Coastguard Worker    vpermb              m19, m17, m19      ; 0 1   4567   cdef
4084*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym1, ym10
4085*c0909341SAndroid Build Coastguard Worker    vpermb              ym1, ym18, ym22    ; 6     89ab   ghij
4086*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m19, m11
4087*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m19, m10
4088*c0909341SAndroid Build Coastguard Worker    mova               ym19, ym8
4089*c0909341SAndroid Build Coastguard Worker    vpermb              m20, m17, m20      ; 2 3   4567   cdef
4090*c0909341SAndroid Build Coastguard Worker    vpdpbusd           ym19, ym1, ym11
4091*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m17, m21      ; 4 5   4567   cdef
4092*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m20, m11
4093*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m5, m20, m10
4094*c0909341SAndroid Build Coastguard Worker    vpermb             ym22, ym17, ym22    ; 6     4567   cdef
4095*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m6, m21, m11
4096*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m7, m21, m10
4097*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3            ; 0 1
4098*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym0, ym22, ym11
4099*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5            ; 2 3
4100*c0909341SAndroid Build Coastguard Worker    vpdpbusd           ym19, ym22, ym10
4101*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7            ; 4 5
4102*c0909341SAndroid Build Coastguard Worker    packssdw            ym0, ym19          ; 6
4103*c0909341SAndroid Build Coastguard Worker    REPX       {psraw x, 2}, m2, m4, m6, ym0
4104*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m2, m4, q1032 ; 1 2
4105*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m5, m4, m6, q1032 ; 3 4
4106*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m6, m0, q1032 ; 5 6
4107*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m3  ; 01 12
4108*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m3
4109*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m5  ; 23 34
4110*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5
4111*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6, m0  ; 45 56
4112*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m0
4113*c0909341SAndroid Build Coastguard Worker.hv_w16_loop:
4114*c0909341SAndroid Build Coastguard Worker    movu               ym19, [r5+strideq*1]
4115*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+strideq*2]
4116*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m19, [r5+strideq*0], 1
4117*c0909341SAndroid Build Coastguard Worker    mova                m20, m9
4118*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m1, m12 ; a0
4119*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m16, m19
4120*c0909341SAndroid Build Coastguard Worker    mova                m21, m9
4121*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m21, m2, m12 ; b0
4122*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m17, m19
4123*c0909341SAndroid Build Coastguard Worker    mova                m22, m8
4124*c0909341SAndroid Build Coastguard Worker    vpdpbusd            m22, m1, m10
4125*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
4126*c0909341SAndroid Build Coastguard Worker    vpermb              m19, m18, m19
4127*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m2, m10
4128*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m3, m13 ; a1
4129*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m21, m4, m13 ; b1
4130*c0909341SAndroid Build Coastguard Worker    vpdpbusd            m22, m2, m11
4131*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
4132*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m19, m11
4133*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
4134*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m5, m14 ; a2
4135*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m21, m6, m14 ; b2
4136*c0909341SAndroid Build Coastguard Worker    packssdw            m22, m1
4137*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
4138*c0909341SAndroid Build Coastguard Worker    psraw               m22, 2              ; 7 8
4139*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
4140*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m6, m0, m22, q1032 ; 6 7
4141*c0909341SAndroid Build Coastguard Worker    mova                 m0, m22
4142*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6, m0  ; 67 78
4143*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m0
4144*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m5, m15 ; a3
4145*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m21, m6, m15 ; b3
4146*c0909341SAndroid Build Coastguard Worker    psrad               m20, 6
4147*c0909341SAndroid Build Coastguard Worker    psrad               m21, 6
4148*c0909341SAndroid Build Coastguard Worker    packssdw            m20, m21
4149*c0909341SAndroid Build Coastguard Worker    mova          [r7+wq*0], ym20
4150*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [r7+wq*1], m20, 1
4151*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+wq*2]
4152*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4153*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop
4154*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
4155*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
4156*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
4157*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
4158*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop0
4159*c0909341SAndroid Build Coastguard Worker    RET
4160*c0909341SAndroid Build Coastguard Worker
4161*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts
4162*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [pd_16384]
4163*c0909341SAndroid Build Coastguard Worker    mova               ym15, [warp_8x8t_end]
4164*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main
4165*c0909341SAndroid Build Coastguard Worker    jmp .start
4166*c0909341SAndroid Build Coastguard Worker.loop:
4167*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main2
4168*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+tsq*4]
4169*c0909341SAndroid Build Coastguard Worker.start:
4170*c0909341SAndroid Build Coastguard Worker    paddd               m16, m16
4171*c0909341SAndroid Build Coastguard Worker    vpermb              m16, m15, m16
4172*c0909341SAndroid Build Coastguard Worker    mova         [tmpq+tsq*0], xm16
4173*c0909341SAndroid Build Coastguard Worker    vextracti128 [tmpq+tsq*2], ym16, 1
4174*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 0x1800
4175*c0909341SAndroid Build Coastguard Worker    jg .loop
4176*c0909341SAndroid Build Coastguard Worker    RET
4177*c0909341SAndroid Build Coastguard Worker
4178*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8_8bpc, 4, 7, 22, dst, ds, src, ss, abcd, filter
4179*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [pd_262144]
4180*c0909341SAndroid Build Coastguard Worker    mova               xm15, [warp_8x8_end]
4181*c0909341SAndroid Build Coastguard Worker    call .main
4182*c0909341SAndroid Build Coastguard Worker    jmp .start
4183*c0909341SAndroid Build Coastguard Worker.loop:
4184*c0909341SAndroid Build Coastguard Worker    call .main2
4185*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
4186*c0909341SAndroid Build Coastguard Worker.start:
4187*c0909341SAndroid Build Coastguard Worker    psrad               m16, 19
4188*c0909341SAndroid Build Coastguard Worker    packuswb            m16, m16
4189*c0909341SAndroid Build Coastguard Worker    vpermb              m16, m15, m16
4190*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm16
4191*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm16
4192*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 0x1800
4193*c0909341SAndroid Build Coastguard Worker    jg .loop
4194*c0909341SAndroid Build Coastguard Worker    RET
4195*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4196*c0909341SAndroid Build Coastguard Worker.main:
4197*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [pd_512]
4198*c0909341SAndroid Build Coastguard Worker%if WIN64
4199*c0909341SAndroid Build Coastguard Worker    mov               abcdq, r5mp
4200*c0909341SAndroid Build Coastguard Worker    vpaddd             ym18, ym1, r6m {1to8} ; mx
4201*c0909341SAndroid Build Coastguard Worker%else
4202*c0909341SAndroid Build Coastguard Worker    add                 r5d, 512
4203*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym18, r5d
4204*c0909341SAndroid Build Coastguard Worker%endif
4205*c0909341SAndroid Build Coastguard Worker    vpaddd             ym20, ym1, r7m {1to8} ; my
4206*c0909341SAndroid Build Coastguard Worker    mova               ym16, [pd_0to7]
4207*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym19, [abcdq+4*0]
4208*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym21, [abcdq+4*1]
4209*c0909341SAndroid Build Coastguard Worker    lea                  r4, [ssq*3+3]
4210*c0909341SAndroid Build Coastguard Worker    mova                m10, [warp_8x8_permA]
4211*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x5555
4212*c0909341SAndroid Build Coastguard Worker    mova                m11, [warp_8x8_permB]
4213*c0909341SAndroid Build Coastguard Worker    lea             filterq, [mc_warp_filter+64*8]
4214*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m12, [warp_8x8_hpack]
4215*c0909341SAndroid Build Coastguard Worker    sub                srcq, r4               ; src -= src_stride*3 + 3
4216*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m13, [warp_8x8_permC]
4217*c0909341SAndroid Build Coastguard Worker    kxnorb               k2, k2, k2
4218*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m14, [warp_8x8_permD]
4219*c0909341SAndroid Build Coastguard Worker    vpdpwssd           ym18, ym19, ym16       ; alpha
4220*c0909341SAndroid Build Coastguard Worker    vpdpwssd           ym20, ym21, ym16       ; gamma
4221*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [srcq]
4222*c0909341SAndroid Build Coastguard Worker    psrad              ym19, 16               ; beta
4223*c0909341SAndroid Build Coastguard Worker    psrad              ym21, 16               ; delta
4224*c0909341SAndroid Build Coastguard Worker    kmovw                k1, r6d
4225*c0909341SAndroid Build Coastguard Worker    psrad              ym16, ym18, 10
4226*c0909341SAndroid Build Coastguard Worker    kmovb                k3, k2
4227*c0909341SAndroid Build Coastguard Worker    paddd              ym18, ym19
4228*c0909341SAndroid Build Coastguard Worker    vpgatherdq       m2{k2}, [filterq+ym16*8] ; filter_x0
4229*c0909341SAndroid Build Coastguard Worker    psrld                m1, 8                ; pd_2
4230*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m11
4231*c0909341SAndroid Build Coastguard Worker    paddd                m8, m1, m1           ; pd_4
4232*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m0, m2
4233*c0909341SAndroid Build Coastguard Worker    call .h
4234*c0909341SAndroid Build Coastguard Worker    psllq                m2, m1, 45
4235*c0909341SAndroid Build Coastguard Worker    pslld                m1, 13
4236*c0909341SAndroid Build Coastguard Worker    paddd                m1, m2
4237*c0909341SAndroid Build Coastguard Worker    vpshrdq              m1, m0, 48           ; 01 12
4238*c0909341SAndroid Build Coastguard Worker    call .h
4239*c0909341SAndroid Build Coastguard Worker    vpshrdq              m2, m1, m0, 48       ; 23 34
4240*c0909341SAndroid Build Coastguard Worker    call .h
4241*c0909341SAndroid Build Coastguard Worker    vpshrdq              m3, m2, m0, 48       ; 45 56
4242*c0909341SAndroid Build Coastguard Worker.main2:
4243*c0909341SAndroid Build Coastguard Worker    call .h
4244*c0909341SAndroid Build Coastguard Worker    psrad              ym17, ym20, 10
4245*c0909341SAndroid Build Coastguard Worker    kmovb                k2, k3
4246*c0909341SAndroid Build Coastguard Worker    paddd              ym20, ym21
4247*c0909341SAndroid Build Coastguard Worker    vpgatherdq       m7{k3}, [filterq+ym17*8] ; filter_y0
4248*c0909341SAndroid Build Coastguard Worker    psrad              ym16, ym20, 10
4249*c0909341SAndroid Build Coastguard Worker    kmovb                k3, k2
4250*c0909341SAndroid Build Coastguard Worker    paddd              ym20, ym21
4251*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m17{k2}, [filterq+ym16*8] ; filter_y1
4252*c0909341SAndroid Build Coastguard Worker    shufps               m5, m7, m17, q2020   ; a0 a1 a2 a3 b0 b1 b2 b3 A0 A1 A2 A3 B0 B1 B2 B3
4253*c0909341SAndroid Build Coastguard Worker    mova                m16, m9
4254*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m5, m13          ;    a0    a1    A0    A1    b0    b1    B0    B1
4255*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m1, m4
4256*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m14              ;    a2    a3    A2    A3    b2    b3    B2    B3
4257*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
4258*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m2, m5
4259*c0909341SAndroid Build Coastguard Worker    shufps               m5, m7, m17, q3131   ; a4 a5 a6 a7 b4 b5 b6 b7 A4 A5 A6 A7 B4 B5 B6 B7
4260*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
4261*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m5, m13          ;    a4    a5    A4    A5    b4    b5    B4    B5
4262*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m3, m4
4263*c0909341SAndroid Build Coastguard Worker    vpshrdq              m3, m0, 48           ; 67 78
4264*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m14              ;    a6    a7    A6    A7    b6    b7    B6    B7
4265*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m3, m5
4266*c0909341SAndroid Build Coastguard Worker    ret
4267*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4268*c0909341SAndroid Build Coastguard Worker.h:
4269*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+ssq*1]
4270*c0909341SAndroid Build Coastguard Worker    psrad              ym16, ym18, 10
4271*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
4272*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym5, [srcq+ssq*0], 1
4273*c0909341SAndroid Build Coastguard Worker    kmovb                k2, k3
4274*c0909341SAndroid Build Coastguard Worker    paddd              ym18, ym19
4275*c0909341SAndroid Build Coastguard Worker    vpgatherdq       m6{k3}, [filterq+ym16*8] ; filter_x1
4276*c0909341SAndroid Build Coastguard Worker    psrad              ym17, ym18, 10
4277*c0909341SAndroid Build Coastguard Worker    kmovb                k3, k2
4278*c0909341SAndroid Build Coastguard Worker    paddd              ym18, ym19
4279*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m16{k2}, [filterq+ym17*8] ; filter_x2
4280*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
4281*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m10, m5          ; a4 b0 a5 b1   a6 b2 a7 b3   a8 b4 a9 b5   aa b6 ab b7
4282*c0909341SAndroid Build Coastguard Worker    vpshldq             m17, m16, m6, 32      ; a4 a5 a6 a7   b0 b1 b2 b3
4283*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m4, m17
4284*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m11, m5          ; a0 b4 a1 b5   a2 b6 a3 b7   a4 b8 a5 b9   a6 ba a7 bb
4285*c0909341SAndroid Build Coastguard Worker    vmovdqa32       m16{k1}, m6               ; a0 a1 a2 a3   b4 b5 b6 b7
4286*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m0, m5, m16
4287*c0909341SAndroid Build Coastguard Worker    vpmultishiftqb       m0, m12, m0          ; 1 1 2 2 (>> 3)
4288*c0909341SAndroid Build Coastguard Worker    ret
4289*c0909341SAndroid Build Coastguard Worker
4290*c0909341SAndroid Build Coastguard Worker%macro BIDIR_FN 1 ; op
4291*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
4292*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4293*c0909341SAndroid Build Coastguard Worker.w4:
4294*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
4295*c0909341SAndroid Build Coastguard Worker    jg .w4_h16
4296*c0909341SAndroid Build Coastguard Worker    WRAP_YMM %1           0
4297*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
4298*c0909341SAndroid Build Coastguard Worker    movd   [dstq          ], xm0
4299*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
4300*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm1
4301*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 1
4302*c0909341SAndroid Build Coastguard Worker    jl .w4_ret
4303*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4304*c0909341SAndroid Build Coastguard Worker    pextrd [dstq          ], xm0, 2
4305*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 3
4306*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm1, 2
4307*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 3
4308*c0909341SAndroid Build Coastguard Worker.w4_ret:
4309*c0909341SAndroid Build Coastguard Worker    RET
4310*c0909341SAndroid Build Coastguard Worker.w4_h16:
4311*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, strided
4312*c0909341SAndroid Build Coastguard Worker    pmulld               m7, [bidir_sctr_w4]
4313*c0909341SAndroid Build Coastguard Worker    %1                    0
4314*c0909341SAndroid Build Coastguard Worker    kxnorw               k1, k1, k1
4315*c0909341SAndroid Build Coastguard Worker    vpscatterdd [dstq+m7]{k1}, m0
4316*c0909341SAndroid Build Coastguard Worker    RET
4317*c0909341SAndroid Build Coastguard Worker.w8:
4318*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
4319*c0909341SAndroid Build Coastguard Worker    jne .w8_h8
4320*c0909341SAndroid Build Coastguard Worker    WRAP_YMM %1           0
4321*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
4322*c0909341SAndroid Build Coastguard Worker    movq   [dstq          ], xm0
4323*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
4324*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
4325*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
4326*c0909341SAndroid Build Coastguard Worker    RET
4327*c0909341SAndroid Build Coastguard Worker.w8_loop:
4328*c0909341SAndroid Build Coastguard Worker    %1_INC_PTR            2
4329*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4330*c0909341SAndroid Build Coastguard Worker.w8_h8:
4331*c0909341SAndroid Build Coastguard Worker    %1                    0
4332*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
4333*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, m0, 2
4334*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm3, m0, 3
4335*c0909341SAndroid Build Coastguard Worker    movq   [dstq          ], xm0
4336*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
4337*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm2
4338*c0909341SAndroid Build Coastguard Worker    movq   [dstq+stride3q ], xm3
4339*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4340*c0909341SAndroid Build Coastguard Worker    movhps [dstq          ], xm0
4341*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
4342*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm2
4343*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm3
4344*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
4345*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
4346*c0909341SAndroid Build Coastguard Worker    RET
4347*c0909341SAndroid Build Coastguard Worker.w16_loop:
4348*c0909341SAndroid Build Coastguard Worker    %1_INC_PTR            2
4349*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4350*c0909341SAndroid Build Coastguard Worker.w16:
4351*c0909341SAndroid Build Coastguard Worker    %1                    0
4352*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
4353*c0909341SAndroid Build Coastguard Worker    mova          [dstq          ], xm0
4354*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], m0, 2
4355*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], ym0, 1
4356*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m0, 3
4357*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4358*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
4359*c0909341SAndroid Build Coastguard Worker    RET
4360*c0909341SAndroid Build Coastguard Worker.w32:
4361*c0909341SAndroid Build Coastguard Worker    pmovzxbq             m7, [pb_02461357]
4362*c0909341SAndroid Build Coastguard Worker.w32_loop:
4363*c0909341SAndroid Build Coastguard Worker    %1                    0
4364*c0909341SAndroid Build Coastguard Worker    %1_INC_PTR            2
4365*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m7, m0
4366*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
4367*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
4368*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
4369*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4370*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
4371*c0909341SAndroid Build Coastguard Worker    RET
4372*c0909341SAndroid Build Coastguard Worker.w64:
4373*c0909341SAndroid Build Coastguard Worker    pmovzxbq             m7, [pb_02461357]
4374*c0909341SAndroid Build Coastguard Worker.w64_loop:
4375*c0909341SAndroid Build Coastguard Worker    %1                    0
4376*c0909341SAndroid Build Coastguard Worker    %1_INC_PTR            2
4377*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m7, m0
4378*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
4379*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
4380*c0909341SAndroid Build Coastguard Worker    dec                  hd
4381*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
4382*c0909341SAndroid Build Coastguard Worker    RET
4383*c0909341SAndroid Build Coastguard Worker.w128:
4384*c0909341SAndroid Build Coastguard Worker    pmovzxbq             m7, [pb_02461357]
4385*c0909341SAndroid Build Coastguard Worker.w128_loop:
4386*c0909341SAndroid Build Coastguard Worker    %1                    0
4387*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m7, m0
4388*c0909341SAndroid Build Coastguard Worker    %1                    2
4389*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m6
4390*c0909341SAndroid Build Coastguard Worker    %1_INC_PTR            4
4391*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m7, m0
4392*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m6
4393*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
4394*c0909341SAndroid Build Coastguard Worker    dec                  hd
4395*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
4396*c0909341SAndroid Build Coastguard Worker    RET
4397*c0909341SAndroid Build Coastguard Worker%endmacro
4398*c0909341SAndroid Build Coastguard Worker
4399*c0909341SAndroid Build Coastguard Worker%macro AVG 1 ; src_offset
4400*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tmp1q+(%1+0)*mmsize]
4401*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tmp2q+(%1+0)*mmsize]
4402*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp1q+(%1+1)*mmsize]
4403*c0909341SAndroid Build Coastguard Worker    paddw                m1, [tmp2q+(%1+1)*mmsize]
4404*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
4405*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4
4406*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
4407*c0909341SAndroid Build Coastguard Worker%endmacro
4408*c0909341SAndroid Build Coastguard Worker
4409*c0909341SAndroid Build Coastguard Worker%macro AVG_INC_PTR 1
4410*c0909341SAndroid Build Coastguard Worker    add               tmp1q, %1*mmsize
4411*c0909341SAndroid Build Coastguard Worker    add               tmp2q, %1*mmsize
4412*c0909341SAndroid Build Coastguard Worker%endmacro
4413*c0909341SAndroid Build Coastguard Worker
4414*c0909341SAndroid Build Coastguard Workercglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3
4415*c0909341SAndroid Build Coastguard Worker%define base r6-avg_avx512icl_table
4416*c0909341SAndroid Build Coastguard Worker    lea                  r6, [avg_avx512icl_table]
4417*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
4418*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
4419*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [r6+wq*4]
4420*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+pw_1024]
4421*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
4422*c0909341SAndroid Build Coastguard Worker    BIDIR_FN            AVG
4423*c0909341SAndroid Build Coastguard Worker
4424*c0909341SAndroid Build Coastguard Worker%macro W_AVG 1 ; src_offset
4425*c0909341SAndroid Build Coastguard Worker    ; (a * weight + b * (16 - weight) + 128) >> 8
4426*c0909341SAndroid Build Coastguard Worker    ; = ((a - b) * weight + (b << 4) + 128) >> 8
4427*c0909341SAndroid Build Coastguard Worker    ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
4428*c0909341SAndroid Build Coastguard Worker    ; = ((((b - a) * (-weight     << 12)) >> 16) + b + 8) >> 4
4429*c0909341SAndroid Build Coastguard Worker    mova                 m0,     [tmp1q+(%1+0)*mmsize]
4430*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, [tmp2q+(%1+0)*mmsize]
4431*c0909341SAndroid Build Coastguard Worker    mova                 m1,     [tmp1q+(%1+1)*mmsize]
4432*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, [tmp2q+(%1+1)*mmsize]
4433*c0909341SAndroid Build Coastguard Worker    pmulhw               m2, m4
4434*c0909341SAndroid Build Coastguard Worker    pmulhw               m3, m4
4435*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
4436*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
4437*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
4438*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
4439*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
4440*c0909341SAndroid Build Coastguard Worker%endmacro
4441*c0909341SAndroid Build Coastguard Worker
4442*c0909341SAndroid Build Coastguard Worker%define W_AVG_INC_PTR AVG_INC_PTR
4443*c0909341SAndroid Build Coastguard Worker
4444*c0909341SAndroid Build Coastguard Workercglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
4445*c0909341SAndroid Build Coastguard Worker%define base r6-w_avg_avx512icl_table
4446*c0909341SAndroid Build Coastguard Worker    lea                  r6, [w_avg_avx512icl_table]
4447*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
4448*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
4449*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, r6m ; weight
4450*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [r6+wq*4]
4451*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pw_2048]
4452*c0909341SAndroid Build Coastguard Worker    psllw                m4, 12 ; (weight-16) << 12 when interpreted as signed
4453*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
4454*c0909341SAndroid Build Coastguard Worker    cmp           dword r6m, 7
4455*c0909341SAndroid Build Coastguard Worker    jg .weight_gt7
4456*c0909341SAndroid Build Coastguard Worker    mov                  r6, tmp1q
4457*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
4458*c0909341SAndroid Build Coastguard Worker    mov               tmp1q, tmp2q
4459*c0909341SAndroid Build Coastguard Worker    psubw                m4, m0, m4 ; -weight
4460*c0909341SAndroid Build Coastguard Worker    mov               tmp2q, r6
4461*c0909341SAndroid Build Coastguard Worker.weight_gt7:
4462*c0909341SAndroid Build Coastguard Worker    BIDIR_FN          W_AVG
4463*c0909341SAndroid Build Coastguard Worker
4464*c0909341SAndroid Build Coastguard Worker%macro MASK 1 ; src_offset
4465*c0909341SAndroid Build Coastguard Worker    ; (a * m + b * (64 - m) + 512) >> 10
4466*c0909341SAndroid Build Coastguard Worker    ; = ((a - b) * m + (b << 6) + 512) >> 10
4467*c0909341SAndroid Build Coastguard Worker    ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
4468*c0909341SAndroid Build Coastguard Worker%if mmsize == 64
4469*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m8, [maskq+%1*32]
4470*c0909341SAndroid Build Coastguard Worker%else
4471*c0909341SAndroid Build Coastguard Worker    vpermq               m3,     [maskq+%1*16], q3120
4472*c0909341SAndroid Build Coastguard Worker%endif
4473*c0909341SAndroid Build Coastguard Worker    mova                 m0,     [tmp2q+(%1+0)*mmsize]
4474*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0, [tmp1q+(%1+0)*mmsize]
4475*c0909341SAndroid Build Coastguard Worker    psubb                m3, m4, m3
4476*c0909341SAndroid Build Coastguard Worker    paddw                m1, m1     ; (b - a) << 1
4477*c0909341SAndroid Build Coastguard Worker    paddb                m3, m3
4478*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4, m3 ; -m << 9
4479*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m2
4480*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
4481*c0909341SAndroid Build Coastguard Worker    mova                 m1,     [tmp2q+(%1+1)*mmsize]
4482*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1, [tmp1q+(%1+1)*mmsize]
4483*c0909341SAndroid Build Coastguard Worker    paddw                m2, m2
4484*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m4, m3
4485*c0909341SAndroid Build Coastguard Worker    pmulhw               m2, m3
4486*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
4487*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
4488*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
4489*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
4490*c0909341SAndroid Build Coastguard Worker%endmacro
4491*c0909341SAndroid Build Coastguard Worker
4492*c0909341SAndroid Build Coastguard Worker%macro MASK_INC_PTR 1
4493*c0909341SAndroid Build Coastguard Worker    add               maskq, %1*32
4494*c0909341SAndroid Build Coastguard Worker    add               tmp2q, %1*64
4495*c0909341SAndroid Build Coastguard Worker    add               tmp1q, %1*64
4496*c0909341SAndroid Build Coastguard Worker%endmacro
4497*c0909341SAndroid Build Coastguard Worker
4498*c0909341SAndroid Build Coastguard Workercglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
4499*c0909341SAndroid Build Coastguard Worker%define base r7-mask_avx512icl_table
4500*c0909341SAndroid Build Coastguard Worker    lea                  r7, [mask_avx512icl_table]
4501*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
4502*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
4503*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
4504*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [r7+wq*4]
4505*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
4506*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+bilin_v_perm64]
4507*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pw_2048]
4508*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
4509*c0909341SAndroid Build Coastguard Worker    BIDIR_FN           MASK
4510*c0909341SAndroid Build Coastguard Worker
4511*c0909341SAndroid Build Coastguard Worker%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4
4512*c0909341SAndroid Build Coastguard Worker    mova                m%1, [tmp1q+mmsize*%3]
4513*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp2q+mmsize*%3]
4514*c0909341SAndroid Build Coastguard Worker    psubw                m1, m%1
4515*c0909341SAndroid Build Coastguard Worker    pabsw               m%2, m1
4516*c0909341SAndroid Build Coastguard Worker    psubusw             m%2, m6, m%2
4517*c0909341SAndroid Build Coastguard Worker    psrlw               m%2, 8 ; 64 - m
4518*c0909341SAndroid Build Coastguard Worker    psllw                m2, m%2, 10
4519*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m2
4520*c0909341SAndroid Build Coastguard Worker    paddw               m%1, m1
4521*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp1q+mmsize*%4]
4522*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmp2q+mmsize*%4]
4523*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
4524*c0909341SAndroid Build Coastguard Worker    pabsw                m3, m2
4525*c0909341SAndroid Build Coastguard Worker    psubusw              m3, m6, m3
4526*c0909341SAndroid Build Coastguard Worker    vpshldw             m%2, m3, 8
4527*c0909341SAndroid Build Coastguard Worker    psllw                m3, m%2, 10
4528*c0909341SAndroid Build Coastguard Worker%if %5
4529*c0909341SAndroid Build Coastguard Worker    psubb               m%2, m5, m%2
4530*c0909341SAndroid Build Coastguard Worker%endif
4531*c0909341SAndroid Build Coastguard Worker    pmulhw               m2, m3
4532*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
4533*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%1, m7
4534*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7
4535*c0909341SAndroid Build Coastguard Worker    packuswb            m%1, m1
4536*c0909341SAndroid Build Coastguard Worker%endmacro
4537*c0909341SAndroid Build Coastguard Worker
4538*c0909341SAndroid Build Coastguard Workercglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
4539*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_420_avx512icl_table
4540*c0909341SAndroid Build Coastguard Worker    lea                  r7, [w_mask_420_avx512icl_table]
4541*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
4542*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r7m ; sign
4543*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
4544*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r7+wq*4]
4545*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
4546*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+pw_2048]
4547*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+pb_m64]             ; -1 << 6
4548*c0909341SAndroid Build Coastguard Worker    mova               ym10, [base+wm_420_mask+32]
4549*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+wm_sign+r6*8] ; (258 - sign) << 6
4550*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
4551*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
4552*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
4553*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4554*c0909341SAndroid Build Coastguard Worker.w4:
4555*c0909341SAndroid Build Coastguard Worker    mova                 m5, [wm_420_perm4]
4556*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
4557*c0909341SAndroid Build Coastguard Worker    jg .w4_h16
4558*c0909341SAndroid Build Coastguard Worker    WRAP_YMM W_MASK       0, 4, 0, 1
4559*c0909341SAndroid Build Coastguard Worker    vinserti128         ym5, [wm_420_perm4+32], 1
4560*c0909341SAndroid Build Coastguard Worker    vpermb              ym4, ym5, ym4
4561*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym8, ym4, ym9
4562*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, m0, 1
4563*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
4564*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
4565*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm1
4566*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 1
4567*c0909341SAndroid Build Coastguard Worker    jl .w4_end
4568*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4569*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*0], xm0, 2
4570*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 3
4571*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm1, 2
4572*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 3
4573*c0909341SAndroid Build Coastguard Worker.w4_end:
4574*c0909341SAndroid Build Coastguard Worker    vpermb              ym8, ym10, ym8
4575*c0909341SAndroid Build Coastguard Worker    movq            [maskq], xm8
4576*c0909341SAndroid Build Coastguard Worker    RET
4577*c0909341SAndroid Build Coastguard Worker.w4_h16:
4578*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, strided
4579*c0909341SAndroid Build Coastguard Worker    pmulld              m11, [bidir_sctr_w4]
4580*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
4581*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m5, m4
4582*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m8, m4, m9
4583*c0909341SAndroid Build Coastguard Worker    kxnorw               k1, k1, k1
4584*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m10, m8
4585*c0909341SAndroid Build Coastguard Worker    mova            [maskq], xm8
4586*c0909341SAndroid Build Coastguard Worker    vpscatterdd [dstq+m11]{k1}, m0
4587*c0909341SAndroid Build Coastguard Worker    RET
4588*c0909341SAndroid Build Coastguard Worker.w8:
4589*c0909341SAndroid Build Coastguard Worker    mova                 m5, [wm_420_perm8]
4590*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
4591*c0909341SAndroid Build Coastguard Worker    jne .w8_h8
4592*c0909341SAndroid Build Coastguard Worker    WRAP_YMM W_MASK       0, 4, 0, 1
4593*c0909341SAndroid Build Coastguard Worker    vinserti128         ym5, [wm_420_perm8+32], 1
4594*c0909341SAndroid Build Coastguard Worker    vpermb              ym4, ym5, ym4
4595*c0909341SAndroid Build Coastguard Worker    vpdpbusd            ym8, ym4, ym9
4596*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m10, m8
4597*c0909341SAndroid Build Coastguard Worker    mova            [maskq], xm8
4598*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
4599*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
4600*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
4601*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
4602*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
4603*c0909341SAndroid Build Coastguard Worker    RET
4604*c0909341SAndroid Build Coastguard Worker.w8_loop:
4605*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 128
4606*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 128
4607*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
4608*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4609*c0909341SAndroid Build Coastguard Worker.w8_h8:
4610*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
4611*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m5, m4
4612*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
4613*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m4, m9
4614*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m10, m1
4615*c0909341SAndroid Build Coastguard Worker    mova            [maskq], xm1
4616*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
4617*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, m0, 2
4618*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm3, m0, 3
4619*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
4620*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
4621*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm2
4622*c0909341SAndroid Build Coastguard Worker    movq   [dstq+stride3q ], xm3
4623*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4624*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*0], xm0
4625*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
4626*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm2
4627*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm3
4628*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
4629*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
4630*c0909341SAndroid Build Coastguard Worker    RET
4631*c0909341SAndroid Build Coastguard Worker.w16:
4632*c0909341SAndroid Build Coastguard Worker    mova                 m5, [wm_420_perm16]
4633*c0909341SAndroid Build Coastguard Worker.w16_loop:
4634*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
4635*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m5, m4
4636*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
4637*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m4, m9
4638*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 128
4639*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 128
4640*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m10, m1
4641*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
4642*c0909341SAndroid Build Coastguard Worker    mova            [maskq], xm1
4643*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
4644*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
4645*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], m0, 2
4646*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], ym0, 1
4647*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m0, 3
4648*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4649*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4650*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
4651*c0909341SAndroid Build Coastguard Worker    RET
4652*c0909341SAndroid Build Coastguard Worker.w32:
4653*c0909341SAndroid Build Coastguard Worker    pmovzxbq             m5, [pb_02461357]
4654*c0909341SAndroid Build Coastguard Worker.w32_loop:
4655*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
4656*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
4657*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m1, m4, m9
4658*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 128
4659*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 128
4660*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m10, m1
4661*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m5, m0
4662*c0909341SAndroid Build Coastguard Worker    mova            [maskq], xm1
4663*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
4664*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
4665*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
4666*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
4667*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4668*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
4669*c0909341SAndroid Build Coastguard Worker    RET
4670*c0909341SAndroid Build Coastguard Worker.w64:
4671*c0909341SAndroid Build Coastguard Worker    pmovzxbq            m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14
4672*c0909341SAndroid Build Coastguard Worker    psrlq               m13, m12, 4          ; 1, 3, 5, 7, 9, 11, 13, 15
4673*c0909341SAndroid Build Coastguard Worker.w64_loop:
4674*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 2
4675*c0909341SAndroid Build Coastguard Worker    W_MASK               11, 5, 1, 3
4676*c0909341SAndroid Build Coastguard Worker    mova                 m2, m8
4677*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m2, m4, m9
4678*c0909341SAndroid Build Coastguard Worker    mova                 m3, m8
4679*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m3, m5, m9
4680*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 256
4681*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 256
4682*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m10, m3
4683*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
4684*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m12, m11
4685*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m13, m11
4686*c0909341SAndroid Build Coastguard Worker    mova            [maskq], ym2
4687*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
4688*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
4689*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
4690*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
4691*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4692*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
4693*c0909341SAndroid Build Coastguard Worker    RET
4694*c0909341SAndroid Build Coastguard Worker.w128:
4695*c0909341SAndroid Build Coastguard Worker    pmovzxbq            m14, [wm_420_perm64]
4696*c0909341SAndroid Build Coastguard Worker    mova                m10, [wm_420_mask]
4697*c0909341SAndroid Build Coastguard Worker    psrlq               m15, m14, 4
4698*c0909341SAndroid Build Coastguard Worker.w128_loop:
4699*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 12, 0, 4
4700*c0909341SAndroid Build Coastguard Worker    W_MASK               11, 13, 1, 5
4701*c0909341SAndroid Build Coastguard Worker    mova                 m4, m8
4702*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m12, m9
4703*c0909341SAndroid Build Coastguard Worker    mova                 m5, m8
4704*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m5, m13, m9
4705*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
4706*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m14, m11
4707*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m15, m11
4708*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+64*0], m0
4709*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+64*0], m1
4710*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 12, 2, 6
4711*c0909341SAndroid Build Coastguard Worker    W_MASK               11, 13, 3, 7
4712*c0909341SAndroid Build Coastguard Worker    vprold               m4, 16
4713*c0909341SAndroid Build Coastguard Worker    vprold               m5, 16
4714*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m4, m12, m9
4715*c0909341SAndroid Build Coastguard Worker    vpdpbusd             m5, m13, m9
4716*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 512
4717*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 512
4718*c0909341SAndroid Build Coastguard Worker    vpermt2b             m4, m10, m5
4719*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
4720*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m14, m11
4721*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m15, m11
4722*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m4
4723*c0909341SAndroid Build Coastguard Worker    add               maskq, 64
4724*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+64*1], m0
4725*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+64*1], m1
4726*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
4727*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4728*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
4729*c0909341SAndroid Build Coastguard Worker    RET
4730*c0909341SAndroid Build Coastguard Worker
4731*c0909341SAndroid Build Coastguard Workercglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
4732*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_422_avx512icl_table
4733*c0909341SAndroid Build Coastguard Worker    lea                  r7, [w_mask_422_avx512icl_table]
4734*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
4735*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r7m ; sign
4736*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
4737*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [r7+wq*4]
4738*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
4739*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+pw_2048]
4740*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+pw_m128]
4741*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+wm_422_mask]
4742*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+pb_127]
4743*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
4744*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+wm_sign+4+r6*4]
4745*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
4746*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
4747*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4748*c0909341SAndroid Build Coastguard Worker.w4:
4749*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
4750*c0909341SAndroid Build Coastguard Worker    jg .w4_h16
4751*c0909341SAndroid Build Coastguard Worker    WRAP_YMM W_MASK       0, 4, 0, 1
4752*c0909341SAndroid Build Coastguard Worker    movhps             xm10, [wm_422_mask+16]
4753*c0909341SAndroid Build Coastguard Worker    vpdpwssd            ym8, ym4, ym9
4754*c0909341SAndroid Build Coastguard Worker    vpermb              ym8, ym10, ym8
4755*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, m0, 1
4756*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
4757*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
4758*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm1
4759*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 1
4760*c0909341SAndroid Build Coastguard Worker    jl .w4_end
4761*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4762*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*0], xm0, 2
4763*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 3
4764*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm1, 2
4765*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 3
4766*c0909341SAndroid Build Coastguard Worker.w4_end:
4767*c0909341SAndroid Build Coastguard Worker    pand                xm8, xm11
4768*c0909341SAndroid Build Coastguard Worker    mova            [maskq], xm8
4769*c0909341SAndroid Build Coastguard Worker    RET
4770*c0909341SAndroid Build Coastguard Worker.w4_h16:
4771*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, strided
4772*c0909341SAndroid Build Coastguard Worker    pmulld               m5, [bidir_sctr_w4]
4773*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
4774*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m4, m9
4775*c0909341SAndroid Build Coastguard Worker    kxnorw               k1, k1, k1
4776*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m10, m8
4777*c0909341SAndroid Build Coastguard Worker    pand                ym8, ym11
4778*c0909341SAndroid Build Coastguard Worker    mova            [maskq], ym8
4779*c0909341SAndroid Build Coastguard Worker    vpscatterdd [dstq+m5]{k1}, m0
4780*c0909341SAndroid Build Coastguard Worker    RET
4781*c0909341SAndroid Build Coastguard Worker.w8:
4782*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
4783*c0909341SAndroid Build Coastguard Worker    jne .w8_h8
4784*c0909341SAndroid Build Coastguard Worker    WRAP_YMM W_MASK       0, 4, 0, 1
4785*c0909341SAndroid Build Coastguard Worker    movhps             xm10, [wm_422_mask+16]
4786*c0909341SAndroid Build Coastguard Worker    vpdpwssd            ym8, ym4, ym9
4787*c0909341SAndroid Build Coastguard Worker    vpermb              ym8, ym10, ym8
4788*c0909341SAndroid Build Coastguard Worker    pand                xm8, xm11
4789*c0909341SAndroid Build Coastguard Worker    mova            [maskq], xm8
4790*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
4791*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
4792*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
4793*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
4794*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
4795*c0909341SAndroid Build Coastguard Worker    RET
4796*c0909341SAndroid Build Coastguard Worker.w8_loop:
4797*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 128
4798*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 128
4799*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
4800*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4801*c0909341SAndroid Build Coastguard Worker.w8_h8:
4802*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
4803*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
4804*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m4, m9
4805*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m10, m1
4806*c0909341SAndroid Build Coastguard Worker    pand                ym1, ym11
4807*c0909341SAndroid Build Coastguard Worker    mova            [maskq], ym1
4808*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
4809*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, m0, 2
4810*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm3, m0, 3
4811*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
4812*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
4813*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm2
4814*c0909341SAndroid Build Coastguard Worker    movq   [dstq+stride3q ], xm3
4815*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4816*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*0], xm0
4817*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
4818*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm2
4819*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm3
4820*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
4821*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
4822*c0909341SAndroid Build Coastguard Worker    RET
4823*c0909341SAndroid Build Coastguard Worker.w16_loop:
4824*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 128
4825*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 128
4826*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
4827*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4828*c0909341SAndroid Build Coastguard Worker.w16:
4829*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
4830*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
4831*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m4, m9
4832*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m10, m1
4833*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
4834*c0909341SAndroid Build Coastguard Worker    pand                ym1, ym11
4835*c0909341SAndroid Build Coastguard Worker    mova            [maskq], ym1
4836*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
4837*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], m0, 2
4838*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], ym0, 1
4839*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m0, 3
4840*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4841*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
4842*c0909341SAndroid Build Coastguard Worker    RET
4843*c0909341SAndroid Build Coastguard Worker.w32:
4844*c0909341SAndroid Build Coastguard Worker    pmovzxbq             m5, [pb_02461357]
4845*c0909341SAndroid Build Coastguard Worker.w32_loop:
4846*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
4847*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
4848*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m4, m9
4849*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 128
4850*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 128
4851*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m10, m1
4852*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m5, m0
4853*c0909341SAndroid Build Coastguard Worker    pand                ym1, ym11
4854*c0909341SAndroid Build Coastguard Worker    mova            [maskq], ym1
4855*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
4856*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
4857*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
4858*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
4859*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4860*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
4861*c0909341SAndroid Build Coastguard Worker    RET
4862*c0909341SAndroid Build Coastguard Worker.w64:
4863*c0909341SAndroid Build Coastguard Worker    pmovzxbq             m5, [pb_02461357]
4864*c0909341SAndroid Build Coastguard Worker.w64_loop:
4865*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
4866*c0909341SAndroid Build Coastguard Worker    mova                 m1, m8
4867*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m4, m9
4868*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 128
4869*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 128
4870*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m10, m1
4871*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m5, m0
4872*c0909341SAndroid Build Coastguard Worker    pand                ym1, ym11
4873*c0909341SAndroid Build Coastguard Worker    mova            [maskq], ym1
4874*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
4875*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
4876*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
4877*c0909341SAndroid Build Coastguard Worker    dec                  hd
4878*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
4879*c0909341SAndroid Build Coastguard Worker    RET
4880*c0909341SAndroid Build Coastguard Worker.w128:
4881*c0909341SAndroid Build Coastguard Worker    pmovzxbq            m13, [pb_02461357]
4882*c0909341SAndroid Build Coastguard Worker.w128_loop:
4883*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
4884*c0909341SAndroid Build Coastguard Worker    W_MASK               12, 5, 2, 3
4885*c0909341SAndroid Build Coastguard Worker    mova                 m2, m8
4886*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m4, m9
4887*c0909341SAndroid Build Coastguard Worker    mova                 m3, m8
4888*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m5, m9
4889*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 256
4890*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 256
4891*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m10, m3
4892*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m13, m0
4893*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m13, m12
4894*c0909341SAndroid Build Coastguard Worker    pand                 m2, m11
4895*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m2
4896*c0909341SAndroid Build Coastguard Worker    add               maskq, 64
4897*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
4898*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
4899*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
4900*c0909341SAndroid Build Coastguard Worker    dec                  hd
4901*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
4902*c0909341SAndroid Build Coastguard Worker    RET
4903*c0909341SAndroid Build Coastguard Worker
4904*c0909341SAndroid Build Coastguard Workercglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
4905*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_444_avx512icl_table
4906*c0909341SAndroid Build Coastguard Worker    lea                  r7, [w_mask_444_avx512icl_table]
4907*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
4908*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
4909*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [r7+wq*4]
4910*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
4911*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pb_64]
4912*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+pw_2048]
4913*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+wm_444_mask]
4914*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
4915*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
4916*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
4917*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4918*c0909341SAndroid Build Coastguard Worker.w4:
4919*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
4920*c0909341SAndroid Build Coastguard Worker    jg .w4_h16
4921*c0909341SAndroid Build Coastguard Worker    WRAP_YMM W_MASK       0, 4, 0, 1, 1
4922*c0909341SAndroid Build Coastguard Worker    vinserti128         ym8, [wm_444_mask+32], 1
4923*c0909341SAndroid Build Coastguard Worker    vpermb              ym4, ym8, ym4
4924*c0909341SAndroid Build Coastguard Worker    mova            [maskq], ym4
4925*c0909341SAndroid Build Coastguard Worker    vextracti32x4      xm1, m0, 1
4926*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
4927*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
4928*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm1
4929*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 1
4930*c0909341SAndroid Build Coastguard Worker    jl .w4_end
4931*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4932*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*0], xm0, 2
4933*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 3
4934*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm1, 2
4935*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 3
4936*c0909341SAndroid Build Coastguard Worker.w4_end:
4937*c0909341SAndroid Build Coastguard Worker    RET
4938*c0909341SAndroid Build Coastguard Worker.w4_h16:
4939*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, strided
4940*c0909341SAndroid Build Coastguard Worker    pmulld               m9, [bidir_sctr_w4]
4941*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1, 1
4942*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m8, m4
4943*c0909341SAndroid Build Coastguard Worker    kxnorw               k1, k1, k1
4944*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m4
4945*c0909341SAndroid Build Coastguard Worker    vpscatterdd [dstq+m9]{k1}, m0
4946*c0909341SAndroid Build Coastguard Worker    RET
4947*c0909341SAndroid Build Coastguard Worker.w8:
4948*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
4949*c0909341SAndroid Build Coastguard Worker    jne .w8_h8
4950*c0909341SAndroid Build Coastguard Worker    WRAP_YMM W_MASK       0, 4, 0, 1, 1
4951*c0909341SAndroid Build Coastguard Worker    vinserti128         ym8, [wm_444_mask+32], 1
4952*c0909341SAndroid Build Coastguard Worker    vpermb              ym4, ym8, ym4
4953*c0909341SAndroid Build Coastguard Worker    mova            [maskq], ym4
4954*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
4955*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
4956*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
4957*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
4958*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
4959*c0909341SAndroid Build Coastguard Worker    RET
4960*c0909341SAndroid Build Coastguard Worker.w8_loop:
4961*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 128
4962*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 128
4963*c0909341SAndroid Build Coastguard Worker    add               maskq, 64
4964*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4965*c0909341SAndroid Build Coastguard Worker.w8_h8:
4966*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1, 1
4967*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m8, m4
4968*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m4
4969*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
4970*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, m0, 2
4971*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm3, m0, 3
4972*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
4973*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
4974*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm2
4975*c0909341SAndroid Build Coastguard Worker    movq   [dstq+stride3q ], xm3
4976*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4977*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*0], xm0
4978*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
4979*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm2
4980*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm3
4981*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
4982*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
4983*c0909341SAndroid Build Coastguard Worker    RET
4984*c0909341SAndroid Build Coastguard Worker.w16_loop:
4985*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 128
4986*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 128
4987*c0909341SAndroid Build Coastguard Worker    add               maskq, 64
4988*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4989*c0909341SAndroid Build Coastguard Worker.w16:
4990*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1, 1
4991*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m8, m4
4992*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
4993*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m4
4994*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
4995*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], m0, 2
4996*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], ym0, 1
4997*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m0, 3
4998*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4999*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
5000*c0909341SAndroid Build Coastguard Worker    RET
5001*c0909341SAndroid Build Coastguard Worker.w32:
5002*c0909341SAndroid Build Coastguard Worker    pmovzxbq             m9, [pb_02461357]
5003*c0909341SAndroid Build Coastguard Worker.w32_loop:
5004*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1, 1
5005*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m8, m4
5006*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 128
5007*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 128
5008*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m9, m0
5009*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m4
5010*c0909341SAndroid Build Coastguard Worker    add               maskq, 64
5011*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
5012*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
5013*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
5014*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5015*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
5016*c0909341SAndroid Build Coastguard Worker    RET
5017*c0909341SAndroid Build Coastguard Worker.w64:
5018*c0909341SAndroid Build Coastguard Worker    pmovzxbq             m9, [pb_02461357]
5019*c0909341SAndroid Build Coastguard Worker.w64_loop:
5020*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1, 1
5021*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m8, m4
5022*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 128
5023*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 128
5024*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m9, m0
5025*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m4
5026*c0909341SAndroid Build Coastguard Worker    add               maskq, 64
5027*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
5028*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
5029*c0909341SAndroid Build Coastguard Worker    dec                  hd
5030*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
5031*c0909341SAndroid Build Coastguard Worker    RET
5032*c0909341SAndroid Build Coastguard Worker.w128:
5033*c0909341SAndroid Build Coastguard Worker    pmovzxbq            m11, [pb_02461357]
5034*c0909341SAndroid Build Coastguard Worker.w128_loop:
5035*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1, 1
5036*c0909341SAndroid Build Coastguard Worker    W_MASK               10, 9, 2, 3, 1
5037*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m8, m4
5038*c0909341SAndroid Build Coastguard Worker    vpermb               m9, m8, m9
5039*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 256
5040*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 256
5041*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m11, m0
5042*c0909341SAndroid Build Coastguard Worker    vpermq              m10, m11, m10
5043*c0909341SAndroid Build Coastguard Worker    mova       [maskq+64*0], m4
5044*c0909341SAndroid Build Coastguard Worker    mova       [maskq+64*1], m9
5045*c0909341SAndroid Build Coastguard Worker    add               maskq, 128
5046*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
5047*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m10
5048*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
5049*c0909341SAndroid Build Coastguard Worker    dec                  hd
5050*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
5051*c0909341SAndroid Build Coastguard Worker    RET
5052*c0909341SAndroid Build Coastguard Worker
5053*c0909341SAndroid Build Coastguard Workercglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
5054*c0909341SAndroid Build Coastguard Worker%define base r6-blend_avx512icl_table
5055*c0909341SAndroid Build Coastguard Worker    lea                  r6, [blend_avx512icl_table]
5056*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5057*c0909341SAndroid Build Coastguard Worker    movifnidn         maskq, maskmp
5058*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5059*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
5060*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pb_64]
5061*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+pw_512]
5062*c0909341SAndroid Build Coastguard Worker    sub                tmpq, maskq
5063*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
5064*c0909341SAndroid Build Coastguard Worker    lea                  r6, [dsq*3]
5065*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5066*c0909341SAndroid Build Coastguard Worker.w4:
5067*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [dstq+dsq*0]
5068*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm0, [dstq+dsq*1], 1
5069*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm1, [dstq+dsq*2]
5070*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm1, [dstq+r6   ], 3
5071*c0909341SAndroid Build Coastguard Worker    mova               xmm4, [maskq]
5072*c0909341SAndroid Build Coastguard Worker    mova               xmm5, [maskq+tmpq]
5073*c0909341SAndroid Build Coastguard Worker    add               maskq, 4*4
5074*c0909341SAndroid Build Coastguard Worker    psubb              xmm3, xm6, xmm4
5075*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm0, xmm5
5076*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm2, xmm3, xmm4
5077*c0909341SAndroid Build Coastguard Worker    punpckhbw          xmm1, xmm5
5078*c0909341SAndroid Build Coastguard Worker    punpckhbw          xmm3, xmm4
5079*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm0, xmm2
5080*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm1, xmm3
5081*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm0, xm7
5082*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm1, xm7
5083*c0909341SAndroid Build Coastguard Worker    packuswb           xmm0, xmm1
5084*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm0
5085*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm0, 1
5086*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*2], xmm0, 2
5087*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+r6   ], xmm0, 3
5088*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*4]
5089*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5090*c0909341SAndroid Build Coastguard Worker    jg .w4
5091*c0909341SAndroid Build Coastguard Worker    RET
5092*c0909341SAndroid Build Coastguard Worker.w8:
5093*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [dstq+dsq*0]
5094*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       xmm1, [dstq+dsq*1]
5095*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm2, [dstq+dsq*2]
5096*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm3, [dstq+r6   ]
5097*c0909341SAndroid Build Coastguard Worker    mova               ymm4, [maskq]
5098*c0909341SAndroid Build Coastguard Worker    mova               ymm5, [maskq+tmpq]
5099*c0909341SAndroid Build Coastguard Worker    add               maskq, 8*4
5100*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm0, ymm2, 0x30
5101*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm1, ymm3, 0xc0
5102*c0909341SAndroid Build Coastguard Worker    psubb              ymm3, ym6, ymm4
5103*c0909341SAndroid Build Coastguard Worker    punpcklbw          ymm0, ymm5
5104*c0909341SAndroid Build Coastguard Worker    punpcklbw          ymm2, ymm3, ymm4
5105*c0909341SAndroid Build Coastguard Worker    punpckhbw          ymm1, ymm5
5106*c0909341SAndroid Build Coastguard Worker    punpckhbw          ymm3, ymm4
5107*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm0, ymm2
5108*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm1, ymm3
5109*c0909341SAndroid Build Coastguard Worker    pmulhrsw           ymm0, ym7
5110*c0909341SAndroid Build Coastguard Worker    pmulhrsw           ymm1, ym7
5111*c0909341SAndroid Build Coastguard Worker    packuswb           ymm0, ymm1
5112*c0909341SAndroid Build Coastguard Worker    vextracti128       xmm1, ymm0, 1
5113*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xmm0
5114*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xmm0
5115*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*2], xmm1
5116*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+r6   ], xmm1
5117*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*4]
5118*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5119*c0909341SAndroid Build Coastguard Worker    jg .w8
5120*c0909341SAndroid Build Coastguard Worker    vzeroupper
5121*c0909341SAndroid Build Coastguard Worker    RET
5122*c0909341SAndroid Build Coastguard Worker.w16:
5123*c0909341SAndroid Build Coastguard Worker    mova                xm1, [dstq+dsq*0]
5124*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, [dstq+dsq*1], 1
5125*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, [dstq+dsq*2], 2
5126*c0909341SAndroid Build Coastguard Worker    mova                 m4, [maskq]
5127*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, [dstq+r6   ], 3
5128*c0909341SAndroid Build Coastguard Worker    mova                 m5, [maskq+tmpq]
5129*c0909341SAndroid Build Coastguard Worker    add               maskq, 16*4
5130*c0909341SAndroid Build Coastguard Worker    psubb                m3, m6, m4
5131*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m5
5132*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m4
5133*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m5
5134*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m4
5135*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
5136*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
5137*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m7
5138*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7
5139*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
5140*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], xm0
5141*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*1], ym0, 1
5142*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*2], m0, 2
5143*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r6   ], m0, 3
5144*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*4]
5145*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5146*c0909341SAndroid Build Coastguard Worker    jg .w16
5147*c0909341SAndroid Build Coastguard Worker    RET
5148*c0909341SAndroid Build Coastguard Worker.w32:
5149*c0909341SAndroid Build Coastguard Worker    mova                ym1, [dstq+dsq*0]
5150*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [dstq+dsq*1], 1
5151*c0909341SAndroid Build Coastguard Worker    mova                 m4, [maskq]
5152*c0909341SAndroid Build Coastguard Worker    mova                 m5, [maskq+tmpq]
5153*c0909341SAndroid Build Coastguard Worker    add               maskq, 32*2
5154*c0909341SAndroid Build Coastguard Worker    psubb                m3, m6, m4
5155*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m5
5156*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m4
5157*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m5
5158*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m4
5159*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
5160*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
5161*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m7
5162*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7
5163*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
5164*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym0
5165*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m0, 1
5166*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5167*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5168*c0909341SAndroid Build Coastguard Worker    jg .w32
5169*c0909341SAndroid Build Coastguard Worker    RET
5170*c0909341SAndroid Build Coastguard Worker
5171*c0909341SAndroid Build Coastguard Workercglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
5172*c0909341SAndroid Build Coastguard Worker%define base r5-blend_v_avx512icl_table
5173*c0909341SAndroid Build Coastguard Worker    lea                  r5, [blend_v_avx512icl_table]
5174*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5175*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5176*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
5177*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pw_512]
5178*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
5179*c0909341SAndroid Build Coastguard Worker    add               maskq, obmc_masks-blend_v_avx512icl_table
5180*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5181*c0909341SAndroid Build Coastguard Worker.w2:
5182*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xmm2, [maskq+2*2]
5183*c0909341SAndroid Build Coastguard Worker.w2_s0_loop:
5184*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [dstq+dsq*0]
5185*c0909341SAndroid Build Coastguard Worker    pinsrw             xmm0, [dstq+dsq*1], 1
5186*c0909341SAndroid Build Coastguard Worker    movd               xmm1, [tmpq]
5187*c0909341SAndroid Build Coastguard Worker    add                tmpq, 2*2
5188*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm0, xmm1
5189*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm0, xmm2
5190*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm0, xm5
5191*c0909341SAndroid Build Coastguard Worker    packuswb           xmm0, xmm0
5192*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xmm0, 0
5193*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xmm0, 1
5194*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5195*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5196*c0909341SAndroid Build Coastguard Worker    jg .w2_s0_loop
5197*c0909341SAndroid Build Coastguard Worker    RET
5198*c0909341SAndroid Build Coastguard Worker.w4:
5199*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       xmm2, [maskq+4*2]
5200*c0909341SAndroid Build Coastguard Worker.w4_loop:
5201*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [dstq+dsq*0]
5202*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm0, [dstq+dsq*1], 1
5203*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [tmpq]
5204*c0909341SAndroid Build Coastguard Worker    add                tmpq, 4*2
5205*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm0, xmm1
5206*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm0, xmm2
5207*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm0, xm5
5208*c0909341SAndroid Build Coastguard Worker    packuswb           xmm0, xmm0
5209*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm0
5210*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm0, 1
5211*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5212*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5213*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
5214*c0909341SAndroid Build Coastguard Worker    RET
5215*c0909341SAndroid Build Coastguard Worker.w8:
5216*c0909341SAndroid Build Coastguard Worker    mova               xmm3, [maskq+8*2]
5217*c0909341SAndroid Build Coastguard Worker.w8_loop:
5218*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [dstq+dsq*0]
5219*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       xmm1, [dstq+dsq*1]
5220*c0909341SAndroid Build Coastguard Worker    mova               xmm2, [tmpq]
5221*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8*2
5222*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm0, xmm2
5223*c0909341SAndroid Build Coastguard Worker    punpckhbw          xmm1, xmm2
5224*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm0, xmm3
5225*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm1, xmm3
5226*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm0, xm5
5227*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm1, xm5
5228*c0909341SAndroid Build Coastguard Worker    packuswb           xmm0, xmm1
5229*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xmm0
5230*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xmm0
5231*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5232*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5233*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
5234*c0909341SAndroid Build Coastguard Worker    RET
5235*c0909341SAndroid Build Coastguard Worker.w16:
5236*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym3, [maskq+16*2]
5237*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym4, [maskq+16*3]
5238*c0909341SAndroid Build Coastguard Worker.w16_loop:
5239*c0909341SAndroid Build Coastguard Worker    mova                xm1, [dstq+dsq*0]
5240*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, [dstq+dsq*1], 1
5241*c0909341SAndroid Build Coastguard Worker    mova                ym2, [tmpq]
5242*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
5243*c0909341SAndroid Build Coastguard Worker    punpcklbw           ym0, ym1, ym2
5244*c0909341SAndroid Build Coastguard Worker    punpckhbw           ym1, ym2
5245*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym0, ym3
5246*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym1, ym4
5247*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym0, ym5
5248*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym1, ym5
5249*c0909341SAndroid Build Coastguard Worker    packuswb            ym0, ym1
5250*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], xm0
5251*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*1], m0, 1
5252*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5253*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5254*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
5255*c0909341SAndroid Build Coastguard Worker    RET
5256*c0909341SAndroid Build Coastguard Worker.w32:
5257*c0909341SAndroid Build Coastguard Worker    mova                 m4, [maskq+32*2]
5258*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m4, m4, q2020
5259*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m4, q3131
5260*c0909341SAndroid Build Coastguard Worker.w32_loop:
5261*c0909341SAndroid Build Coastguard Worker    mova                ym1, [dstq+dsq*0]
5262*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [dstq+dsq*1], 1
5263*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmpq]
5264*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
5265*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
5266*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
5267*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
5268*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m4
5269*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
5270*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
5271*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
5272*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym0
5273*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m0, 1
5274*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5275*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5276*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
5277*c0909341SAndroid Build Coastguard Worker    RET
5278*c0909341SAndroid Build Coastguard Worker
5279*c0909341SAndroid Build Coastguard Workercglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
5280*c0909341SAndroid Build Coastguard Worker%define base r6-blend_h_avx512icl_table
5281*c0909341SAndroid Build Coastguard Worker    lea                  r6, [blend_h_avx512icl_table]
5282*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5283*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
5284*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
5285*c0909341SAndroid Build Coastguard Worker    lea               maskq, [base+obmc_masks+hq*2]
5286*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pw_512]
5287*c0909341SAndroid Build Coastguard Worker    lea                  hd, [hq*3]
5288*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
5289*c0909341SAndroid Build Coastguard Worker    shr                  hd, 2 ; h * 3/4
5290*c0909341SAndroid Build Coastguard Worker    lea               maskq, [maskq+hq*2]
5291*c0909341SAndroid Build Coastguard Worker    neg                  hq
5292*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5293*c0909341SAndroid Build Coastguard Worker.w2:
5294*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [dstq+dsq*0]
5295*c0909341SAndroid Build Coastguard Worker    pinsrw             xmm0, [dstq+dsq*1], 1
5296*c0909341SAndroid Build Coastguard Worker    movd               xmm2, [maskq+hq*2]
5297*c0909341SAndroid Build Coastguard Worker    movd               xmm1, [tmpq]
5298*c0909341SAndroid Build Coastguard Worker    add                tmpq, 2*2
5299*c0909341SAndroid Build Coastguard Worker    punpcklwd          xmm2, xmm2
5300*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm0, xmm1
5301*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm0, xmm2
5302*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm0, xm5
5303*c0909341SAndroid Build Coastguard Worker    packuswb           xmm0, xmm0
5304*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xmm0, 0
5305*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xmm0, 1
5306*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5307*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
5308*c0909341SAndroid Build Coastguard Worker    jl .w2
5309*c0909341SAndroid Build Coastguard Worker    RET
5310*c0909341SAndroid Build Coastguard Worker.w4:
5311*c0909341SAndroid Build Coastguard Worker    mova               xmm3, [blend_shuf]
5312*c0909341SAndroid Build Coastguard Worker.w4_loop:
5313*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [dstq+dsq*0]
5314*c0909341SAndroid Build Coastguard Worker    pinsrd             xmm0, [dstq+dsq*1], 1
5315*c0909341SAndroid Build Coastguard Worker    movd               xmm2, [maskq+hq*2]
5316*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [tmpq]
5317*c0909341SAndroid Build Coastguard Worker    add                tmpq, 4*2
5318*c0909341SAndroid Build Coastguard Worker    pshufb             xmm2, xmm3
5319*c0909341SAndroid Build Coastguard Worker    punpcklbw          xmm0, xmm1
5320*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm0, xmm2
5321*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm0, xm5
5322*c0909341SAndroid Build Coastguard Worker    packuswb           xmm0, xmm0
5323*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xmm0
5324*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xmm0, 1
5325*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5326*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
5327*c0909341SAndroid Build Coastguard Worker    jl .w4_loop
5328*c0909341SAndroid Build Coastguard Worker    RET
5329*c0909341SAndroid Build Coastguard Worker.w8:
5330*c0909341SAndroid Build Coastguard Worker    vbroadcasti128     ymm4, [blend_shuf]
5331*c0909341SAndroid Build Coastguard Worker    shufpd             ymm4, ymm4, 0x03
5332*c0909341SAndroid Build Coastguard Worker.w8_loop:
5333*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ymm1, [dstq+dsq*0]
5334*c0909341SAndroid Build Coastguard Worker    movq               xmm0, [dstq+dsq*1]
5335*c0909341SAndroid Build Coastguard Worker    vpblendd           ymm0, ymm1, 0x30
5336*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ymm3, [maskq+hq*2]
5337*c0909341SAndroid Build Coastguard Worker    movq               xmm1, [tmpq+8*1]
5338*c0909341SAndroid Build Coastguard Worker    vinserti128        ymm1, [tmpq+8*0], 1
5339*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8*2
5340*c0909341SAndroid Build Coastguard Worker    pshufb             ymm3, ymm4
5341*c0909341SAndroid Build Coastguard Worker    punpcklbw          ymm0, ymm1
5342*c0909341SAndroid Build Coastguard Worker    pmaddubsw          ymm0, ymm3
5343*c0909341SAndroid Build Coastguard Worker    pmulhrsw           ymm0, ym5
5344*c0909341SAndroid Build Coastguard Worker    vextracti128       xmm1, ymm0, 1
5345*c0909341SAndroid Build Coastguard Worker    packuswb           xmm0, xmm1
5346*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*0], xmm0
5347*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*1], xmm0
5348*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5349*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
5350*c0909341SAndroid Build Coastguard Worker    jl .w8_loop
5351*c0909341SAndroid Build Coastguard Worker    vzeroupper
5352*c0909341SAndroid Build Coastguard Worker    RET
5353*c0909341SAndroid Build Coastguard Worker.w16:
5354*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym4, [blend_shuf]
5355*c0909341SAndroid Build Coastguard Worker    shufpd              ym4, ym4, 0x0c
5356*c0909341SAndroid Build Coastguard Worker.w16_loop:
5357*c0909341SAndroid Build Coastguard Worker    mova                xm1, [dstq+dsq*0]
5358*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, [dstq+dsq*1], 1
5359*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym3, [maskq+hq*2]
5360*c0909341SAndroid Build Coastguard Worker    mova                ym2, [tmpq]
5361*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
5362*c0909341SAndroid Build Coastguard Worker    pshufb              ym3, ym4
5363*c0909341SAndroid Build Coastguard Worker    punpcklbw           ym0, ym1, ym2
5364*c0909341SAndroid Build Coastguard Worker    punpckhbw           ym1, ym2
5365*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym0, ym3
5366*c0909341SAndroid Build Coastguard Worker    pmaddubsw           ym1, ym3
5367*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym0, ym5
5368*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym1, ym5
5369*c0909341SAndroid Build Coastguard Worker    packuswb            ym0, ym1
5370*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], xm0
5371*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+dsq*1], m0, 1
5372*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5373*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
5374*c0909341SAndroid Build Coastguard Worker    jl .w16_loop
5375*c0909341SAndroid Build Coastguard Worker    RET
5376*c0909341SAndroid Build Coastguard Worker.w32:
5377*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m4, [blend_shuf]
5378*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m4, 0xf0
5379*c0909341SAndroid Build Coastguard Worker.w32_loop:
5380*c0909341SAndroid Build Coastguard Worker    mova                ym1, [dstq+dsq*0]
5381*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [dstq+dsq*1], 1
5382*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [maskq+hq*2]
5383*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmpq]
5384*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
5385*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4
5386*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
5387*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
5388*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
5389*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
5390*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
5391*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
5392*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
5393*c0909341SAndroid Build Coastguard Worker    mova          [dstq+dsq*0], ym0
5394*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+dsq*1], m0, 1
5395*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5396*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
5397*c0909341SAndroid Build Coastguard Worker    jl .w32_loop
5398*c0909341SAndroid Build Coastguard Worker    RET
5399*c0909341SAndroid Build Coastguard Worker.w64:
5400*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, [maskq+hq*2]
5401*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq]
5402*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmpq]
5403*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
5404*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
5405*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
5406*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
5407*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
5408*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
5409*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
5410*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
5411*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
5412*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
5413*c0909341SAndroid Build Coastguard Worker    inc                  hq
5414*c0909341SAndroid Build Coastguard Worker    jl .w64
5415*c0909341SAndroid Build Coastguard Worker    RET
5416*c0909341SAndroid Build Coastguard Worker.w128:
5417*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, [maskq+hq*2]
5418*c0909341SAndroid Build Coastguard Worker    mova                 m2, [dstq+64*0]
5419*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmpq+64*0]
5420*c0909341SAndroid Build Coastguard Worker    mova                 m3, [dstq+64*1]
5421*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tmpq+64*1]
5422*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64*2
5423*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m2, m1
5424*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m1
5425*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m6
5426*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6
5427*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m3, m4
5428*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m4
5429*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6
5430*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m6
5431*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m5}, m0, m2, m1, m3
5432*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m2
5433*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m3
5434*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
5435*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
5436*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
5437*c0909341SAndroid Build Coastguard Worker    inc                  hq
5438*c0909341SAndroid Build Coastguard Worker    jl .w128
5439*c0909341SAndroid Build Coastguard Worker    RET
5440*c0909341SAndroid Build Coastguard Worker
5441*c0909341SAndroid Build Coastguard Workercglobal resize_8bpc, 6, 12, 19, dst, dst_stride, src, src_stride, \
5442*c0909341SAndroid Build Coastguard Worker                                dst_w, h, src_w, dx, mx0
5443*c0909341SAndroid Build Coastguard Worker    sub          dword mx0m, 4<<14
5444*c0909341SAndroid Build Coastguard Worker    sub        dword src_wm, 8
5445*c0909341SAndroid Build Coastguard Worker    mov                  r6, ~0
5446*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, dxm
5447*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, mx0m
5448*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, src_wm
5449*c0909341SAndroid Build Coastguard Worker    kmovq                k3, r6
5450*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
5451*c0909341SAndroid Build Coastguard Worker    LEA                  r7, $$
5452*c0909341SAndroid Build Coastguard Worker%define base r7-$$
5453*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [base+pw_m256]
5454*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+pd_63]
5455*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m15, [base+pb_8x0_8x8]
5456*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
5457*c0909341SAndroid Build Coastguard Worker    pslld                m5, 4                      ; dx*16
5458*c0909341SAndroid Build Coastguard Worker    pslld                m6, 14
5459*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
5460*c0909341SAndroid Build Coastguard Worker    mova                m16, [base+resize_permA]
5461*c0909341SAndroid Build Coastguard Worker    mova                m17, [base+resize_permB]
5462*c0909341SAndroid Build Coastguard Worker    mova               xm18, [base+resize_permC]
5463*c0909341SAndroid Build Coastguard Worker.loop_y:
5464*c0909341SAndroid Build Coastguard Worker    xor                  xd, xd
5465*c0909341SAndroid Build Coastguard Worker    mova                 m4, m8     ; per-line working version of mx
5466*c0909341SAndroid Build Coastguard Worker.loop_x:
5467*c0909341SAndroid Build Coastguard Worker    pmaxsd               m0, m4, m2
5468*c0909341SAndroid Build Coastguard Worker    psrad                m9, m4, 8  ; filter offset (unmasked)
5469*c0909341SAndroid Build Coastguard Worker    pminsd               m0, m6     ; iclip(mx, 0, src_w-8)
5470*c0909341SAndroid Build Coastguard Worker    psubd                m1, m4, m0 ; pshufb offset
5471*c0909341SAndroid Build Coastguard Worker    psrad                m0, 14     ; clipped src_x offset
5472*c0909341SAndroid Build Coastguard Worker    psrad                m1, 14     ; pshufb edge_emu offset
5473*c0909341SAndroid Build Coastguard Worker    vptestmd             k4, m1, m1
5474*c0909341SAndroid Build Coastguard Worker    pand                 m9, m7     ; filter offset (masked)
5475*c0909341SAndroid Build Coastguard Worker    ktestw               k4, k4
5476*c0909341SAndroid Build Coastguard Worker    jz .load
5477*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym12, m0, 1
5478*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym13, m1, 1
5479*c0909341SAndroid Build Coastguard Worker    kmovq                k1, k3
5480*c0909341SAndroid Build Coastguard Worker    kmovq                k2, k3
5481*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m10{k1}, [srcq+ym0]
5482*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m11{k2}, [srcq+ym12]
5483*c0909341SAndroid Build Coastguard Worker    kmovq                k1, k3
5484*c0909341SAndroid Build Coastguard Worker    kmovq                k2, k3
5485*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m14{k1}, [base+resize_shuf+4+ym1]
5486*c0909341SAndroid Build Coastguard Worker    vpgatherdq       m0{k2}, [base+resize_shuf+4+ym13]
5487*c0909341SAndroid Build Coastguard Worker    mova                m12, m16
5488*c0909341SAndroid Build Coastguard Worker    mova                m13, m17
5489*c0909341SAndroid Build Coastguard Worker    paddb               m14, m15
5490*c0909341SAndroid Build Coastguard Worker    paddb                m0, m15
5491*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m14
5492*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m0
5493*c0909341SAndroid Build Coastguard Worker    vpermi2d            m12, m10, m11
5494*c0909341SAndroid Build Coastguard Worker    vpermi2d            m13, m10, m11
5495*c0909341SAndroid Build Coastguard Worker    jmp .filter
5496*c0909341SAndroid Build Coastguard Worker.load:
5497*c0909341SAndroid Build Coastguard Worker    kmovq                k1, k3
5498*c0909341SAndroid Build Coastguard Worker    kmovq                k2, k3
5499*c0909341SAndroid Build Coastguard Worker    vpgatherdd      m12{k1}, [srcq+m0+0]
5500*c0909341SAndroid Build Coastguard Worker    vpgatherdd      m13{k2}, [srcq+m0+4]
5501*c0909341SAndroid Build Coastguard Worker.filter:
5502*c0909341SAndroid Build Coastguard Worker    kmovq                k1, k3
5503*c0909341SAndroid Build Coastguard Worker    kmovq                k2, k3
5504*c0909341SAndroid Build Coastguard Worker    vpgatherdd      m10{k1}, [base+resize_filter+m9*8+0]
5505*c0909341SAndroid Build Coastguard Worker    vpgatherdd      m11{k2}, [base+resize_filter+m9*8+4]
5506*c0909341SAndroid Build Coastguard Worker    mova                m14, m2
5507*c0909341SAndroid Build Coastguard Worker    vpdpbusd            m14, m12, m10
5508*c0909341SAndroid Build Coastguard Worker    vpdpbusd            m14, m13, m11
5509*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m14
5510*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m3
5511*c0909341SAndroid Build Coastguard Worker    packuswb            m14, m14
5512*c0909341SAndroid Build Coastguard Worker    vpermd              m14, m18, m14
5513*c0909341SAndroid Build Coastguard Worker    mova          [dstq+xq], xm14
5514*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
5515*c0909341SAndroid Build Coastguard Worker    add                  xd, 16
5516*c0909341SAndroid Build Coastguard Worker    cmp                  xd, dst_wd
5517*c0909341SAndroid Build Coastguard Worker    jl .loop_x
5518*c0909341SAndroid Build Coastguard Worker    add                dstq, dst_strideq
5519*c0909341SAndroid Build Coastguard Worker    add                srcq, src_strideq
5520*c0909341SAndroid Build Coastguard Worker    dec                  hd
5521*c0909341SAndroid Build Coastguard Worker    jg .loop_y
5522*c0909341SAndroid Build Coastguard Worker    RET
5523*c0909341SAndroid Build Coastguard Worker
5524*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
5525