xref: /aosp_15_r20/external/libdav1d/src/x86/mc16_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2020, VideoLAN and dav1d authors
2; Copyright © 2020, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 64
32
33spel_h_shufA:  db  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
34               db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41
35spel_h_shufC:  db  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17
36               db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49
37               db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25
38               db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57
39spel_h_shufB:  db  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
40               db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45
41spel_h_shufD:  db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21
42               db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53
43               db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29
44               db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61
45spel_v_shuf8:  db  0,  1, 16, 17,  2,  3, 18, 19,  4,  5, 20, 21,  6,  7, 22, 23
46               db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
47               db  8,  9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
48               db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
49spel_v_shuf16: db  0,  1, 32, 33,  2,  3, 34, 35,  4,  5, 36, 37,  6,  7, 38, 39
50               db  8,  9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
51               db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
52               db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
53prep_endA:     db  1,  2,  5,  6,  9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
54               db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
55               db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
56               db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
57prep_endB:     db  1,  2,  5,  6,  9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46
58               db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62
59               db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110
60               db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126
61prep_endC:     db  1,  2,  5,  6,  9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78
62               db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94
63               db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110
64               db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126
65spel_shuf4a:   db  1,  2, 17, 18,  5,  6, 21, 22,  9, 10, 25, 26, 13, 14, 29, 30
66               db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46
67               db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
68               db 49, 50, 65, 66, 53, 54, 69, 70, 57, 58, 73, 74, 61, 62, 77, 78
69spel_shuf4b:   db 50, 51, 65, 66, 54, 55, 69, 70, 58, 59, 73, 74, 62, 63, 77, 78
70               db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94
71               db 81, 82, 97, 98, 85, 86,101,102, 89, 90,105,106, 93, 94,109,110
72               db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126
73spel_shuf8a:   db  1,  2, 17, 18,  5,  6, 21, 22,  9, 10, 25, 26, 13, 14, 29, 30
74               db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78
75               db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
76               db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110
77spel_shuf8b:   db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78
78               db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94
79               db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110
80               db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126
81spel_shuf16:   db  1,  2, 33, 34,  5,  6, 37, 38,  9, 10, 41, 42, 13, 14, 45, 46
82               db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62
83               db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110
84               db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126
85spel_shuf32:   db  1,  2, 65, 66,  5,  6, 69, 70,  9, 10, 73, 74, 13, 14, 77, 78
86               db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94
87               db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110
88               db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126
89spel_h_shuf2b: db  1,  2, 17, 18,  5,  6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38
90               db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50,  9, 10, 53, 54, 13, 14
91               db  9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46
92spel_shuf2:    db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30
93spel_h_shuf2a: db  0,  1,  2,  3,  2,  3,  4,  5, 16, 17, 18, 19, 18, 19, 20, 21
94               db  4,  5,  6,  7,  6,  7,  8,  9, 20, 21, 22, 23, 22, 23, 24, 25
95w_mask_end42x: db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
96               db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
97w_mask_end444: db  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
98               db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
99               db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
100               db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
101w_mask_shuf4:  db  0,  2,  8, 10,  4,  6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
102               db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
103               db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94
104               db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126
105w_mask_shuf8:  db  0,  2, 16, 18,  4,  6, 20, 22,  8, 10, 24, 26, 12, 14, 28, 30
106               db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
107               db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94
108               db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126
109w_mask_shuf16: db  0,  2, 32, 34,  4,  6, 36, 38,  8, 10, 40, 42, 12, 14, 44, 46
110               db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
111               db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110
112               db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126
113warp8x8_permA: db  0,  1,  2,  3, 32, 33, 34, 35,  2,  3,  4,  5, 34, 35, 36, 37
114               db  4,  5,  6,  7, 36, 37, 38, 39,  6,  7,  8,  9, 38, 39, 40, 41
115               db  8,  9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45
116               db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
117warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
118               db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53
119               db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57
120               db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61
121warp8x8_end:   db  0,  1,  4,  5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
122               db  2,  3,  6,  7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
123               db  8,  9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
124               db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
125deint_q_shuf: ;dq  0,  2,  4,  6,  1,  3,  5,  7
126pd_0to7:       dd  0,  1,  2,  3,  4,  5,  6,  7
127               dd  1
128pw_2048:       times 2 dw 2048
129               dd  3
130pw_8192:       times 2 dw 8192
131avg_shift:     dw  5,  5,  3,  3
132pw_27615:      times 2 dw 27615
133pw_32766:      times 2 dw 32766
134warp8x8_permC: db -1,  0, -1,  1, -1,  8, -1,  9, -1,  4, -1,  5, -1, 12, -1, 13
135warp8x8_permD: db -1,  2, -1,  3, -1, 10, -1, 11, -1,  6, -1,  7, -1, 14, -1, 15
136warp_shift_h:  db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53
137blend_shuf:    db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
138resize_permA:  dd  0,  4,  8, 12,  1,  5,  9, 13, 16, 20, 24, 28, 17, 21, 25, 29
139resize_permB:  dd  2,  6, 10, 14,  3,  7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31
140resize_permC:  dq  0,  1,  4,  5,  8,  9, 12, 13
141resize_permD:  dq  2,  3,  6,  7, 10, 11, 14, 15
142resize_permE:  dq  0,  2,  4,  6
143resize_shufA:  db -1,  0, -1,  1, -1,  4, -1,  5, -1,  8, -1,  9, -1, 12, -1, 13
144resize_shufB:  db -1,  2, -1,  3, -1,  6, -1,  7, -1, 10, -1, 11, -1, 14, -1, 15
145rescale_mul:   dd  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
146resize_shuf:   db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7
147               db  8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
148
149prep_hv_shift:    dq  6,  4
150put_bilin_h_rnd:  dw  8,  8, 10, 10
151prep_mul:         dw 16, 16,  4,  4
152put_8tap_h_rnd:   dd 34, 40
153prep_8tap_rnd:    dd 128 - (8192 << 8)
154warp_8x8_rnd_h:   dd 512, 2048
155warp_8x8_rnd_v:   dd 262144, 65536
156warp_8x8t_rnd_v:  dd 16384 - (8192 << 15)
157avg_round:        dw -16400, -16400, -16388, -16388
158w_avg_round:      dd 128 + (8192 << 4),  32 + (8192 << 4)
159mask_round:       dd 512 + (8192 << 6), 128 + (8192 << 6)
160w_mask_round:     dd 128, 64
161bidir_shift:      dw  6,  6,  4,  4
162
163pb_64:    times 4 db 64
164pw_m512:  times 2 dw -512
165pw_2:     times 2 dw 2
166pw_64:    times 2 dw 64
167pd_32:    dd 32
168pd_63:    dd 63
169pd_128:   dd 128
170pd_640:   dd 640
171pd_2176:  dd 2176
172pd_16384: dd 16384
173pd_0_4:   dd 0, 4
174
175%define pw_16 prep_mul
176%define pd_512 warp_8x8_rnd_h
177
178%macro BASE_JMP_TABLE 3-*
179    %xdefine %1_%2_table (%%table - %3)
180    %xdefine %%base %1_%2
181    %%table:
182    %rep %0 - 2
183        dw %%base %+ _w%3 - %%base
184        %rotate 1
185    %endrep
186%endmacro
187
188%macro HV_JMP_TABLE 5-*
189    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3)
190    %xdefine %%base %1_%3
191    %assign %%types %4
192    %if %%types & 1
193        %xdefine %1_%2_h_%3_table  (%%h  - %5)
194        %%h:
195        %rep %0 - 4
196            dw %%prefix %+ .h_w%5 - %%base
197            %rotate 1
198        %endrep
199        %rotate 4
200    %endif
201    %if %%types & 2
202        %xdefine %1_%2_v_%3_table  (%%v  - %5)
203        %%v:
204        %rep %0 - 4
205            dw %%prefix %+ .v_w%5 - %%base
206            %rotate 1
207        %endrep
208        %rotate 4
209    %endif
210    %if %%types & 4
211        %xdefine %1_%2_hv_%3_table (%%hv - %5)
212        %%hv:
213        %rep %0 - 4
214            dw %%prefix %+ .hv_w%5 - %%base
215            %rotate 1
216        %endrep
217    %endif
218%endmacro
219
220%macro BIDIR_JMP_TABLE 2-*
221    %xdefine %1_%2_table (%%table - 2*%3)
222    %xdefine %%base %1_%2_table
223    %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
224    %%table:
225    %rep %0 - 2
226        dd %%prefix %+ .w%3 - %%base
227        %rotate 1
228    %endrep
229%endmacro
230
231%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put)
232%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep)
233
234BIDIR_JMP_TABLE avg,        avx512icl,       4, 8, 16, 32, 64, 128
235BIDIR_JMP_TABLE w_avg,      avx512icl,       4, 8, 16, 32, 64, 128
236BIDIR_JMP_TABLE mask,       avx512icl,       4, 8, 16, 32, 64, 128
237BIDIR_JMP_TABLE w_mask_420, avx512icl,       4, 8, 16, 32, 64, 128
238BIDIR_JMP_TABLE w_mask_422, avx512icl,       4, 8, 16, 32, 64, 128
239BIDIR_JMP_TABLE w_mask_444, avx512icl,       4, 8, 16, 32, 64, 128
240BIDIR_JMP_TABLE blend,      avx512icl,       4, 8, 16, 32
241BIDIR_JMP_TABLE blend_v,    avx512icl,    2, 4, 8, 16, 32
242BIDIR_JMP_TABLE blend_h,    avx512icl,    2, 4, 8, 16, 32, 64, 128
243BASE_JMP_TABLE put,         avx512icl,    2, 4, 8, 16, 32, 64, 128
244BASE_JMP_TABLE prep,        avx512icl,       4, 8, 16, 32, 64, 128
245HV_JMP_TABLE   put,  bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
246HV_JMP_TABLE   prep, bilin, avx512icl, 7,    4, 8, 16, 32, 64, 128
247HV_JMP_TABLE   put,  6tap,  avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
248HV_JMP_TABLE   put,  8tap,  avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
249HV_JMP_TABLE   prep, 6tap,  avx512icl, 2,    4, 8, 16, 32, 64, 128
250HV_JMP_TABLE   prep, 8tap,  avx512icl, 2,    4, 8, 16, 32, 64, 128
251
252%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
253
254cextern mc_subpel_filters
255%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
256
257cextern mc_warp_filter
258cextern obmc_masks_avx2
259cextern resize_filter
260
261SECTION .text
262
263%if WIN64
264DECLARE_REG_TMP 4
265%else
266DECLARE_REG_TMP 8
267%endif
268
269INIT_ZMM avx512icl
270cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy
271    mov                mxyd, r6m ; mx
272    lea                  r7, [put_avx512icl]
273    tzcnt               t0d, wm
274    movifnidn            hd, hm
275    test               mxyd, mxyd
276    jnz .h
277    mov                mxyd, r7m ; my
278    test               mxyd, mxyd
279    jnz .v
280.put:
281    movzx               t0d, word [r7+t0*2+table_offset(put,)]
282    add                  t0, r7
283    jmp                  t0
284.put_w2:
285    mov                 r6d, [srcq+ssq*0]
286    mov                 r7d, [srcq+ssq*1]
287    lea                srcq, [srcq+ssq*2]
288    mov        [dstq+dsq*0], r6d
289    mov        [dstq+dsq*1], r7d
290    lea                dstq, [dstq+dsq*2]
291    sub                  hd, 2
292    jg .put_w2
293    RET
294.put_w4:
295    mov                  r6, [srcq+ssq*0]
296    mov                  r7, [srcq+ssq*1]
297    lea                srcq, [srcq+ssq*2]
298    mov        [dstq+dsq*0], r6
299    mov        [dstq+dsq*1], r7
300    lea                dstq, [dstq+dsq*2]
301    sub                  hd, 2
302    jg .put_w4
303    RET
304.put_w8:
305    movu               xmm0, [srcq+ssq*0]
306    movu               xmm1, [srcq+ssq*1]
307    lea                srcq, [srcq+ssq*2]
308    mova       [dstq+dsq*0], xmm0
309    mova       [dstq+dsq*1], xmm1
310    lea                dstq, [dstq+dsq*2]
311    sub                  hd, 2
312    jg .put_w8
313    RET
314.put_w16:
315    movu                ym0, [srcq+ssq*0]
316    movu                ym1, [srcq+ssq*1]
317    lea                srcq, [srcq+ssq*2]
318    mova       [dstq+dsq*0], ym0
319    mova       [dstq+dsq*1], ym1
320    lea                dstq, [dstq+dsq*2]
321    sub                  hd, 2
322    jg .put_w16
323    RET
324.put_w32:
325    movu                 m0, [srcq+ssq*0]
326    movu                 m1, [srcq+ssq*1]
327    lea                srcq, [srcq+ssq*2]
328    mova       [dstq+dsq*0], m0
329    mova       [dstq+dsq*1], m1
330    lea                dstq, [dstq+dsq*2]
331    sub                  hd, 2
332    jg .put_w32
333    RET
334.put_w64:
335    movu                 m0, [srcq+ssq*0+64*0]
336    movu                 m1, [srcq+ssq*0+64*1]
337    movu                 m2, [srcq+ssq*1+64*0]
338    movu                 m3, [srcq+ssq*1+64*1]
339    lea                srcq, [srcq+ssq*2]
340    mova  [dstq+dsq*0+64*0], m0
341    mova  [dstq+dsq*0+64*1], m1
342    mova  [dstq+dsq*1+64*0], m2
343    mova  [dstq+dsq*1+64*1], m3
344    lea                dstq, [dstq+dsq*2]
345    sub                  hd, 2
346    jg .put_w64
347    RET
348.put_w128:
349    movu                 m0, [srcq+64*0]
350    movu                 m1, [srcq+64*1]
351    movu                 m2, [srcq+64*2]
352    movu                 m3, [srcq+64*3]
353    add                srcq, ssq
354    mova        [dstq+64*0], m0
355    mova        [dstq+64*1], m1
356    mova        [dstq+64*2], m2
357    mova        [dstq+64*3], m3
358    add                dstq, dsq
359    dec                  hd
360    jg .put_w128
361    RET
362.h:
363    vpbroadcastw         m5, mxyd
364    mov                mxyd, r7m ; my
365    vpbroadcastd         m4, [pw_16]
366    psubw                m4, m5
367    test               mxyd, mxyd
368    jnz .hv
369    ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
370    movzx               t0d, word [r7+t0*2+table_offset(put, _bilin_h)]
371    mov                 r6d, r8m ; bitdepth_max
372    add                  t0, r7
373    shr                 r6d, 11
374    vpbroadcastd         m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4]
375    jmp                  t0
376.h_w2:
377    movq               xmm1, [srcq+ssq*0]
378    movhps             xmm1, [srcq+ssq*1]
379    lea                srcq, [srcq+ssq*2]
380    pmullw             xmm0, xmm1, xm4
381    psrlq              xmm1, 16
382    pmullw             xmm1, xm5
383    paddw              xmm0, xm6
384    paddw              xmm0, xmm1
385    psrlw              xmm0, 4
386    movd       [dstq+dsq*0], xmm0
387    pextrd     [dstq+dsq*1], xmm0, 2
388    lea                dstq, [dstq+dsq*2]
389    sub                  hd, 2
390    jg .h_w2
391    RET
392.h_w4:
393    movq               xmm0, [srcq+ssq*0+0]
394    movhps             xmm0, [srcq+ssq*1+0]
395    movq               xmm1, [srcq+ssq*0+2]
396    movhps             xmm1, [srcq+ssq*1+2]
397    lea                srcq, [srcq+ssq*2]
398    pmullw             xmm0, xm4
399    pmullw             xmm1, xm5
400    paddw              xmm0, xm6
401    paddw              xmm0, xmm1
402    psrlw              xmm0, 4
403    movq       [dstq+dsq*0], xmm0
404    movhps     [dstq+dsq*1], xmm0
405    lea                dstq, [dstq+dsq*2]
406    sub                  hd, 2
407    jg .h_w4
408    RET
409.h_w8:
410    movu                xm0, [srcq+ssq*0+0]
411    vinserti32x4        ym0, [srcq+ssq*1+0], 1
412    movu                xm1, [srcq+ssq*0+2]
413    vinserti32x4        ym1, [srcq+ssq*1+2], 1
414    lea                srcq, [srcq+ssq*2]
415    pmullw              ym0, ym4
416    pmullw              ym1, ym5
417    paddw               ym0, ym6
418    paddw               ym0, ym1
419    psrlw               ym0, 4
420    mova          [dstq+dsq*0], xm0
421    vextracti32x4 [dstq+dsq*1], ym0, 1
422    lea                dstq, [dstq+dsq*2]
423    sub                  hd, 2
424    jg .h_w8
425    RET
426.h_w16:
427    movu                ym0, [srcq+ssq*0+0]
428    vinserti32x8         m0, [srcq+ssq*1+0], 1
429    movu                ym1, [srcq+ssq*0+2]
430    vinserti32x8         m1, [srcq+ssq*1+2], 1
431    lea                srcq, [srcq+ssq*2]
432    pmullw               m0, m4
433    pmullw               m1, m5
434    paddw                m0, m6
435    paddw                m0, m1
436    psrlw                m0, 4
437    mova          [dstq+dsq*0], ym0
438    vextracti32x8 [dstq+dsq*1], m0, 1
439    lea                dstq, [dstq+dsq*2]
440    sub                  hd, 2
441    jg .h_w16
442    RET
443.h_w32:
444    pmullw               m0, m4, [srcq+ssq*0+0]
445    pmullw               m2, m5, [srcq+ssq*0+2]
446    pmullw               m1, m4, [srcq+ssq*1+0]
447    pmullw               m3, m5, [srcq+ssq*1+2]
448    lea                srcq, [srcq+ssq*2]
449    paddw                m0, m6
450    paddw                m1, m6
451    paddw                m0, m2
452    paddw                m1, m3
453    psrlw                m0, 4
454    psrlw                m1, 4
455    mova       [dstq+dsq*0], m0
456    mova       [dstq+dsq*1], m1
457    lea                dstq, [dstq+dsq*2]
458    sub                  hd, 2
459    jg .h_w32
460    RET
461.h_w64:
462    pmullw               m0, m4, [srcq+64*0+0]
463    pmullw               m2, m5, [srcq+64*0+2]
464    pmullw               m1, m4, [srcq+64*1+0]
465    pmullw               m3, m5, [srcq+64*1+2]
466    add                srcq, ssq
467    paddw                m0, m6
468    paddw                m1, m6
469    paddw                m0, m2
470    paddw                m1, m3
471    psrlw                m0, 4
472    psrlw                m1, 4
473    mova        [dstq+64*0], m0
474    mova        [dstq+64*1], m1
475    add                dstq, dsq
476    dec                  hd
477    jg .h_w64
478    RET
479.h_w128:
480    pmullw               m0, m4, [srcq+64*0+0]
481    pmullw               m7, m5, [srcq+64*0+2]
482    pmullw               m1, m4, [srcq+64*1+0]
483    pmullw               m8, m5, [srcq+64*1+2]
484    pmullw               m2, m4, [srcq+64*2+0]
485    pmullw               m9, m5, [srcq+64*2+2]
486    pmullw               m3, m4, [srcq+64*3+0]
487    pmullw              m10, m5, [srcq+64*3+2]
488    add                srcq, ssq
489    REPX      {paddw x, m6}, m0, m1, m2, m3
490    paddw                m0, m7
491    paddw                m1, m8
492    paddw                m2, m9
493    paddw                m3, m10
494    REPX       {psrlw x, 4}, m0, m1, m2, m3
495    mova        [dstq+64*0], m0
496    mova        [dstq+64*1], m1
497    mova        [dstq+64*2], m2
498    mova        [dstq+64*3], m3
499    add                dstq, dsq
500    dec                  hd
501    jg .h_w128
502    RET
503.v:
504    movzx               t0d, word [r7+t0*2+table_offset(put, _bilin_v)]
505    shl                mxyd, 11
506    vpbroadcastw         m8, mxyd
507    add                  t0, r7
508    jmp                  t0
509.v_w2:
510    movd               xmm0, [srcq+ssq*0]
511.v_w2_loop:
512    movd               xmm1, [srcq+ssq*1]
513    lea                srcq, [srcq+ssq*2]
514    punpckldq          xmm2, xmm0, xmm1
515    movd               xmm0, [srcq+ssq*0]
516    punpckldq          xmm1, xmm0
517    psubw              xmm1, xmm2
518    pmulhrsw           xmm1, xm8
519    paddw              xmm1, xmm2
520    movd       [dstq+dsq*0], xmm1
521    pextrd     [dstq+dsq*1], xmm1, 1
522    lea                dstq, [dstq+dsq*2]
523    sub                  hd, 2
524    jg .v_w2_loop
525    RET
526.v_w4:
527    movq               xmm0, [srcq+ssq*0]
528.v_w4_loop:
529    movq               xmm1, [srcq+ssq*1]
530    lea                srcq, [srcq+ssq*2]
531    punpcklqdq         xmm2, xmm0, xmm1
532    movq               xmm0, [srcq+ssq*0]
533    punpcklqdq         xmm1, xmm0
534    psubw              xmm1, xmm2
535    pmulhrsw           xmm1, xm8
536    paddw              xmm1, xmm2
537    movq       [dstq+dsq*0], xmm1
538    movhps     [dstq+dsq*1], xmm1
539    lea                dstq, [dstq+dsq*2]
540    sub                  hd, 2
541    jg .v_w4_loop
542    RET
543.v_w8:
544    movu               xmm0, [srcq+ssq*0]
545.v_w8_loop:
546    vbroadcasti128     ymm1, [srcq+ssq*1]
547    lea                srcq, [srcq+ssq*2]
548    vpblendd           ymm2, ymm0, ymm1, 0xf0
549    vbroadcasti128     ymm0, [srcq+ssq*0]
550    vpblendd           ymm1, ymm0, 0xf0
551    psubw              ymm1, ymm2
552    pmulhrsw           ymm1, ym8
553    paddw              ymm1, ymm2
554    mova         [dstq+dsq*0], xmm1
555    vextracti128 [dstq+dsq*1], ymm1, 1
556    lea                dstq, [dstq+dsq*2]
557    sub                  hd, 2
558    jg .v_w8_loop
559    vzeroupper
560    RET
561.v_w16:
562    movu                ym0, [srcq+ssq*0]
563.v_w16_loop:
564    movu                ym3, [srcq+ssq*1]
565    lea                srcq, [srcq+ssq*2]
566    psubw               ym1, ym3, ym0
567    pmulhrsw            ym1, ym8
568    paddw               ym1, ym0
569    movu                ym0, [srcq+ssq*0]
570    psubw               ym2, ym0, ym3
571    pmulhrsw            ym2, ym8
572    paddw               ym2, ym3
573    mova       [dstq+dsq*0], ym1
574    mova       [dstq+dsq*1], ym2
575    lea                dstq, [dstq+dsq*2]
576    sub                  hd, 2
577    jg .v_w16_loop
578    RET
579.v_w32:
580    movu                 m0, [srcq+ssq*0]
581.v_w32_loop:
582    movu                 m3, [srcq+ssq*1]
583    lea                srcq, [srcq+ssq*2]
584    psubw                m1, m3, m0
585    pmulhrsw             m1, m8
586    paddw                m1, m0
587    movu                 m0, [srcq+ssq*0]
588    psubw                m2, m0, m3
589    pmulhrsw             m2, m8
590    paddw                m2, m3
591    mova       [dstq+dsq*0], m1
592    mova       [dstq+dsq*1], m2
593    lea                dstq, [dstq+dsq*2]
594    sub                  hd, 2
595    jg .v_w32_loop
596    RET
597.v_w64:
598    movu                 m0, [srcq+ssq*0+64*0]
599    movu                 m1, [srcq+ssq*0+64*1]
600.v_w64_loop:
601    movu                 m2, [srcq+ssq*1+64*0]
602    movu                 m3, [srcq+ssq*1+64*1]
603    lea                srcq, [srcq+ssq*2]
604    psubw                m4, m2, m0
605    pmulhrsw             m4, m8
606    paddw                m4, m0
607    movu                 m0, [srcq+ssq*0+64*0]
608    psubw                m5, m3, m1
609    pmulhrsw             m5, m8
610    paddw                m5, m1
611    movu                 m1, [srcq+ssq*0+64*1]
612    psubw                m6, m0, m2
613    pmulhrsw             m6, m8
614    psubw                m7, m1, m3
615    pmulhrsw             m7, m8
616    mova  [dstq+dsq*0+64*0], m4
617    mova  [dstq+dsq*0+64*1], m5
618    paddw                m6, m2
619    paddw                m7, m3
620    mova  [dstq+dsq*1+64*0], m6
621    mova  [dstq+dsq*1+64*1], m7
622    lea                dstq, [dstq+dsq*2]
623    sub                  hd, 2
624    jg .v_w64_loop
625    RET
626.v_w128:
627    movu                 m0, [srcq+ssq*0+64*0]
628    movu                 m1, [srcq+ssq*0+64*1]
629    movu                 m2, [srcq+ssq*0+64*2]
630    movu                 m3, [srcq+ssq*0+64*3]
631.v_w128_loop:
632    movu                 m4, [srcq+ssq*1+64*0]
633    movu                 m5, [srcq+ssq*1+64*1]
634    movu                 m6, [srcq+ssq*1+64*2]
635    movu                 m7, [srcq+ssq*1+64*3]
636    lea                srcq, [srcq+ssq*2]
637    psubw                m9, m4, m0
638    pmulhrsw             m9, m8
639    paddw                m9, m0
640    movu                 m0, [srcq+ssq*0+64*0]
641    psubw               m10, m5, m1
642    pmulhrsw            m10, m8
643    paddw               m10, m1
644    movu                 m1, [srcq+ssq*0+64*1]
645    psubw               m11, m6, m2
646    pmulhrsw            m11, m8
647    paddw               m11, m2
648    movu                 m2, [srcq+ssq*0+64*2]
649    psubw               m12, m7, m3
650    pmulhrsw            m12, m8
651    paddw               m12, m3
652    movu                 m3, [srcq+ssq*0+64*3]
653    mova  [dstq+dsq*0+64*0], m9
654    psubw                m9, m0, m4
655    pmulhrsw             m9, m8
656    mova  [dstq+dsq*0+64*1], m10
657    psubw               m10, m1, m5
658    pmulhrsw            m10, m8
659    mova  [dstq+dsq*0+64*2], m11
660    psubw               m11, m2, m6
661    pmulhrsw            m11, m8
662    mova  [dstq+dsq*0+64*3], m12
663    psubw               m12, m3, m7
664    pmulhrsw            m12, m8
665    paddw                m9, m4
666    paddw               m10, m5
667    mova  [dstq+dsq*1+64*0], m9
668    mova  [dstq+dsq*1+64*1], m10
669    paddw               m11, m6
670    paddw               m12, m7
671    mova  [dstq+dsq*1+64*2], m11
672    mova  [dstq+dsq*1+64*3], m12
673    lea                dstq, [dstq+dsq*2]
674    sub                  hd, 2
675    jg .v_w128_loop
676    RET
677.hv:
678    movzx               t0d, word [r7+t0*2+table_offset(put, _bilin_hv)]
679    shl                mxyd, 11
680    vpbroadcastd         m6, [pw_2]
681    vpbroadcastw         m7, mxyd
682    vpbroadcastd         m8, [pw_8192]
683    add                  t0, r7
684    test          dword r8m, 0x800
685    jnz .hv_12bpc
686    psllw                m4, 2
687    psllw                m5, 2
688    vpbroadcastd         m8, [pw_2048]
689.hv_12bpc:
690    jmp                  t0
691.hv_w2:
692    vpbroadcastq       xmm1, [srcq+ssq*0]
693    pmullw             xmm0, xmm1, xm4
694    psrlq              xmm1, 16
695    pmullw             xmm1, xm5
696    paddw              xmm0, xm6
697    paddw              xmm0, xmm1
698    psrlw              xmm0, 2
699.hv_w2_loop:
700    movq               xmm2, [srcq+ssq*1]
701    lea                srcq, [srcq+ssq*2]
702    movhps             xmm2, [srcq+ssq*0]
703    pmullw             xmm1, xmm2, xm4
704    psrlq              xmm2, 16
705    pmullw             xmm2, xm5
706    paddw              xmm1, xm6
707    paddw              xmm1, xmm2
708    psrlw              xmm1, 2                ; 1 _ 2 _
709    shufpd             xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _
710    mova               xmm0, xmm1
711    psubw              xmm1, xmm2
712    paddw              xmm1, xmm1
713    pmulhw             xmm1, xm7
714    paddw              xmm1, xmm2
715    pmulhrsw           xmm1, xm8
716    movd       [dstq+dsq*0], xmm1
717    pextrd     [dstq+dsq*1], xmm1, 2
718    lea                dstq, [dstq+dsq*2]
719    sub                  hd, 2
720    jg .hv_w2_loop
721    RET
722.hv_w4:
723    pmullw             xmm0, xm4, [srcq+ssq*0-8]
724    pmullw             xmm1, xm5, [srcq+ssq*0-6]
725    paddw              xmm0, xm6
726    paddw              xmm0, xmm1
727    psrlw              xmm0, 2
728.hv_w4_loop:
729    movq               xmm1, [srcq+ssq*1+0]
730    movq               xmm2, [srcq+ssq*1+2]
731    lea                srcq, [srcq+ssq*2]
732    movhps             xmm1, [srcq+ssq*0+0]
733    movhps             xmm2, [srcq+ssq*0+2]
734    pmullw             xmm1, xm4
735    pmullw             xmm2, xm5
736    paddw              xmm1, xm6
737    paddw              xmm1, xmm2
738    psrlw              xmm1, 2                ; 1 2
739    shufpd             xmm2, xmm0, xmm1, 0x01 ; 0 1
740    mova               xmm0, xmm1
741    psubw              xmm1, xmm2
742    paddw              xmm1, xmm1
743    pmulhw             xmm1, xm7
744    paddw              xmm1, xmm2
745    pmulhrsw           xmm1, xm8
746    movq       [dstq+dsq*0], xmm1
747    movhps     [dstq+dsq*1], xmm1
748    lea                dstq, [dstq+dsq*2]
749    sub                  hd, 2
750    jg .hv_w4_loop
751    RET
752.hv_w8:
753    pmullw             xmm0, xm4, [srcq+ssq*0+0]
754    pmullw             xmm1, xm5, [srcq+ssq*0+2]
755    paddw              xmm0, xm6
756    paddw              xmm0, xmm1
757    psrlw              xmm0, 2
758    vinserti32x4        ym0, xmm0, 1
759.hv_w8_loop:
760    movu                xm1, [srcq+ssq*1+0]
761    movu                xm2, [srcq+ssq*1+2]
762    lea                srcq, [srcq+ssq*2]
763    vinserti32x4        ym1, [srcq+ssq*0+0], 1
764    vinserti32x4        ym2, [srcq+ssq*0+2], 1
765    pmullw              ym1, ym4
766    pmullw              ym2, ym5
767    paddw               ym1, ym6
768    paddw               ym1, ym2
769    psrlw               ym1, 2              ; 1 2
770    vshufi32x4          ym2, ym0, ym1, 0x01 ; 0 1
771    mova                ym0, ym1
772    psubw               ym1, ym2
773    paddw               ym1, ym1
774    pmulhw              ym1, ym7
775    paddw               ym1, ym2
776    pmulhrsw            ym1, ym8
777    mova          [dstq+dsq*0], xm1
778    vextracti32x4 [dstq+dsq*1], ym1, 1
779    lea                dstq, [dstq+dsq*2]
780    sub                  hd, 2
781    jg .hv_w8_loop
782    RET
783.hv_w16:
784    pmullw              ym0, ym4, [srcq+ssq*0+0]
785    pmullw              ym1, ym5, [srcq+ssq*0+2]
786    paddw               ym0, ym6
787    paddw               ym0, ym1
788    psrlw               ym0, 2
789    vinserti32x8         m0, ym0, 1
790.hv_w16_loop:
791    movu                ym1, [srcq+ssq*1+0]
792    movu                ym2, [srcq+ssq*1+2]
793    lea                srcq, [srcq+ssq*2]
794    vinserti32x8         m1, [srcq+ssq*0+0], 1
795    vinserti32x8         m2, [srcq+ssq*0+2], 1
796    pmullw               m1, m4
797    pmullw               m2, m5
798    paddw                m1, m6
799    paddw                m1, m2
800    psrlw                m1, 2             ; 1 2
801    vshufi32x4           m2, m0, m1, q1032 ; 0 1
802    mova                 m0, m1
803    psubw                m1, m2
804    paddw                m1, m1
805    pmulhw               m1, m7
806    paddw                m1, m2
807    pmulhrsw             m1, m8
808    mova          [dstq+dsq*0], ym1
809    vextracti32x8 [dstq+dsq*1], m1, 1
810    lea                dstq, [dstq+dsq*2]
811    sub                  hd, 2
812    jg .hv_w16_loop
813    RET
814.hv_w32:
815.hv_w64:
816.hv_w128:
817    movifnidn            wd, wm
818    lea                 r6d, [hq+wq*8-256]
819    mov                  r4, srcq
820    mov                  r7, dstq
821.hv_w32_loop0:
822    pmullw               m0, m4, [srcq+ssq*0+0]
823    pmullw               m1, m5, [srcq+ssq*0+2]
824    paddw                m0, m6
825    paddw                m0, m1
826    psrlw                m0, 2
827.hv_w32_loop:
828    pmullw               m3, m4, [srcq+ssq*1+0]
829    pmullw               m1, m5, [srcq+ssq*1+2]
830    lea                srcq, [srcq+ssq*2]
831    paddw                m3, m6
832    paddw                m3, m1
833    psrlw                m3, 2
834    psubw                m1, m3, m0
835    paddw                m1, m1
836    pmulhw               m1, m7
837    paddw                m1, m0
838    pmullw               m0, m4, [srcq+ssq*0+0]
839    pmullw               m2, m5, [srcq+ssq*0+2]
840    paddw                m0, m6
841    paddw                m0, m2
842    psrlw                m0, 2
843    psubw                m2, m0, m3
844    paddw                m2, m2
845    pmulhw               m2, m7
846    paddw                m2, m3
847    pmulhrsw             m1, m8
848    pmulhrsw             m2, m8
849    mova       [dstq+dsq*0], m1
850    mova       [dstq+dsq*1], m2
851    lea                dstq, [dstq+dsq*2]
852    sub                  hd, 2
853    jg .hv_w32_loop
854    add                  r4, 64
855    add                  r7, 64
856    movzx                hd, r6b
857    mov                srcq, r4
858    mov                dstq, r7
859    sub                 r6d, 1<<8
860    jg .hv_w32_loop0
861    RET
862
863cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3
864    movifnidn          mxyd, r5m ; mx
865    lea                  r6, [prep_avx512icl]
866    tzcnt                wd, wm
867    movifnidn            hd, hm
868    test               mxyd, mxyd
869    jnz .h
870    mov                mxyd, r6m ; my
871    test               mxyd, mxyd
872    jnz .v
873.prep:
874    movzx                wd, word [r6+wq*2+table_offset(prep,)]
875    mov                 r5d, r7m ; bitdepth_max
876    vpbroadcastd         m5, [r6-prep_avx512icl+pw_8192]
877    add                  wq, r6
878    shr                 r5d, 11
879    vpbroadcastd         m4, [r6-prep_avx512icl+prep_mul+r5*4]
880    lea            stride3q, [strideq*3]
881    jmp                  wq
882.prep_w4:
883    mov                 r3d, 0x0c
884    kmovb                k1, r3d
885.prep_w4_loop:
886    movq                xm0, [srcq+strideq*0]
887    movhps              xm0, [srcq+strideq*1]
888    vpbroadcastq        ym1, [srcq+strideq*2]
889    vpunpcklqdq     ym0{k1}, ym1, [srcq+stride3q] {1to4}
890    lea                srcq, [srcq+strideq*4]
891    pmullw              ym0, ym4
892    psubw               ym0, ym5
893    mova             [tmpq], ym0
894    add                tmpq, 32
895    sub                  hd, 4
896    jg .prep_w4_loop
897    RET
898.prep_w8:
899    movu                xm0, [srcq+strideq*0]
900    vinserti32x4        ym0, [srcq+strideq*1], 1
901    vinserti32x4         m0, [srcq+strideq*2], 2
902    vinserti32x4         m0, [srcq+stride3q ], 3
903    lea                srcq, [srcq+strideq*4]
904    pmullw               m0, m4
905    psubw                m0, m5
906    mova             [tmpq], m0
907    add                tmpq, 64
908    sub                  hd, 4
909    jg .prep_w8
910    RET
911.prep_w16:
912    movu                ym0, [srcq+strideq*0]
913    vinserti32x8         m0, [srcq+strideq*1], 1
914    movu                ym1, [srcq+strideq*2]
915    vinserti32x8         m1, [srcq+stride3q ], 1
916    lea                srcq, [srcq+strideq*4]
917    pmullw               m0, m4
918    pmullw               m1, m4
919    psubw                m0, m5
920    psubw                m1, m5
921    mova        [tmpq+64*0], m0
922    mova        [tmpq+64*1], m1
923    add                tmpq, 64*2
924    sub                  hd, 4
925    jg .prep_w16
926    RET
927.prep_w32:
928    pmullw               m0, m4, [srcq+strideq*0]
929    pmullw               m1, m4, [srcq+strideq*1]
930    pmullw               m2, m4, [srcq+strideq*2]
931    pmullw               m3, m4, [srcq+stride3q ]
932    lea                srcq, [srcq+strideq*4]
933    REPX      {psubw x, m5}, m0, m1, m2, m3
934    mova        [tmpq+64*0], m0
935    mova        [tmpq+64*1], m1
936    mova        [tmpq+64*2], m2
937    mova        [tmpq+64*3], m3
938    add                tmpq, 64*4
939    sub                  hd, 4
940    jg .prep_w32
941    RET
942.prep_w64:
943    pmullw               m0, m4, [srcq+strideq*0+64*0]
944    pmullw               m1, m4, [srcq+strideq*0+64*1]
945    pmullw               m2, m4, [srcq+strideq*1+64*0]
946    pmullw               m3, m4, [srcq+strideq*1+64*1]
947    lea                srcq, [srcq+strideq*2]
948    REPX      {psubw x, m5}, m0, m1, m2, m3
949    mova        [tmpq+64*0], m0
950    mova        [tmpq+64*1], m1
951    mova        [tmpq+64*2], m2
952    mova        [tmpq+64*3], m3
953    add                tmpq, 64*4
954    sub                  hd, 2
955    jg .prep_w64
956    RET
957.prep_w128:
958    pmullw               m0, m4, [srcq+64*0]
959    pmullw               m1, m4, [srcq+64*1]
960    pmullw               m2, m4, [srcq+64*2]
961    pmullw               m3, m4, [srcq+64*3]
962    add                srcq, strideq
963    REPX      {psubw x, m5}, m0, m1, m2, m3
964    mova        [tmpq+64*0], m0
965    mova        [tmpq+64*1], m1
966    mova        [tmpq+64*2], m2
967    mova        [tmpq+64*3], m3
968    add                tmpq, 64*4
969    dec                  hd
970    jg .prep_w128
971    RET
972.h:
973    vpbroadcastw         m5, mxyd
974    mov                mxyd, r6m ; my
975    vpbroadcastd         m4, [pw_16]
976    vpbroadcastd         m6, [pw_32766]
977    psubw                m4, m5
978    test          dword r7m, 0x800
979    jnz .h_12bpc
980    psllw                m4, 2
981    psllw                m5, 2
982.h_12bpc:
983    test               mxyd, mxyd
984    jnz .hv
985    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
986    add                  wq, r6
987    lea            stride3q, [strideq*3]
988    jmp                  wq
989.h_w4:
990    movu                xm1, [srcq+strideq*0]
991    vinserti32x4        ym1, [srcq+strideq*2], 1
992    movu                xm2, [srcq+strideq*1]
993    vinserti32x4        ym2, [srcq+stride3q ], 1
994    lea                srcq, [srcq+strideq*4]
995    punpcklqdq          ym0, ym1, ym2
996    psrldq              ym1, 2
997    psrldq              ym2, 2
998    pmullw              ym0, ym4
999    punpcklqdq          ym1, ym2
1000    pmullw              ym1, ym5
1001    psubw               ym0, ym6
1002    paddw               ym0, ym1
1003    psraw               ym0, 2
1004    mova             [tmpq], ym0
1005    add                tmpq, 32
1006    sub                  hd, 4
1007    jg .h_w4
1008    RET
1009.h_w8:
1010    movu                xm0, [srcq+strideq*0+0]
1011    movu                xm1, [srcq+strideq*0+2]
1012    vinserti32x4        ym0, [srcq+strideq*1+0], 1
1013    vinserti32x4        ym1, [srcq+strideq*1+2], 1
1014    vinserti32x4         m0, [srcq+strideq*2+0], 2
1015    vinserti32x4         m1, [srcq+strideq*2+2], 2
1016    vinserti32x4         m0, [srcq+stride3q +0], 3
1017    vinserti32x4         m1, [srcq+stride3q +2], 3
1018    lea                srcq, [srcq+strideq*4]
1019    pmullw               m0, m4
1020    pmullw               m1, m5
1021    psubw                m0, m6
1022    paddw                m0, m1
1023    psraw                m0, 2
1024    mova             [tmpq], m0
1025    add                tmpq, 64
1026    sub                  hd, 4
1027    jg .h_w8
1028    RET
1029.h_w16:
1030    movu                ym0, [srcq+strideq*0+0]
1031    vinserti32x8         m0, [srcq+strideq*1+0], 1
1032    movu                ym1, [srcq+strideq*0+2]
1033    vinserti32x8         m1, [srcq+strideq*1+2], 1
1034    lea                srcq, [srcq+strideq*2]
1035    pmullw               m0, m4
1036    pmullw               m1, m5
1037    psubw                m0, m6
1038    paddw                m0, m1
1039    psraw                m0, 2
1040    mova             [tmpq], m0
1041    add                tmpq, 64
1042    sub                  hd, 2
1043    jg .h_w16
1044    RET
1045.h_w32:
1046    pmullw               m0, m4, [srcq+strideq*0+0]
1047    pmullw               m2, m5, [srcq+strideq*0+2]
1048    pmullw               m1, m4, [srcq+strideq*1+0]
1049    pmullw               m3, m5, [srcq+strideq*1+2]
1050    lea                srcq, [srcq+strideq*2]
1051    psubw                m0, m6
1052    psubw                m1, m6
1053    paddw                m0, m2
1054    paddw                m1, m3
1055    psraw                m0, 2
1056    psraw                m1, 2
1057    mova        [tmpq+64*0], m0
1058    mova        [tmpq+64*1], m1
1059    add                tmpq, 64*2
1060    sub                  hd, 2
1061    jg .h_w32
1062    RET
1063.h_w64:
1064    pmullw               m0, m4, [srcq+ 0]
1065    pmullw               m2, m5, [srcq+ 2]
1066    pmullw               m1, m4, [srcq+64]
1067    pmullw               m3, m5, [srcq+66]
1068    add                srcq, strideq
1069    psubw                m0, m6
1070    psubw                m1, m6
1071    paddw                m0, m2
1072    paddw                m1, m3
1073    psraw                m0, 2
1074    psraw                m1, 2
1075    mova        [tmpq+64*0], m0
1076    mova        [tmpq+64*1], m1
1077    add                tmpq, 64*2
1078    dec                  hd
1079    jg .h_w64
1080    RET
1081.h_w128:
1082    pmullw               m0, m4, [srcq+  0]
1083    pmullw               m7, m5, [srcq+  2]
1084    pmullw               m1, m4, [srcq+ 64]
1085    pmullw               m8, m5, [srcq+ 66]
1086    pmullw               m2, m4, [srcq+128]
1087    pmullw               m9, m5, [srcq+130]
1088    pmullw               m3, m4, [srcq+192]
1089    pmullw              m10, m5, [srcq+194]
1090    add                srcq, strideq
1091    REPX      {psubw x, m6}, m0, m1, m2, m3
1092    paddw                m0, m7
1093    paddw                m1, m8
1094    paddw                m2, m9
1095    paddw                m3, m10
1096    REPX       {psraw x, 2}, m0, m1, m2, m3
1097    mova        [tmpq+64*0], m0
1098    mova        [tmpq+64*1], m1
1099    mova        [tmpq+64*2], m2
1100    mova        [tmpq+64*3], m3
1101    add                tmpq, 64*4
1102    dec                  hd
1103    jg .h_w128
1104    RET
1105.v:
1106    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
1107    vpbroadcastw         m9, mxyd
1108    vpbroadcastd         m8, [pw_16]
1109    vpbroadcastd        m10, [pw_32766]
1110    add                  wq, r6
1111    lea            stride3q, [strideq*3]
1112    psubw                m8, m9
1113    test          dword r7m, 0x800
1114    jnz .v_12bpc
1115    psllw                m8, 2
1116    psllw                m9, 2
1117.v_12bpc:
1118    jmp                  wq
1119.v_w4:
1120    movq               xmm0, [srcq+strideq*0]
1121.v_w4_loop:
1122    vpbroadcastq       xmm2, [srcq+strideq*1]
1123    vpbroadcastq       ymm1, [srcq+strideq*2]
1124    vpbroadcastq       ymm3, [srcq+stride3q ]
1125    lea                srcq, [srcq+strideq*4]
1126    vpblendd           ymm2, ymm1, 0x30
1127    vpblendd           ymm2, ymm3, 0xc0
1128    vpblendd           ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3
1129    movq               xmm0, [srcq+strideq*0]
1130    valignq            ymm2, ymm0, ymm2, 1    ; 1 2 3 4
1131    pmullw             ymm1, ym8
1132    pmullw             ymm2, ym9
1133    psubw              ymm1, ym10
1134    paddw              ymm1, ymm2
1135    psraw              ymm1, 2
1136    mova             [tmpq], ymm1
1137    add                tmpq, 32
1138    sub                  hd, 4
1139    jg .v_w4_loop
1140    vzeroupper
1141    RET
1142.v_w8:
1143    movu                xm0, [srcq+strideq*0]
1144.v_w8_loop:
1145    vinserti32x4        ym1, ym0, [srcq+strideq*1], 1
1146    vinserti32x4         m1, [srcq+strideq*2], 2
1147    vinserti32x4         m1, [srcq+stride3q ], 3 ; 0 1 2 3
1148    lea                srcq, [srcq+strideq*4]
1149    movu                xm0, [srcq+strideq*0]
1150    valignq              m2, m0, m1, 2           ; 1 2 3 4
1151    pmullw               m1, m8
1152    pmullw               m2, m9
1153    psubw                m1, m10
1154    paddw                m1, m2
1155    psraw                m1, 2
1156    mova             [tmpq], m1
1157    add                tmpq, 64
1158    sub                  hd, 4
1159    jg .v_w8_loop
1160    RET
1161.v_w16:
1162    movu                ym0, [srcq+strideq*0]
1163.v_w16_loop:
1164    vinserti32x8         m1, m0, [srcq+strideq*1], 1 ; 0 1
1165    movu                ym3, [srcq+strideq*2]
1166    vinserti32x8         m2, m3, [srcq+stride3q ], 1 ; 2 3
1167    lea                srcq, [srcq+strideq*4]
1168    movu                ym0, [srcq+strideq*0]
1169    vshufi32x4           m3, m1, m3, q1032           ; 1 2
1170    vshufi32x4           m4, m2, m0, q1032           ; 3 4
1171    pmullw               m1, m8
1172    pmullw               m2, m8
1173    pmullw               m3, m9
1174    pmullw               m4, m9
1175    psubw                m1, m10
1176    psubw                m2, m10
1177    paddw                m1, m3
1178    paddw                m2, m4
1179    psraw                m1, 2
1180    psraw                m2, 2
1181    mova        [tmpq+64*0], m1
1182    mova        [tmpq+64*1], m2
1183    add                tmpq, 64*2
1184    sub                  hd, 4
1185    jg .v_w16_loop
1186    RET
1187.v_w32:
1188    movu                 m0, [srcq+strideq*0]
1189.v_w32_loop:
1190    movu                 m3, [srcq+strideq*1]
1191    lea                srcq, [srcq+strideq*2]
1192    pmullw               m1, m8, m0
1193    movu                 m0, [srcq+strideq*0]
1194    pmullw               m2, m8, m3
1195    pmullw               m3, m9
1196    pmullw               m4, m9, m0
1197    psubw                m1, m10
1198    psubw                m2, m10
1199    paddw                m1, m3
1200    paddw                m2, m4
1201    psraw                m1, 2
1202    psraw                m2, 2
1203    mova        [tmpq+64*0], m1
1204    mova        [tmpq+64*1], m2
1205    add                tmpq, 64*2
1206    sub                  hd, 2
1207    jg .v_w32_loop
1208    RET
1209.v_w64:
1210    movu                 m0, [srcq+64*0]
1211    movu                 m1, [srcq+64*1]
1212.v_w64_loop:
1213    add                srcq, strideq
1214    pmullw               m2, m8, m0
1215    movu                 m0, [srcq+64*0]
1216    pmullw               m3, m8, m1
1217    movu                 m1, [srcq+64*1]
1218    pmullw               m4, m9, m0
1219    pmullw               m5, m9, m1
1220    psubw                m2, m10
1221    psubw                m3, m10
1222    paddw                m2, m4
1223    paddw                m3, m5
1224    psraw                m2, 2
1225    psraw                m3, 2
1226    mova        [tmpq+64*0], m2
1227    mova        [tmpq+64*1], m3
1228    add                tmpq, 64*2
1229    dec                  hd
1230    jg .v_w64_loop
1231    RET
1232.v_w128:
1233    movu                 m0, [srcq+64*0]
1234    movu                 m1, [srcq+64*1]
1235    movu                 m2, [srcq+64*2]
1236    movu                 m3, [srcq+64*3]
1237.v_w128_loop:
1238    add                srcq, strideq
1239    pmullw               m4, m8, m0
1240    movu                 m0, [srcq+64*0]
1241    pmullw               m5, m8, m1
1242    movu                 m1, [srcq+64*1]
1243    pmullw               m6, m8, m2
1244    movu                 m2, [srcq+64*2]
1245    pmullw               m7, m8, m3
1246    movu                 m3, [srcq+64*3]
1247    pmullw              m11, m9, m0
1248    pmullw              m12, m9, m1
1249    pmullw              m13, m9, m2
1250    pmullw              m14, m9, m3
1251    REPX     {psubw x, m10}, m4, m5, m6, m7
1252    paddw                m4, m11
1253    paddw                m5, m12
1254    paddw                m6, m13
1255    paddw                m7, m14
1256    REPX       {psraw x, 2}, m4, m5, m6, m7
1257    mova        [tmpq+64*0], m4
1258    mova        [tmpq+64*1], m5
1259    mova        [tmpq+64*2], m6
1260    mova        [tmpq+64*3], m7
1261    add                tmpq, 64*4
1262    dec                  hd
1263    jg .v_w128_loop
1264    RET
1265.hv:
1266    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
1267    shl                mxyd, 11
1268    vpbroadcastw         m7, mxyd
1269    add                  wq, r6
1270    lea            stride3q, [strideq*3]
1271    jmp                  wq
1272.hv_w4:
1273    movq               xmm0, [srcq+strideq*0+0]
1274    movq               xmm1, [srcq+strideq*0+2]
1275    pmullw             xmm0, xm4
1276    pmullw             xmm1, xm5
1277    psubw              xmm0, xm6
1278    paddw              xmm0, xmm1
1279    psraw              xmm0, 2
1280    vpbroadcastq        ym0, xmm0
1281.hv_w4_loop:
1282    movu                xm1, [srcq+strideq*1]
1283    vinserti128         ym1, [srcq+stride3q ], 1
1284    movu                xm2, [srcq+strideq*2]
1285    lea                srcq, [srcq+strideq*4]
1286    vinserti128         ym2, [srcq+strideq*0], 1
1287    punpcklqdq          ym3, ym1, ym2
1288    psrldq              ym1, 2
1289    psrldq              ym2, 2
1290    pmullw              ym3, ym4
1291    punpcklqdq          ym1, ym2
1292    pmullw              ym1, ym5
1293    psubw               ym3, ym6
1294    paddw               ym1, ym3
1295    psraw               ym1, 2           ; 1 2 3 4
1296    valignq             ym2, ym1, ym0, 3 ; 0 1 2 3
1297    mova                ym0, ym1
1298    psubw               ym1, ym2
1299    pmulhrsw            ym1, ym7
1300    paddw               ym1, ym2
1301    mova             [tmpq], ym1
1302    add                tmpq, 32
1303    sub                  hd, 4
1304    jg .hv_w4_loop
1305    RET
1306.hv_w8:
1307    pmullw              xm0, xm4, [srcq+strideq*0+0]
1308    pmullw              xm1, xm5, [srcq+strideq*0+2]
1309    psubw               xm0, xm6
1310    paddw               xm0, xm1
1311    psraw               xm0, 2
1312    vinserti32x4         m0, xm0, 3
1313.hv_w8_loop:
1314    movu                xm1, [srcq+strideq*1+0]
1315    movu                xm2, [srcq+strideq*1+2]
1316    vinserti32x4        ym1, [srcq+strideq*2+0], 1
1317    vinserti32x4        ym2, [srcq+strideq*2+2], 1
1318    vinserti32x4         m1, [srcq+stride3q +0], 2
1319    vinserti32x4         m2, [srcq+stride3q +2], 2
1320    lea                srcq, [srcq+strideq*4]
1321    vinserti32x4         m1, [srcq+strideq*0+0], 3
1322    vinserti32x4         m2, [srcq+strideq*0+2], 3
1323    pmullw               m1, m4
1324    pmullw               m2, m5
1325    psubw                m1, m6
1326    paddw                m1, m2
1327    psraw                m1, 2         ; 1 2 3 4
1328    valignq              m2, m1, m0, 6 ; 0 1 2 3
1329    mova                 m0, m1
1330    psubw                m1, m2
1331    pmulhrsw             m1, m7
1332    paddw                m1, m2
1333    mova             [tmpq], m1
1334    add                tmpq, 64
1335    sub                  hd, 4
1336    jg .hv_w8_loop
1337    RET
1338.hv_w16:
1339    pmullw              ym0, ym4, [srcq+strideq*0+0]
1340    pmullw              ym1, ym5, [srcq+strideq*0+2]
1341    psubw               ym0, ym6
1342    paddw               ym0, ym1
1343    psraw               ym0, 2
1344    vinserti32x8         m0, ym0, 1
1345.hv_w16_loop:
1346    movu                ym1, [srcq+strideq*1+0]
1347    movu                ym2, [srcq+strideq*1+2]
1348    lea                srcq, [srcq+strideq*2]
1349    vinserti32x8         m1, [srcq+strideq*0+0], 1
1350    vinserti32x8         m2, [srcq+strideq*0+2], 1
1351    pmullw               m1, m4
1352    pmullw               m2, m5
1353    psubw                m1, m6
1354    paddw                m1, m2
1355    psraw                m1, 2             ; 1 2
1356    vshufi32x4           m2, m0, m1, q1032 ; 0 1
1357    mova                 m0, m1
1358    psubw                m1, m2
1359    pmulhrsw             m1, m7
1360    paddw                m1, m2
1361    mova             [tmpq], m1
1362    add                tmpq, 64
1363    sub                  hd, 2
1364    jg .hv_w16_loop
1365    RET
1366.hv_w32:
1367    pmullw               m0, m4, [srcq+strideq*0+0]
1368    pmullw               m1, m5, [srcq+strideq*0+2]
1369    psubw                m0, m6
1370    paddw                m0, m1
1371    psraw                m0, 2
1372.hv_w32_loop:
1373    pmullw               m3, m4, [srcq+strideq*1+0]
1374    pmullw               m1, m5, [srcq+strideq*1+2]
1375    lea                srcq, [srcq+strideq*2]
1376    psubw                m3, m6
1377    paddw                m3, m1
1378    psraw                m3, 2
1379    psubw                m1, m3, m0
1380    pmulhrsw             m1, m7
1381    paddw                m1, m0
1382    pmullw               m0, m4, [srcq+strideq*0+0]
1383    pmullw               m2, m5, [srcq+strideq*0+2]
1384    psubw                m0, m6
1385    paddw                m0, m2
1386    psraw                m0, 2
1387    psubw                m2, m0, m3
1388    pmulhrsw             m2, m7
1389    paddw                m2, m3
1390    mova        [tmpq+64*0], m1
1391    mova        [tmpq+64*1], m2
1392    add                tmpq, 64*2
1393    sub                  hd, 2
1394    jg .hv_w32_loop
1395    RET
1396.hv_w64:
1397    pmullw               m0, m4, [srcq+ 0]
1398    pmullw               m2, m5, [srcq+ 2]
1399    pmullw               m1, m4, [srcq+64]
1400    pmullw               m3, m5, [srcq+66]
1401    psubw                m0, m6
1402    psubw                m1, m6
1403    paddw                m0, m2
1404    paddw                m1, m3
1405    psraw                m0, 2
1406    psraw                m1, 2
1407.hv_w64_loop:
1408    add                srcq, strideq
1409    pmullw               m2, m4, [srcq+ 0]
1410    pmullw               m8, m5, [srcq+ 2]
1411    pmullw               m3, m4, [srcq+64]
1412    pmullw               m9, m5, [srcq+66]
1413    psubw                m2, m6
1414    psubw                m3, m6
1415    paddw                m2, m8
1416    paddw                m3, m9
1417    psraw                m2, 2
1418    psraw                m3, 2
1419    psubw                m8, m2, m0
1420    psubw                m9, m3, m1
1421    pmulhrsw             m8, m7
1422    pmulhrsw             m9, m7
1423    paddw                m8, m0
1424    mova                 m0, m2
1425    paddw                m9, m1
1426    mova                 m1, m3
1427    mova        [tmpq+64*0], m8
1428    mova        [tmpq+64*1], m9
1429    add                tmpq, 64*2
1430    dec                  hd
1431    jg .hv_w64_loop
1432    RET
1433.hv_w128:
1434    pmullw               m0, m4, [srcq+  0]
1435    pmullw               m8, m5, [srcq+  2]
1436    pmullw               m1, m4, [srcq+ 64]
1437    pmullw               m9, m5, [srcq+ 66]
1438    pmullw               m2, m4, [srcq+128]
1439    pmullw              m10, m5, [srcq+130]
1440    pmullw               m3, m4, [srcq+192]
1441    pmullw              m11, m5, [srcq+194]
1442    REPX      {psubw x, m6}, m0, m1, m2, m3
1443    paddw                m0, m8
1444    paddw                m1, m9
1445    paddw                m2, m10
1446    paddw                m3, m11
1447    REPX       {psraw x, 2}, m0, m1, m2, m3
1448.hv_w128_loop:
1449    add                srcq, strideq
1450    pmullw               m8, m4, [srcq+  0]
1451    pmullw              m12, m5, [srcq+  2]
1452    pmullw               m9, m4, [srcq+ 64]
1453    pmullw              m13, m5, [srcq+ 66]
1454    pmullw              m10, m4, [srcq+128]
1455    pmullw              m14, m5, [srcq+130]
1456    pmullw              m11, m4, [srcq+192]
1457    pmullw              m15, m5, [srcq+194]
1458    REPX      {psubw x, m6}, m8, m9, m10, m11
1459    paddw                m8, m12
1460    paddw                m9, m13
1461    paddw               m10, m14
1462    paddw               m11, m15
1463    REPX       {psraw x, 2}, m8, m9, m10, m11
1464    psubw               m12, m8, m0
1465    psubw               m13, m9, m1
1466    psubw               m14, m10, m2
1467    psubw               m15, m11, m3
1468    REPX   {pmulhrsw x, m7}, m12, m13, m14, m15
1469    paddw               m12, m0
1470    mova                 m0, m8
1471    paddw               m13, m1
1472    mova                 m1, m9
1473    mova        [tmpq+64*0], m12
1474    mova        [tmpq+64*1], m13
1475    paddw               m14, m2
1476    mova                 m2, m10
1477    paddw               m15, m3
1478    mova                 m3, m11
1479    mova        [tmpq+64*2], m14
1480    mova        [tmpq+64*3], m15
1481    add                tmpq, 64*4
1482    dec                  hd
1483    jg .hv_w128_loop
1484    RET
1485
1486; int8_t subpel_filters[5][15][8]
1487%assign FILTER_REGULAR (0*15 << 16) | 3*15
1488%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
1489%assign FILTER_SHARP   (2*15 << 16) | 3*15
1490
1491%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
1492cglobal %1_%2_16bpc
1493    mov                 t0d, FILTER_%3
1494%ifidn %3, %4
1495    mov                 t1d, t0d
1496%else
1497    mov                 t1d, FILTER_%4
1498%endif
1499%if %0 == 5 ; skip the jump in the last filter
1500    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
1501%endif
1502%endmacro
1503
1504%if WIN64
1505DECLARE_REG_TMP 4, 5
1506%define buf rsp+stack_offset+8 ; shadow space
1507%else
1508DECLARE_REG_TMP 7, 8
1509%define buf rsp-40 ; red zone
1510%endif
1511
1512%define PUT_8TAP_FN FN put_8tap,
1513PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_16bpc
1514PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_16bpc
1515PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_16bpc
1516PUT_8TAP_FN regular,        REGULAR, REGULAR
1517
1518cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
1519%define base r8-put_avx512icl
1520    imul                mxd, mxm, 0x010101
1521    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
1522    imul                myd, mym, 0x010101
1523    add                 myd, t1d ; 6tap_v, my, 4tap_v
1524    lea                  r8, [put_avx512icl]
1525    movifnidn            wd, wm
1526    movifnidn            hd, hm
1527    test                mxd, 0xf00
1528    jnz .h
1529    test                myd, 0xf00
1530    jnz .v
1531.put:
1532    tzcnt                wd, wd
1533    movzx                wd, word [r8+wq*2+table_offset(put,)]
1534    add                  wq, r8
1535%if WIN64
1536    pop                  r8
1537%endif
1538    jmp                  wq
1539.h_w8:
1540    mova                 m4, [spel_h_shufA]
1541    movu                 m5, [spel_h_shufB]
1542    movu                 m6, [spel_h_shufC]
1543.h_w8_loop:
1544    movu                ym2, [srcq+ssq*0]
1545    vinserti32x8         m2, [srcq+ssq*1], 1
1546    lea                srcq, [srcq+ssq*2]
1547    mova                 m0, m8
1548    vpermb               m1, m4, m2
1549    vpdpwssd             m0, m10, m1
1550    vpermb               m1, m5, m2
1551    vpdpwssd             m0, m11, m1
1552    vpermb               m1, m6, m2
1553    vpdpwssd             m0, m12, m1
1554    psrad                m0, 6
1555    vextracti32x8       ym1, m0, 1
1556    packusdw            ym0, ym1
1557    pminsw              ym0, ym15
1558    mova          [dstq+dsq*0], xm0
1559    vextracti32x4 [dstq+dsq*1], ym0, 1
1560    lea                dstq, [dstq+dsq*2]
1561    sub                  hd, 2
1562    jg .h_w8_loop
1563    RET
1564.h:
1565    vpbroadcastw        m15, r8m
1566    test                myd, 0xf00
1567    jnz .hv
1568    mov                 r7d, r8m
1569    shr                 r7d, 11
1570    vpbroadcastd         m8, [base+put_8tap_h_rnd+r7*4]
1571    cmp                  wd, 4
1572    jle mangle(private_prefix %+ _put_8tap_16bpc_avx512icl).h_w4
1573    shr                 mxd, 16
1574    sub                srcq, 4
1575    pmovsxbw           xmm0, [base+subpel_filters+1+mxq*8]
1576    mova              [buf], xmm0
1577    vpbroadcastd        m10, xmm0
1578    vpbroadcastd        m12, [buf+8]
1579    vpbroadcastd        m11, [buf+4]
1580    sub                  wd, 16
1581    jl .h_w8
1582    vbroadcasti32x4      m6, [spel_h_shufA]
1583    vbroadcasti32x4      m7, [spel_h_shufB]
1584    jg .h_w32
1585.h_w16_loop:
1586    movu                ym2, [srcq+ssq*0+ 0]
1587    vinserti32x8         m2, [srcq+ssq*1+ 0], 1
1588    movu                ym3, [srcq+ssq*0+12]
1589    vinserti32x8         m3, [srcq+ssq*1+12], 1
1590    lea                srcq, [srcq+ssq*2]
1591    mova                 m0, m8
1592    mova                 m1, m8
1593    pshufb               m4, m2, m6
1594    vpdpwssd             m0, m10, m4 ; a0  b0
1595    pshufb               m4, m3, m7
1596    vpdpwssd             m1, m12, m4 ; a2' b2'
1597    pshufb               m2, m7
1598    pshufb               m3, m6
1599    vpdpwssd             m0, m11, m2 ; a1  b1
1600    vpdpwssd             m1, m11, m3 ; a1' b1'
1601    shufpd               m2, m3, 0x55
1602    vpdpwssd             m0, m12, m2 ; a2  b2
1603    vpdpwssd             m1, m10, m2 ; a0' b0'
1604    psrad                m0, 6
1605    psrad                m1, 6
1606    packusdw             m0, m1
1607    pminsw               m0, m15
1608    mova          [dstq+dsq*0], ym0
1609    vextracti32x8 [dstq+dsq*1], m0, 1
1610    lea                dstq, [dstq+dsq*2]
1611    sub                  hd, 2
1612    jg .h_w16_loop
1613    RET
1614.h_w32:
1615    lea                srcq, [srcq+wq*2]
1616    lea                dstq, [dstq+wq*2]
1617    neg                  wq
1618.h_w32_loop0:
1619    mov                  r6, wq
1620.h_w32_loop:
1621    movu                 m2, [srcq+r6*2+ 0]
1622    movu                 m3, [srcq+r6*2+12]
1623    mova                 m0, m8
1624    mova                 m1, m8
1625    pshufb               m4, m2, m6
1626    vpdpwssd             m0, m10, m4 ; a0
1627    pshufb               m4, m3, m7
1628    vpdpwssd             m1, m12, m4 ; b2
1629    pshufb               m2, m7
1630    pshufb               m3, m6
1631    vpdpwssd             m0, m11, m2 ; a1
1632    vpdpwssd             m1, m11, m3 ; b1
1633    shufpd               m2, m3, 0x55
1634    vpdpwssd             m0, m12, m2 ; a2
1635    vpdpwssd             m1, m10, m2 ; b0
1636    psrad                m0, 6
1637    psrad                m1, 6
1638    packusdw             m0, m1
1639    pminsw               m0, m15
1640    mova        [dstq+r6*2], m0
1641    add                  r6, 32
1642    jl .h_w32_loop
1643    add                srcq, ssq
1644    add                dstq, dsq
1645    dec                  hd
1646    jg .h_w32_loop0
1647    RET
1648.v:
1649    movzx               mxd, myb
1650    shr                 myd, 16
1651    cmp                  hd, 6
1652    cmovs               myd, mxd
1653    vpbroadcastd        m11, [pd_32]
1654    pmovsxbw           xmm0, [base+subpel_filters+1+myq*8]
1655    tzcnt               r7d, wd
1656    vpbroadcastw        m15, r8m
1657    mov                  r6, ssq
1658    movzx               r7d, word [r8+r7*2+table_offset(put, _6tap_v)]
1659    neg                  r6
1660    mova [rsp+stack_offset+8], xmm0
1661    vpbroadcastd        m12, xmm0
1662    add                  r7, r8
1663    vpbroadcastd        m13, [rsp+stack_offset+12]
1664    vpbroadcastd        m14, [rsp+stack_offset+16]
1665    jmp                  r7
1666.v_w2:
1667    movd               xmm2, [srcq+r6 *2]
1668    pinsrd             xmm2, [srcq+r6 *1], 1
1669    pinsrd             xmm2, [srcq+ssq*0], 2
1670    pinsrd             xmm2, [srcq+ssq*1], 3 ; 0 1 2 3
1671    lea                srcq, [srcq+ssq*2]
1672    movd               xmm0, [srcq+ssq*0]
1673    palignr            xmm3, xmm0, xmm2, 4   ; 1 2 3 4
1674    punpcklwd          xmm1, xmm2, xmm3      ; 01 12
1675    punpckhwd          xmm2, xmm3            ; 23 34
1676.v_w2_loop:
1677    movd               xmm3, [srcq+ssq*1]
1678    mova               xmm4, xm11
1679    vpdpwssd           xmm4, xmm1, xm12      ; a0 b0
1680    lea                srcq, [srcq+ssq*2]
1681    mova               xmm1, xmm2
1682    vpdpwssd           xmm4, xmm2, xm13      ; a1 b1
1683    punpckldq          xmm2, xmm0, xmm3      ; 4 5
1684    movd               xmm0, [srcq+ssq*0]
1685    punpckldq          xmm3, xmm0            ; 5 6
1686    punpcklwd          xmm2, xmm3            ; 45 56
1687    vpdpwssd           xmm4, xmm2, xm14      ; a2 b2
1688    psrad              xmm4, 6
1689    packusdw           xmm4, xmm4
1690    pminsw             xmm4, xm15
1691    movd       [dstq+dsq*0], xmm4
1692    pextrd     [dstq+dsq*1], xmm4, 1
1693    lea                dstq, [dstq+dsq*2]
1694    sub                  hd, 2
1695    jg .v_w2_loop
1696    RET
1697.v_w4:
1698    movq               xmm1, [srcq+r6 *2]
1699    vpbroadcastq       ymm3, [srcq+r6 *1]
1700    vpbroadcastq       ymm2, [srcq+ssq*0]
1701    vpbroadcastq       ymm4, [srcq+ssq*1]
1702    lea                srcq, [srcq+ssq*2]
1703    vpbroadcastq       ymm0, [srcq+ssq*0]
1704    vpblendd           ymm1, ymm3, 0x30
1705    vpblendd           ymm3, ymm2, 0x30
1706    punpcklwd          ymm1, ymm3       ; 01 12
1707    vpblendd           ymm2, ymm4, 0x30
1708    vpblendd           ymm4, ymm0, 0x30
1709    punpcklwd          ymm2, ymm4       ; 23 34
1710.v_w4_loop:
1711    vpbroadcastq       ymm3, [srcq+ssq*1]
1712    mova               ymm4, ym11
1713    vpdpwssd           ymm4, ymm1, ym12 ; a0 b0
1714    lea                srcq, [srcq+ssq*2]
1715    mova               ymm1, ymm2
1716    vpdpwssd           ymm4, ymm2, ym13 ; a1 b1
1717    vpblendd           ymm2, ymm0, ymm3, 0x30
1718    vpbroadcastq       ymm0, [srcq+ssq*0]
1719    vpblendd           ymm3, ymm0, 0x30
1720    punpcklwd          ymm2, ymm3       ; 45 56
1721    vpdpwssd           ymm4, ymm2, ym14 ; a2 b2
1722    psrad              ymm4, 6
1723    vextracti128       xmm3, ymm4, 1
1724    packusdw           xmm4, xmm3
1725    pminsw             xmm4, xm15
1726    movq       [dstq+dsq*0], xmm4
1727    movhps     [dstq+dsq*1], xmm4
1728    lea                dstq, [dstq+dsq*2]
1729    sub                  hd, 2
1730    jg .v_w4_loop
1731    vzeroupper
1732    RET
1733.v_w8:
1734    vbroadcasti32x4      m0, [srcq+ssq*0]
1735    vinserti32x4         m1, m0, [srcq+r6 *2], 0
1736    vinserti32x4         m1, [srcq+r6 *1], 1 ; 0 1 2
1737    vinserti32x4        ym0, [srcq+ssq*1], 1
1738    lea                srcq, [srcq+ssq*2]
1739    mova                 m5, [spel_v_shuf8]
1740    vinserti32x4         m0, [srcq+ssq*0], 2 ; 2 3 4
1741    vpermb               m1, m5, m1          ; 01 12
1742    vpermb               m2, m5, m0          ; 23 34
1743.v_w8_loop:
1744    vinserti32x4         m0, [srcq+ssq*1], 3
1745    lea                srcq, [srcq+ssq*2]
1746    movu                xm3, [srcq+ssq*0]
1747    mova                 m4, m11
1748    vpdpwssd             m4, m12, m1         ; a0 b0
1749    vshufi32x4           m0, m3, q1032       ; 4 5 6
1750    mova                 m1, m2
1751    vpdpwssd             m4, m13, m2         ; a1 b1
1752    vpermb               m2, m5, m0          ; 45 56
1753    vpdpwssd             m4, m14, m2         ; a2 b2
1754    psrad                m4, 6
1755    vextracti32x8       ym3, m4, 1
1756    packusdw            ym4, ym3
1757    pminsw              ym4, ym15
1758    mova          [dstq+dsq*0], xm4
1759    vextracti32x4 [dstq+dsq*1], ym4, 1
1760    lea                dstq, [dstq+dsq*2]
1761    sub                  hd, 2
1762    jg .v_w8_loop
1763    RET
1764.v_w16:
1765    vbroadcasti32x8      m0, [srcq+r6 *1]
1766    vinserti32x8         m1, m0, [srcq+ssq*0], 1
1767    vinserti32x8         m0, [srcq+r6*2], 0
1768    mova                 m6, [spel_v_shuf16]
1769    movu                ym3, [srcq+ssq*1]
1770    lea                srcq, [srcq+ssq*2]
1771    vinserti32x8         m3, [srcq+ssq*0], 1
1772    vpermb               m1, m6, m1     ; 12
1773    vpermb               m0, m6, m0     ; 01
1774    vpermb               m3, m6, m3     ; 34
1775    mova                 m7, [deint_q_shuf]
1776    vpshrdd              m2, m1, m3, 16 ; 23
1777.v_w16_loop:
1778    mova                 m5, m11
1779    vpdpwssd             m5, m12, m1    ; b0
1780    mova                 m4, m11
1781    vpdpwssd             m4, m12, m0    ; a0
1782    mova                 m1, m3
1783    vpdpwssd             m5, m13, m3    ; b1
1784    mova                 m0, m2
1785    vpdpwssd             m4, m13, m2    ; a1
1786    movu                ym3, [srcq+ssq*1]
1787    lea                srcq, [srcq+ssq*2]
1788    vinserti32x8         m3, [srcq+ssq*0], 1
1789    vpermb               m3, m6, m3     ; 56
1790    vpshrdd              m2, m1, m3, 16 ; 45
1791    vpdpwssd             m5, m14, m3    ; b2
1792    vpdpwssd             m4, m14, m2    ; a2
1793    psrad                m5, 6
1794    psrad                m4, 6
1795    packusdw             m4, m5
1796    pminsw               m4, m15
1797    vpermq               m4, m7, m4
1798    mova          [dstq+dsq*0], ym4
1799    vextracti32x8 [dstq+dsq*1], m4, 1
1800    lea                dstq, [dstq+dsq*2]
1801    sub                  hd, 2
1802    jg .v_w16_loop
1803    RET
1804.v_w32:
1805.v_w64:
1806.v_w128:
1807    lea                  wd, [hq+wq*8-256]
1808.v_w32_loop0:
1809    movu                m16, [srcq+r6 *2]
1810    movu                m17, [srcq+r6 *1]
1811    lea                  r7, [srcq+ssq*2]
1812    movu                m18, [srcq+ssq*0]
1813    movu                m19, [srcq+ssq*1]
1814    mov                  r8, dstq
1815    movu                m20, [r7  +ssq*0]
1816    punpcklwd            m0, m16, m17 ; 01
1817    punpckhwd           m16, m17
1818    punpcklwd            m1, m17, m18 ; 12
1819    punpckhwd           m17, m18
1820    punpcklwd            m2, m18, m19 ; 23
1821    punpckhwd           m18, m19
1822    punpcklwd            m3, m19, m20 ; 34
1823    punpckhwd           m19, m20
1824.v_w32_loop:
1825    mova                 m4, m11
1826    vpdpwssd             m4, m12, m0  ; a0
1827    mova                 m6, m11
1828    vpdpwssd             m6, m12, m16
1829    mova                 m5, m11
1830    vpdpwssd             m5, m12, m1  ; b0
1831    mova                 m7, m11
1832    vpdpwssd             m7, m12, m17
1833    mova                 m0, m2
1834    vpdpwssd             m4, m13, m2  ; a1
1835    mova                m16, m18
1836    vpdpwssd             m6, m13, m18
1837    mova                 m1, m3
1838    vpdpwssd             m5, m13, m3  ; b1
1839    mova                m17, m19
1840    vpdpwssd             m7, m13, m19
1841    movu                m19, [r7+ssq*1]
1842    lea                  r7, [r7+ssq*2]
1843    punpcklwd            m2, m20, m19 ; 45
1844    punpckhwd           m18, m20, m19
1845    movu                m20, [r7+ssq*0]
1846    vpdpwssd             m4, m14, m2  ; a2
1847    vpdpwssd             m6, m14, m18
1848    punpcklwd            m3, m19, m20 ; 56
1849    punpckhwd           m19, m20
1850    vpdpwssd             m5, m14, m3  ; b2
1851    vpdpwssd             m7, m14, m19
1852    REPX       {psrad x, 6}, m4, m6, m5, m7
1853    packusdw             m4, m6
1854    packusdw             m5, m7
1855    pminsw               m4, m15
1856    pminsw               m5, m15
1857    mova         [r8+dsq*0], m4
1858    mova         [r8+dsq*1], m5
1859    lea                  r8, [r8+dsq*2]
1860    sub                  hd, 2
1861    jg .v_w32_loop
1862    add                srcq, 64
1863    add                dstq, 64
1864    movzx                hd, wb
1865    sub                  wd, 1<<8
1866    jg .v_w32_loop0
1867    vzeroupper
1868    RET
1869.hv:
1870    cmp                  wd, 4
1871    jg .hv_w8
1872    movzx               mxd, mxb
1873    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
1874    movzx               mxd, myb
1875    shr                 myd, 16
1876    cmp                  hd, 6
1877    cmovs               myd, mxd
1878    pmovsxbw           xmm1, [base+subpel_filters+1+myq*8]
1879    mov                  r6, ssq
1880    sub                srcq, 2
1881    neg                  r6
1882    test          dword r8m, 0x800
1883    jnz .hv_12bit
1884    vpbroadcastd        m10, [pd_2176]
1885    psllw              xmm0, 6
1886    jmp .hv_main
1887.hv_12bit:
1888    vpbroadcastd        m10, [pd_640]
1889    psllw              xmm0, 4
1890    psllw              xmm1, 2
1891.hv_main:
1892    movu                xm4, [srcq+r6 *2]
1893    vinserti32x4        ym4, [srcq+r6 *1], 1
1894    vinserti32x4         m4, [srcq+ssq*0], 2
1895    vbroadcasti32x4      m6, [spel_h_shufA]
1896    vinserti32x4         m4, [srcq+ssq*1], 3 ; 0 1 2 3
1897    lea                srcq, [srcq+ssq*2]
1898    movu                xm5, [srcq+ssq*0]    ; 4
1899    mova           [buf+ 0], xmm0
1900    mova           [buf+16], xmm1
1901    vpbroadcastd         m8, [buf+ 4]
1902    vpbroadcastd         m9, [buf+ 8]
1903    vpbroadcastd       ym12, xmm1
1904    vpbroadcastd       ym13, [buf+20]
1905    vpbroadcastd       ym14, [buf+24]
1906    cmp                  wd, 4
1907    je .hv_w4
1908    vbroadcasti32x4      m2, [spel_h_shufA]
1909    mova                 m3, [spel_h_shuf2b]
1910    mova                 m1, m10
1911    pshufb               m4, m6
1912    pshufb              xm5, xm6
1913    punpcklqdq           m2, m4, m5
1914    vpdpwssd             m1, m8, m2    ; 04 1_ 2_ 3_
1915    mova                ym6, [spel_h_shuf2a]
1916    punpckhqdq           m4, m5
1917    mova                xm5, [spel_shuf2]
1918    vpdpwssd             m1, m9, m4
1919    vpermb               m1, m3, m1    ; 01 12
1920    vextracti32x4       xm2, ym1, 1    ; 23 34
1921.hv_w2_loop:
1922    movu                xm3, [srcq+ssq*1]
1923    lea                srcq, [srcq+ssq*2]
1924    vinserti32x4        ym3, [srcq+ssq*0], 1
1925    vpermb              ym3, ym6, ym3
1926    pmaddwd            xmm0, xm12, xm1 ; a0 b0
1927    mova                xm4, xm10
1928    vpdpwssd            xm4, xm8, xm3
1929    vextracti32x4       xm3, ym3, 1
1930    mova                xm1, xm2
1931    vpdpwssd           xmm0, xm13, xm2 ; a1 b1
1932    vpdpwssd            xm4, xm9, xm3  ; 5 6
1933    vpermt2b            xm2, xm5, xm4  ; 45 56
1934    vpdpwssd           xmm0, xm14, xm2 ; a2 b2
1935    psrad              xmm0, 10
1936    packusdw           xmm0, xmm0
1937    pminsw             xmm0, xm15
1938    movd       [dstq+dsq*0], xmm0
1939    pextrd     [dstq+dsq*1], xmm0, 1
1940    lea                dstq, [dstq+dsq*2]
1941    sub                  hd, 2
1942    jg .hv_w2_loop
1943    RET
1944.hv_w4:
1945    vbroadcasti32x4      m7, [spel_h_shufB]
1946    mova                ym0, [spel_shuf4a]
1947    pshufb               m1, m4, m6
1948    mova                 m2, m10
1949    vpdpwssd             m2, m8, m1
1950    pshufb              xm1, xm5, xm6
1951    mova                xm3, xm10
1952    vpdpwssd            xm3, xm8, xm1
1953    pshufb               m4, m7
1954    pshufb              xm5, xm7
1955    vpdpwssd             m2, m9, m4    ; 0 1 2 3
1956    vpdpwssd            xm3, xm9, xm5  ; 4
1957    mova                ym5, [spel_shuf4b]
1958    vpermb               m1, m0, m2    ; 01 12
1959    vshufi32x4           m2, m3, q1032 ; 2 3 4
1960    vpermb               m2, m0, m2    ; 23 34
1961.hv_w4_loop:
1962    movu                xm3, [srcq+ssq*1]
1963    lea                srcq, [srcq+ssq*2]
1964    vinserti32x4        ym3, [srcq+ssq*0], 1
1965    pmaddwd             ym0, ym12, ym1 ; a0 b0
1966    mova                ym1, ym2
1967    pshufb              ym4, ym3, ym6
1968    mova                ym2, ym10
1969    vpdpwssd            ym2, ym8, ym4
1970    pshufb              ym3, ym7
1971    vpdpwssd            ym0, ym13, ym1 ; a1 b1
1972    vpdpwssd            ym2, ym9, ym3  ; 5 6
1973    vpermt2b            ym2, ym5, ym1  ; 45 56
1974    vpdpwssd            ym0, ym14, ym2 ; a2 b2
1975    psrad               ym0, 10
1976    vextracti32x4       xm4, ym0, 1
1977    packusdw            xm0, xm4
1978    pminsw             xmm0, xm0, xm15
1979    movq       [dstq+dsq*0], xmm0
1980    movhps     [dstq+dsq*1], xmm0
1981    lea                dstq, [dstq+dsq*2]
1982    sub                  hd, 2
1983    jg .hv_w4_loop
1984    RET
1985.hv_w8:
1986    shr                 mxd, 16
1987    pmovsxbw           xmm0, [base+subpel_filters+1+mxq*8]
1988    movzx               mxd, myb
1989    shr                 myd, 16
1990    cmp                  hd, 6
1991    cmovs               myd, mxd
1992    pmovsxbw           xmm1, [base+subpel_filters+1+myq*8]
1993    mov                  r6, ssq
1994    sub                srcq, 4
1995    neg                  r6
1996    test          dword r8m, 0x800
1997    jnz .hv_w8_12bit
1998    vpbroadcastd         m8, [pd_2176]
1999    psllw              xmm0, 6
2000    jmp .hv_w8_main
2001.hv_w8_12bit:
2002    vpbroadcastd         m8, [pd_640]
2003    psllw              xmm0, 4
2004    psllw              xmm1, 2
2005.hv_w8_main:
2006    mova           [buf+ 0], xmm0
2007    mova           [buf+16], xmm1
2008    vpbroadcastd         m9, xmm0
2009    vpbroadcastd        m10, [buf+ 4]
2010    vpbroadcastd        m11, [buf+ 8]
2011    vpbroadcastd        m12, xmm1
2012    vpbroadcastd        m13, [buf+20]
2013    vpbroadcastd        m14, [buf+24]
2014    cmp                  wd, 16
2015    jge .hv_w16
2016    mova                 m6, [spel_h_shufA]
2017    movu               ym16, [srcq+r6 *2]
2018    vinserti32x8        m16, [srcq+r6 *1], 1 ; 0 1
2019    movu               ym17, [srcq+ssq*0]
2020    vinserti32x8        m17, [srcq+ssq*1], 1 ; 2 3
2021    lea                srcq, [srcq+ssq*2]
2022    movu               ym18, [srcq+ssq*0]    ; 4
2023    movu                 m7, [spel_h_shufC]
2024    vpermb               m3, m6, m16
2025    mova                 m1, m8
2026    vpermb               m4, m6, m17
2027    vpdpwssd             m1, m9, m3   ; a0 b0
2028    mova                 m2, m8
2029    vpermb               m5, m6, m18
2030    vpdpwssd             m2, m9, m4   ; c0 d0
2031    mova                 m0, m8
2032    vpermb              m16, m7, m16
2033    vpdpwssd             m0, m9, m5   ; e0
2034    vpermb              m17, m7, m17
2035    vpdpwssd             m1, m11, m16 ; a2 b2
2036    vpermb              m18, m7, m18
2037    vpdpwssd             m2, m11, m17 ; c2 d2
2038    shufpd               m3, m16, 0x55
2039    vpdpwssd             m0, m11, m18 ; e2
2040    mova                m16, [spel_shuf8a]
2041    shufpd               m4, m17, 0x55
2042    vpdpwssd             m1, m10, m3  ; a1 b1
2043    shufpd               m5, m18, 0x55
2044    vpdpwssd             m2, m10, m4  ; c1 d1
2045    vpdpwssd             m0, m10, m5  ; e1
2046    mova                 m5, [spel_shuf8b]
2047    vpermt2b             m1, m16, m2  ; 01 12
2048    vpermt2b             m2, m16, m0  ; 23 34
2049.hv_w8_loop:
2050    movu               ym18, [srcq+ssq*1]
2051    lea                srcq, [srcq+ssq*2]
2052    vinserti32x8        m18, [srcq+ssq*0], 1
2053    mova                 m0, m8
2054    vpermb              m17, m6, m18
2055    vpdpwssd             m0, m9, m17  ; f0 g0
2056    vpermb              m18, m7, m18
2057    pmaddwd             m16, m12, m1  ; A0 B0
2058    vpdpwssd             m0, m11, m18 ; f2 g2
2059    shufpd              m17, m18, 0x55
2060    mova                 m1, m2
2061    vpdpwssd            m16, m13, m2  ; A1 B1
2062    vpdpwssd             m0, m10, m17 ; f1 g1
2063    vpermt2b             m2, m5, m0   ; 45 56
2064    vpdpwssd            m16, m14, m2  ; A2 B2
2065    psrad               m16, 10
2066    vextracti32x8      ym17, m16, 1
2067    packusdw           ym16, ym17
2068    pminsw             ym16, ym15
2069    mova         [dstq+dsq*0], xm16
2070    vextracti128 [dstq+dsq*1], ym16, 1
2071    lea                dstq, [dstq+dsq*2]
2072    sub                  hd, 2
2073    jg .hv_w8_loop
2074    vzeroupper
2075    RET
2076.hv_w16:
2077    vbroadcasti32x4     m20, [spel_h_shufA]
2078    vbroadcasti32x4     m21, [spel_h_shufB]
2079    jg .hv_w32
2080    vbroadcasti32x8      m6, [srcq+r6 *2+ 8]
2081    vinserti32x8         m2, m6, [srcq+r6 *2+16], 1
2082    vinserti32x8         m6, [srcq+r6 *2+ 0], 0 ; 0
2083    movu               ym16, [srcq+r6 *1+ 0]
2084    movu               ym17, [srcq+r6 *1+12]
2085    vinserti32x8        m16, [srcq+ssq*0+ 0], 1
2086    vinserti32x8        m17, [srcq+ssq*0+12], 1 ; 1 2
2087    movu               ym18, [srcq+ssq*1+ 0]
2088    movu               ym19, [srcq+ssq*1+12]
2089    lea                srcq, [srcq+ssq*2]
2090    vinserti32x8        m18, [srcq+ssq*0+ 0], 1
2091    vinserti32x8        m19, [srcq+ssq*0+12], 1 ; 3 4
2092    pshufb               m2, m20
2093    mova                 m1, m8
2094    pshufb               m3, m16, m20
2095    vpdpwssd             m1, m11, m2    ; a2
2096    mova                 m2, m8
2097    pshufb               m4, m17, m21
2098    vpdpwssd             m2, m9, m3     ; b0  c0
2099    mova                 m3, m8
2100    pshufb               m5, m18, m20
2101    vpdpwssd             m3, m11, m4    ; b2' c2'
2102    mova                 m4, m8
2103    pshufb               m7, m19, m21
2104    vpdpwssd             m4, m9, m5     ; d0  e0
2105    mova                 m5, m8
2106    pshufb               m0, m6, m20
2107    vpdpwssd             m5, m11, m7    ; d2' e2'
2108    mova                 m7, [spel_shuf16]
2109    pshufb              m16, m21
2110    vpdpwssd             m1, m9, m0     ; a0
2111    pshufb              m17, m20
2112    vpdpwssd             m2, m10, m16   ; b1  c1
2113    pshufb              m18, m21
2114    vpdpwssd             m3, m10, m17   ; b1' c1'
2115    pshufb              m19, m20
2116    vpdpwssd             m4, m10, m18   ; d1  e1
2117    pshufb               m6, m21
2118    vpdpwssd             m5, m10, m19   ; d1' e1'
2119    shufpd              m16, m17, 0x55
2120    vpdpwssd             m1, m10, m6    ; a1
2121    shufpd              m18, m19, 0x55
2122    vpdpwssd             m2, m11, m16   ; b2  c2
2123    vpdpwssd             m3, m9, m16    ; b0' c0'
2124    vpdpwssd             m4, m11, m18   ; d2  e2
2125    vpdpwssd             m5, m9, m18    ; d0' e0'
2126    pslldq               m1, 1
2127    vpermt2b             m2, m7, m3     ; 12
2128    vpermt2b             m4, m7, m5     ; 34
2129    vpshrdd              m1, m2, 16     ; 01
2130    vpshrdd              m3, m2, m4, 16 ; 23
2131.hv_w16_loop:
2132    movu               ym18, [srcq+ssq*1+ 0]
2133    movu               ym19, [srcq+ssq*1+12]
2134    lea                srcq, [srcq+ssq*2]
2135    vinserti32x8        m18, [srcq+ssq*0+ 0], 1
2136    vinserti32x8        m19, [srcq+ssq*0+12], 1
2137    mova                 m5, m8
2138    mova                 m6, m8
2139    pshufb              m17, m18, m20
2140    vpdpwssd             m5, m9, m17    ; f0  g0
2141    pshufb              m16, m19, m21
2142    vpdpwssd             m6, m11, m16   ; f2' g2'
2143    pmaddwd             m17, m12, m2    ; B0
2144    mova                 m2, m4
2145    pmaddwd             m16, m12, m1    ; A0
2146    mova                 m1, m3
2147    pshufb              m18, m21
2148    vpdpwssd             m5, m10, m18   ; f1  g1
2149    pshufb              m19, m20
2150    vpdpwssd             m6, m10, m19   ; f1' g1'
2151    vpdpwssd            m17, m13, m4    ; B1
2152    vpdpwssd            m16, m13, m3    ; A1
2153    shufpd              m18, m19, 0x55
2154    vpdpwssd             m5, m11, m18   ; f2  g2
2155    vpdpwssd             m6, m9, m18    ; f0' g0'
2156    mova                 m4, m7
2157    vpermi2b             m4, m5, m6     ; 56
2158    vpshrdd              m3, m2, m4, 16 ; 45
2159    vpdpwssd            m17, m14, m4    ; B2
2160    vpdpwssd            m16, m14, m3    ; A2
2161    psrad               m16, 10
2162    psrad               m17, 10
2163    vshufi32x4          m18, m16, m17, q3232
2164    vinserti32x8        m16, ym17, 1
2165    packusdw            m16, m18
2166    pminsw              m16, m15
2167    mova          [dstq+dsq*0], ym16
2168    vextracti32x8 [dstq+dsq*1], m16, 1
2169    lea                dstq, [dstq+dsq*2]
2170    sub                  hd, 2
2171    jg .hv_w16_loop
2172    vzeroupper
2173    RET
2174.hv_w32:
2175    WIN64_SPILL_XMM      28
2176    mova                m27, [spel_shuf32]
2177    lea                  wd, [hq+wq*8-256]
2178.hv_w32_loop0:
2179    movu                m16, [srcq+r6 *2+ 0]
2180    movu                 m7, [srcq+r6 *2+12]
2181    movu                 m6, [srcq+r6 *1+ 0]
2182    movu                m18, [srcq+r6 *1+12]
2183    lea                  r7, [srcq+ssq*2]
2184    movu                m17, [srcq+ssq*0+ 0]
2185    movu                m19, [srcq+ssq*0+12]
2186    movu                m22, [srcq+ssq*1+ 0]
2187    movu                m24, [srcq+ssq*1+12]
2188    mov                  r8, dstq
2189    movu                m23, [r7  +ssq*0+ 0]
2190    movu                m25, [r7  +ssq*0+12]
2191    pshufb               m1, m16, m20
2192    mova                 m0, m8
2193    pshufb               m2, m7, m21
2194    vpdpwssd             m0, m9, m1     ; a0
2195    mova                 m1, m8
2196    pshufb               m4, m6, m20
2197    vpdpwssd             m1, m11, m2    ; a2'
2198    mova                 m2, m8
2199    pshufb               m3, m17, m20
2200    vpdpwssd             m2, m9, m4     ; b0
2201    mova                 m4, m8
2202    pshufb               m5, m18, m21
2203    vpdpwssd             m4, m9, m3     ; c0
2204    mova                 m3, m8
2205    pshufb              m26, m19, m21
2206    vpdpwssd             m3, m11, m5    ; b2'
2207    mova                 m5, m8
2208    pshufb              m16, m21
2209    vpdpwssd             m5, m11, m26   ; c2'
2210    pshufb               m7, m20
2211    vpdpwssd             m0, m10, m16   ; a1
2212    pshufb               m6, m21
2213    vpdpwssd             m1, m10, m7    ; a1'
2214    pshufb              m17, m21
2215    vpdpwssd             m2, m10, m6    ; b1
2216    pshufb              m18, m20
2217    vpdpwssd             m4, m10, m17   ; c1
2218    pshufb              m19, m20
2219    vpdpwssd             m3, m10, m18   ; b1'
2220    shufpd              m16, m7, 0x55
2221    vpdpwssd             m5, m10, m19   ; c1'
2222    shufpd               m6, m18, 0x55
2223    vpdpwssd             m0, m11, m16   ; a2
2224    shufpd              m17, m19, 0x55
2225    vpdpwssd             m1, m9, m16    ; a0'
2226    pshufb              m16, m22, m20
2227    vpdpwssd             m2, m11, m6    ; b2
2228    pshufb               m7, m23, m20
2229    vpdpwssd             m4, m11, m17   ; c2
2230    vpdpwssd             m3, m9, m6     ; b0'
2231    mova                 m6, m8
2232    vpdpwssd             m5, m9, m17    ; c0'
2233    pshufb              m17, m24, m21
2234    vpdpwssd             m6, m9, m16    ; d0
2235    mova                m16, m8
2236    pshufb              m26, m25, m21
2237    vpdpwssd            m16, m9, m7     ; e0
2238    mova                 m7, m8
2239    pshufb              m22, m21
2240    vpdpwssd             m7, m11, m17   ; d2'
2241    mova                m17, m8
2242    pshufb              m23, m21
2243    vpdpwssd            m17, m11, m26   ; e2'
2244    pshufb              m24, m20
2245    vpdpwssd             m6, m10, m22   ; d1
2246    pshufb              m25, m20
2247    vpdpwssd            m16, m10, m23   ; e1
2248    shufpd              m22, m24, 0x55
2249    vpdpwssd             m7, m10, m24   ; d1'
2250    shufpd              m23, m25, 0x55
2251    vpdpwssd            m17, m10, m25   ; e1'
2252    pslldq               m0, 1
2253    vpdpwssd             m6, m11, m22   ; d2
2254    pslldq               m1, 1
2255    vpdpwssd            m16, m11, m23   ; e2
2256    vpermt2b             m2, m27, m4    ; 12
2257    vpdpwssd             m7, m9, m22    ; d0'
2258    vpermt2b             m3, m27, m5    ; 12'
2259    vpdpwssd            m17, m9, m23    ; e0'
2260    vpshrdd              m0, m2, 16     ; 01
2261    vpermt2b             m6, m27, m16   ; 34
2262    vpshrdd              m1, m3, 16     ; 01'
2263    vpermt2b             m7, m27, m17   ; 34'
2264    vpshrdd              m4, m2, m6, 16 ; 23
2265    vpshrdd              m5, m3, m7, 16 ; 23'
2266.hv_w32_loop:
2267    movu                m22, [r7+ssq*1+ 0]
2268    movu                m24, [r7+ssq*1+12]
2269    lea                  r7, [r7+ssq*2]
2270    movu                m23, [r7+ssq*0+ 0]
2271    movu                m25, [r7+ssq*0+12]
2272    pmaddwd             m17, m12, m2    ; B0
2273    mova                 m2, m6
2274    pmaddwd             m19, m12, m3    ; B0'
2275    mova                 m3, m7
2276    pmaddwd             m16, m12, m0    ; A0
2277    mova                 m0, m4
2278    pmaddwd             m18, m12, m1    ; A0'
2279    mova                 m1, m5
2280    vpdpwssd            m17, m13, m6    ; B1
2281    vpdpwssd            m19, m13, m7    ; B1'
2282    mova                 m6, m8
2283    vpdpwssd            m16, m13, m4    ; A1
2284    pshufb               m4, m22, m20
2285    vpdpwssd            m18, m13, m5    ; A1'
2286    pshufb               m7, m23, m20
2287    vpdpwssd             m6, m9, m4     ; f0
2288    mova                 m4, m8
2289    pshufb               m5, m24, m21
2290    vpdpwssd             m4, m9, m7     ; g0
2291    mova                 m7, m8
2292    pshufb              m26, m25, m21
2293    vpdpwssd             m7, m11, m5    ; f2'
2294    mova                 m5, m8
2295    pshufb              m22, m21
2296    vpdpwssd             m5, m11, m26   ; g2'
2297    pshufb              m23, m21
2298    vpdpwssd             m6, m10, m22   ; f1
2299    pshufb              m24, m20
2300    vpdpwssd             m4, m10, m23   ; g1
2301    pshufb              m25, m20
2302    vpdpwssd             m7, m10, m24   ; f1'
2303    shufpd              m22, m24, 0x55
2304    vpdpwssd             m5, m10, m25   ; g1'
2305    shufpd              m23, m25, 0x55
2306    vpdpwssd             m6, m11, m22   ; f2
2307    vpdpwssd             m4, m11, m23   ; g2
2308    vpdpwssd             m7, m9, m22    ; f0'
2309    vpdpwssd             m5, m9, m23    ; g0'
2310    vpermt2b             m6, m27, m4    ; 56
2311    vpermt2b             m7, m27, m5    ; 56'
2312    vpdpwssd            m17, m14, m6    ; B2
2313    vpshrdd              m4, m2, m6, 16 ; 45
2314    vpdpwssd            m19, m14, m7    ; B2'
2315    vpshrdd              m5, m3, m7, 16 ; 45'
2316    vpdpwssd            m16, m14, m4    ; A2
2317    vpdpwssd            m18, m14, m5    ; A2'
2318    REPX      {psrad x, 10}, m17, m19, m16, m18
2319    packusdw            m17, m19
2320    packusdw            m16, m18
2321    pminsw              m17, m15
2322    pminsw              m16, m15
2323    mova         [r8+dsq*0], m16
2324    mova         [r8+dsq*1], m17
2325    lea                  r8, [r8+dsq*2]
2326    sub                  hd, 2
2327    jg .hv_w32_loop
2328    add                srcq, 64
2329    add                dstq, 64
2330    movzx                hd, wb
2331    sub                  wd, 1<<8
2332    jg .hv_w32_loop0
2333    RET
2334
2335PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_16bpc
2336PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_16bpc
2337PUT_8TAP_FN regular_sharp,  REGULAR, SHARP,   put_8tap_16bpc
2338PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_8tap_16bpc
2339PUT_8TAP_FN sharp,          SHARP,   SHARP
2340
2341cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
2342    imul                mxd, mxm, 0x010101
2343    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
2344    imul                myd, mym, 0x010101
2345    add                 myd, t1d ; 8tap_v, my, 4tap_v
2346    lea                  r8, [put_avx512icl]
2347    movifnidn            wd, wm
2348    movifnidn            hd, hm
2349    test                mxd, 0xf00
2350    jnz .h
2351    test                myd, 0xf00
2352    jz mangle(private_prefix %+ _put_6tap_16bpc_avx512icl).put
2353.v:
2354    movzx               mxd, myb
2355    shr                 myd, 16
2356    cmp                  hd, 6
2357    cmovs               myd, mxd
2358    vpbroadcastd        m10, [pd_32]
2359    pmovsxbw           xmm0, [base+subpel_filters+myq*8]
2360    tzcnt               r7d, wd
2361    vpbroadcastw        m11, r8m
2362    lea                  r6, [ssq*3]
2363    movzx               r7d, word [r8+r7*2+table_offset(put, _8tap_v)]
2364    sub                srcq, r6
2365    mova [rsp+stack_offset+8], xmm0
2366    vpbroadcastd        m12, xmm0
2367    add                  r7, r8
2368    vpbroadcastd        m13, [rsp+stack_offset+12]
2369    vpbroadcastd        m14, [rsp+stack_offset+16]
2370    vpbroadcastd        m15, [rsp+stack_offset+20]
2371    jmp                  r7
2372.v_w2:
2373    movd               xmm2, [srcq+ssq*0]
2374    pinsrd             xmm2, [srcq+ssq*1], 1
2375    pinsrd             xmm2, [srcq+ssq*2], 2
2376    add                srcq, r6
2377    pinsrd             xmm2, [srcq+ssq*0], 3  ; 0 1 2 3
2378    movd               xmm3, [srcq+ssq*1]
2379    vpbroadcastd       xmm1, [srcq+ssq*2]
2380    add                srcq, r6
2381    vpbroadcastd       xmm0, [srcq+ssq*0]
2382    vpblendd           xmm3, xmm1, 0x02       ; 4 5
2383    vpblendd           xmm1, xmm0, 0x02       ; 5 6
2384    palignr            xmm4, xmm3, xmm2, 4    ; 1 2 3 4
2385    punpcklwd          xmm3, xmm1             ; 45 56
2386    punpcklwd          xmm1, xmm2, xmm4       ; 01 12
2387    punpckhwd          xmm2, xmm4             ; 23 34
2388.v_w2_loop:
2389    vpbroadcastd       xmm4, [srcq+ssq*1]
2390    lea                srcq, [srcq+ssq*2]
2391    mova               xmm5, xm10
2392    vpdpwssd           xmm5, xm12, xmm1       ; a0 b0
2393    mova               xmm1, xmm2
2394    vpdpwssd           xmm5, xm13, xmm2       ; a1 b1
2395    mova               xmm2, xmm3
2396    vpdpwssd           xmm5, xm14, xmm3       ; a2 b2
2397    vpblendd           xmm3, xmm0, xmm4, 0x02 ; 6 7
2398    vpbroadcastd       xmm0, [srcq+ssq*0]
2399    vpblendd           xmm4, xmm0, 0x02       ; 7 8
2400    punpcklwd          xmm3, xmm4             ; 67 78
2401    vpdpwssd           xmm5, xm15, xmm3       ; a3 b3
2402    psrad              xmm5, 6
2403    packusdw           xmm5, xmm5
2404    pminsw             xmm5, xm11
2405    movd       [dstq+dsq*0], xmm5
2406    pextrd     [dstq+dsq*1], xmm5, 1
2407    lea                dstq, [dstq+dsq*2]
2408    sub                  hd, 2
2409    jg .v_w2_loop
2410    RET
2411.v_w4:
2412    movq               xmm1, [srcq+ssq*0]
2413    vpbroadcastq       ymm0, [srcq+ssq*1]
2414    vpbroadcastq       ymm2, [srcq+ssq*2]
2415    add                srcq, r6
2416    vpbroadcastq       ymm4, [srcq+ssq*0]
2417    vpbroadcastq       ymm3, [srcq+ssq*1]
2418    vpbroadcastq       ymm5, [srcq+ssq*2]
2419    add                srcq, r6
2420    vpblendd           ymm1, ymm0, 0x30
2421    vpblendd           ymm0, ymm2, 0x30
2422    punpcklwd          ymm1, ymm0       ; 01 12
2423    vpbroadcastq       ymm0, [srcq+ssq*0]
2424    vpblendd           ymm2, ymm4, 0x30
2425    vpblendd           ymm4, ymm3, 0x30
2426    punpcklwd          ymm2, ymm4       ; 23 34
2427    vpblendd           ymm3, ymm5, 0x30
2428    vpblendd           ymm5, ymm0, 0x30
2429    punpcklwd          ymm3, ymm5       ; 45 56
2430.v_w4_loop:
2431    vpbroadcastq       ymm5, [srcq+ssq*1]
2432    lea                srcq, [srcq+ssq*2]
2433    mova               ymm4, ym10
2434    vpdpwssd           ymm4, ym12, ymm1 ; a0 b0
2435    mova               ymm1, ymm2
2436    vpdpwssd           ymm4, ym13, ymm2 ; a1 b1
2437    mova               ymm2, ymm3
2438    vpdpwssd           ymm4, ym14, ymm3 ; a2 b2
2439    vpblendd           ymm3, ymm0, ymm5, 0x30
2440    vpbroadcastq       ymm0, [srcq+ssq*0]
2441    vpblendd           ymm5, ymm0, 0x30
2442    punpcklwd          ymm3, ymm5       ; 67 78
2443    vpdpwssd           ymm4, ym15, ymm3 ; a3 b3
2444    psrad              ymm4, 6
2445    vextracti128       xmm5, ymm4, 1
2446    packusdw           xmm4, xmm5
2447    pminsw             xmm4, xm11
2448    movq       [dstq+dsq*0], xmm4
2449    movhps     [dstq+dsq*1], xmm4
2450    lea                dstq, [dstq+dsq*2]
2451    sub                  hd, 2
2452    jg .v_w4_loop
2453    vzeroupper
2454    RET
2455.v_w8:
2456    vbroadcasti32x4      m2, [srcq+ssq*2]
2457    vinserti32x4         m1, m2, [srcq+ssq*0], 0
2458    vinserti32x4         m1, [srcq+ssq*1], 1 ; 0 1 2
2459    add                srcq, r6
2460    vinserti32x4        ym2, [srcq+ssq*0], 1
2461    vinserti32x4         m2, [srcq+ssq*1], 2 ; 2 3 4
2462    mova                 m6, [spel_v_shuf8]
2463    movu                xm0, [srcq+ssq*1]
2464    vinserti32x4        ym0, [srcq+ssq*2], 1
2465    add                srcq, r6
2466    vinserti32x4         m0, [srcq+ssq*0], 2 ; 4 5 6
2467    vpermb               m1, m6, m1          ; 01 12
2468    vpermb               m2, m6, m2          ; 23 34
2469    vpermb               m3, m6, m0          ; 45 56
2470.v_w8_loop:
2471    vinserti32x4         m0, [srcq+ssq*1], 3
2472    lea                srcq, [srcq+ssq*2]
2473    movu                xm5, [srcq+ssq*0]
2474    mova                 m4, m10
2475    vpdpwssd             m4, m12, m1         ; a0 b0
2476    mova                 m1, m2
2477    vshufi32x4           m0, m5, q1032       ; 6 7 8
2478    vpdpwssd             m4, m13, m2         ; a1 b1
2479    mova                 m2, m3
2480    vpdpwssd             m4, m14, m3         ; a2 b2
2481    vpermb               m3, m6, m0          ; 67 78
2482    vpdpwssd             m4, m15, m3         ; a3 b3
2483    psrad                m4, 6
2484    vextracti32x8       ym5, m4, 1
2485    packusdw            ym4, ym5
2486    pminsw              ym4, ym11
2487    mova          [dstq+dsq*0], xm4
2488    vextracti32x4 [dstq+dsq*1], ym4, 1
2489    lea                dstq, [dstq+dsq*2]
2490    sub                  hd, 2
2491    jg .v_w8_loop
2492    RET
2493.v_w16:
2494    vbroadcasti32x8      m0, [srcq+ssq*1]
2495    vinserti32x8         m1, m0, [srcq+ssq*2], 1
2496    vinserti32x8         m0, [srcq+ssq*0], 0
2497    mova                 m8, [spel_v_shuf16]
2498    add                srcq, r6
2499    movu                ym3, [srcq+ssq*0]
2500    vinserti32x8         m3, [srcq+ssq*1], 1
2501    movu                ym5, [srcq+ssq*2]
2502    add                srcq, r6
2503    vinserti32x8         m5, [srcq+ssq*0], 1
2504    vpermb               m1, m8, m1     ; 12
2505    vpermb               m0, m8, m0     ; 01
2506    vpermb               m3, m8, m3     ; 34
2507    vpermb               m5, m8, m5     ; 56
2508    mova                 m9, [deint_q_shuf]
2509    vpshrdd              m2, m1, m3, 16 ; 23
2510    vpshrdd              m4, m3, m5, 16 ; 45
2511.v_w16_loop:
2512    mova                 m7, m10
2513    vpdpwssd             m7, m12, m1    ; b0
2514    mova                 m6, m10
2515    vpdpwssd             m6, m12, m0    ; a0
2516    mova                 m1, m3
2517    vpdpwssd             m7, m13, m3    ; b1
2518    mova                 m0, m2
2519    vpdpwssd             m6, m13, m2    ; a1
2520    mova                 m3, m5
2521    vpdpwssd             m7, m14, m5    ; b2
2522    mova                 m2, m4
2523    vpdpwssd             m6, m14, m4    ; a2
2524    movu                ym5, [srcq+ssq*1]
2525    lea                srcq, [srcq+ssq*2]
2526    vinserti32x8         m5, [srcq+ssq*0], 1
2527    vpermb               m5, m8, m5     ; 78
2528    vpshrdd              m4, m3, m5, 16 ; 67
2529    vpdpwssd             m7, m15, m5    ; b3
2530    vpdpwssd             m6, m15, m4    ; a3
2531    psrad                m7, 6
2532    psrad                m6, 6
2533    packusdw             m6, m7
2534    pminsw               m6, m11
2535    vpermq               m6, m9, m6
2536    mova          [dstq+dsq*0], ym6
2537    vextracti32x8 [dstq+dsq*1], m6, 1
2538    lea                dstq, [dstq+dsq*2]
2539    sub                  hd, 2
2540    jg .v_w16_loop
2541    RET
2542.v_w32:
2543.v_w64:
2544.v_w128:
2545    WIN64_SPILL_XMM      23
2546    lea                  wd, [hq+wq*8-256]
2547.v_w32_loop0:
2548    movu                m16, [srcq+ssq*0]
2549    movu                m17, [srcq+ssq*1]
2550    lea                  r7, [srcq+r6   ]
2551    movu                m18, [srcq+ssq*2]
2552    movu                m19, [r7  +ssq*0]
2553    mov                  r8, dstq
2554    movu                m20, [r7  +ssq*1]
2555    movu                m21, [r7  +ssq*2]
2556    add                  r7, r6
2557    movu                m22, [r7  +ssq*0]
2558    punpcklwd            m0, m16, m17 ; 01l
2559    punpckhwd           m16, m17      ; 01h
2560    punpcklwd            m1, m17, m18 ; 12l
2561    punpckhwd           m17, m18      ; 12h
2562    punpcklwd            m2, m18, m19 ; 23l
2563    punpckhwd           m18, m19      ; 23h
2564    punpcklwd            m3, m19, m20 ; 34l
2565    punpckhwd           m19, m20      ; 34h
2566    punpcklwd            m4, m20, m21 ; 45l
2567    punpckhwd           m20, m21      ; 45h
2568    punpcklwd            m5, m21, m22 ; 56l
2569    punpckhwd           m21, m22      ; 56h
2570.v_w32_loop:
2571    mova                 m6, m10
2572    vpdpwssd             m6, m12, m0  ; a0l
2573    mova                 m8, m10
2574    vpdpwssd             m8, m12, m16 ; a0h
2575    mova                 m7, m10
2576    vpdpwssd             m7, m12, m1  ; b0l
2577    mova                 m9, m10
2578    vpdpwssd             m9, m12, m17 ; b0h
2579    mova                 m0, m2
2580    vpdpwssd             m6, m13, m2  ; a1l
2581    mova                m16, m18
2582    vpdpwssd             m8, m13, m18 ; a1h
2583    mova                 m1, m3
2584    vpdpwssd             m7, m13, m3  ; b1l
2585    mova                m17, m19
2586    vpdpwssd             m9, m13, m19 ; b1h
2587    mova                 m2, m4
2588    vpdpwssd             m6, m14, m4  ; a2l
2589    mova                m18, m20
2590    vpdpwssd             m8, m14, m20 ; a2h
2591    mova                 m3, m5
2592    vpdpwssd             m7, m14, m5  ; b2l
2593    mova                m19, m21
2594    vpdpwssd             m9, m14, m21 ; b2h
2595    movu                m21, [r7+ssq*1]
2596    lea                  r7, [r7+ssq*2]
2597    punpcklwd            m4, m22, m21 ; 67l
2598    punpckhwd           m20, m22, m21 ; 67h
2599    movu                m22, [r7+ssq*0]
2600    vpdpwssd             m6, m15, m4  ; a3l
2601    vpdpwssd             m8, m15, m20 ; a3h
2602    punpcklwd            m5, m21, m22 ; 78l
2603    punpckhwd           m21, m22      ; 78h
2604    vpdpwssd             m7, m15, m5  ; b3l
2605    vpdpwssd             m9, m15, m21 ; b3h
2606    REPX       {psrad x, 6}, m6, m8, m7, m9
2607    packusdw             m6, m8
2608    packusdw             m7, m9
2609    pminsw               m6, m11
2610    pminsw               m7, m11
2611    mova         [r8+dsq*0], m6
2612    mova         [r8+dsq*1], m7
2613    lea                  r8, [r8+dsq*2]
2614    sub                  hd, 2
2615    jg .v_w32_loop
2616    add                srcq, 64
2617    add                dstq, 64
2618    movzx                hd, wb
2619    sub                  wd, 1<<8
2620    jg .v_w32_loop0
2621    RET
2622.h_w2:
2623    RESET_STACK_STATE
2624    mova                ym2, [spel_h_shuf2a]
2625    sub                srcq, 2
2626    pshufd             xmm3, xmm0, q1111
2627    pshufd             xmm4, xmm0, q2222
2628.h_w2_loop:
2629    movu                xm1, [srcq+ssq*0]
2630    vinserti32x4        ym1, [srcq+ssq*1], 1
2631    lea                srcq, [srcq+ssq*2]
2632    mova               xmm0, xm8
2633    vpermb              ym1, ym2, ym1
2634    vpdpwssd           xmm0, xmm3, xm1
2635    vextracti32x4       xm1, ym1, 1
2636    vpdpwssd           xmm0, xmm4, xm1
2637    psrad              xmm0, 6
2638    packusdw           xmm0, xmm0
2639    pminsw             xmm0, xm15
2640    movd       [dstq+dsq*0], xmm0
2641    pextrd     [dstq+dsq*1], xmm0, 1
2642    lea                dstq, [dstq+dsq*2]
2643    sub                  hd, 2
2644    jg .h_w2_loop
2645    RET
2646.h_w4:
2647    movzx               mxd, mxb
2648    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
2649    jl .h_w2
2650    vbroadcasti32x4     ym4, [spel_h_shufA]
2651    vbroadcasti32x4     ym5, [spel_h_shufB]
2652    sub                srcq, 2
2653    pshufd             xmm0, xmm0, q2211
2654    vpbroadcastq        ym6, xmm0
2655    vpermq              ym7, ymm0, q1111
2656.h_w4_loop:
2657    movu                xm2, [srcq+ssq*0]
2658    vinserti32x4        ym2, [srcq+ssq*1], 1
2659    lea                srcq, [srcq+ssq*2]
2660    mova                ym0, ym8
2661    pshufb              ym1, ym2, ym4
2662    vpdpwssd            ym0, ym6, ym1
2663    pshufb              ym2, ym5
2664    vpdpwssd            ym0, ym7, ym2
2665    psrad               ym0, 6
2666    vextracti32x4       xm1, ym0, 1
2667    packusdw            xm0, xm1
2668    pminsw             xmm0, xm0, xm15
2669    movq       [dstq+dsq*0], xmm0
2670    movhps     [dstq+dsq*1], xmm0
2671    lea                dstq, [dstq+dsq*2]
2672    sub                  hd, 2
2673    jg .h_w4_loop
2674    RET
2675.h_w8:
2676    mova                 m4, [spel_h_shufA]
2677    movu                 m5, [spel_h_shufB]
2678    movu                 m6, [spel_h_shufC]
2679    mova                 m7, [spel_h_shufD]
2680.h_w8_loop:
2681    movu                ym2, [srcq+ssq*0]
2682    vinserti32x8         m2, [srcq+ssq*1], 1
2683    lea                srcq, [srcq+ssq*2]
2684    mova                 m0, m8
2685    vpermb               m1, m4, m2
2686    vpdpwssd             m0, m10, m1
2687    vpermb               m1, m5, m2
2688    vpdpwssd             m0, m11, m1
2689    vpermb               m1, m6, m2
2690    vpdpwssd             m0, m12, m1
2691    vpermb               m1, m7, m2
2692    vpdpwssd             m0, m13, m1
2693    psrad                m0, 6
2694    vextracti32x8       ym1, m0, 1
2695    packusdw            ym0, ym1
2696    pminsw              ym0, ym15
2697    mova          [dstq+dsq*0], xm0
2698    vextracti32x4 [dstq+dsq*1], ym0, 1
2699    lea                dstq, [dstq+dsq*2]
2700    sub                  hd, 2
2701    jg .h_w8_loop
2702    RET
2703.h:
2704    vpbroadcastw        m15, r8m
2705    test                myd, 0xf00
2706    jnz .hv
2707    mov                 r7d, r8m
2708    shr                 r7d, 11
2709    vpbroadcastd         m8, [base+put_8tap_h_rnd+r7*4]
2710    cmp                  wd, 4
2711    jle .h_w4
2712    shr                 mxd, 16
2713    sub                srcq, 6
2714    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
2715    mova              [buf], xmm0
2716    vpbroadcastd        m10, xmm0
2717    vpbroadcastd        m11, [buf+ 4]
2718    vpbroadcastd        m12, [buf+ 8]
2719    vpbroadcastd        m13, [buf+12]
2720    sub                  wd, 16
2721    jl .h_w8
2722    vbroadcasti32x4      m6, [spel_h_shufA]
2723    vbroadcasti32x4      m7, [spel_h_shufB]
2724    jg .h_w32
2725.h_w16_loop:
2726    movu                ym2, [srcq+ssq*0+ 0]
2727    vinserti32x8         m2, [srcq+ssq*1+ 0], 1
2728    movu                ym3, [srcq+ssq*0+16]
2729    vinserti32x8         m3, [srcq+ssq*1+16], 1
2730    lea                srcq, [srcq+ssq*2]
2731    mova                 m0, m8
2732    mova                 m1, m8
2733    pshufb               m4, m2, m6
2734    vpdpwssd             m0, m10, m4 ; a0
2735    pshufb               m4, m3, m6
2736    vpdpwssd             m1, m12, m4 ; b2
2737    pshufb               m4, m2, m7
2738    vpdpwssd             m0, m11, m4 ; a1
2739    pshufb               m4, m3, m7
2740    vpdpwssd             m1, m13, m4 ; b3
2741    shufpd               m2, m3, 0x55
2742    pshufb               m4, m2, m6
2743    vpdpwssd             m0, m12, m4 ; a2
2744    vpdpwssd             m1, m10, m4 ; b0
2745    pshufb               m2, m7
2746    vpdpwssd             m0, m13, m2 ; a3
2747    vpdpwssd             m1, m11, m2 ; b1
2748    psrad                m0, 6
2749    psrad                m1, 6
2750    packusdw             m0, m1
2751    pminsw               m0, m15
2752    mova          [dstq+dsq*0], ym0
2753    vextracti32x8 [dstq+dsq*1], m0, 1
2754    lea                dstq, [dstq+dsq*2]
2755    sub                  hd, 2
2756    jg .h_w16_loop
2757    RET
2758.h_w32:
2759    lea                srcq, [srcq+wq*2]
2760    lea                dstq, [dstq+wq*2]
2761    neg                  wq
2762.h_w32_loop0:
2763    mov                  r6, wq
2764.h_w32_loop:
2765    movu                 m2, [srcq+r6*2+ 0]
2766    movu                 m3, [srcq+r6*2+ 8]
2767    mova                 m0, m8
2768    mova                 m1, m8
2769    pshufb               m4, m2, m6
2770    vpdpwssd             m0, m10, m4 ; a0
2771    pshufb               m4, m3, m6
2772    vpdpwssd             m1, m10, m4 ; b0
2773    vpdpwssd             m0, m12, m4 ; a2
2774    movu                 m4, [srcq+r6*2+16]
2775    pshufb               m3, m7
2776    vpdpwssd             m1, m11, m3 ; b1
2777    vpdpwssd             m0, m13, m3 ; a3
2778    pshufb               m3, m4, m6
2779    vpdpwssd             m1, m12, m3 ; b2
2780    pshufb               m2, m7
2781    vpdpwssd             m0, m11, m2 ; a1
2782    pshufb               m4, m7
2783    vpdpwssd             m1, m13, m4 ; b3
2784    psrad                m0, 6
2785    psrad                m1, 6
2786    packusdw             m0, m1
2787    pminsw               m0, m15
2788    mova        [dstq+r6*2], m0
2789    add                  r6, 32
2790    jl .h_w32_loop
2791    add                srcq, ssq
2792    add                dstq, dsq
2793    dec                  hd
2794    jg .h_w32_loop0
2795    RET
2796.hv:
2797    cmp                  wd, 4
2798    jg .hv_w8
2799    movzx               mxd, mxb
2800    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
2801    movzx               mxd, myb
2802    shr                 myd, 16
2803    cmp                  hd, 6
2804    cmovs               myd, mxd
2805    pmovsxbw           xmm1, [base+subpel_filters+myq*8]
2806    lea                  r6, [ssq*3]
2807    sub                srcq, 2
2808    sub                srcq, r6
2809    test          dword r8m, 0x800
2810    jnz .hv_12bit
2811    vpbroadcastd        m10, [pd_2176]
2812    psllw              xmm0, 6
2813    jmp .hv_main
2814.hv_12bit:
2815    vpbroadcastd        m10, [pd_640]
2816    psllw              xmm0, 4
2817    psllw              xmm1, 2
2818.hv_main:
2819    mova           [buf+ 0], xmm0
2820    mova           [buf+16], xmm1
2821    vpbroadcastd         m8, [buf+ 4]
2822    vpbroadcastd         m9, [buf+ 8]
2823    vpbroadcastd       ym11, xmm1
2824    vpbroadcastd       ym12, [buf+20]
2825    vpbroadcastd       ym13, [buf+24]
2826    vpbroadcastd       ym14, [buf+28]
2827    movu                xm4, [srcq+ssq*0]
2828    vinserti32x4        ym4, [srcq+ssq*1], 1
2829    vinserti32x4         m4, [srcq+ssq*2], 2
2830    add                srcq, r6
2831    vinserti32x4         m4, [srcq+ssq*0], 3 ; 0 1 2 3
2832    movu                xm0, [srcq+ssq*1]
2833    vinserti32x4        ym0, [srcq+ssq*2], 1
2834    add                srcq, r6
2835    vinserti32x4         m0, [srcq+ssq*0], 2 ; 4 5 6
2836    cmp                  wd, 4
2837    je .hv_w4
2838    vbroadcasti32x4      m2, [spel_h_shufA]
2839    mova                 m3, [spel_h_shuf2b]
2840    mova                ym6, [spel_h_shuf2a]
2841    mova                xm7, [spel_shuf2]
2842    mova                 m1, m10
2843    pshufb               m4, m2
2844    pshufb               m0, m2
2845    punpcklqdq           m2, m4, m0
2846    vpdpwssd             m1, m8, m2    ; 04 15 26 3_
2847    punpckhqdq           m4, m0
2848    vpdpwssd             m1, m9, m4
2849    vpermb               m1, m3, m1    ; 01 12
2850    vextracti32x4       xm2, ym1, 1    ; 23 34
2851    vextracti32x4       xm3, m1, 2     ; 45 56
2852.hv_w2_loop:
2853    movu                xm5, [srcq+ssq*1]
2854    lea                srcq, [srcq+ssq*2]
2855    vinserti32x4        ym5, [srcq+ssq*0], 1
2856    mova                xm4, xm10
2857    vpermb              ym5, ym6, ym5
2858    pmaddwd            xmm0, xm11, xm1 ; a0 b0
2859    vpdpwssd            xm4, xm8, xm5
2860    vextracti32x4       xm5, ym5, 1
2861    mova                xm1, xm2
2862    vpdpwssd           xmm0, xm12, xm2 ; a1 b1
2863    vpdpwssd            xm4, xm9, xm5  ; 7 8
2864    mova                xm2, xm3
2865    vpdpwssd           xmm0, xm13, xm3 ; a2 b2
2866    vpermt2b            xm3, xm7, xm4  ; 67 78
2867    vpdpwssd           xmm0, xm14, xm3 ; a3 b3
2868    psrad              xmm0, 10
2869    packusdw           xmm0, xmm0
2870    pminsw             xmm0, xm15
2871    movd       [dstq+dsq*0], xmm0
2872    pextrd     [dstq+dsq*1], xmm0, 1
2873    lea                dstq, [dstq+dsq*2]
2874    sub                  hd, 2
2875    jg .hv_w2_loop
2876    RET
2877.hv_w4:
2878    vbroadcasti32x4     m19, [spel_h_shufA]
2879    vbroadcasti32x4     m20, [spel_h_shufB]
2880    mova                ym6, [spel_shuf4a]
2881    mova                ym7, [spel_shuf4b]
2882    mova                 m2, m10
2883    mova                 m3, m10
2884    pshufb               m1, m4, m19
2885    vpdpwssd             m2, m8, m1
2886    pshufb               m1, m0, m19
2887    vpdpwssd             m3, m8, m1
2888    pshufb               m4, m20
2889    vpdpwssd             m2, m9, m4
2890    pshufb               m0, m20
2891    vpdpwssd             m3, m9, m0
2892    vpermb               m1, m6, m2    ; 01 12
2893    vshufi32x4           m2, m3, q1032
2894    vpermb               m3, m6, m3    ; 45 56
2895    vpermb               m2, m6, m2    ; 23 34
2896.hv_w4_loop:
2897    movu               xm18, [srcq+ssq*1]
2898    lea                srcq, [srcq+ssq*2]
2899    vinserti128        ym18, [srcq+ssq*0], 1
2900    pmaddwd            ym16, ym11, ym1 ; a0 b0
2901    mova                ym1, ym2
2902    mova                ym2, ym3
2903    pshufb             ym17, ym18, ym19
2904    mova                ym3, ym10
2905    vpdpwssd            ym3, ym8, ym17
2906    pshufb             ym18, ym20
2907    vpdpwssd           ym16, ym12, ym1 ; a1 b1
2908    vpdpwssd            ym3, ym9, ym18 ; 7 8
2909    vpdpwssd           ym16, ym13, ym2 ; a2 b2
2910    vpermt2b            ym3, ym7, ym2  ; 67 78
2911    vpdpwssd           ym16, ym14, ym3 ; a3 b3
2912    psrad              ym16, 10
2913    vextracti128       xm17, ym16, 1
2914    packusdw           xm16, xm17
2915    pminsw             xm16, xm15
2916    movq       [dstq+dsq*0], xm16
2917    movhps     [dstq+dsq*1], xm16
2918    lea                dstq, [dstq+dsq*2]
2919    sub                  hd, 2
2920    jg .hv_w4_loop
2921    vzeroupper
2922    RET
2923.hv_w8:
2924    shr                 mxd, 16
2925    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
2926    movzx               mxd, myb
2927    shr                 myd, 16
2928    cmp                  hd, 6
2929    cmovs               myd, mxd
2930    pmovsxbw           xmm1, [base+subpel_filters+myq*8]
2931    lea                  r6, [ssq*3]
2932    sub                srcq, 6
2933    sub                srcq, r6
2934    test          dword r8m, 0x800
2935    jnz .hv_w8_12bit
2936    vpbroadcastd        m10, [pd_2176]
2937    psllw              xmm0, 6
2938    jmp .hv_w8_main
2939.hv_w8_12bit:
2940    vpbroadcastd        m10, [pd_640]
2941    psllw              xmm0, 4
2942    psllw              xmm1, 2
2943.hv_w8_main:
2944    mova           [buf+ 0], xmm0
2945    mova           [buf+16], xmm1
2946    vpbroadcastd        m11, xmm0
2947    vpbroadcastd        m12, [buf+ 4]
2948    vpbroadcastd        m13, [buf+ 8]
2949    vpbroadcastd        m14, [buf+12]
2950    vpbroadcastd        m16, xmm1
2951    vpbroadcastd        m17, [buf+20]
2952    vpbroadcastd        m18, [buf+24]
2953    vpbroadcastd        m19, [buf+28]
2954    cmp                  wd, 8
2955    jg .hv_w16
2956    mova                 m5, [spel_h_shufA]
2957    movu                ym0, [srcq+ssq*0]
2958    vinserti32x8         m0, [srcq+ssq*1], 1 ; 0 1
2959    movu                ym9, [srcq+ssq*2]
2960    add                srcq, r6
2961    vinserti32x8         m9, [srcq+ssq*0], 1 ; 2 3
2962    movu               ym20, [srcq+ssq*1]
2963    vinserti32x8        m20, [srcq+ssq*2], 1 ; 4 5
2964    add srcq, r6
2965    movu               ym21, [srcq+ssq*0]    ; 6
2966    movu                 m6, [spel_h_shufB]
2967    movu                 m7, [spel_h_shufC]
2968    vpermb               m8, m5, m0
2969    mova                 m1, m10
2970    vpdpwssd             m1, m11, m8  ; a0 b0
2971    vpermb               m8, m5, m9
2972    mova                 m2, m10
2973    vpdpwssd             m2, m11, m8  ; c0 d0
2974    vpermb               m8, m5, m20
2975    mova                 m3, m10
2976    vpdpwssd             m3, m11, m8  ; e0 f0
2977    vpermb               m8, m5, m21
2978    mova                 m4, m10
2979    vpdpwssd             m4, m11, m8  ; g0
2980    vpermb               m8, m6, m0
2981    vpdpwssd             m1, m12, m8  ; a1 b1
2982    vpermb               m8, m6, m9
2983    vpdpwssd             m2, m12, m8  ; c1 d1
2984    vpermb               m8, m6, m20
2985    vpdpwssd             m3, m12, m8  ; e1 f1
2986    vpermb               m8, m6, m21
2987    vpdpwssd             m4, m12, m8  ; g1
2988    vpermb               m8, m7, m0
2989    vpdpwssd             m1, m13, m8  ; a2 b2
2990    vpermb               m8, m7, m9
2991    vpdpwssd             m2, m13, m8  ; c2 d2
2992    vpermb               m8, m7, m20
2993    vpdpwssd             m3, m13, m8  ; e2 f2
2994    vpermb               m8, m7, m21
2995    vpdpwssd             m4, m13, m8  ; g2
2996    mova                 m8, [spel_h_shufD]
2997    vpermb               m0, m8, m0
2998    vpdpwssd             m1, m14, m0  ; a3 b3
2999    mova                 m0, [spel_shuf8a]
3000    vpermb               m9, m8, m9
3001    vpdpwssd             m2, m14, m9  ; c3 d3
3002    mova                 m9, [spel_shuf8b]
3003    vpermb              m20, m8, m20
3004    vpdpwssd             m3, m14, m20 ; e3 f3
3005    vpermb              m21, m8, m21
3006    vpdpwssd             m4, m14, m21 ; g3
3007    vpermt2b             m1, m0, m2   ; 01 12
3008    vpermt2b             m2, m0, m3   ; 23 34
3009    vpermt2b             m3, m0, m4   ; 45 56
3010.hv_w8_loop:
3011    movu                ym0, [srcq+ssq*1]
3012    lea                srcq, [srcq+ssq*2]
3013    vinserti32x8         m0, [srcq+ssq*0], 1
3014    mova                 m4, m10
3015    vpermb              m21, m5, m0
3016    vpdpwssd             m4, m11, m21 ; h0 i0
3017    vpermb              m21, m6, m0
3018    pmaddwd             m20, m16, m1  ; A0 B0
3019    vpdpwssd             m4, m12, m21 ; h1 i1
3020    vpermb              m21, m7, m0
3021    mova                 m1, m2
3022    vpdpwssd            m20, m17, m2  ; A1 B1
3023    vpdpwssd             m4, m13, m21 ; h2 i2
3024    vpermb              m21, m8, m0
3025    mova                 m2, m3
3026    vpdpwssd            m20, m18, m3  ; A2 B2
3027    vpdpwssd             m4, m14, m21 ; h3 i3
3028    vpermt2b             m3, m9, m4   ; 67 78
3029    vpdpwssd            m20, m19, m3  ; A3 B3
3030    psrad               m20, 10
3031    vextracti32x8      ym21, m20, 1
3032    packusdw           ym20, ym21
3033    pminsw             ym20, ym15
3034    mova         [dstq+dsq*0], xm20
3035    vextracti128 [dstq+dsq*1], ym20, 1
3036    lea                dstq, [dstq+dsq*2]
3037    sub                  hd, 2
3038    jg .hv_w8_loop
3039    vzeroupper
3040    RET
3041.hv_w16:
3042    WIN64_SPILL_XMM 26
3043    vbroadcasti32x4     m20, [spel_h_shufA]
3044    vbroadcasti32x4     m21, [spel_h_shufB]
3045    add                  wd, wd
3046    mova                 m9, [spel_shuf16]
3047    lea                  wd, [hq+wq*8-256]
3048.hv_w16_loop0:
3049    vbroadcasti32x8      m5, [srcq+ssq*0+ 8]
3050    vinserti32x8         m4, m5, [srcq+ssq*0+ 0], 0
3051    vinserti32x8         m5, [srcq+ssq*0+16], 1 ; 0
3052    movu                ym6, [srcq+ssq*1+ 0]
3053    movu                ym7, [srcq+ssq*1+16]
3054    lea                  r7, [srcq+r6]
3055    vinserti32x8         m6, [srcq+ssq*2+ 0], 1
3056    vinserti32x8         m7, [srcq+ssq*2+16], 1 ; 1 2
3057    movu               ym22, [r7  +ssq*0+ 0]
3058    movu               ym23, [r7  +ssq*0+16]
3059    mov                  r8, dstq
3060    vinserti32x8        m22, [r7  +ssq*1+ 0], 1
3061    vinserti32x8        m23, [r7  +ssq*1+16], 1 ; 3 4
3062    movu               ym24, [r7  +ssq*2+ 0]
3063    movu               ym25, [r7  +ssq*2+16]
3064    add                  r7, r6
3065    vinserti32x8        m24, [r7  +ssq*0+ 0], 1
3066    vinserti32x8        m25, [r7  +ssq*0+16], 1 ; 5 6
3067    pshufb               m0, m4, m20
3068    mova                 m1, m10
3069    vpdpwssd             m1, m11, m0    ; a0
3070    pshufb               m0, m6, m20
3071    mova                 m2, m10
3072    vpdpwssd             m2, m11, m0    ; b0
3073    pshufb               m0, m7, m20
3074    mova                 m3, m10
3075    vpdpwssd             m3, m13, m0    ; c2
3076    pshufb               m0, m4, m21
3077    vpdpwssd             m1, m12, m0    ; a1
3078    pshufb               m0, m6, m21
3079    vpdpwssd             m2, m12, m0    ; b1
3080    pshufb               m0, m7, m21
3081    vpdpwssd             m3, m14, m0    ; c3
3082    pshufb               m0, m5, m20
3083    vpdpwssd             m1, m13, m0    ; a2
3084    shufpd               m6, m7, 0x55
3085    pshufb               m7, m6, m20
3086    vpdpwssd             m2, m13, m7    ; b2
3087    vpdpwssd             m3, m11, m7    ; c0
3088    pshufb               m5, m21
3089    vpdpwssd             m1, m14, m5    ; a3
3090    pshufb               m6, m21
3091    vpdpwssd             m2, m14, m6    ; b3
3092    vpdpwssd             m3, m12, m6    ; c1
3093    pshufb               m0, m22, m20
3094    mova                 m4, m10
3095    vpdpwssd             m4, m11, m0    ; d0
3096    pshufb               m0, m23, m20
3097    mova                 m5, m10
3098    vpdpwssd             m5, m13, m0    ; e2
3099    pshufb               m0, m24, m20
3100    mova                 m6, m10
3101    vpdpwssd             m6, m11, m0    ; f0
3102    pshufb               m0, m25, m20
3103    mova                 m7, m10
3104    vpdpwssd             m7, m13, m0    ; g2
3105    pshufb               m0, m22, m21
3106    vpdpwssd             m4, m12, m0    ; d1
3107    pshufb               m0, m23, m21
3108    vpdpwssd             m5, m14, m0    ; e3
3109    pshufb               m0, m24, m21
3110    vpdpwssd             m6, m12, m0    ; f1
3111    pshufb               m0, m25, m21
3112    vpdpwssd             m7, m14, m0    ; g3
3113    shufpd              m22, m23, 0x55
3114    pshufb              m23, m22, m20
3115    vpdpwssd             m4, m13, m23   ; d2
3116    vpdpwssd             m5, m11, m23   ; e0
3117    shufpd              m24, m25, 0x55
3118    pshufb              m25, m24, m20
3119    vpdpwssd             m6, m13, m25   ; f2
3120    vpdpwssd             m7, m11, m25   ; g0
3121    pshufb              m22, m21
3122    vpdpwssd             m4, m14, m22   ; d3
3123    vpdpwssd             m5, m12, m22   ; e1
3124    pshufb              m24, m21
3125    vpdpwssd             m6, m14, m24   ; f3
3126    vpdpwssd             m7, m12, m24   ; g1
3127    pslldq               m1, 1
3128    vpermt2b             m2, m9, m3     ; 12
3129    vpermt2b             m4, m9, m5     ; 34
3130    vpermt2b             m6, m9, m7     ; 56
3131    vpshrdd              m1, m2, 16     ; 01
3132    vpshrdd              m3, m2, m4, 16 ; 23
3133    vpshrdd              m5, m4, m6, 16 ; 45
3134.hv_w16_loop:
3135    movu               ym24, [r7+ssq*1+ 0]
3136    movu               ym25, [r7+ssq*1+16]
3137    lea                  r7, [r7+ssq*2]
3138    vinserti32x8        m24, [r7+ssq*0+ 0], 1
3139    vinserti32x8        m25, [r7+ssq*0+16], 1
3140    mova                 m7, m10
3141    mova                 m8, m10
3142    pshufb               m0, m24, m20
3143    vpdpwssd             m7, m11, m0    ; h0
3144    pshufb               m0, m25, m20
3145    vpdpwssd             m8, m13, m0    ; i2
3146    pmaddwd             m22, m16, m1    ; A0
3147    mova                 m1, m3
3148    pmaddwd             m23, m16, m2    ; B0
3149    mova                 m2, m4
3150    pshufb               m0, m24, m21
3151    vpdpwssd             m7, m12, m0    ; h1
3152    pshufb               m0, m25, m21
3153    vpdpwssd             m8, m14, m0    ; i3
3154    vpdpwssd            m22, m17, m3    ; A1
3155    mova                 m3, m5
3156    vpdpwssd            m23, m17, m4    ; B1
3157    mova                 m4, m6
3158    shufpd              m24, m25, 0x55
3159    pshufb              m25, m24, m20
3160    vpdpwssd             m7, m13, m25   ; h2
3161    vpdpwssd             m8, m11, m25   ; i0
3162    vpdpwssd            m22, m18, m5    ; A2
3163    vpdpwssd            m23, m18, m6    ; B2
3164    pshufb              m24, m21
3165    vpdpwssd             m7, m14, m24   ; h3
3166    vpdpwssd             m8, m12, m24   ; i1
3167    vpermt2b             m7, m9, m8     ; 78
3168    vpshrdd              m5, m6, m7, 16 ; 67
3169    vpdpwssd            m22, m19, m5    ; A3
3170    vpdpwssd            m23, m19, m7    ; B3
3171    mova                 m6, m7
3172    psrad               m22, 10
3173    psrad               m23, 10
3174    vshufi32x4           m0, m22, m23, q3232
3175    vinserti32x8        m22, ym23, 1
3176    packusdw            m22, m0
3177    pminsw              m22, m15
3178    mova          [r8+dsq*0], ym22
3179    vextracti32x8 [r8+dsq*1], m22, 1
3180    lea                  r8, [r8+dsq*2]
3181    sub                  hd, 2
3182    jg .hv_w16_loop
3183    add                srcq, 32
3184    add                dstq, 32
3185    movzx                hd, wb
3186    sub                  wd, 1<<8
3187    jg .hv_w16_loop0
3188    RET
3189
3190%if WIN64
3191DECLARE_REG_TMP 6, 4
3192%else
3193DECLARE_REG_TMP 6, 7
3194%endif
3195
3196%define PREP_8TAP_FN FN prep_8tap,
3197PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_16bpc
3198PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_16bpc
3199PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_16bpc
3200PREP_8TAP_FN regular,        REGULAR, REGULAR
3201
3202cglobal prep_6tap_16bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my
3203%define base r7-prep_avx512icl
3204    imul                mxd, mxm, 0x010101
3205    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
3206    imul                myd, mym, 0x010101
3207    add                 myd, t1d ; 6tap_v, my, 4tap_v
3208    lea                  r7, [prep_avx512icl]
3209    mov                  wd, wm
3210    movifnidn            hd, hm
3211    test                mxd, 0xf00
3212    jnz .h
3213    test                myd, 0xf00
3214    jnz .v
3215.prep:
3216    tzcnt                wd, wd
3217    mov                 r5d, r7m ; bitdepth_max
3218    vpbroadcastd         m5, [pw_8192]
3219    movzx                wd, word [r7+wq*2+table_offset(prep,)]
3220    shr                 r5d, 11
3221    vpbroadcastd         m4, [r7-prep_avx512icl+prep_mul+r5*4]
3222    add                  wq, r7
3223    lea                  r6, [ssq*3]
3224%if WIN64
3225    pop                  r7
3226%endif
3227    jmp                  wq
3228.h_w8:
3229    mova                 m6, [spel_h_shufA]
3230    movu                 m7, [spel_h_shufC]
3231    mova                 m8, [prep_endB]
3232.h_w8_loop:
3233    movu                ym4, [srcq+ssq*0]
3234    vinserti32x8         m4, [srcq+ssq*1], 1
3235    movu                ym5, [srcq+ssq*2]
3236    vinserti32x8         m5, [srcq+r6   ], 1
3237    lea                srcq, [srcq+ssq*4]
3238    mova                 m0, m10
3239    mova                 m1, m10
3240    vpermb               m2, m6, m4
3241    vpermb               m3, m6, m5
3242    vpdpwssd             m0, m12, m2 ; a0 b0
3243    vpdpwssd             m1, m12, m3 ; c0 d0
3244    vpermb               m4, m7, m4
3245    vpermb               m5, m7, m5
3246    vpdpwssd             m0, m14, m4 ; a2 b2
3247    vpdpwssd             m1, m14, m5 ; c2 d2
3248    shufpd               m2, m4, 0x55
3249    shufpd               m3, m5, 0x55
3250    vpdpwssd             m0, m13, m2 ; a1 b1
3251    vpdpwssd             m1, m13, m3 ; c1 d1
3252    vpermt2b             m0, m8, m1
3253    mova             [tmpq], m0
3254    add                tmpq, 64
3255    sub                  hd, 4
3256    jg .h_w8_loop
3257    RET
3258.h:
3259    vpbroadcastd        m10, [prep_8tap_rnd]
3260    test                myd, 0xf00
3261    jnz .hv
3262    lea                  r6, [ssq*3]
3263    cmp                  wd, 4
3264    je mangle(private_prefix %+ _prep_8tap_16bpc_avx512icl).h_w4
3265    shr                 mxd, 16
3266    pmovsxbw           xmm0, [base+subpel_filters+1+mxq*8]
3267    mov                 r5d, r7m
3268    sub                srcq, 4
3269    shr                 r5d, 11
3270    psllw              xmm0, [base+prep_hv_shift+r5*8]
3271    mova             [tmpq], xmm0
3272    vpbroadcastd        m12, xmm0
3273    vpbroadcastd        m13, [tmpq+ 4]
3274    vpbroadcastd        m14, [tmpq+ 8]
3275    cmp                  wd, 16
3276    jl .h_w8
3277    vbroadcasti32x4      m5, [spel_h_shufA]
3278    vbroadcasti32x4      m6, [spel_h_shufB]
3279    mova                 m7, [prep_endC]
3280    jg .h_w32
3281.h_w16_loop:
3282    movu                ym2, [srcq+ssq*0+ 0]
3283    vinserti32x8         m2, [srcq+ssq*1+ 0], 1
3284    movu                ym3, [srcq+ssq*0+12]
3285    vinserti32x8         m3, [srcq+ssq*1+12], 1
3286    lea                srcq, [srcq+ssq*2]
3287    mova                 m0, m10
3288    mova                 m1, m10
3289    pshufb               m4, m2, m5   ; 01
3290    vpdpwssd             m0, m12, m4  ; a0  b0
3291    pshufb               m4, m3, m6   ; 89
3292    vpdpwssd             m1, m14, m4  ; a2' b2'
3293    pshufb               m2, m6       ; 23
3294    pshufb               m3, m5       ; 67
3295    vpdpwssd             m0, m13, m2  ; a1  b1
3296    vpdpwssd             m1, m13, m3  ; a1' b1'
3297    shufpd               m2, m3, 0x55 ; 45
3298    vpdpwssd             m0, m14, m2  ; a2  b2
3299    vpdpwssd             m1, m12, m2  ; a0' b0'
3300    vpermt2b             m0, m7, m1
3301    mova             [tmpq], m0
3302    add                tmpq, 64
3303    sub                  hd, 2
3304    jg .h_w16_loop
3305    RET
3306.h_w32:
3307    lea                srcq, [srcq+wq*2]
3308    neg                  wq
3309.h_w32_loop0:
3310    mov                  r6, wq
3311.h_w32_loop:
3312    movu                 m2, [srcq+r6*2+ 0]
3313    movu                 m3, [srcq+r6*2+12]
3314    mova                 m0, m10
3315    mova                 m1, m10
3316    pshufb               m4, m2, m5
3317    vpdpwssd             m0, m12, m4
3318    pshufb               m4, m3, m6
3319    vpdpwssd             m1, m14, m4
3320    pshufb               m2, m6
3321    pshufb               m3, m5
3322    vpdpwssd             m0, m13, m2
3323    vpdpwssd             m1, m13, m3
3324    shufpd               m2, m3, 0x55
3325    vpdpwssd             m0, m14, m2
3326    vpdpwssd             m1, m12, m2
3327    vpermt2b             m0, m7, m1
3328    mova             [tmpq], m0
3329    add                tmpq, 64
3330    add                  r6, 32
3331    jl .h_w32_loop
3332    add                srcq, ssq
3333    dec                  hd
3334    jg .h_w32_loop0
3335    RET
3336.v:
3337    movzx               mxd, myb
3338    shr                 myd, 16
3339    cmp                  hd, 4
3340    cmove               myd, mxd
3341    mov                 r5d, r7m
3342    vpbroadcastd        m10, [prep_8tap_rnd]
3343    pmovsxbw           xmm0, [base+subpel_filters+1+myq*8]
3344    tzcnt               r6d, wd
3345    shr                 r5d, 11
3346    movzx               r6d, word [r7+r6*2+table_offset(prep, _6tap_v)]
3347    psllw              xmm0, [base+prep_hv_shift+r5*8]
3348    add                  r7, r6
3349    mova             [tmpq], xmm0
3350    vpbroadcastd        m12, xmm0
3351    mov                  r6, ssq
3352    vpbroadcastd        m13, [tmpq+ 4]
3353    neg                  r6
3354    vpbroadcastd        m14, [tmpq+ 8]
3355    jmp                  r7
3356.v_w4:
3357    mov                 r3d, 0x330c
3358    movq                xm1, [srcq+r6 *2]
3359    kmovw                k1, r3d
3360    vpbroadcastq    ym1{k1}, [srcq+r6 *1]
3361    vpbroadcastq         m2, [srcq+ssq*0]
3362    vinserti32x4     m1{k1}, m2, [srcq+ssq*1], 3
3363    movq                xm0, [srcq+ssq*2]
3364    mova                ym4, [prep_endA]
3365    valignq              m0, m1, 2
3366    punpcklwd            m1, m0        ; 01 12 23 34
3367.v_w4_loop:
3368    lea                srcq, [srcq+ssq*4]
3369    movq                xm2, [srcq+r6 *1]
3370    vpbroadcastq    ym2{k1}, [srcq+ssq*0]
3371    vpbroadcastq         m3, [srcq+ssq*1]
3372    vinserti32x4     m2{k1}, m3, [srcq+ssq*2], 3
3373    mova                 m3, m10
3374    vpdpwssd             m3, m12, m1   ; a0 b0 c0 d0
3375    valignq              m0, m2, m0, 6 ; 4 5 6 7
3376    punpcklwd            m0, m2        ; 45 56 67 78
3377    vpdpwssd             m3, m14, m0   ; a2 b2 c2 d2
3378    vshufi32x4           m1, m0, q1032 ; 23 34 45 56
3379    vpdpwssd             m3, m13, m1   ; a1 b1 c1 d1
3380    mova                 m1, m0
3381    mova                 m0, m2
3382    vpermb               m3, m4, m3
3383    mova             [tmpq], ym3
3384    add                tmpq, 32
3385    sub                  hd, 4
3386    jg .v_w4_loop
3387    RET
3388.v_w8:
3389    vbroadcasti32x4     ym1, [srcq+r6 *1]
3390    mov                 r3d, 0x33
3391    vbroadcasti32x4      m2, [srcq+ssq*0]
3392    kmovb                k1, r3d
3393    mova                 m6, [spel_v_shuf8]
3394    vinserti64x2     m1{k1}, m2, [srcq+r6 *2], 0 ; 0 1 2
3395    vbroadcasti32x4     ym0, [srcq+ssq*1]
3396    vinserti64x2     m0{k1}, m2, [srcq+ssq*2], 2 ; 2 3 4
3397    mova                 m7, [prep_endB]
3398    vpermb               m1, m6, m1  ; 01 12
3399    vpermb               m2, m6, m0  ; 23 34
3400.v_w8_loop:
3401    lea                srcq, [srcq+ssq*4]
3402    vbroadcasti32x4     ym3, [srcq+r6 *1]
3403    movu                xm4, [srcq+ssq*0]
3404    vshufi64x2       m3{k1}, m0, m4, q1032       ; 4 5 6
3405    vbroadcasti32x4     ym0, [srcq+ssq*1]
3406    vinserti64x2     m0{k1}, m4, [srcq+ssq*2], 2 ; 6 7 8
3407    mova                 m4, m10
3408    vpdpwssd             m4, m12, m1 ; a0 b0
3409    mova                 m5, m10
3410    vpdpwssd             m5, m12, m2 ; c0 d0
3411    vpermb               m1, m6, m3  ; 45 56
3412    vpdpwssd             m4, m13, m2 ; a1 b1
3413    vpermb               m2, m6, m0  ; 67 78
3414    vpdpwssd             m5, m13, m1 ; c1 d1
3415    vpdpwssd             m4, m14, m1 ; a2 b2
3416    vpdpwssd             m5, m14, m2 ; c2 d2
3417    vpermt2b             m4, m7, m5
3418    mova             [tmpq], m4
3419    add                tmpq, 64
3420    sub                  hd, 4
3421    jg .v_w8_loop
3422    RET
3423.v_w16:
3424    vbroadcasti32x8      m0, [srcq+r6 *1]
3425    vinserti32x8         m1, m0, [srcq+ssq*0], 1 ; 1 2
3426    vinserti32x8         m0, [srcq+r6 *2], 0     ; 0 1
3427    mova                 m6, [spel_v_shuf16]
3428    movu                ym3, [srcq+ssq*1]
3429    lea                srcq, [srcq+ssq*2]
3430    vinserti32x8         m3, [srcq+ssq*0], 1     ; 3 4
3431    mova                 m7, [prep_endA]
3432    vpermb               m1, m6, m1     ; 12
3433    vpermb               m0, m6, m0     ; 01
3434    vpermb               m3, m6, m3     ; 34
3435    vpshrdd              m2, m1, m3, 16 ; 23
3436.v_w16_loop:
3437    mova                 m5, m10
3438    vpdpwssd             m5, m12, m1    ; b0
3439    mova                 m4, m10
3440    vpdpwssd             m4, m12, m0    ; a0
3441    mova                 m1, m3
3442    vpdpwssd             m5, m13, m3    ; b1
3443    movu                ym3, [srcq+ssq*1]
3444    lea                srcq, [srcq+ssq*2]
3445    vpdpwssd             m4, m13, m2    ; a1
3446    vinserti32x8         m3, [srcq+ssq*0], 1
3447    mova                 m0, m2
3448    vpermb               m3, m6, m3     ; 56
3449    vpshrdd              m2, m1, m3, 16 ; 45
3450    vpdpwssd             m5, m14, m3    ; b2
3451    vpdpwssd             m4, m14, m2    ; a2
3452    vpermt2b             m4, m7, m5
3453    mova             [tmpq], m4
3454    add                tmpq, 64
3455    sub                  hd, 2
3456    jg .v_w16_loop
3457    RET
3458.v_w32:
3459.v_w64:
3460.v_w128:
3461%if WIN64
3462    push                 r8
3463%endif
3464    mova                m11, [prep_endC]
3465    lea                  r5, [hq+wq*8-256]
3466.v_w32_loop0:
3467    movu                 m4, [srcq+r6 *2]
3468    movu                 m5, [srcq+r6 *1]
3469    lea                  r7, [srcq+ssq*2]
3470    movu                 m6, [srcq+ssq*0]
3471    movu                 m7, [srcq+ssq*1]
3472    mov                  r8, tmpq
3473    movu                 m8, [r7  +ssq*0]
3474    punpcklwd            m0, m4, m5  ; 01
3475    punpckhwd            m4, m5
3476    punpcklwd            m1, m5, m6  ; 12
3477    punpckhwd            m5, m6
3478    punpcklwd            m2, m6, m7  ; 23
3479    punpckhwd            m6, m7
3480    punpcklwd            m3, m7, m8  ; 34
3481    punpckhwd            m7, m8
3482.v_w32_loop:
3483    mova                m16, m10
3484    movu                 m9, [r7+ssq*1]
3485    mova                m18, m10
3486    vpdpwssd            m16, m12, m0 ; a0
3487    mova                m17, m10
3488    vpdpwssd            m18, m12, m4
3489    mova                m19, m10
3490    vpdpwssd            m17, m12, m1 ; b0
3491    lea                  r7, [r7+ssq*2]
3492    vpdpwssd            m19, m12, m5
3493    mova                 m0, m2
3494    vpdpwssd            m16, m13, m2 ; a1
3495    punpcklwd            m2, m8, m9  ; 45
3496    mova                 m4, m6
3497    vpdpwssd            m18, m13, m6
3498    punpckhwd            m6, m8, m9
3499    movu                 m8, [r7+ssq*0]
3500    vpdpwssd            m17, m13, m3 ; b1
3501    mova                 m1, m3
3502    vpdpwssd            m19, m13, m7
3503    mova                 m5, m7
3504    vpdpwssd            m16, m14, m2 ; a2
3505    punpcklwd            m3, m9, m8  ; 56
3506    vpdpwssd            m18, m14, m6
3507    punpckhwd            m7, m9, m8
3508    vpdpwssd            m17, m14, m3 ; b2
3509    vpdpwssd            m19, m14, m7
3510    vpermt2b            m16, m11, m18
3511    vpermt2b            m17, m11, m19
3512    mova          [r8+wq*0], m16
3513    mova          [r8+wq*2], m17
3514    lea                  r8, [r8+wq*4]
3515    sub                  hd, 2
3516    jg .v_w32_loop
3517    add                srcq, 64
3518    add                tmpq, 64
3519    movzx                hd, r5b
3520    sub                 r5d, 1<<8
3521    jg .v_w32_loop0
3522%if WIN64
3523    pop                  r8
3524%endif
3525    vzeroupper
3526    RET
3527.hv_w4:
3528    movzx               mxd, mxb
3529    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
3530    movzx               mxd, myb
3531    shr                 myd, 16
3532    cmp                  hd, 4
3533    cmove               myd, mxd
3534    mov                 r5d, r7m
3535    pmovsxbw           xmm1, [base+subpel_filters+1+myq*8]
3536    mov                  r6, ssq
3537    sub                srcq, 2
3538    shr                 r5d, 11
3539    neg                  r6
3540    psllw              xmm0, [base+prep_hv_shift+r5*8]
3541    psllw              xmm1, 2
3542    mova          [tmpq+ 0], xmm0
3543    mova          [tmpq+16], xmm1
3544    vpbroadcastd         m8, [tmpq+ 4]
3545    mov                 r3d, 0xf0
3546    vpbroadcastd         m9, [tmpq+ 8]
3547    vpbroadcastd        m12, xmm1
3548    movu                xm3, [srcq+r6 *2]
3549    kmovb                k1, r3d
3550    vinserti32x4        ym3, [srcq+r6 *1], 1
3551    vbroadcasti32x4      m2, [srcq+ssq*0]
3552    vinserti64x2     m3{k1}, m2, [srcq+ssq*1], 3
3553    movu                xm4, [srcq+ssq*2]
3554    vbroadcasti32x4      m5, [spel_h_shufA]
3555    vbroadcasti32x4      m6, [spel_h_shufB]
3556    mova                 m1, m11
3557    mova                m15, [spel_shuf4a]
3558    mova                xm2, xm11
3559    pshufb               m0, m3, m5
3560    vpdpwssd             m1, m8, m0
3561    pshufb              xm0, xm4, xm5
3562    vpdpwssd            xm2, xm8, xm0
3563    vpbroadcastd        m13, [tmpq+20]
3564    pshufb               m3, m6
3565    vpbroadcastd        m14, [tmpq+24]
3566    pshufb              xm4, xm6
3567    mova                 m7, [spel_shuf4b]
3568    vpdpwssd             m1, m9, m3    ; 0 1 2 3
3569    vpdpwssd            xm2, xm9, xm4  ; 4
3570    vpermt2b             m1, m15, m2   ; 01 12 23 34
3571    mova               ym15, [prep_endA]
3572.hv_w4_loop:
3573    lea                srcq, [srcq+ssq*4]
3574    movu                xm4, [srcq+r6 *1]
3575    vinserti32x4        ym4, [srcq+ssq*0], 1
3576    vbroadcasti32x4      m3, [srcq+ssq*1]
3577    vinserti64x2     m4{k1}, m3, [srcq+ssq*2], 3
3578    mova                 m2, m11
3579    pshufb               m3, m4, m5
3580    vpdpwssd             m2, m8, m3
3581    mova                 m3, m10
3582    vpdpwssd             m3, m12, m1   ; a0 b0 c0 d0
3583    pshufb               m4, m6
3584    vpdpwssd             m2, m9, m4    ; 5 6 7 8
3585    mova                 m4, m1
3586    vpermt2b             m1, m7, m2    ; 45 56 67 78
3587    vpdpwssd             m3, m14, m1   ; a2 b2 c2 d2
3588    vshufi32x4           m4, m1, q1032 ; 23 34 45 56
3589    vpdpwssd             m3, m13, m4   ; a1 b1 c1 d1
3590    vpermb               m3, m15, m3
3591    mova             [tmpq], ym3
3592    add                tmpq, 32
3593    sub                  hd, 4
3594    jg .hv_w4_loop
3595    RET
3596.hv_w8:
3597    mova                 m8, [spel_h_shufA]
3598    movu               ym18, [srcq+r6 *2]
3599    vinserti32x8        m18, [srcq+r6 *1], 1 ; 0 1
3600    movu               ym19, [srcq+ssq*0]
3601    vinserti32x8        m19, [srcq+ssq*1], 1 ; 2 3
3602    movu               ym20, [srcq+ssq*2]    ; 4
3603    movu                 m9, [spel_h_shufC]
3604    mova                m21, [spel_shuf8a]
3605    mova                 m0, [spel_shuf8b]
3606    vpermb               m4, m8, m18
3607    mova                 m1, m10
3608    vpermb               m5, m8, m19
3609    vpdpwssd             m1, m12, m4  ; a0 b0
3610    mova                 m2, m10
3611    vpermb               m6, m8, m20
3612    vpdpwssd             m2, m12, m5  ; c0 d0
3613    mova                 m3, m10
3614    vpermb              m18, m9, m18
3615    vpdpwssd             m3, m12, m6  ; e0
3616    mova                 m7, [prep_endB]
3617    vpermb              m19, m9, m19
3618    vpdpwssd             m1, m14, m18 ; a2 b2
3619    vpermb              m20, m9, m20
3620    vpdpwssd             m2, m14, m19 ; c2 d2
3621    shufpd               m4, m18, 0x55
3622    vpdpwssd             m3, m14, m20 ; e2
3623    shufpd               m5, m19, 0x55
3624    vpdpwssd             m1, m13, m4  ; a1 b1
3625    shufpd               m6, m20, 0x55
3626    vpdpwssd             m2, m13, m5  ; c1 d1
3627    vpdpwssd             m3, m13, m6  ; e1
3628    vpermt2b             m1, m21, m2  ; 01 12
3629    vpermt2b             m2, m21, m3  ; 23 34
3630.hv_w8_loop:
3631    lea                srcq, [srcq+ssq*4]
3632    movu               ym18, [srcq+r6 *1]
3633    vinserti32x8        m18, [srcq+ssq*0], 1
3634    movu               ym19, [srcq+ssq*1]
3635    vinserti32x8        m19, [srcq+ssq*2], 1
3636    mova                 m3, m10
3637    vpermb               m5, m8, m18
3638    mova                 m4, m10
3639    vpermb               m6, m8, m19
3640    vpdpwssd             m3, m12, m5  ; f0 g0
3641    mova                m20, m11
3642    vpdpwssd             m4, m12, m6  ; h0 i0
3643    mova                m21, m11
3644    vpdpwssd            m20, m15, m1  ; A0 B0
3645    vpermb              m18, m9, m18
3646    vpdpwssd            m21, m15, m2  ; C0 D0
3647    vpermb              m19, m9, m19
3648    vpdpwssd             m3, m14, m18 ; f2 g2
3649    vpdpwssd             m4, m14, m19 ; h2 i2
3650    shufpd               m5, m18, 0x55
3651    vpdpwssd            m20, m16, m2  ; A1 B1
3652    shufpd               m6, m19, 0x55
3653    vpdpwssd             m3, m13, m5  ; f1 g1
3654    vpdpwssd             m4, m13, m6  ; h1 i1
3655    vpermt2b             m2, m0, m3   ; 45 56
3656    vpdpwssd            m21, m16, m2  ; C1 D1
3657    mova                 m1, m2
3658    vpermt2b             m2, m0, m4   ; 67 78
3659    vpdpwssd            m20, m17, m1  ; A2 B2
3660    vpdpwssd            m21, m17, m2  ; A2 B2
3661    vpermt2b            m20, m7, m21
3662    mova             [tmpq], m20
3663    add                tmpq, 64
3664    sub                  hd, 4
3665    jg .hv_w8_loop
3666    vzeroupper
3667    RET
3668.hv:
3669    vpbroadcastd        m11, [pd_128]
3670    cmp                  wd, 4
3671    je .hv_w4
3672    shr                 mxd, 16
3673    pmovsxbw           xmm0, [base+subpel_filters+1+mxq*8]
3674    movzx               mxd, myb
3675    shr                 myd, 16
3676    cmp                  hd, 6
3677    cmovs               myd, mxd
3678    mov                 r5d, r7m
3679    pmovsxbw           xmm1, [base+subpel_filters+1+myq*8]
3680    mov                  r6, ssq
3681    sub                srcq, 4
3682    shr                 r5d, 11
3683    neg                  r6
3684    psllw              xmm0, [base+prep_hv_shift+r5*8]
3685    psllw              xmm1, 2
3686    mova          [tmpq+ 0], xmm0
3687    mova          [tmpq+16], xmm1
3688    vpbroadcastd        m12, xmm0
3689    vpbroadcastd        m13, [tmpq+ 4]
3690    vpbroadcastd        m14, [tmpq+ 8]
3691    vpbroadcastd        m15, xmm1
3692    vpbroadcastd        m16, [tmpq+20]
3693    vpbroadcastd        m17, [tmpq+24]
3694    cmp                  wd, 16
3695    jl .hv_w8
3696    vbroadcasti32x4      m8, [spel_h_shufA]
3697    vbroadcasti32x4      m9, [spel_h_shufB]
3698    jg .hv_w32
3699    vbroadcasti32x8      m6, [srcq+r6 *2+ 8]
3700    vinserti32x8         m2, m6, [srcq+r6 *2+16], 1
3701    vinserti32x8         m6, [srcq+r6 *2+ 0], 0 ; 0
3702    movu               ym18, [srcq+r6 *1+ 0]
3703    movu               ym19, [srcq+r6 *1+12]
3704    vinserti32x8        m18, [srcq+ssq*0+ 0], 1
3705    vinserti32x8        m19, [srcq+ssq*0+12], 1 ; 1 2
3706    movu               ym20, [srcq+ssq*1+ 0]
3707    movu               ym21, [srcq+ssq*1+12]
3708    lea                srcq, [srcq+ssq*2]
3709    vinserti32x8        m20, [srcq+ssq*0+ 0], 1
3710    vinserti32x8        m21, [srcq+ssq*0+12], 1 ; 3 4
3711    pshufb               m2, m8
3712    mova                 m1, m10
3713    pshufb               m3, m18, m8
3714    vpdpwssd             m1, m14, m2    ; a2
3715    mova                 m2, m10
3716    pshufb               m4, m19, m9
3717    vpdpwssd             m2, m12, m3    ; b0  c0
3718    mova                 m3, m10
3719    pshufb               m5, m20, m8
3720    vpdpwssd             m3, m14, m4    ; b2' c2'
3721    mova                 m4, m10
3722    pshufb               m7, m21, m9
3723    vpdpwssd             m4, m12, m5    ; d0  e0
3724    mova                 m5, m10
3725    pshufb               m0, m6, m8
3726    vpdpwssd             m5, m14, m7    ; d2' e2'
3727    mova                 m7, [spel_shuf16]
3728    pshufb              m18, m9
3729    vpdpwssd             m1, m12, m0    ; a0
3730    pshufb              m19, m8
3731    vpdpwssd             m2, m13, m18   ; b1  c1
3732    pshufb              m20, m9
3733    vpdpwssd             m3, m13, m19   ; b1' c1'
3734    pshufb              m21, m8
3735    vpdpwssd             m4, m13, m20   ; d1  e1
3736    pshufb               m6, m9
3737    vpdpwssd             m5, m13, m21   ; d1' e1'
3738    mova                 m0, [prep_endB]
3739    shufpd              m18, m19, 0x55
3740    vpdpwssd             m1, m13, m6    ; a1
3741    shufpd              m20, m21, 0x55
3742    vpdpwssd             m2, m14, m18   ; b2  c2
3743    vpdpwssd             m3, m12, m18   ; b0' c0'
3744    vpdpwssd             m4, m14, m20   ; d2  e2
3745    vpdpwssd             m5, m12, m20   ; d0' e0'
3746    pslldq               m1, 1
3747    vpermt2b             m2, m7, m3     ; 12
3748    vpermt2b             m4, m7, m5     ; 34
3749    vpshrdd              m1, m2, 16     ; 01
3750    vpshrdd              m3, m2, m4, 16 ; 23
3751.hv_w16_loop:
3752    movu               ym18, [srcq+ssq*1+ 0]
3753    movu               ym19, [srcq+ssq*1+12]
3754    lea                srcq, [srcq+ssq*2]
3755    vinserti32x8        m18, [srcq+ssq*0+ 0], 1
3756    vinserti32x8        m19, [srcq+ssq*0+12], 1
3757    mova                 m5, m10
3758    mova                 m6, m10
3759    pshufb              m21, m18, m8
3760    vpdpwssd             m5, m12, m21   ; f0  g0
3761    pshufb              m20, m19, m9
3762    mova                m21, m11
3763    vpdpwssd             m6, m14, m20   ; f2' g2'
3764    mova                m20, m11
3765    vpdpwssd            m21, m15, m2    ; B0
3766    mova                 m2, m4
3767    vpdpwssd            m20, m15, m1    ; A0
3768    mova                 m1, m3
3769    pshufb              m18, m9
3770    vpdpwssd             m5, m13, m18   ; f1  g1
3771    pshufb              m19, m8
3772    vpdpwssd             m6, m13, m19   ; f1' g1'
3773    vpdpwssd            m21, m16, m4    ; B1
3774    vpdpwssd            m20, m16, m3    ; A1
3775    shufpd              m18, m19, 0x55
3776    vpdpwssd             m5, m14, m18   ; f2  g2
3777    vpdpwssd             m6, m12, m18   ; f0' g0'
3778    mova                 m4, m7
3779    vpermi2b             m4, m5, m6     ; 56
3780    vpshrdd              m3, m2, m4, 16 ; 45
3781    vpdpwssd            m21, m17, m4    ; B2
3782    vpdpwssd            m20, m17, m3    ; A2
3783    vpermt2b            m20, m0, m21
3784    mova             [tmpq], m20
3785    add                tmpq, 64
3786    sub                  hd, 2
3787    jg .hv_w16_loop
3788    vzeroupper
3789    RET
3790.hv_w32:
3791    WIN64_SPILL_XMM      29
3792%if WIN64
3793    push                 r8
3794%endif
3795    mova                m27, [spel_shuf32]
3796    lea                 r5d, [hq+wq*8-256]
3797    mova                m28, [prep_endC]
3798.hv_w32_loop0:
3799    movu                m18, [srcq+r6 *2+ 0]
3800    movu                 m7, [srcq+r6 *2+12]
3801    movu                 m6, [srcq+r6 *1+ 0]
3802    movu                m20, [srcq+r6 *1+12]
3803    lea                  r7, [srcq+ssq*2]
3804    movu                m19, [srcq+ssq*0+ 0]
3805    movu                m21, [srcq+ssq*0+12]
3806    movu                m22, [srcq+ssq*1+ 0]
3807    movu                m24, [srcq+ssq*1+12]
3808    mov                  r8, tmpq
3809    movu                m23, [r7  +ssq*0+ 0]
3810    movu                m25, [r7  +ssq*0+12]
3811    pshufb               m1, m18, m8
3812    mova                 m0, m10
3813    pshufb               m2, m7, m9
3814    vpdpwssd             m0, m12, m1    ; a0
3815    mova                 m1, m10
3816    pshufb               m4, m6, m8
3817    vpdpwssd             m1, m14, m2    ; a2'
3818    mova                 m2, m10
3819    pshufb               m3, m19, m8
3820    vpdpwssd             m2, m12, m4    ; b0
3821    mova                 m4, m10
3822    pshufb               m5, m20, m9
3823    vpdpwssd             m4, m12, m3    ; c0
3824    mova                 m3, m10
3825    pshufb              m26, m21, m9
3826    vpdpwssd             m3, m14, m5    ; b2'
3827    mova                 m5, m10
3828    pshufb              m18, m9
3829    vpdpwssd             m5, m14, m26   ; c2'
3830    pshufb               m7, m8
3831    vpdpwssd             m0, m13, m18   ; a1
3832    pshufb               m6, m9
3833    vpdpwssd             m1, m13, m7    ; a1'
3834    pshufb              m19, m9
3835    vpdpwssd             m2, m13, m6    ; b1
3836    pshufb              m20, m8
3837    vpdpwssd             m4, m13, m19   ; c1
3838    pshufb              m21, m8
3839    vpdpwssd             m3, m13, m20   ; b1'
3840    shufpd              m18, m7, 0x55
3841    vpdpwssd             m5, m13, m21   ; c1'
3842    shufpd               m6, m20, 0x55
3843    vpdpwssd             m0, m14, m18   ; a2
3844    shufpd              m19, m21, 0x55
3845    vpdpwssd             m1, m12, m18   ; a0'
3846    pshufb              m18, m22, m8
3847    vpdpwssd             m2, m14, m6    ; b2
3848    pshufb               m7, m23, m8
3849    vpdpwssd             m4, m14, m19   ; c2
3850    vpdpwssd             m3, m12, m6    ; b0'
3851    mova                 m6, m10
3852    vpdpwssd             m5, m12, m19   ; c0'
3853    pshufb              m19, m24, m9
3854    vpdpwssd             m6, m12, m18   ; d0
3855    mova                m18, m10
3856    pshufb              m26, m25, m9
3857    vpdpwssd            m18, m12, m7    ; e0
3858    mova                 m7, m10
3859    pshufb              m22, m9
3860    vpdpwssd             m7, m14, m19   ; d2'
3861    mova                m19, m10
3862    pshufb              m23, m9
3863    vpdpwssd            m19, m14, m26   ; e2'
3864    pshufb              m24, m8
3865    vpdpwssd             m6, m13, m22   ; d1
3866    pshufb              m25, m8
3867    vpdpwssd            m18, m13, m23   ; e1
3868    shufpd              m22, m24, 0x55
3869    vpdpwssd             m7, m13, m24   ; d1'
3870    shufpd              m23, m25, 0x55
3871    vpdpwssd            m19, m13, m25   ; e1'
3872    pslldq               m0, 1
3873    vpdpwssd             m6, m14, m22   ; d2
3874    pslldq               m1, 1
3875    vpdpwssd            m18, m14, m23   ; e2
3876    vpermt2b             m2, m27, m4    ; 12
3877    vpdpwssd             m7, m12, m22   ; d0'
3878    vpermt2b             m3, m27, m5    ; 12'
3879    vpdpwssd            m19, m12, m23   ; e0'
3880    vpshrdd              m0, m2, 16     ; 01
3881    vpermt2b             m6, m27, m18   ; 34
3882    vpshrdd              m1, m3, 16     ; 01'
3883    vpermt2b             m7, m27, m19   ; 34'
3884    vpshrdd              m4, m2, m6, 16 ; 23
3885    vpshrdd              m5, m3, m7, 16 ; 23'
3886.hv_w32_loop:
3887    movu                m22, [r7+ssq*1+ 0]
3888    movu                m24, [r7+ssq*1+12]
3889    lea                  r7, [r7+ssq*2]
3890    movu                m23, [r7+ssq*0+ 0]
3891    movu                m25, [r7+ssq*0+12]
3892    mova                m19, m11
3893    vpdpwssd            m19, m15, m2    ; B0
3894    mova                m21, m11
3895    vpdpwssd            m21, m15, m3    ; B0'
3896    mova                m18, m11
3897    vpdpwssd            m18, m15, m0    ; A0
3898    mova                m20, m11
3899    vpdpwssd            m20, m15, m1    ; A0'
3900    mova                 m2, m6
3901    vpdpwssd            m19, m16, m6    ; B1
3902    mova                 m3, m7
3903    vpdpwssd            m21, m16, m7    ; B1'
3904    mova                 m0, m4
3905    vpdpwssd            m18, m16, m4    ; A1
3906    mova                 m1, m5
3907    pshufb               m4, m22, m8
3908    vpdpwssd            m20, m16, m5    ; A1'
3909    mova                 m6, m10
3910    pshufb               m7, m23, m8
3911    vpdpwssd             m6, m12, m4    ; f0
3912    mova                 m4, m10
3913    pshufb               m5, m24, m9
3914    vpdpwssd             m4, m12, m7    ; g0
3915    mova                 m7, m10
3916    pshufb              m26, m25, m9
3917    vpdpwssd             m7, m14, m5    ; f2'
3918    mova                 m5, m10
3919    pshufb              m22, m9
3920    vpdpwssd             m5, m14, m26   ; g2'
3921    pshufb              m23, m9
3922    vpdpwssd             m6, m13, m22   ; f1
3923    pshufb              m24, m8
3924    vpdpwssd             m4, m13, m23   ; g1
3925    pshufb              m25, m8
3926    vpdpwssd             m7, m13, m24   ; f1'
3927    shufpd              m22, m24, 0x55
3928    vpdpwssd             m5, m13, m25   ; g1'
3929    shufpd              m23, m25, 0x55
3930    vpdpwssd             m6, m14, m22   ; f2
3931    vpdpwssd             m4, m14, m23   ; g2
3932    vpdpwssd             m7, m12, m22   ; f0'
3933    vpdpwssd             m5, m12, m23   ; g0'
3934    vpermt2b             m6, m27, m4    ; 56
3935    vpermt2b             m7, m27, m5    ; 56'
3936    vpdpwssd            m19, m17, m6    ; B2
3937    vpshrdd              m4, m2, m6, 16 ; 45
3938    vpdpwssd            m21, m17, m7    ; B2'
3939    vpshrdd              m5, m3, m7, 16 ; 45'
3940    vpdpwssd            m18, m17, m4    ; A2
3941    vpdpwssd            m20, m17, m5    ; A2'
3942    vpermt2b            m19, m28, m21
3943    vpermt2b            m18, m28, m20
3944    mova          [r8+wq*0], m18
3945    mova          [r8+wq*2], m19
3946    lea                  r8, [r8+wq*4]
3947    sub                  hd, 2
3948    jg .hv_w32_loop
3949    add                srcq, 64
3950    add                tmpq, 64
3951    movzx                hd, r5b
3952    sub                 r5d, 1<<8
3953    jg .hv_w32_loop0
3954%if WIN64
3955    pop                  r8
3956%endif
3957    RET
3958
3959PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_16bpc
3960PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_16bpc
3961PREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_16bpc
3962PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_16bpc
3963PREP_8TAP_FN sharp,          SHARP,   SHARP
3964
3965cglobal prep_8tap_16bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my
3966%define base r7-prep_avx512icl
3967    imul                mxd, mxm, 0x010101
3968    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
3969    imul                myd, mym, 0x010101
3970    add                 myd, t1d ; 8tap_v, my, 4tap_v
3971    lea                  r7, [prep_avx512icl]
3972    mov                  wd, wm
3973    movifnidn            hd, hm
3974    test                mxd, 0xf00
3975    jnz .h
3976    test                myd, 0xf00
3977    jz mangle(private_prefix %+ _prep_6tap_16bpc_avx512icl).prep
3978.v:
3979    movzx               mxd, myb
3980    shr                 myd, 16
3981    cmp                  hd, 4
3982    cmove               myd, mxd
3983    mov                 r5d, r7m
3984    vpbroadcastd        m10, [prep_8tap_rnd]
3985    pmovsxbw           xmm0, [base+subpel_filters+myq*8]
3986    tzcnt               r6d, wd
3987    shr                 r5d, 11
3988    movzx               r6d, word [r7+r6*2+table_offset(prep, _8tap_v)]
3989    psllw              xmm0, [base+prep_hv_shift+r5*8]
3990    add                  r7, r6
3991    lea                  r6, [strideq*3]
3992    sub                srcq, r6
3993    mova             [tmpq], xmm0
3994    vpbroadcastd        m12, xmm0
3995    vpbroadcastd        m13, [tmpq+ 4]
3996    vpbroadcastd        m14, [tmpq+ 8]
3997    vpbroadcastd        m15, [tmpq+12]
3998    jmp                  r7
3999.v_w4:
4000    mov                 r3d, 0x330c
4001    movq                xm1, [srcq+strideq*0]
4002    kmovw                k1, r3d
4003    vpbroadcastq    ym1{k1}, [srcq+strideq*1]
4004    vpbroadcastq         m0, [srcq+r6       ]
4005    vinserti32x4     m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2 3
4006    lea                srcq, [srcq+strideq*4]
4007    vpbroadcastq    ym0{k1}, [srcq+strideq*0]
4008    vpbroadcastq         m2, [srcq+strideq*1]
4009    vinserti32x4     m0{k1}, m2, [srcq+strideq*2], 3 ; 3 4 5 6
4010    mova                ym5, [prep_endA]
4011    vshufi32x4           m3, m1, m0, q1021 ; 1 2 3 4
4012    vshufi32x4           m2, m1, m0, q2132 ; 2 3 4 5
4013    punpcklwd            m1, m3            ; 01 12 23 34
4014    punpcklwd            m2, m0            ; 23 34 45 56
4015.v_w4_loop:
4016    movq                xm4, [srcq+r6       ]
4017    lea                srcq, [srcq+strideq*4]
4018    vpbroadcastq    ym4{k1}, [srcq+strideq*0]
4019    vpbroadcastq         m3, [srcq+strideq*1]
4020    vinserti32x4     m4{k1}, m3, [srcq+strideq*2], 3 ; 7 8 9 a
4021    mova                 m3, m10
4022    vpdpwssd             m3, m12, m1       ; a0 b0 c0 d0
4023    valignq              m1, m4, m0, 6     ; 6 7 8 9
4024    vpdpwssd             m3, m13, m2       ; a1 b1 c1 d1
4025    mova                 m0, m4
4026    punpcklwd            m4, m1, m4        ; 67 78 89 9a
4027    vpdpwssd             m3, m15, m4       ; a3 b3 c3 d3
4028    vshufi32x4           m1, m2, m4, q1032 ; 45 56 67 78
4029    vpdpwssd             m3, m14, m1       ; a2 b2 c2 d2
4030    mova                 m2, m4
4031    vpermb               m3, m5, m3
4032    mova             [tmpq], ym3
4033    add                tmpq, 32
4034    sub                  hd, 4
4035    jg .v_w4_loop
4036    RET
4037.v_w8:
4038    movu                xm0, [srcq+strideq*0]
4039    mov                 r3d, 0x33
4040    vbroadcasti32x4     ym1, [srcq+strideq*1]
4041    kmovb                k1, r3d
4042    mova                 m7, [spel_v_shuf8]
4043    vinserti64x2     m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2
4044    add                srcq, r6
4045    vbroadcasti32x4     ym2, [srcq+strideq*0]
4046    vbroadcasti32x4      m3, [srcq+strideq*1]
4047    vbroadcasti32x4     ym0, [srcq+strideq*2]
4048    vshufi64x2       m2{k1}, m1, m3, q1032    ; 2 3 4
4049    vinserti64x2     m0{k1}, m3, [srcq+r6], 2 ; 4 5 6
4050    mova                 m8, [prep_endB]
4051    vpermb               m1, m7, m1  ; 01 12
4052    vpermb               m2, m7, m2  ; 23 34
4053    vpermb               m3, m7, m0  ; 45 56
4054.v_w8_loop:
4055    lea                srcq, [srcq+strideq*4]
4056    vbroadcasti32x4     ym4, [srcq+strideq*0]
4057    movu                xm5, [srcq+strideq*1]
4058    vshufi64x2       m4{k1}, m0, m5, q1032    ; 6 7 8
4059    vbroadcasti32x4     ym0, [srcq+strideq*2]
4060    vinserti64x2     m0{k1}, m5, [srcq+r6], 2 ; 8 9 a
4061    mova                 m5, m10
4062    vpdpwssd             m5, m12, m1 ; a0 b0
4063    mova                 m6, m10
4064    vpdpwssd             m6, m12, m2 ; c0 d0
4065    mova                 m1, m3
4066    vpdpwssd             m5, m13, m2 ; c1 d1
4067    vpdpwssd             m6, m13, m3 ; c1 d1
4068    vpermb               m2, m7, m4  ; 67 78
4069    vpdpwssd             m5, m14, m3 ; a2 b2
4070    vpermb               m3, m7, m0  ; 89 9a
4071    vpdpwssd             m6, m14, m2 ; c2 d2
4072    vpdpwssd             m5, m15, m2 ; a3 b3
4073    vpdpwssd             m6, m15, m3 ; c3 d3
4074    vpermt2b             m5, m8, m6
4075    mova             [tmpq], m5
4076    add                tmpq, 64
4077    sub                  hd, 4
4078    jg .v_w8_loop
4079    RET
4080.v_w16:
4081    vbroadcasti32x8      m0, [srcq+strideq*1]
4082    vinserti32x8         m1, m0, [srcq+strideq*2], 1
4083    vinserti32x8         m0, [srcq+strideq*0], 0
4084    mova                 m8, [spel_v_shuf16]
4085    add                srcq, r6
4086    movu                ym3, [srcq+strideq*0]
4087    vinserti32x8         m3, [srcq+strideq*1], 1
4088    movu                ym5, [srcq+strideq*2]
4089    add                srcq, r6
4090    vinserti32x8         m5, [srcq+strideq*0], 1
4091    mova                m11, [prep_endA]
4092    vpermb               m1, m8, m1     ; 12
4093    vpermb               m0, m8, m0     ; 01
4094    vpermb               m3, m8, m3     ; 34
4095    vpermb               m5, m8, m5     ; 56
4096    vpshrdd              m2, m1, m3, 16 ; 23
4097    vpshrdd              m4, m3, m5, 16 ; 45
4098.v_w16_loop:
4099    mova                 m7, m10
4100    vpdpwssd             m7, m12, m1    ; b0
4101    mova                 m6, m10
4102    vpdpwssd             m6, m12, m0    ; a0
4103    mova                 m1, m3
4104    vpdpwssd             m7, m13, m3    ; b1
4105    mova                 m0, m2
4106    vpdpwssd             m6, m13, m2    ; a1
4107    mova                 m3, m5
4108    vpdpwssd             m7, m14, m5    ; b2
4109    mova                 m2, m4
4110    vpdpwssd             m6, m14, m4    ; a2
4111    movu                ym5, [srcq+strideq*1]
4112    lea                srcq, [srcq+strideq*2]
4113    vinserti32x8         m5, [srcq+strideq*0], 1
4114    vpermb               m5, m8, m5     ; 78
4115    vpshrdd              m4, m3, m5, 16 ; 67
4116    vpdpwssd             m7, m15, m5    ; b3
4117    vpdpwssd             m6, m15, m4    ; a3
4118    vpermt2b             m6, m11, m7
4119    mova             [tmpq], m6
4120    add                tmpq, 64
4121    sub                  hd, 2
4122    jg .v_w16_loop
4123    RET
4124.v_w32:
4125.v_w64:
4126.v_w128:
4127    WIN64_PUSH_XMM       23
4128%if WIN64
4129    push                 r8
4130%endif
4131    mova                m11, [prep_endC]
4132    lea                  r5, [hq+wq*8-256]
4133.v_w32_loop0:
4134    movu                m16, [srcq+strideq*0]
4135    movu                m17, [srcq+strideq*1]
4136    lea                  r7, [srcq+r6]
4137    movu                m18, [srcq+strideq*2]
4138    movu                m19, [r7  +strideq*0]
4139    mov                  r8, tmpq
4140    movu                m20, [r7  +strideq*1]
4141    movu                m21, [r7  +strideq*2]
4142    add                  r7, r6
4143    movu                m22, [r7  +strideq*0]
4144    punpcklwd            m0, m16, m17 ; 01l
4145    punpckhwd           m16, m17      ; 01h
4146    punpcklwd            m1, m17, m18 ; 12l
4147    punpckhwd           m17, m18      ; 12h
4148    punpcklwd            m2, m18, m19 ; 23l
4149    punpckhwd           m18, m19      ; 23h
4150    punpcklwd            m3, m19, m20 ; 34l
4151    punpckhwd           m19, m20      ; 34h
4152    punpcklwd            m4, m20, m21 ; 45l
4153    punpckhwd           m20, m21      ; 45h
4154    punpcklwd            m5, m21, m22 ; 56l
4155    punpckhwd           m21, m22      ; 56h
4156.v_w32_loop:
4157    mova                 m6, m10
4158    vpdpwssd             m6, m12, m0  ; a0l
4159    mova                 m8, m10
4160    vpdpwssd             m8, m12, m16 ; a0h
4161    mova                 m7, m10
4162    vpdpwssd             m7, m12, m1  ; b0l
4163    mova                 m9, m10
4164    vpdpwssd             m9, m12, m17 ; b0h
4165    mova                 m0, m2
4166    vpdpwssd             m6, m13, m2  ; a1l
4167    mova                m16, m18
4168    vpdpwssd             m8, m13, m18 ; a1h
4169    mova                 m1, m3
4170    vpdpwssd             m7, m13, m3  ; b1l
4171    mova                m17, m19
4172    vpdpwssd             m9, m13, m19 ; b1h
4173    mova                 m2, m4
4174    vpdpwssd             m6, m14, m4  ; a2l
4175    mova                m18, m20
4176    vpdpwssd             m8, m14, m20 ; a2h
4177    mova                 m3, m5
4178    vpdpwssd             m7, m14, m5  ; b2l
4179    mova                m19, m21
4180    vpdpwssd             m9, m14, m21 ; b2h
4181    movu                m21, [r7+strideq*1]
4182    lea                  r7, [r7+strideq*2]
4183    punpcklwd            m4, m22, m21 ; 67l
4184    punpckhwd           m20, m22, m21 ; 67h
4185    movu                m22, [r7+strideq*0]
4186    vpdpwssd             m6, m15, m4  ; a3l
4187    vpdpwssd             m8, m15, m20 ; a3h
4188    punpcklwd            m5, m21, m22 ; 78l
4189    punpckhwd           m21, m22      ; 78h
4190    vpdpwssd             m7, m15, m5  ; b3l
4191    vpdpwssd             m9, m15, m21 ; b3h
4192    vpermt2b             m6, m11, m8
4193    vpermt2b             m7, m11, m9
4194    mova          [r8+wq*0], m6
4195    mova          [r8+wq*2], m7
4196    lea                  r8, [r8+wq*4]
4197    sub                  hd, 2
4198    jg .v_w32_loop
4199    add                srcq, 64
4200    add                tmpq, 64
4201    movzx                hd, r5b
4202    sub                 r5d, 1<<8
4203    jg .v_w32_loop0
4204%if WIN64
4205    pop                  r8
4206%endif
4207    RET
4208.h_w4:
4209    RESET_STACK_STATE
4210    movzx               mxd, mxb
4211    sub                srcq, 2
4212    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
4213    mov                 r5d, r7m
4214    vbroadcasti32x4      m4, [spel_h_shufA]
4215    vbroadcasti32x4      m5, [spel_h_shufB]
4216    shr                 r5d, 11
4217    mova                ym9, [prep_endA]
4218    psllw              xmm0, [base+prep_hv_shift+r5*8]
4219    mova             [tmpq], xmm0
4220    vpbroadcastd         m6, [tmpq+4]
4221    vpbroadcastd         m7, [tmpq+8]
4222.h_w4_loop:
4223    movu                xm2, [srcq+strideq*0]
4224    vinserti32x4        ym2, [srcq+strideq*1], 1
4225    vinserti32x4         m2, [srcq+strideq*2], 2
4226    vinserti32x4         m2, [srcq+r6       ], 3
4227    lea                srcq, [srcq+strideq*4]
4228    mova                 m0, m10
4229    pshufb               m1, m2, m4
4230    vpdpwssd             m0, m6, m1
4231    pshufb               m2, m5
4232    vpdpwssd             m0, m7, m2
4233    vpermb               m0, m9, m0
4234    mova             [tmpq], ym0
4235    add                tmpq, 32
4236    sub                  hd, 4
4237    jg .h_w4_loop
4238    RET
4239.h_w8:
4240    mova                 m6, [spel_h_shufA]
4241    movu                 m7, [spel_h_shufB]
4242    movu                 m8, [spel_h_shufC]
4243    mova                 m9, [spel_h_shufD]
4244    mova                m11, [prep_endB]
4245.h_w8_loop:
4246    movu                ym4, [srcq+strideq*0]
4247    vinserti32x8         m4, [srcq+strideq*1], 1
4248    movu                ym5, [srcq+strideq*2]
4249    vinserti32x8         m5, [srcq+r6       ], 1
4250    lea                srcq, [srcq+strideq*4]
4251    mova                 m0, m10
4252    mova                 m1, m10
4253    vpermb               m2, m6, m4
4254    vpermb               m3, m6, m5
4255    vpdpwssd             m0, m12, m2
4256    vpdpwssd             m1, m12, m3
4257    vpermb               m2, m7, m4
4258    vpermb               m3, m7, m5
4259    vpdpwssd             m0, m13, m2
4260    vpdpwssd             m1, m13, m3
4261    vpermb               m2, m8, m4
4262    vpermb               m3, m8, m5
4263    vpdpwssd             m0, m14, m2
4264    vpdpwssd             m1, m14, m3
4265    vpermb               m2, m9, m4
4266    vpermb               m3, m9, m5
4267    vpdpwssd             m0, m15, m2
4268    vpdpwssd             m1, m15, m3
4269    vpermt2b             m0, m11, m1
4270    mova             [tmpq], m0
4271    add                tmpq, 64
4272    sub                  hd, 4
4273    jg .h_w8_loop
4274    RET
4275.h:
4276    vpbroadcastd        m10, [prep_8tap_rnd]
4277    test                myd, 0xf00
4278    jnz .hv
4279    lea                  r6, [strideq*3]
4280    cmp                  wd, 4
4281    je .h_w4
4282    shr                 mxd, 16
4283    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
4284    mov                 r5d, r7m
4285    sub                srcq, 6
4286    shr                 r5d, 11
4287    psllw              xmm0, [base+prep_hv_shift+r5*8]
4288    mova             [tmpq], xmm0
4289    vpbroadcastd        m12, xmm0
4290    vpbroadcastd        m13, [tmpq+ 4]
4291    vpbroadcastd        m14, [tmpq+ 8]
4292    vpbroadcastd        m15, [tmpq+12]
4293    cmp                  wd, 16
4294    jl .h_w8
4295    vbroadcasti32x4      m6, [spel_h_shufA]
4296    vbroadcasti32x4      m7, [spel_h_shufB]
4297    mova                m11, [prep_endC]
4298    jg .h_w32
4299.h_w16_loop:
4300    movu                ym2, [srcq+strideq*0+ 0]
4301    vinserti32x8         m2, [srcq+strideq*1+ 0], 1
4302    movu                ym3, [srcq+strideq*0+16]
4303    vinserti32x8         m3, [srcq+strideq*1+16], 1
4304    lea                srcq, [srcq+strideq*2]
4305    mova                 m0, m10
4306    mova                 m1, m10
4307    pshufb               m4, m2, m6
4308    vpdpwssd             m0, m12, m4 ; a0
4309    pshufb               m4, m3, m6
4310    vpdpwssd             m1, m14, m4 ; b2
4311    pshufb               m4, m2, m7
4312    vpdpwssd             m0, m13, m4 ; a1
4313    pshufb               m4, m3, m7
4314    vpdpwssd             m1, m15, m4 ; b3
4315    shufpd               m2, m3, 0x55
4316    pshufb               m4, m2, m6
4317    vpdpwssd             m0, m14, m4 ; a2
4318    vpdpwssd             m1, m12, m4 ; b0
4319    pshufb               m2, m7
4320    vpdpwssd             m0, m15, m2 ; a3
4321    vpdpwssd             m1, m13, m2 ; b1
4322    vpermt2b             m0, m11, m1
4323    mova             [tmpq], m0
4324    add                tmpq, 64
4325    sub                  hd, 2
4326    jg .h_w16_loop
4327    RET
4328.h_w32:
4329    lea                srcq, [srcq+wq*2]
4330    neg                  wq
4331.h_w32_loop0:
4332    mov                  r6, wq
4333.h_w32_loop:
4334    movu                 m2, [srcq+r6*2+ 0]
4335    movu                 m3, [srcq+r6*2+ 8]
4336    mova                 m0, m10
4337    mova                 m1, m10
4338    pshufb               m4, m2, m6
4339    vpdpwssd             m0, m12, m4 ; a0
4340    pshufb               m4, m3, m6
4341    vpdpwssd             m1, m12, m4 ; b0
4342    vpdpwssd             m0, m14, m4 ; a2
4343    movu                 m4, [srcq+r6*2+16]
4344    pshufb               m3, m7
4345    vpdpwssd             m1, m13, m3 ; b1
4346    vpdpwssd             m0, m15, m3 ; a3
4347    pshufb               m3, m4, m6
4348    vpdpwssd             m1, m14, m3 ; b2
4349    pshufb               m2, m7
4350    vpdpwssd             m0, m13, m2 ; a1
4351    pshufb               m4, m7
4352    vpdpwssd             m1, m15, m4 ; b3
4353    vpermt2b             m0, m11, m1
4354    mova             [tmpq], m0
4355    add                tmpq, 64
4356    add                  r6, 32
4357    jl .h_w32_loop
4358    add                srcq, strideq
4359    dec                  hd
4360    jg .h_w32_loop0
4361    RET
4362.hv:
4363    vpbroadcastd        m11, [pd_128]
4364    cmp                  wd, 4
4365    jg .hv_w8
4366    movzx               mxd, mxb
4367    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
4368    movzx               mxd, myb
4369    shr                 myd, 16
4370    cmp                  hd, 4
4371    cmove               myd, mxd
4372    mov                 r5d, r7m
4373    pmovsxbw           xmm1, [base+subpel_filters+myq*8]
4374    lea                  r6, [strideq*3]
4375    sub                srcq, 2
4376    shr                 r5d, 11
4377    sub                srcq, r6
4378    psllw              xmm0, [base+prep_hv_shift+r5*8]
4379    psllw              xmm1, 2
4380    mova          [tmpq+ 0], xmm0
4381    mova          [tmpq+16], xmm1
4382    vpbroadcastd        m12, xmm1
4383    movu               xm16, [srcq+strideq*0]
4384    mov                 r3d, 0xff0
4385    vinserti128        ym16, [srcq+strideq*1], 1
4386    kmovw                k1, r3d
4387    vbroadcasti32x4     m18, [srcq+strideq*2]
4388    add                srcq, r6
4389    vinserti64x2    m16{k1}, m18, [srcq+strideq*0], 3
4390    movu               xm17, [srcq+strideq*1]
4391    vbroadcasti32x4    ym18, [srcq+strideq*2]
4392    add                srcq, r6
4393    vinserti32x4    m17{k1}, m18, [srcq+strideq*0], 2
4394    vbroadcasti32x4      m5, [spel_h_shufA]
4395    vbroadcasti32x4      m6, [spel_h_shufB]
4396    vpbroadcastd         m8, [tmpq+ 4]
4397    vpbroadcastd         m9, [tmpq+ 8]
4398    mova                 m1, m10
4399    mova                m19, [spel_shuf4a]
4400    mova                 m2, m10
4401    pshufb               m0, m16, m5
4402    vpdpwssd             m1, m8, m0
4403    pshufb               m0, m17, m5
4404    vpdpwssd             m2, m8, m0
4405    vpbroadcastd        m13, [tmpq+20]
4406    pshufb              m16, m6
4407    vpbroadcastd        m14, [tmpq+24]
4408    pshufb              m17, m6
4409    vpbroadcastd        m15, [tmpq+28]
4410    vpdpwssd             m1, m9, m16       ; 0 1 2 3
4411    vpdpwssd             m2, m9, m17       ; 4 5 6
4412    mova                 m7, [spel_shuf4b]
4413    vpermt2b             m1, m19, m2       ; 01 12 23 34
4414    vpermb               m2, m19, m2       ; 45 56
4415    mova               ym19, [prep_endA]
4416    vshufi32x4           m2, m1, m2, q1032 ; 23 34 45 56
4417.hv_w4_loop:
4418    movu               xm17, [srcq+strideq*1]
4419    vinserti128        ym17, [srcq+strideq*2], 1
4420    vbroadcasti32x4     m16, [srcq+r6       ]
4421    lea                srcq, [srcq+strideq*4]
4422    vinserti64x2    m17{k1}, m16, [srcq+strideq*0], 3
4423    mova                m18, m10
4424    pshufb              m16, m17, m5
4425    vpdpwssd            m18, m8, m16
4426    mova                m16, m11
4427    vpdpwssd            m16, m12, m1       ; a0 b0 c0 d0
4428    pshufb              m17, m6
4429    vpdpwssd            m18, m9, m17       ; 7 8 9 a
4430    mova                 m1, m2
4431    vpdpwssd            m16, m13, m2       ; a1 b1 c1 d1
4432    vpermt2b             m2, m7, m18       ; 67 78 89 9a
4433    vpdpwssd            m16, m15, m2       ; a3 b3 c3 d3
4434    vshufi32x4           m1, m2, q1032     ; 45 56 67 78
4435    vpdpwssd            m16, m14, m1       ; a2 b2 c2 d2
4436    vpermb              m16, m19, m16
4437    mova             [tmpq], ym16
4438    add                tmpq, 32
4439    sub                  hd, 4
4440    jg .hv_w4_loop
4441    vzeroupper
4442    RET
4443.hv_w8:
4444    shr                 mxd, 16
4445    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
4446    movzx               mxd, myb
4447    shr                 myd, 16
4448    cmp                  hd, 6
4449    cmovs               myd, mxd
4450    mov                 r5d, r7m
4451    pmovsxbw           xmm1, [base+subpel_filters+myq*8]
4452    lea                  r6, [strideq*3]
4453    sub                srcq, 6
4454    shr                 r5d, 11
4455    sub                srcq, r6
4456    psllw              xmm0, [base+prep_hv_shift+r5*8]
4457    psllw              xmm1, 2
4458    mova          [tmpq+ 0], xmm0
4459    mova          [tmpq+16], xmm1
4460    vpbroadcastd        m12, xmm0
4461    vpbroadcastd        m13, [tmpq+ 4]
4462    vpbroadcastd        m14, [tmpq+ 8]
4463    vpbroadcastd        m15, [tmpq+12]
4464    vpbroadcastd        m16, xmm1
4465    vpbroadcastd        m17, [tmpq+20]
4466    vpbroadcastd        m18, [tmpq+24]
4467    vpbroadcastd        m19, [tmpq+28]
4468    cmp                  wd, 8
4469    jg .hv_w16
4470    WIN64_SPILL_XMM      23
4471    mova                 m5, [spel_h_shufA]
4472    movu                ym0, [srcq+strideq*0]
4473    vinserti32x8         m0, [srcq+strideq*1], 1 ; 0 1
4474    movu                ym9, [srcq+strideq*2]
4475    add                srcq, r6
4476    vinserti32x8         m9, [srcq+strideq*0], 1 ; 2 3
4477    movu               ym20, [srcq+strideq*1]
4478    vinserti32x8        m20, [srcq+strideq*2], 1 ; 4 5
4479    add                srcq, r6
4480    movu               ym21, [srcq+strideq*0]    ; 6
4481    movu                 m6, [spel_h_shufB]
4482    movu                 m7, [spel_h_shufC]
4483    mova               ym22, [prep_endB]
4484    vpermb               m8, m5, m0
4485    mova                 m1, m10
4486    vpdpwssd             m1, m12, m8  ; a0 b0
4487    vpermb               m8, m5, m9
4488    mova                 m2, m10
4489    vpdpwssd             m2, m12, m8  ; c0 d0
4490    vpermb               m8, m5, m20
4491    mova                 m3, m10
4492    vpdpwssd             m3, m12, m8  ; e0 f0
4493    vpermb               m8, m5, m21
4494    mova                 m4, m10
4495    vpdpwssd             m4, m12, m8  ; g0
4496    vpermb               m8, m6, m0
4497    vpdpwssd             m1, m13, m8  ; a1 b1
4498    vpermb               m8, m6, m9
4499    vpdpwssd             m2, m13, m8  ; c1 d1
4500    vpermb               m8, m6, m20
4501    vpdpwssd             m3, m13, m8  ; e1 f1
4502    vpermb               m8, m6, m21
4503    vpdpwssd             m4, m13, m8  ; g1
4504    vpermb               m8, m7, m0
4505    vpdpwssd             m1, m14, m8  ; a2 b2
4506    vpermb               m8, m7, m9
4507    vpdpwssd             m2, m14, m8  ; c2 d2
4508    vpermb               m8, m7, m20
4509    vpdpwssd             m3, m14, m8  ; e2 f2
4510    vpermb               m8, m7, m21
4511    vpdpwssd             m4, m14, m8  ; g2
4512    mova                 m8, [spel_h_shufD]
4513    vpermb               m0, m8, m0
4514    vpdpwssd             m1, m15, m0  ; a3 b3
4515    mova                 m0, [spel_shuf8a]
4516    vpermb               m9, m8, m9
4517    vpdpwssd             m2, m15, m9  ; c3 d3
4518    mova                 m9, [spel_shuf8b]
4519    vpermb              m20, m8, m20
4520    vpdpwssd             m3, m15, m20 ; e3 f3
4521    vpermb              m21, m8, m21
4522    vpdpwssd             m4, m15, m21 ; g3
4523    vpermt2b             m1, m0, m2   ; 01 12
4524    vpermt2b             m2, m0, m3   ; 23 34
4525    vpermt2b             m3, m0, m4   ; 45 56
4526.hv_w8_loop:
4527    movu                ym0, [srcq+strideq*1]
4528    lea                srcq, [srcq+strideq*2]
4529    vinserti32x8         m0, [srcq+strideq*0], 1
4530    mova                 m4, m10
4531    mova                m20, m11
4532    vpermb              m21, m5, m0
4533    vpdpwssd             m4, m12, m21 ; h0 i0
4534    vpermb              m21, m6, m0
4535    vpdpwssd            m20, m16, m1  ; A0 B0
4536    vpdpwssd             m4, m13, m21 ; h1 i1
4537    vpermb              m21, m7, m0
4538    mova                 m1, m2
4539    vpdpwssd            m20, m17, m2  ; A1 B1
4540    vpdpwssd             m4, m14, m21 ; h2 i2
4541    vpermb              m21, m8, m0
4542    mova                 m2, m3
4543    vpdpwssd            m20, m18, m3  ; A2 B2
4544    vpdpwssd             m4, m15, m21 ; h3 i3
4545    vpermt2b             m3, m9, m4   ; 67 78
4546    vpdpwssd            m20, m19, m3  ; A3 B3
4547    vpermb              m20, m22, m20
4548    mova             [tmpq], ym20
4549    add                tmpq, 32
4550    sub                  hd, 2
4551    jg .hv_w8_loop
4552    RET
4553.hv_w16:
4554    WIN64_SPILL_XMM      27
4555%if WIN64
4556    push                 r8
4557%endif
4558    vbroadcasti32x4     m20, [spel_h_shufA]
4559    vbroadcasti32x4     m21, [spel_h_shufB]
4560    add                  wd, wd
4561    mova                 m9, [spel_shuf16]
4562    mova                m26, [prep_endB]
4563    lea                 r5d, [hq+wq*8-256]
4564.hv_w16_loop0:
4565    vbroadcasti32x8      m5, [srcq+strideq*0+ 8]
4566    vinserti32x8         m4, m5, [srcq+strideq*0+ 0], 0
4567    vinserti32x8         m5, [srcq+strideq*0+16], 1 ; 0
4568    movu                ym6, [srcq+strideq*1+ 0]
4569    movu                ym7, [srcq+strideq*1+16]
4570    lea                  r7, [srcq+r6]
4571    vinserti32x8         m6, [srcq+strideq*2+ 0], 1
4572    vinserti32x8         m7, [srcq+strideq*2+16], 1 ; 1 2
4573    movu               ym22, [r7  +strideq*0+ 0]
4574    movu               ym23, [r7  +strideq*0+16]
4575    mov                  r8, tmpq
4576    vinserti32x8        m22, [r7  +strideq*1+ 0], 1
4577    vinserti32x8        m23, [r7  +strideq*1+16], 1 ; 3 4
4578    movu               ym24, [r7  +strideq*2+ 0]
4579    movu               ym25, [r7  +strideq*2+16]
4580    add                  r7, r6
4581    vinserti32x8        m24, [r7  +strideq*0+ 0], 1
4582    vinserti32x8        m25, [r7  +strideq*0+16], 1 ; 5 6
4583    pshufb               m0, m4, m20
4584    mova                 m1, m10
4585    vpdpwssd             m1, m12, m0    ; a0
4586    pshufb               m0, m6, m20
4587    mova                 m2, m10
4588    vpdpwssd             m2, m12, m0    ; b0
4589    pshufb               m0, m7, m20
4590    mova                 m3, m10
4591    vpdpwssd             m3, m14, m0    ; c2
4592    pshufb               m0, m4, m21
4593    vpdpwssd             m1, m13, m0    ; a1
4594    pshufb               m0, m6, m21
4595    vpdpwssd             m2, m13, m0    ; b1
4596    pshufb               m0, m7, m21
4597    vpdpwssd             m3, m15, m0    ; c3
4598    pshufb               m0, m5, m20
4599    vpdpwssd             m1, m14, m0    ; a2
4600    shufpd               m6, m7, 0x55
4601    pshufb               m7, m6, m20
4602    vpdpwssd             m2, m14, m7    ; b2
4603    vpdpwssd             m3, m12, m7    ; c0
4604    pshufb               m5, m21
4605    vpdpwssd             m1, m15, m5    ; a3
4606    pshufb               m6, m21
4607    vpdpwssd             m2, m15, m6    ; b3
4608    vpdpwssd             m3, m13, m6    ; c1
4609    pshufb               m0, m22, m20
4610    mova                 m4, m10
4611    vpdpwssd             m4, m12, m0    ; d0
4612    pshufb               m0, m23, m20
4613    mova                 m5, m10
4614    vpdpwssd             m5, m14, m0    ; e2
4615    pshufb               m0, m24, m20
4616    mova                 m6, m10
4617    vpdpwssd             m6, m12, m0    ; f0
4618    pshufb               m0, m25, m20
4619    mova                 m7, m10
4620    vpdpwssd             m7, m14, m0    ; g2
4621    pshufb               m0, m22, m21
4622    vpdpwssd             m4, m13, m0    ; d1
4623    pshufb               m0, m23, m21
4624    vpdpwssd             m5, m15, m0    ; e3
4625    pshufb               m0, m24, m21
4626    vpdpwssd             m6, m13, m0    ; f1
4627    pshufb               m0, m25, m21
4628    vpdpwssd             m7, m15, m0    ; g3
4629    shufpd              m22, m23, 0x55
4630    pshufb              m23, m22, m20
4631    vpdpwssd             m4, m14, m23   ; d2
4632    vpdpwssd             m5, m12, m23   ; e0
4633    shufpd              m24, m25, 0x55
4634    pshufb              m25, m24, m20
4635    vpdpwssd             m6, m14, m25   ; f2
4636    vpdpwssd             m7, m12, m25   ; g0
4637    pshufb              m22, m21
4638    vpdpwssd             m4, m15, m22   ; d3
4639    vpdpwssd             m5, m13, m22   ; e1
4640    pshufb              m24, m21
4641    vpdpwssd             m6, m15, m24   ; f3
4642    vpdpwssd             m7, m13, m24   ; g1
4643    pslldq               m1, 1
4644    vpermt2b             m2, m9, m3     ; 12
4645    vpermt2b             m4, m9, m5     ; 34
4646    vpermt2b             m6, m9, m7     ; 56
4647    vpshrdd              m1, m2, 16     ; 01
4648    vpshrdd              m3, m2, m4, 16 ; 23
4649    vpshrdd              m5, m4, m6, 16 ; 45
4650.hv_w16_loop:
4651    movu               ym24, [r7+strideq*1+ 0]
4652    movu               ym25, [r7+strideq*1+16]
4653    lea                  r7, [r7+strideq*2]
4654    vinserti32x8        m24, [r7+strideq*0+ 0], 1
4655    vinserti32x8        m25, [r7+strideq*0+16], 1
4656    mova                 m7, m10
4657    mova                 m8, m10
4658    pshufb               m0, m24, m20
4659    vpdpwssd             m7, m12, m0    ; h0
4660    mova                m22, m11
4661    pshufb               m0, m25, m20
4662    vpdpwssd             m8, m14, m0    ; i2
4663    mova                m23, m11
4664    vpdpwssd            m22, m16, m1    ; A0
4665    mova                 m1, m3
4666    vpdpwssd            m23, m16, m2    ; B0
4667    mova                 m2, m4
4668    pshufb               m0, m24, m21
4669    vpdpwssd             m7, m13, m0    ; h1
4670    pshufb               m0, m25, m21
4671    vpdpwssd             m8, m15, m0    ; i3
4672    vpdpwssd            m22, m17, m3    ; A1
4673    mova                 m3, m5
4674    vpdpwssd            m23, m17, m4    ; B1
4675    mova                 m4, m6
4676    shufpd              m24, m25, 0x55
4677    pshufb              m25, m24, m20
4678    vpdpwssd             m7, m14, m25   ; h2
4679    vpdpwssd             m8, m12, m25   ; i0
4680    vpdpwssd            m22, m18, m5    ; A2
4681    vpdpwssd            m23, m18, m6    ; B2
4682    pshufb              m24, m21
4683    vpdpwssd             m7, m15, m24   ; h3
4684    vpdpwssd             m8, m13, m24   ; i1
4685    vpermt2b             m7, m9, m8     ; 78
4686    vpshrdd              m5, m6, m7, 16 ; 67
4687    vpdpwssd            m22, m19, m5    ; A3
4688    vpdpwssd            m23, m19, m7    ; B3
4689    mova                 m6, m7
4690    vpermt2b            m22, m26, m23
4691    mova          [r8+wq*0], ym22
4692    vextracti32x8 [r8+wq*1], m22, 1
4693    lea                  r8, [r8+wq*2]
4694    sub                  hd, 2
4695    jg .hv_w16_loop
4696    add                srcq, 32
4697    add                tmpq, 32
4698    movzx                hd, r5b
4699    sub                 r5d, 1<<8
4700    jg .hv_w16_loop0
4701%if WIN64
4702    pop                  r8
4703%endif
4704    RET
4705
4706%if WIN64
4707DECLARE_REG_TMP 5
4708%else
4709DECLARE_REG_TMP 7
4710%endif
4711
4712cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts
4713%define base r6-pd_0to7
4714    mov                 t0d, r7m
4715    lea                  r6, [pd_0to7]
4716    shr                 t0d, 11
4717    vpbroadcastd         m8, [base+warp_8x8t_rnd_v]
4718    vpbroadcastd         m1, [base+warp_8x8_rnd_h+t0*4]
4719    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main
4720    psrad               m14, m16, 15
4721    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
4722    psrad               m16, 15
4723    packssdw            m14, m16
4724    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
4725    psrad               m15, m16, 15
4726    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
4727    add                 tsq, tsq
4728    psrad               m16, 15
4729    packssdw            m15, m16
4730    jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end
4731
4732cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd
4733    mov                 t0d, r7m ; pixel_max
4734    lea                  r6, [pd_0to7]
4735    shr                 t0d, 11
4736    vpbroadcastd         m1, [base+warp_8x8_rnd_h+t0*4]
4737    vpbroadcastd         m8, [base+warp_8x8_rnd_v+t0*4]
4738    call .main
4739    psrad               m14, m16, 13
4740    call .main2
4741    psrad               m16, 13
4742    packusdw            m14, m16
4743    call .main2
4744    psrad               m15, m16, 13
4745    call .main2
4746    vpbroadcastd         m0, [base+bidir_shift+t0*4]
4747    vpsrlvw             m14, m0
4748    psrad               m16, 13
4749    packusdw            m15, m16
4750    vpsrlvw             m15, m0
4751.end:
4752    mova                 m0, [base+warp8x8_end]
4753    vpermb              m16, m0, m14
4754    lea                  r2, [dsq*3]
4755    mova          [dstq+dsq*0], xm16
4756    vextracti128  [dstq+dsq*1], ym16, 1
4757    vextracti32x4 [dstq+dsq*2], m16, 2
4758    vextracti32x4 [dstq+r2   ], m16, 3
4759    vpermb              m16, m0, m15
4760    lea                dstq, [dstq+dsq*4]
4761    mova          [dstq+dsq*0], xm16
4762    vextracti128  [dstq+dsq*1], ym16, 1
4763    vextracti32x4 [dstq+dsq*2], m16, 2
4764    vextracti32x4 [dstq+r2   ], m16, 3
4765    RET
4766.main:
4767    vpbroadcastd        ym3, [base+pd_512]
4768%if WIN64
4769    mov               abcdq, r5mp
4770    vpaddd             ym18, ym3, r6m {1to8} ; mx
4771%else
4772    add                 r5d, 512
4773    vpbroadcastd       ym18, r5d
4774%endif
4775    vpaddd             ym20, ym3, r7m {1to8} ; my
4776    mova               ym16, [base+pd_0to7]
4777    vpbroadcastd       ym19, [abcdq+4*0]     ; alpha
4778    vpbroadcastd       ym21, [abcdq+4*1]     ; gamma
4779    lea                  r4, [ssq*3+6]
4780    vpdpwssd           ym18, ym19, ym16      ; tmx
4781    vpdpwssd           ym20, ym21, ym16      ; tmy
4782    sub                srcq, r4
4783    mova                m10, [base+warp8x8_permA]
4784    lea                  r4, [mc_warp_filter+64*8]
4785    vbroadcasti32x4     m12, [base+warp8x8_permC]
4786    kxnorb               k1, k1, k1
4787    vbroadcasti32x4     m13, [base+warp8x8_permD]
4788    movu                ym5, [srcq+0]
4789    vinserti32x8         m5, [srcq+8], 1
4790    psrad              ym17, ym18, 10
4791    mova                m11, [base+warp8x8_permB]
4792    kmovb                k2, k1
4793    vpgatherdq       m3{k1}, [r4+ym17*8]    ; filter_x0
4794    psrad              ym19, 16             ; beta
4795    psrad              ym21, 16             ; delta
4796    paddd              ym18, ym19
4797    vpermb               m4, m10, m5
4798    vpbroadcastq         m9, [base+warp_shift_h+t0*8]
4799    pshufd               m3, m3, q3120
4800    paddd                m7, m1, m1
4801    pshufb               m2, m3, m12
4802    vpdpwssd             m1, m4, m2
4803    vpermb               m5, m11, m5
4804    vshufi32x4           m4, m5, q1021
4805    pshufb               m3, m13
4806    vpdpwssd             m1, m4, m3
4807    call .h
4808    psllq                m2, m1, 32
4809    paddd                m1, m2
4810    vpmultishiftqb       m1, m9, m1
4811    vpshrdq              m1, m0, 48          ; 01 12
4812    call .h
4813    vpshrdq              m2, m1, m0, 48      ; 23 34
4814    call .h
4815    vpshrdq              m3, m2, m0, 48      ; 45 56
4816.main2:
4817    call .h
4818    psrad               ym6, ym20, 10
4819    kmovb                k1, k2
4820    paddd              ym17, ym20, ym21      ; my += delta
4821    vpgatherdq      m20{k2}, [r4+ym6*8]      ; filter_y0
4822    psrad              ym16, ym17, 10
4823    kmovb                k2, k1
4824    vpgatherdq       m6{k1}, [r4+ym16*8]     ; filter_y1
4825    shufps               m5, m20, m6, q2020
4826    mova                m16, m8
4827    pshufb               m4, m5, m12
4828    vpdpwssd            m16, m1, m4          ; a0 b0
4829    pshufb               m5, m13
4830    mova                 m1, m2
4831    vpdpwssd            m16, m2, m5          ; a1 b1
4832    shufps               m6, m20, m6, q3131
4833    paddd              ym20, ym17, ym21
4834    pshufb               m4, m6, m12
4835    mova                 m2, m3
4836    vpdpwssd            m16, m3, m4          ; a2 b2
4837    vpshrdq              m3, m0, 48          ; 67 78
4838    pshufb               m6, m13
4839    vpdpwssd            m16, m3, m6          ; a3 b3
4840    ret
4841ALIGN function_align
4842.h:
4843    movu               ym16, [srcq+ssq*1]
4844    psrad               ym6, ym18, 10
4845    lea                srcq, [srcq+ssq*2]
4846    vinserti32x8         m5, m16, [srcq+ssq*0], 1
4847    kmovb                k1, k2
4848    paddd              ym17, ym18, ym19      ; mx += beta
4849    vpgatherdq      m18{k2}, [r4+ym6*8]      ; filter_x1
4850    psrad              ym16, ym17, 10
4851    kmovb                k2, k1
4852    vpgatherdq       m6{k1}, [r4+ym16*8]     ; filter_x2
4853    vpermb               m4, m10, m5
4854    shufps              m16, m18, m6, q2020
4855    shufps               m6, m18, m6, q3131
4856    mova                 m0, m7
4857    pshufb              m18, m16, m12
4858    vpdpwssd             m0, m4, m18         ; a0 b0
4859    vpermb               m5, m11, m5
4860    pshufb              m18, m6, m13
4861    vpdpwssd             m0, m5, m18         ; a3 b3
4862    paddd              ym18, ym17, ym19
4863    vshufi32x4          m17, m4, m5, q1021
4864    pshufb              m16, m13
4865    vpdpwssd             m0, m17, m16        ; a1 b1
4866    vshufi32x4           m4, m5, q2132
4867    pshufb               m6, m12
4868    vpdpwssd             m0, m4, m6          ; a2 b2
4869    vpmultishiftqb       m0, m9, m0          ; a a b b
4870    ret
4871
4872%macro BIDIR_FN 0
4873    call .main
4874    lea            stride3q, [strideq*3]
4875    jmp                  wq
4876.w4:
4877    movq   [dstq          ], xm0
4878    movhps [dstq+strideq*1], xm0
4879    vextracti32x4       xm2, ym0, 1
4880    movq   [dstq+strideq*2], xm2
4881    movhps [dstq+stride3q ], xm2
4882    cmp                  hd, 8
4883    jl .w4_end
4884    vextracti32x4       xm2, m0, 2
4885    lea                dstq, [dstq+strideq*4]
4886    movq   [dstq          ], xm2
4887    movhps [dstq+strideq*1], xm2
4888    vextracti32x4       xm0, m0, 3
4889    movq   [dstq+strideq*2], xm0
4890    movhps [dstq+stride3q ], xm0
4891    je .w4_end
4892    lea                dstq, [dstq+strideq*4]
4893    movq   [dstq          ], xm1
4894    movhps [dstq+strideq*1], xm1
4895    vextracti32x4       xm0, ym1, 1
4896    movq   [dstq+strideq*2], xm0
4897    movhps [dstq+stride3q ], xm0
4898    vextracti32x4       xm0, m1, 2
4899    lea                dstq, [dstq+strideq*4]
4900    movq   [dstq          ], xm0
4901    movhps [dstq+strideq*1], xm0
4902    vextracti32x4       xm1, m1, 3
4903    movq   [dstq+strideq*2], xm1
4904    movhps [dstq+stride3q ], xm1
4905.w4_end:
4906    RET
4907.w8_loop:
4908    call .main
4909    lea                dstq, [dstq+strideq*4]
4910.w8:
4911    mova          [dstq+strideq*0], xm0
4912    vextracti32x4 [dstq+strideq*1], ym0, 1
4913    vextracti32x4 [dstq+strideq*2], m0, 2
4914    vextracti32x4 [dstq+stride3q ], m0, 3
4915    sub                  hd, 8
4916    jl .w8_end
4917    lea                dstq, [dstq+strideq*4]
4918    mova          [dstq+strideq*0], xm1
4919    vextracti32x4 [dstq+strideq*1], ym1, 1
4920    vextracti32x4 [dstq+strideq*2], m1, 2
4921    vextracti32x4 [dstq+stride3q ], m1, 3
4922    jg .w8_loop
4923.w8_end:
4924    RET
4925.w16_loop:
4926    call .main
4927    lea                dstq, [dstq+strideq*4]
4928.w16:
4929    mova          [dstq+strideq*0], ym0
4930    vextracti32x8 [dstq+strideq*1], m0, 1
4931    mova          [dstq+strideq*2], ym1
4932    vextracti32x8 [dstq+stride3q ], m1, 1
4933    sub                  hd, 4
4934    jg .w16_loop
4935    RET
4936.w32_loop:
4937    call .main
4938    lea                dstq, [dstq+strideq*2]
4939.w32:
4940    mova   [dstq+strideq*0], m0
4941    mova   [dstq+strideq*1], m1
4942    sub                  hd, 2
4943    jg .w32_loop
4944    RET
4945.w64_loop:
4946    call .main
4947    add                dstq, strideq
4948.w64:
4949    mova        [dstq+64*0], m0
4950    mova        [dstq+64*1], m1
4951    dec                  hd
4952    jg .w64_loop
4953    RET
4954.w128_loop:
4955    call .main
4956    add                dstq, strideq
4957.w128:
4958    mova        [dstq+64*0], m0
4959    mova        [dstq+64*1], m1
4960    call .main
4961    mova        [dstq+64*2], m0
4962    mova        [dstq+64*3], m1
4963    dec                  hd
4964    jg .w128_loop
4965    RET
4966%endmacro
4967
4968%if WIN64
4969DECLARE_REG_TMP 5
4970%else
4971DECLARE_REG_TMP 7
4972%endif
4973
4974cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3
4975%define base r6-avg_avx512icl_table
4976    lea                  r6, [avg_avx512icl_table]
4977    tzcnt                wd, wm
4978    mov                 t0d, r6m ; pixel_max
4979    movsxd               wq, [r6+wq*4]
4980    shr                 t0d, 11
4981    vpbroadcastd         m2, [base+avg_round+t0*4]
4982    vpbroadcastd         m3, [base+avg_shift+t0*4]
4983    movifnidn            hd, hm
4984    add                  wq, r6
4985    BIDIR_FN
4986ALIGN function_align
4987.main:
4988    mova                 m0, [tmp1q+64*0]
4989    paddsw               m0, [tmp2q+64*0]
4990    mova                 m1, [tmp1q+64*1]
4991    paddsw               m1, [tmp2q+64*1]
4992    add               tmp1q, 64*2
4993    add               tmp2q, 64*2
4994    pmaxsw               m0, m2
4995    pmaxsw               m1, m2
4996    psubsw               m0, m2
4997    psubsw               m1, m2
4998    vpsrlvw              m0, m3
4999    vpsrlvw              m1, m3
5000    ret
5001
5002cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3
5003%define base r6-w_avg_avx512icl_table
5004    lea                  r6, [w_avg_avx512icl_table]
5005    tzcnt                wd, wm
5006    mov                 t0d, r7m ; pixel_max
5007    shr                 t0d, 11
5008    movsxd               wq, [r6+wq*4]
5009    vpbroadcastd         m5, [base+w_avg_round+t0*4]
5010    vpbroadcastd         m7, [base+bidir_shift+t0*4]
5011    add                  wq, r6
5012    mov                 r6d, r6m ; weight
5013    lea                 t0d, [r6-16]
5014    shl                 r6d, 16
5015    sub                 r6d, t0d ; 16-weight, weight
5016    movifnidn            hd, hm
5017    vpbroadcastd         m6, r6d
5018    BIDIR_FN
5019ALIGN function_align
5020.main:
5021    mova                 m3, [tmp1q+64*0]
5022    mova                 m1, [tmp2q+64*0]
5023    mova                 m0, [tmp1q+64*1]
5024    mova                 m4, [tmp2q+64*1]
5025    add               tmp1q, 64*2
5026    add               tmp2q, 64*2
5027    punpcklwd            m2, m1, m3
5028    punpckhwd            m1, m3
5029    punpcklwd            m3, m4, m0
5030    punpckhwd            m4, m0
5031    mova                 m0, m5
5032    vpdpwssd             m0, m6, m2
5033    mova                 m2, m5
5034    vpdpwssd             m2, m6, m1
5035    mova                 m1, m5
5036    vpdpwssd             m1, m6, m3
5037    mova                 m3, m5
5038    vpdpwssd             m3, m6, m4
5039    REPX       {psrad x, 2}, m0, m2, m1, m3
5040    packusdw             m0, m2
5041    packusdw             m1, m3
5042    vpsrlvw              m0, m7
5043    vpsrlvw              m1, m7
5044    ret
5045
5046cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
5047%define base r7-mask_avx512icl_table
5048    lea                  r7, [mask_avx512icl_table]
5049    tzcnt                wd, wm
5050    mov                 r6d, r7m ; pixel_max
5051    movifnidn            hd, hm
5052    shr                 r6d, 11
5053    movsxd               wq, [r7+wq*4]
5054    vpbroadcastd         m8, [base+pw_64]
5055    vpbroadcastd         m9, [base+mask_round+r6*4]
5056    vpbroadcastd        m10, [base+bidir_shift+r6*4]
5057    mov               maskq, maskmp
5058    add                  wq, r7
5059    BIDIR_FN
5060ALIGN function_align
5061.main:
5062    pmovzxbw             m1, [maskq+32*0]
5063    mova                 m4, [tmp1q+64*0]
5064    mova                 m2, [tmp2q+64*0]
5065    pmovzxbw             m6, [maskq+32*1]
5066    mova                 m5, [tmp1q+64*1]
5067    mova                 m3, [tmp2q+64*1]
5068    add               maskq, 32*2
5069    add               tmp1q, 64*2
5070    add               tmp2q, 64*2
5071    punpcklwd            m7, m4, m2
5072    punpckhwd            m4, m2
5073    psubw                m0, m8, m1
5074    punpcklwd            m2, m1, m0 ; m, 64-m
5075    punpckhwd            m1, m0
5076    mova                 m0, m9
5077    vpdpwssd             m0, m7, m2
5078    mova                 m2, m9
5079    vpdpwssd             m2, m4, m1 ; tmp1 * m + tmp2 * (64-m)
5080    punpcklwd            m7, m5, m3
5081    punpckhwd            m5, m3
5082    psubw                m1, m8, m6
5083    punpcklwd            m3, m6, m1
5084    punpckhwd            m6, m1
5085    mova                 m1, m9
5086    vpdpwssd             m1, m7, m3
5087    mova                 m3, m9
5088    vpdpwssd             m3, m5, m6
5089    REPX       {psrad x, 4}, m0, m2, m1, m3
5090    packusdw             m0, m2
5091    packusdw             m1, m3
5092    vpsrlvw              m0, m10
5093    vpsrlvw              m1, m10
5094    ret
5095
5096cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
5097%define base r7-w_mask_420_avx512icl_table
5098    lea                  r7, [w_mask_420_avx512icl_table]
5099    tzcnt                wd, wm
5100    mov                 r6d, r8m ; pixel_max
5101    movifnidn            hd, hm
5102    shr                 r6d, 11
5103    movsxd               wq, [r7+wq*4]
5104    vpbroadcastd        m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
5105    vpbroadcastd        m11, [base+pw_64]
5106    vpbroadcastd        m12, [base+mask_round+r6*4]
5107    vpbroadcastd        m13, [base+bidir_shift+r6*4]
5108    mov                 r6d, r7m ; sign
5109    vpbroadcastd        m14, [base+w_mask_round+r6*4]
5110    mova               ym15, [w_mask_end42x]
5111    mov               maskq, maskmp
5112    add                  wq, r7
5113    call .main
5114    lea            stride3q, [strideq*3]
5115    jmp                  wq
5116.w4:
5117    mova                 m4, [w_mask_shuf4]
5118    vpermt2b             m2, m4, m3
5119    mova                 m3, m14
5120    vpdpbusd             m3, m2, [pb_64] {1to16}
5121    vpermb               m3, m15, m3
5122    movq   [dstq+strideq*0], xm0
5123    movhps [dstq+strideq*1], xm0
5124    vextracti32x4       xm2, ym0, 1
5125    movq   [dstq+strideq*2], xm2
5126    movhps [dstq+stride3q ], xm2
5127    mova            [maskq], xm3
5128    cmp                  hd, 8
5129    jl .w4_end
5130    vextracti32x4       xm2, m0, 2
5131    lea                dstq, [dstq+strideq*4]
5132    movq   [dstq+strideq*0], xm2
5133    movhps [dstq+strideq*1], xm2
5134    vextracti32x4       xm0, m0, 3
5135    movq   [dstq+strideq*2], xm0
5136    movhps [dstq+stride3q ], xm0
5137    je .w4_end
5138    lea                dstq, [dstq+strideq*4]
5139    movq   [dstq+strideq*0], xm1
5140    movhps [dstq+strideq*1], xm1
5141    vextracti32x4       xm2, ym1, 1
5142    movq   [dstq+strideq*2], xm2
5143    movhps [dstq+stride3q ], xm2
5144    vextracti32x4       xm2, m1, 2
5145    lea                dstq, [dstq+strideq*4]
5146    movq   [dstq+strideq*0], xm2
5147    movhps [dstq+strideq*1], xm2
5148    vextracti32x4       xm1, m1, 3
5149    movq   [dstq+strideq*2], xm1
5150    movhps [dstq+stride3q ], xm1
5151.w4_end:
5152    RET
5153.w8:
5154    mova                 m8, [w_mask_shuf8]
5155    vpbroadcastd         m9, [pb_64]
5156    jmp .w8_start
5157.w8_loop:
5158    call .main
5159    lea                dstq, [dstq+strideq*4]
5160    add               maskq, 16
5161.w8_start:
5162    vpermt2b             m2, m8, m3
5163    mova                 m3, m14
5164    vpdpbusd             m3, m2, m9
5165    vpermb               m3, m15, m3
5166    mova          [dstq+strideq*0], xm0
5167    vextracti32x4 [dstq+strideq*1], ym0, 1
5168    vextracti32x4 [dstq+strideq*2], m0, 2
5169    vextracti32x4 [dstq+stride3q ], m0, 3
5170    mova            [maskq], xm3
5171    sub                  hd, 8
5172    jl .w8_end
5173    lea                dstq, [dstq+strideq*4]
5174    mova          [dstq+strideq*0], xm1
5175    vextracti32x4 [dstq+strideq*1], ym1, 1
5176    vextracti32x4 [dstq+strideq*2], m1, 2
5177    vextracti32x4 [dstq+stride3q ], m1, 3
5178    jg .w8_loop
5179.w8_end:
5180    RET
5181.w16:
5182    mova                 m8, [w_mask_shuf16]
5183    vpbroadcastd         m9, [pb_64]
5184    jmp .w16_start
5185.w16_loop:
5186    call .main
5187    lea                dstq, [dstq+strideq*4]
5188    add               maskq, 16
5189.w16_start:
5190    vpermt2b             m2, m8, m3
5191    mova                 m3, m14
5192    vpdpbusd             m3, m2, m9
5193    vpermb               m3, m15, m3
5194    mova          [dstq+strideq*0], ym0
5195    vextracti32x8 [dstq+strideq*1], m0, 1
5196    mova          [dstq+strideq*2], ym1
5197    vextracti32x8 [dstq+stride3q ], m1, 1
5198    mova            [maskq], xm3
5199    sub                  hd, 4
5200    jg .w16_loop
5201    RET
5202.w32_loop:
5203    call .main
5204    lea                dstq, [dstq+strideq*4]
5205    add               maskq, 32
5206.w32:
5207    paddw                m2, m3
5208    mova                 m8, m14
5209    vpdpwssd             m8, m11, m2
5210    mova   [dstq+strideq*0], m0
5211    mova   [dstq+strideq*1], m1
5212    call .main
5213    paddw                m2, m3
5214    mova                 m3, m14
5215    vpdpwssd             m3, m11, m2
5216    vpermt2b             m8, m15, m3
5217    mova   [dstq+strideq*2], m0
5218    mova   [dstq+stride3q ], m1
5219    mova            [maskq], ym8
5220    sub                  hd, 4
5221    jg .w32_loop
5222    RET
5223.w64_loop:
5224    call .main
5225    lea                dstq, [dstq+strideq*2]
5226    add               maskq, 32
5227.w64:
5228    mova                 m8, m2
5229    mova                 m9, m3
5230    mova [dstq+strideq*0+64*0], m0
5231    mova [dstq+strideq*0+64*1], m1
5232    call .main
5233    paddw                m8, m2
5234    paddw                m9, m3
5235    mova                 m2, m14
5236    vpdpwssd             m2, m11, m8
5237    mova                 m3, m14
5238    vpdpwssd             m3, m11, m9
5239    vpermt2b             m2, m15, m3
5240    mova [dstq+strideq*1+64*0], m0
5241    mova [dstq+strideq*1+64*1], m1
5242    mova            [maskq], ym2
5243    sub                  hd, 2
5244    jg .w64_loop
5245    RET
5246.w128_loop:
5247    call .main
5248    lea                dstq, [dstq+strideq*2]
5249    add               maskq, 64
5250.w128:
5251    mova               m16, m2
5252    mova                m8, m3
5253    mova [dstq+strideq*0+64*0], m0
5254    mova [dstq+strideq*0+64*1], m1
5255    call .main
5256    mova                m17, m2
5257    mova                 m9, m3
5258    mova [dstq+strideq*0+64*2], m0
5259    mova [dstq+strideq*0+64*3], m1
5260    call .main
5261    paddw                m2, m16
5262    paddw                m3, m8
5263    mova                m16, m14
5264    vpdpwssd            m16, m11, m2
5265    mova                 m8, m14
5266    vpdpwssd             m8, m11, m3
5267    mova [dstq+strideq*1+64*0], m0
5268    mova [dstq+strideq*1+64*1], m1
5269    call .main
5270    paddw                m2, m17
5271    paddw                m3, m9
5272    mova                m17, m14
5273    vpdpwssd            m17, m11, m2
5274    mova                 m9, m14
5275    vpdpwssd             m9, m11, m3
5276    vpermt2b            m16, m15, m8
5277    vpermt2b            m17, m15, m9
5278    mova [dstq+strideq*1+64*2], m0
5279    mova [dstq+strideq*1+64*3], m1
5280    mova       [maskq+32*0], ym16
5281    mova       [maskq+32*1], ym17
5282    sub                  hd, 2
5283    jg .w128_loop
5284    vzeroupper
5285    RET
5286ALIGN function_align
5287.main:
5288    mova                 m1, [tmp1q+64*0]
5289    mova                 m3, [tmp2q+64*0]
5290    mova                 m4, [tmp1q+64*1]
5291    mova                 m7, [tmp2q+64*1]
5292    add               tmp1q, 64*2
5293    add               tmp2q, 64*2
5294    psubsw               m6, m1, m3
5295    punpcklwd            m5, m3, m1
5296    pabsw                m6, m6
5297    punpckhwd            m3, m1
5298    psubusw              m6, m10, m6
5299    psrlw                m6, 10      ; 64-m
5300    psubw                m2, m11, m6 ; m
5301    punpcklwd            m1, m6, m2
5302    punpckhwd            m6, m2
5303    mova                 m0, m12
5304    vpdpwssd             m0, m5, m1
5305    mova                 m1, m12
5306    vpdpwssd             m1, m3, m6
5307    psubsw               m5, m4, m7
5308    punpcklwd            m6, m7, m4
5309    pabsw                m5, m5
5310    punpckhwd            m7, m4
5311    psubusw              m5, m10, m5
5312    psrlw                m5, 10
5313    psubw                m3, m11, m5
5314    punpcklwd            m4, m5, m3
5315    psrad                m0, 4
5316    punpckhwd            m5, m3
5317    psrad                m1, 4
5318    packusdw             m0, m1
5319    mova                 m1, m12
5320    vpdpwssd             m1, m6, m4
5321    mova                 m4, m12
5322    vpdpwssd             m4, m7, m5
5323    psrad                m1, 4
5324    psrad                m4, 4
5325    packusdw             m1, m4
5326    vpsrlvw              m0, m13
5327    vpsrlvw              m1, m13
5328    ret
5329
5330cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
5331%define base r7-w_mask_422_avx512icl_table
5332    lea                  r7, [w_mask_422_avx512icl_table]
5333    tzcnt                wd, wm
5334    mov                 r6d, r8m ; pixel_max
5335    movifnidn            hd, hm
5336    shr                 r6d, 11
5337    movsxd               wq, [r7+wq*4]
5338    vpbroadcastd         m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
5339    vpbroadcastd         m9, [base+pw_64]
5340    vpbroadcastd        m10, [base+mask_round+r6*4]
5341    vpbroadcastd        m11, [base+bidir_shift+r6*4]
5342    mov                 r6d, r7m ; sign
5343    vpbroadcastd        m12, [base+w_mask_round+r6*4]
5344    mova               ym13, [w_mask_end42x]
5345    mov               maskq, maskmp
5346    add                  wq, r7
5347    paddw               m14, m9, m9 ; pw_128
5348    call .main
5349    lea            stride3q, [strideq*3]
5350    jmp                  wq
5351.w4:
5352    movq   [dstq+strideq*0], xm0
5353    movhps [dstq+strideq*1], xm0
5354    vextracti32x4       xm2, ym0, 1
5355    movq   [dstq+strideq*2], xm2
5356    movhps [dstq+stride3q ], xm2
5357    cmp                  hd, 8
5358    jl .w4_end
5359    vextracti32x4       xm2, m0, 2
5360    lea                dstq, [dstq+strideq*4]
5361    movq   [dstq+strideq*0], xm2
5362    movhps [dstq+strideq*1], xm2
5363    vextracti32x4       xm0, m0, 3
5364    movq   [dstq+strideq*2], xm0
5365    movhps [dstq+stride3q ], xm0
5366    je .w4_end
5367    lea                dstq, [dstq+strideq*4]
5368    movq   [dstq+strideq*0], xm1
5369    movhps [dstq+strideq*1], xm1
5370    vextracti32x4       xm2, ym1, 1
5371    movq   [dstq+strideq*2], xm2
5372    movhps [dstq+stride3q ], xm2
5373    vextracti32x4       xm2, m1, 2
5374    lea                dstq, [dstq+strideq*4]
5375    movq   [dstq+strideq*0], xm2
5376    movhps [dstq+strideq*1], xm2
5377    vextracti32x4       xm1, m1, 3
5378    movq   [dstq+strideq*2], xm1
5379    movhps [dstq+stride3q ], xm1
5380.w4_end:
5381    RET
5382.w8_loop:
5383    call .main
5384    lea                dstq, [dstq+strideq*4]
5385.w8:
5386    mova          [dstq+strideq*0], xm0
5387    vextracti32x4 [dstq+strideq*1], ym0, 1
5388    vextracti32x4 [dstq+strideq*2], m0, 2
5389    vextracti32x4 [dstq+stride3q ], m0, 3
5390    sub                  hd, 8
5391    jl .w8_end
5392    lea                dstq, [dstq+strideq*4]
5393    mova          [dstq+strideq*0], xm1
5394    vextracti32x4 [dstq+strideq*1], ym1, 1
5395    vextracti32x4 [dstq+strideq*2], m1, 2
5396    vextracti32x4 [dstq+stride3q ], m1, 3
5397    jg .w8_loop
5398.w8_end:
5399    RET
5400.w16_loop:
5401    call .main
5402    lea                dstq, [dstq+strideq*4]
5403.w16:
5404    mova          [dstq+strideq*0], ym0
5405    vextracti32x8 [dstq+strideq*1], m0, 1
5406    mova          [dstq+strideq*2], ym1
5407    vextracti32x8 [dstq+stride3q ], m1, 1
5408    sub                  hd, 4
5409    jg .w16_loop
5410    RET
5411.w32_loop:
5412    call .main
5413    lea                dstq, [dstq+strideq*2]
5414.w32:
5415    mova   [dstq+strideq*0], m0
5416    mova   [dstq+strideq*1], m1
5417    sub                  hd, 2
5418    jg .w32_loop
5419    RET
5420.w64_loop:
5421    call .main
5422    add                dstq, strideq
5423.w64:
5424    mova        [dstq+64*0], m0
5425    mova        [dstq+64*1], m1
5426    dec                  hd
5427    jg .w64_loop
5428    RET
5429.w128_loop:
5430    call .main
5431    add                dstq, strideq
5432.w128:
5433    mova        [dstq+64*0], m0
5434    mova        [dstq+64*1], m1
5435    call .main
5436    mova        [dstq+64*2], m0
5437    mova        [dstq+64*3], m1
5438    dec                  hd
5439    jg .w128_loop
5440    RET
5441ALIGN function_align
5442.main:
5443    mova                 m1, [tmp1q+64*0]
5444    mova                 m3, [tmp2q+64*0]
5445    mova                 m4, [tmp1q+64*1]
5446    mova                 m7, [tmp2q+64*1]
5447    add               tmp1q, 64*2
5448    add               tmp2q, 64*2
5449    psubsw               m6, m1, m3
5450    punpcklwd            m5, m3, m1
5451    pabsw                m6, m6
5452    punpckhwd            m3, m1
5453    psubusw              m6, m8, m6
5454    psrlw                m6, 10
5455    psubw                m2, m9, m6
5456    punpcklwd            m1, m6, m2
5457    punpckhwd            m6, m2
5458    mova                 m0, m10
5459    vpdpwssd             m0, m5, m1
5460    mova                 m1, m10
5461    vpdpwssd             m1, m3, m6
5462    psubsw               m5, m4, m7
5463    punpcklwd            m6, m7, m4
5464    pabsw                m5, m5
5465    punpckhwd            m7, m4
5466    psubusw              m5, m8, m5
5467    psrlw                m5, 10
5468    psubw                m3, m9, m5
5469    punpcklwd            m4, m5, m3
5470    psrad                m0, 4
5471    punpckhwd            m5, m3
5472    psrad                m1, 4
5473    packusdw             m0, m1
5474    mova                 m1, m10
5475    vpdpwssd             m1, m6, m4
5476    mova                 m4, m10
5477    vpdpwssd             m4, m7, m5
5478    mova                 m5, m12
5479    vpdpwssd             m5, m14, m2
5480    mova                 m2, m12
5481    vpdpwssd             m2, m14, m3
5482    psrad                m1, 4
5483    psrad                m4, 4
5484    packusdw             m1, m4
5485    vpermt2b             m5, m13, m2
5486    vpsrlvw              m0, m11
5487    vpsrlvw              m1, m11
5488    mova            [maskq], ym5
5489    add               maskq, 32
5490    ret
5491
5492cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3
5493%define base r7-w_mask_444_avx512icl_table
5494    lea                  r7, [w_mask_444_avx512icl_table]
5495    tzcnt                wd, wm
5496    mov                 r6d, r8m ; pixel_max
5497    movifnidn            hd, hm
5498    shr                 r6d, 11
5499    movsxd               wq, [r7+wq*4]
5500    vpbroadcastd         m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
5501    vpbroadcastd         m9, [base+pw_64]
5502    vpbroadcastd        m10, [base+mask_round+r6*4]
5503    mova                m11, [w_mask_end444]
5504    vpbroadcastd        m12, [base+bidir_shift+r6*4]
5505    mov               maskq, maskmp
5506    add                  wq, r7
5507    call .main
5508    lea            stride3q, [strideq*3]
5509    jmp                  wq
5510.w4:
5511    movq   [dstq+strideq*0], xm0
5512    movhps [dstq+strideq*1], xm0
5513    vextracti32x4       xm2, ym0, 1
5514    movq   [dstq+strideq*2], xm2
5515    movhps [dstq+stride3q ], xm2
5516    cmp                  hd, 8
5517    jl .w4_end
5518    vextracti32x4       xm2, m0, 2
5519    lea                dstq, [dstq+strideq*4]
5520    movq   [dstq+strideq*0], xm2
5521    movhps [dstq+strideq*1], xm2
5522    vextracti32x4       xm0, m0, 3
5523    movq   [dstq+strideq*2], xm0
5524    movhps [dstq+stride3q ], xm0
5525    je .w4_end
5526    lea                dstq, [dstq+strideq*4]
5527    movq   [dstq+strideq*0], xm1
5528    movhps [dstq+strideq*1], xm1
5529    vextracti32x4       xm2, ym1, 1
5530    movq   [dstq+strideq*2], xm2
5531    movhps [dstq+stride3q ], xm2
5532    vextracti32x4       xm2, m1, 2
5533    lea                dstq, [dstq+strideq*4]
5534    movq   [dstq+strideq*0], xm2
5535    movhps [dstq+strideq*1], xm2
5536    vextracti32x4       xm1, m1, 3
5537    movq   [dstq+strideq*2], xm1
5538    movhps [dstq+stride3q ], xm1
5539.w4_end:
5540    RET
5541.w8_loop:
5542    call .main
5543    lea                dstq, [dstq+strideq*4]
5544.w8:
5545    mova          [dstq+strideq*0], xm0
5546    vextracti32x4 [dstq+strideq*1], ym0, 1
5547    vextracti32x4 [dstq+strideq*2], m0, 2
5548    vextracti32x4 [dstq+stride3q ], m0, 3
5549    sub                  hd, 8
5550    jl .w8_end
5551    lea                dstq, [dstq+strideq*4]
5552    mova          [dstq+strideq*0], xm1
5553    vextracti32x4 [dstq+strideq*1], ym1, 1
5554    vextracti32x4 [dstq+strideq*2], m1, 2
5555    vextracti32x4 [dstq+stride3q ], m1, 3
5556    jg .w8_loop
5557.w8_end:
5558    RET
5559.w16_loop:
5560    call .main
5561    lea                dstq, [dstq+strideq*4]
5562.w16:
5563    mova          [dstq+strideq*0], ym0
5564    vextracti32x8 [dstq+strideq*1], m0, 1
5565    mova          [dstq+strideq*2], ym1
5566    vextracti32x8 [dstq+stride3q ], m1, 1
5567    sub                  hd, 4
5568    jg .w16_loop
5569    RET
5570.w32_loop:
5571    call .main
5572    lea                dstq, [dstq+strideq*2]
5573.w32:
5574    mova   [dstq+strideq*0], m0
5575    mova   [dstq+strideq*1], m1
5576    sub                  hd, 2
5577    jg .w32_loop
5578    RET
5579.w64_loop:
5580    call .main
5581    add                dstq, strideq
5582.w64:
5583    mova        [dstq+64*0], m0
5584    mova        [dstq+64*1], m1
5585    dec                  hd
5586    jg .w64_loop
5587    RET
5588.w128_loop:
5589    call .main
5590    add                dstq, strideq
5591.w128:
5592    mova        [dstq+64*0], m0
5593    mova        [dstq+64*1], m1
5594    call .main
5595    mova        [dstq+64*2], m0
5596    mova        [dstq+64*3], m1
5597    dec                  hd
5598    jg .w128_loop
5599    RET
5600ALIGN function_align
5601.main:
5602    mova                 m1, [tmp1q+64*0]
5603    mova                 m3, [tmp2q+64*0]
5604    mova                 m4, [tmp1q+64*1]
5605    mova                 m7, [tmp2q+64*1]
5606    add               tmp1q, 64*2
5607    add               tmp2q, 64*2
5608    psubsw               m6, m1, m3
5609    punpcklwd            m5, m3, m1
5610    pabsw                m6, m6
5611    punpckhwd            m3, m1
5612    psubusw              m6, m8, m6
5613    psrlw                m6, 10
5614    psubw                m2, m9, m6
5615    punpcklwd            m1, m6, m2
5616    punpckhwd            m6, m2
5617    mova                 m0, m10
5618    vpdpwssd             m0, m5, m1
5619    mova                 m1, m10
5620    vpdpwssd             m1, m3, m6
5621    psubsw               m5, m4, m7
5622    punpcklwd            m6, m7, m4
5623    pabsw                m5, m5
5624    punpckhwd            m7, m4
5625    psubusw              m5, m8, m5
5626    psrlw                m5, 10
5627    psubw                m3, m9, m5
5628    punpcklwd            m4, m5, m3
5629    psrad                m0, 4
5630    punpckhwd            m5, m3
5631    psrad                m1, 4
5632    packusdw             m0, m1
5633    mova                 m1, m10
5634    vpdpwssd             m1, m6, m4
5635    mova                 m4, m10
5636    vpdpwssd             m4, m7, m5
5637    vpermt2b             m2, m11, m3
5638    psrad                m1, 4
5639    psrad                m4, 4
5640    packusdw             m1, m4
5641    vpsrlvw              m0, m12
5642    vpsrlvw              m1, m12
5643    mova            [maskq], m2
5644    add               maskq, 64
5645    ret
5646
5647cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
5648%define base r6-blend_avx512icl_table
5649    lea                  r6, [blend_avx512icl_table]
5650    tzcnt                wd, wm
5651    movifnidn            hd, hm
5652    movsxd               wq, [r6+wq*4]
5653    movifnidn         maskq, maskmp
5654    vpbroadcastd         m6, [base+pw_m512]
5655    add                  wq, r6
5656    lea                  r6, [dsq*3]
5657    jmp                  wq
5658.w4:
5659    pmovzxbw           ym19, [maskq]
5660    movq               xm16, [dstq+dsq*0]
5661    movhps             xm16, [dstq+dsq*1]
5662    vpbroadcastq       ym17, [dstq+dsq*2]
5663    vpbroadcastq       ym18, [dstq+r6   ]
5664    pmullw             ym19, ym6
5665    vpblendd           ym16, ym17, 0x30
5666    vpblendd           ym16, ym18, 0xc0
5667    psubw              ym17, ym16, [tmpq]
5668    add               maskq, 16
5669    add                tmpq, 32
5670    pmulhrsw           ym17, ym19
5671    paddw              ym16, ym17
5672    vextracti128       xm17, ym16, 1
5673    movq       [dstq+dsq*0], xm16
5674    movhps     [dstq+dsq*1], xm16
5675    movq       [dstq+dsq*2], xm17
5676    movhps     [dstq+r6   ], xm17
5677    lea                dstq, [dstq+dsq*4]
5678    sub                  hd, 4
5679    jg .w4
5680    vzeroupper
5681    RET
5682.w8:
5683    pmovzxbw             m2, [maskq]
5684    mova                xm0, [dstq+dsq*0]
5685    vinserti32x4        ym0, [dstq+dsq*1], 1
5686    vinserti32x4         m0, [dstq+dsq*2], 2
5687    vinserti32x4         m0, [dstq+r6   ], 3
5688    pmullw               m2, m6
5689    psubw                m1, m0, [tmpq]
5690    add               maskq, 32
5691    add                tmpq, 64
5692    pmulhrsw             m1, m2
5693    paddw                m0, m1
5694    mova          [dstq+dsq*0], xm0
5695    vextracti32x4 [dstq+dsq*1], ym0, 1
5696    vextracti32x4 [dstq+dsq*2], m0, 2
5697    vextracti32x4 [dstq+r6   ], m0, 3
5698    lea                dstq, [dstq+dsq*4]
5699    sub                  hd, 4
5700    jg .w8
5701    RET
5702.w16:
5703    pmovzxbw             m4, [maskq+32*0]
5704    pmovzxbw             m5, [maskq+32*1]
5705    mova                ym0, [dstq+dsq*0]
5706    vinserti32x8         m0, [dstq+dsq*1], 1
5707    mova                ym1, [dstq+dsq*2]
5708    vinserti32x8         m1, [dstq+r6   ], 1
5709    pmullw               m4, m6
5710    pmullw               m5, m6
5711    psubw                m2, m0, [tmpq+64*0]
5712    psubw                m3, m1, [tmpq+64*1]
5713    add               maskq, 32*2
5714    add                tmpq, 64*2
5715    pmulhrsw             m2, m4
5716    pmulhrsw             m3, m5
5717    paddw                m0, m2
5718    paddw                m1, m3
5719    mova          [dstq+dsq*0], ym0
5720    vextracti32x8 [dstq+dsq*1], m0, 1
5721    mova          [dstq+dsq*2], ym1
5722    vextracti32x8 [dstq+r6   ], m1, 1
5723    lea                dstq, [dstq+dsq*4]
5724    sub                  hd, 4
5725    jg .w16
5726    RET
5727.w32:
5728    pmovzxbw             m4, [maskq+32*0]
5729    pmovzxbw             m5, [maskq+32*1]
5730    mova                 m0, [dstq+dsq*0]
5731    mova                 m1, [dstq+dsq*1]
5732    pmullw               m4, m6
5733    pmullw               m5, m6
5734    psubw                m2, m0, [tmpq+ 64*0]
5735    psubw                m3, m1, [tmpq+ 64*1]
5736    add               maskq, 32*2
5737    add                tmpq, 64*2
5738    pmulhrsw             m2, m4
5739    pmulhrsw             m3, m5
5740    paddw                m0, m2
5741    paddw                m1, m3
5742    mova       [dstq+dsq*0], m0
5743    mova       [dstq+dsq*1], m1
5744    lea                dstq, [dstq+dsq*2]
5745    sub                  hd, 2
5746    jg .w32
5747    RET
5748
5749cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h
5750    lea                  r5, [blend_v_avx512icl_table]
5751    tzcnt                wd, wm
5752    movifnidn            hd, hm
5753    movsxd               wq, [r5+wq*4]
5754    add                  wq, r5
5755    jmp                  wq
5756.w2:
5757    vpbroadcastd       xmm2, [obmc_masks_avx2+2*2]
5758.w2_loop:
5759    movd               xmm0, [dstq+dsq*0]
5760    pinsrd             xmm0, [dstq+dsq*1], 1
5761    movq               xmm1, [tmpq]
5762    add                tmpq, 4*2
5763    psubw              xmm1, xmm0, xmm1
5764    pmulhrsw           xmm1, xmm2
5765    paddw              xmm0, xmm1
5766    movd       [dstq+dsq*0], xmm0
5767    pextrd     [dstq+dsq*1], xmm0, 1
5768    lea                dstq, [dstq+dsq*2]
5769    sub                  hd, 2
5770    jg .w2_loop
5771    RET
5772.w4:
5773    vpbroadcastq       xmm2, [obmc_masks_avx2+4*2]
5774.w4_loop:
5775    movq               xmm0, [dstq+dsq*0]
5776    movhps             xmm0, [dstq+dsq*1]
5777    psubw              xmm1, xmm0, [tmpq]
5778    add                tmpq, 8*2
5779    pmulhrsw           xmm1, xmm2
5780    paddw              xmm0, xmm1
5781    movq       [dstq+dsq*0], xmm0
5782    movhps     [dstq+dsq*1], xmm0
5783    lea                dstq, [dstq+dsq*2]
5784    sub                  hd, 2
5785    jg .w4_loop
5786    RET
5787.w8:
5788    vbroadcasti32x4     ym2, [obmc_masks_avx2+8*2]
5789.w8_loop:
5790    mova                xm0, [dstq+dsq*0]
5791    vinserti32x4        ym0, [dstq+dsq*1], 1
5792    psubw               ym1, ym0, [tmpq]
5793    add                tmpq, 16*2
5794    pmulhrsw            ym1, ym2
5795    paddw               ym0, ym1
5796    mova          [dstq+dsq*0], xm0
5797    vextracti32x4 [dstq+dsq*1], ym0, 1
5798    lea                dstq, [dstq+dsq*2]
5799    sub                  hd, 2
5800    jg .w8_loop
5801    RET
5802.w16:
5803    vbroadcasti32x8      m2, [obmc_masks_avx2+16*2]
5804.w16_loop:
5805    mova                ym0, [dstq+dsq*0]
5806    vinserti32x8         m0, [dstq+dsq*1], 1
5807    psubw                m1, m0, [tmpq]
5808    add                tmpq, 32*2
5809    pmulhrsw             m1, m2
5810    paddw                m0, m1
5811    mova          [dstq+dsq*0], ym0
5812    vextracti32x8 [dstq+dsq*1], m0, 1
5813    lea                dstq, [dstq+dsq*2]
5814    sub                  hd, 2
5815    jg .w16_loop
5816    RET
5817.w32:
5818    mova                 m4, [obmc_masks_avx2+32*2]
5819.w32_loop:
5820    mova                 m0,     [dstq+dsq*0]
5821    psubw                m2, m0, [tmpq+ 64*0]
5822    mova                 m1,     [dstq+dsq*1]
5823    psubw                m3, m1, [tmpq+ 64*1]
5824    add                tmpq, 64*2
5825    pmulhrsw             m2, m4
5826    pmulhrsw             m3, m4
5827    paddw                m0, m2
5828    paddw                m1, m3
5829    mova       [dstq+dsq*0], m0
5830    mova       [dstq+dsq*1], m1
5831    lea                dstq, [dstq+dsq*2]
5832    sub                  hd, 2
5833    jg .w32_loop
5834    RET
5835
5836cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask
5837%define base r6-$$
5838    lea                  r6, [$$]
5839    tzcnt                wd, wm
5840    mov                  hd, hm
5841    movsxd               wq, [base+blend_h_avx512icl_table+wq*4]
5842    lea               maskq, [base+obmc_masks_avx2+hq*2]
5843    lea                  hd, [hq*3]
5844    lea                  wq, [base+blend_h_avx512icl_table+wq]
5845    shr                  hd, 2 ; h * 3/4
5846    lea               maskq, [maskq+hq*2]
5847    neg                  hq
5848    jmp                  wq
5849.w2:
5850    movd               xmm0, [dstq+dsq*0]
5851    pinsrd             xmm0, [dstq+dsq*1], 1
5852    movd               xmm2, [maskq+hq*2]
5853    movq               xmm1, [tmpq]
5854    add                tmpq, 4*2
5855    punpcklwd          xmm2, xmm2
5856    psubw              xmm1, xmm0, xmm1
5857    pmulhrsw           xmm1, xmm2
5858    paddw              xmm0, xmm1
5859    movd       [dstq+dsq*0], xmm0
5860    pextrd     [dstq+dsq*1], xmm0, 1
5861    lea                dstq, [dstq+dsq*2]
5862    add                  hq, 2
5863    jl .w2
5864    RET
5865.w4:
5866    mova               xmm3, [blend_shuf]
5867.w4_loop:
5868    movq               xmm0, [dstq+dsq*0]
5869    movhps             xmm0, [dstq+dsq*1]
5870    movd               xmm2, [maskq+hq*2]
5871    psubw              xmm1, xmm0, [tmpq]
5872    add                tmpq, 8*2
5873    pshufb             xmm2, xmm3
5874    pmulhrsw           xmm1, xmm2
5875    paddw              xmm0, xmm1
5876    movq       [dstq+dsq*0], xmm0
5877    movhps     [dstq+dsq*1], xmm0
5878    lea                dstq, [dstq+dsq*2]
5879    add                  hq, 2
5880    jl .w4_loop
5881    RET
5882.w8:
5883    vbroadcasti32x4     ym3, [blend_shuf]
5884    shufpd              ym3, ym3, 0x0c
5885.w8_loop:
5886    mova                xm0, [dstq+dsq*0]
5887    vinserti32x4        ym0, [dstq+dsq*1], 1
5888    vpbroadcastd        ym2, [maskq+hq*2]
5889    psubw               ym1, ym0, [tmpq]
5890    add                tmpq, 16*2
5891    pshufb              ym2, ym3
5892    pmulhrsw            ym1, ym2
5893    paddw               ym0, ym1
5894    mova          [dstq+dsq*0], xm0
5895    vextracti32x4 [dstq+dsq*1], ym0, 1
5896    lea                dstq, [dstq+dsq*2]
5897    add                  hq, 2
5898    jl .w8_loop
5899    RET
5900.w16:
5901    vbroadcasti32x4      m3, [blend_shuf]
5902    shufpd               m3, m3, 0xf0
5903.w16_loop:
5904    mova                ym0, [dstq+dsq*0]
5905    vinserti32x8         m0, [dstq+dsq*1], 1
5906    vpbroadcastd         m2, [maskq+hq*2]
5907    psubw                m1, m0, [tmpq]
5908    add                tmpq, 32*2
5909    pshufb               m2, m3
5910    pmulhrsw             m1, m2
5911    paddw                m0, m1
5912    mova          [dstq+dsq*0], ym0
5913    vextracti32x8 [dstq+dsq*1], m0, 1
5914    lea                dstq, [dstq+dsq*2]
5915    add                  hq, 2
5916    jl .w16_loop
5917    RET
5918.w32:
5919    vpbroadcastw         m4, [maskq+hq*2]
5920    vpbroadcastw         m5, [maskq+hq*2+2]
5921    mova                 m0,     [dstq+dsq*0]
5922    psubw                m2, m0, [tmpq+ 64*0]
5923    mova                 m1,     [dstq+dsq*1]
5924    psubw                m3, m1, [tmpq+ 64*1]
5925    add                tmpq, 64*2
5926    pmulhrsw             m2, m4
5927    pmulhrsw             m3, m5
5928    paddw                m0, m2
5929    paddw                m1, m3
5930    mova       [dstq+dsq*0], m0
5931    mova       [dstq+dsq*1], m1
5932    lea                dstq, [dstq+dsq*2]
5933    add                  hq, 2
5934    jl .w32
5935    RET
5936.w64:
5937    vpbroadcastw         m4, [maskq+hq*2]
5938    mova                 m0,     [dstq+64*0]
5939    psubw                m2, m0, [tmpq+64*0]
5940    mova                 m1,     [dstq+64*1]
5941    psubw                m3, m1, [tmpq+64*1]
5942    add                tmpq, 64*2
5943    pmulhrsw             m2, m4
5944    pmulhrsw             m3, m4
5945    paddw                m0, m2
5946    paddw                m1, m3
5947    mova        [dstq+64*0], m0
5948    mova        [dstq+64*1], m1
5949    add                dstq, dsq
5950    inc                  hq
5951    jl .w64
5952    RET
5953.w128:
5954    vpbroadcastw         m8, [maskq+hq*2]
5955    mova                 m0,     [dstq+64*0]
5956    psubw                m4, m0, [tmpq+64*0]
5957    mova                 m1,     [dstq+64*1]
5958    psubw                m5, m1, [tmpq+64*1]
5959    mova                 m2,     [dstq+64*2]
5960    psubw                m6, m2, [tmpq+64*2]
5961    mova                 m3,     [dstq+64*3]
5962    psubw                m7, m3, [tmpq+64*3]
5963    add                tmpq, 64*4
5964    REPX   {pmulhrsw x, m8}, m4, m5, m6, m7
5965    paddw                m0, m4
5966    paddw                m1, m5
5967    paddw                m2, m6
5968    paddw                m3, m7
5969    mova        [dstq+64*0], m0
5970    mova        [dstq+64*1], m1
5971    mova        [dstq+64*2], m2
5972    mova        [dstq+64*3], m3
5973    add                dstq, dsq
5974    inc                  hq
5975    jl .w128
5976    RET
5977
5978cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \
5979                                 dst_w, h, src_w, dx, mx0, pxmax
5980    sub          dword mx0m, 4<<14
5981    sub        dword src_wm, 8
5982    mov                  r6, ~0
5983    vpbroadcastd         m5, dxm
5984    vpbroadcastd         m8, mx0m
5985    vpbroadcastd         m6, src_wm
5986    kmovq                k6, r6
5987 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
5988    LEA                  r7, $$
5989%define base r7-$$
5990    vpbroadcastd         m3, [base+pd_16384]
5991    vpbroadcastd         m7, [base+pd_63]
5992    mova                m24, [base+resize_permA]
5993    mova                m25, [base+resize_permB]
5994    mova                m26, [base+resize_permC]
5995    mova                m27, [base+resize_permD]
5996    vbroadcasti32x4     m28, [base+resize_shufA]
5997    vbroadcasti32x4     m29, [base+resize_shufB]
5998    mova                m30, [base+resize_permE]
5999    vpbroadcastw       ym31, pxmaxm
6000    vpdpwssd             m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
6001    pslld                m5, 4                      ; dx*16
6002    pslld                m6, 14
6003    pxor                 m2, m2
6004.loop_y:
6005    xor                  xd, xd
6006    mova                 m4, m8     ; per-line working version of mx
6007.loop_x:
6008    pmaxsd               m0, m4, m2
6009    psrad                m9, m4, 8  ; filter offset (unmasked)
6010    pminsd               m0, m6     ; iclip(mx, 0, src_w-8)
6011    psubd                m1, m4, m0 ; pshufb offset
6012    psrad                m0, 14     ; clipped src_x offset
6013    psrad                m1, 14     ; pshufb edge_emu offset
6014    vptestmd             k5, m1, m1
6015    pand                 m9, m7     ; filter offset (masked)
6016    ktestw               k5, k5
6017    jz .load
6018    vpbroadcastq        m14, [base+pd_0_4]
6019    vpermq              m10, m0, q1100
6020    vpermq              m11, m0, q3322
6021    vpermq              m20, m1, q1100
6022    vpermq              m21, m1, q3322
6023    punpckldq           m10, m10
6024    punpckldq           m11, m11
6025    punpckldq           m20, m20
6026    punpckldq           m21, m21
6027    paddd               m10, m14
6028    paddd               m11, m14
6029    paddd               m20, m14
6030    paddd               m21, m14
6031    vextracti32x8      ym12, m10, 1
6032    vextracti32x8      ym13, m11, 1
6033    vextracti32x8      ym22, m20, 1
6034    vextracti32x8      ym23, m21, 1
6035    kmovq                k1, k6
6036    kmovq                k2, k6
6037    kmovq                k3, k6
6038    kmovq                k4, k6
6039    vpgatherdq      m16{k1}, [srcq+ym10*2] ; 0 1 2 3
6040    vpgatherdq      m17{k2}, [srcq+ym11*2] ; 4 5 6 7
6041    vpgatherdq      m18{k3}, [srcq+ym12*2] ; 8 9 A B
6042    vpgatherdq      m19{k4}, [srcq+ym13*2] ; C D E F
6043    kmovq                k1, k6
6044    kmovq                k2, k6
6045    kmovq                k3, k6
6046    kmovq                k4, k6
6047    vpgatherdq       m0{k1}, [base+resize_shuf+8+ym20*2]
6048    vpgatherdq       m1{k2}, [base+resize_shuf+8+ym21*2]
6049    vpgatherdq      m14{k3}, [base+resize_shuf+8+ym22*2]
6050    vpgatherdq      m15{k4}, [base+resize_shuf+8+ym23*2]
6051    pshufb              m16, m0
6052    pshufb              m17, m1
6053    pshufb              m18, m14
6054    pshufb              m19, m15
6055    mova                m20, m24
6056    mova                m22, m24
6057    mova                m21, m25
6058    mova                m23, m25
6059    vpermi2d            m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b
6060    vpermi2d            m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d
6061    vpermi2d            m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb
6062    vpermi2d            m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd
6063    mova                m15, m26
6064    mova                m17, m26
6065    mova                m16, m27
6066    mova                m18, m27
6067    vpermi2q            m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa
6068    vpermi2q            m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb
6069    vpermi2q            m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc
6070    vpermi2q            m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd
6071    kmovq                k1, k6
6072    kmovq                k2, k6
6073    vpgatherdd      m11{k1}, [base+resize_filter+m9*8+0]
6074    vpgatherdd      m13{k2}, [base+resize_filter+m9*8+4]
6075    pshufb              m10, m11, m28
6076    pshufb              m11, m11, m29
6077    pshufb              m12, m13, m28
6078    pshufb              m13, m13, m29
6079    jmp .filter
6080.load:
6081    kmovq                k1, k6
6082    kmovq                k2, k6
6083    kmovq                k3, k6
6084    kmovq                k4, k6
6085    vpgatherdd      m11{k1}, [base+resize_filter+m9*8+0]
6086    vpgatherdd      m13{k2}, [base+resize_filter+m9*8+4]
6087    pshufb              m10, m11, m28
6088    pshufb              m11, m11, m29
6089    pshufb              m12, m13, m28
6090    pshufb              m13, m13, m29
6091    vpgatherdd      m15{k3}, [srcq+m0*2+ 0]
6092    vpgatherdd      m16{k4}, [srcq+m0*2+ 4]
6093    kmovq                k1, k6
6094    kmovq                k2, k6
6095    vpgatherdd      m17{k1}, [srcq+m0*2+ 8]
6096    vpgatherdd      m18{k2}, [srcq+m0*2+12]
6097.filter:
6098    mova                m14, m2
6099    vpdpwssd            m14, m15, m10
6100    vpdpwssd            m14, m16, m11
6101    vpdpwssd            m14, m17, m12
6102    vpdpwssd            m14, m18, m13
6103    psubd               m14, m3, m14
6104    psrad               m14, 15
6105    packusdw            m14, m14
6106    vpermq              m14, m30, m14
6107    pminsw             ym14, ym31
6108    mova        [dstq+xq*2], ym14
6109    paddd                m4, m5
6110    add                  xd, 16
6111    cmp                  xd, dst_wd
6112    jl .loop_x
6113    add                dstq, dst_strideq
6114    add                srcq, src_strideq
6115    dec                  hd
6116    jg .loop_y
6117    RET
6118
6119%endif ; ARCH_X86_64
6120