xref: /aosp_15_r20/external/libdav1d/src/x86/mc_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2018-2021, VideoLAN and dav1d authors
2; Copyright © 2018-2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 32
32
33; dav1d_obmc_masks[] with 64-x interleaved
34obmc_masks:     db  0,  0,  0,  0
35                ; 2
36                db 45, 19, 64,  0
37                ; 4
38                db 39, 25, 50, 14, 59,  5, 64,  0
39                ; 8
40                db 36, 28, 42, 22, 48, 16, 53, 11, 57,  7, 61,  3, 64,  0, 64,  0
41                ; 16
42                db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
43                db 56,  8, 58,  6, 60,  4, 61,  3, 64,  0, 64,  0, 64,  0, 64,  0
44                ; 32
45                db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
46                db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
47                db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
48                db 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0
49
50warp_8x8_shufA: db  0,  2,  4,  6,  1,  3,  5,  7,  1,  3,  5,  7,  2,  4,  6,  8
51                db  4,  6,  8, 10,  5,  7,  9, 11,  5,  7,  9, 11,  6,  8, 10, 12
52warp_8x8_shufB: db  2,  4,  6,  8,  3,  5,  7,  9,  3,  5,  7,  9,  4,  6,  8, 10
53                db  6,  8, 10, 12,  7,  9, 11, 13,  7,  9, 11, 13,  8, 10, 12, 14
54subpel_h_shuf4: db  0,  1,  2,  3,  1,  2,  3,  4,  8,  9, 10, 11,  9, 10, 11, 12
55                db  2,  3,  4,  5,  3,  4,  5,  6, 10, 11, 12, 13, 11, 12, 13, 14
56subpel_h_shufA: db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
57subpel_h_shufB: db  4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
58subpel_h_shufC: db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
59subpel_v_shuf4: db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
60subpel_s_shuf2: db  0,  1,  2,  3,  0,  1,  2,  3,  8,  9, 10, 11,  8,  9, 10, 11
61subpel_s_shuf8: db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
62bilin_h_shuf4:  db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
63bilin_v_shuf4:  db  4,  0,  5,  1,  6,  2,  7,  3,  8,  4,  9,  5, 10,  6, 11,  7
64deint_shuf4:    db  0,  4,  1,  5,  2,  6,  3,  7,  4,  8,  5,  9,  6, 10,  7, 11
65blend_shuf:     db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
66pb_8x0_8x8:     db  0,  0,  0,  0,  0,  0,  0,  0,  8,  8,  8,  8,  8,  8,  8,  8
67bdct_lb_dw:     db  0,  0,  0,  0,  4,  4,  4,  4,  8,  8,  8,  8, 12, 12, 12, 12
68wswap:          db  2,  3,  0,  1,  6,  7,  4,  5, 10, 11,  8,  9, 14, 15, 12, 13
69resize_shuf:    db  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  7,  7,  7,  7
70rescale_mul:    dd  0,  1,  2,  3,  4,  5,  6,  7
71
72wm_420_sign:    dd 0x01020102, 0x01010101
73wm_422_sign:    dd 0x80808080, 0x7f7f7f7f
74
75pb_64:   times 4 db 64
76pw_m256: times 2 dw -256
77pw_15:   times 2 dw 15
78pw_32:   times 2 dw 32
79pw_34:   times 2 dw 34
80pw_258:  times 2 dw 258
81pw_512:  times 2 dw 512
82pw_1024: times 2 dw 1024
83pw_2048: times 2 dw 2048
84pw_6903: times 2 dw 6903
85pw_8192: times 2 dw 8192
86pd_32:           dd 32
87pd_63:           dd 63
88pd_512:          dd 512
89pd_32768:        dd 32768
90pd_0x3ff:        dd 0x3ff
91pd_0x4000:       dd 0x4000
92pq_0x40000000:   dq 0x40000000
93
94cextern mc_subpel_filters
95cextern mc_warp_filter2
96cextern resize_filter
97cextern z_filter_s
98
99%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
100
101%macro BASE_JMP_TABLE 3-*
102    %xdefine %1_%2_table (%%table - %3)
103    %xdefine %%base %1_%2
104    %%table:
105    %rep %0 - 2
106        dw %%base %+ _w%3 - %%base
107        %rotate 1
108    %endrep
109%endmacro
110
111%macro HV_JMP_TABLE 5-*
112    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
113    %xdefine %%base %1_%3
114    %assign %%types %4
115    %if %%types & 1
116        %xdefine %1_%2_h_%3_table  (%%h  - %5)
117        %%h:
118        %rep %0 - 4
119            dw %%prefix %+ .h_w%5 - %%base
120            %rotate 1
121        %endrep
122        %rotate 4
123    %endif
124    %if %%types & 2
125        %xdefine %1_%2_v_%3_table  (%%v  - %5)
126        %%v:
127        %rep %0 - 4
128            dw %%prefix %+ .v_w%5 - %%base
129            %rotate 1
130        %endrep
131        %rotate 4
132    %endif
133    %if %%types & 4
134        %xdefine %1_%2_hv_%3_table (%%hv - %5)
135        %%hv:
136        %rep %0 - 4
137            dw %%prefix %+ .hv_w%5 - %%base
138            %rotate 1
139        %endrep
140    %endif
141%endmacro
142
143%macro BIDIR_JMP_TABLE 2-*
144    %xdefine %1_%2_table (%%table - 2*%3)
145    %xdefine %%base %1_%2_table
146    %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
147    %%table:
148    %rep %0 - 2
149        dd %%prefix %+ .w%3 - %%base
150        %rotate 1
151    %endrep
152%endmacro
153
154%macro SCALED_JMP_TABLE 2-*
155    %xdefine %1_%2_table (%%table - %3)
156    %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
157%%table:
158    %rep %0 - 2
159        dw %%base %+ .w%3 - %%base
160        %rotate 1
161    %endrep
162    %rotate 2
163%%dy_1024:
164    %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
165    %rep %0 - 2
166        dw %%base %+ .dy1_w%3 - %%base
167        %rotate 1
168    %endrep
169    %rotate 2
170%%dy_2048:
171    %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
172    %rep %0 - 2
173        dw %%base %+ .dy2_w%3 - %%base
174        %rotate 1
175    %endrep
176%endmacro
177
178%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_8bpc_avx2.put)
179%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_8bpc_avx2.prep)
180
181%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
182
183BASE_JMP_TABLE   put,  avx2,            2, 4, 8, 16, 32, 64, 128
184BASE_JMP_TABLE   prep, avx2,               4, 8, 16, 32, 64, 128
185HV_JMP_TABLE     put,  bilin, avx2,  7, 2, 4, 8, 16, 32, 64, 128
186HV_JMP_TABLE     prep, bilin, avx2,  7,    4, 8, 16, 32, 64, 128
187HV_JMP_TABLE     put,  6tap,  avx2,  3, 2, 4, 8, 16, 32, 64, 128
188HV_JMP_TABLE     put,  8tap,  avx2,  3, 2, 4, 8, 16, 32, 64, 128
189HV_JMP_TABLE     prep, 6tap,  avx2,  1,    4, 8, 16, 32, 64, 128
190HV_JMP_TABLE     prep, 8tap,  avx2,  1,    4, 8, 16, 32, 64, 128
191SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128
192SCALED_JMP_TABLE prep_8tap_scaled, avx2,   4, 8, 16, 32, 64, 128
193BIDIR_JMP_TABLE  avg, avx2,                4, 8, 16, 32, 64, 128
194BIDIR_JMP_TABLE  w_avg, avx2,              4, 8, 16, 32, 64, 128
195BIDIR_JMP_TABLE  mask, avx2,               4, 8, 16, 32, 64, 128
196BIDIR_JMP_TABLE  w_mask_420, avx2,         4, 8, 16, 32, 64, 128
197BIDIR_JMP_TABLE  w_mask_422, avx2,         4, 8, 16, 32, 64, 128
198BIDIR_JMP_TABLE  w_mask_444, avx2,         4, 8, 16, 32, 64, 128
199BIDIR_JMP_TABLE  blend, avx2,              4, 8, 16, 32
200BIDIR_JMP_TABLE  blend_v, avx2,         2, 4, 8, 16, 32
201BIDIR_JMP_TABLE  blend_h, avx2,         2, 4, 8, 16, 32, 32, 32
202
203SECTION .text
204
205INIT_XMM avx2
206cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
207    movifnidn          mxyd, r6m ; mx
208    lea                  r7, [put_avx2]
209    tzcnt                wd, wm
210    movifnidn            hd, hm
211    test               mxyd, mxyd
212    jnz .h
213    mov                mxyd, r7m ; my
214    test               mxyd, mxyd
215    jnz .v
216.put:
217    movzx                wd, word [r7+wq*2+table_offset(put,)]
218    add                  wq, r7
219    jmp                  wq
220.put_w2:
221    movzx               r6d, word [srcq+ssq*0]
222    movzx               r7d, word [srcq+ssq*1]
223    lea                srcq, [srcq+ssq*2]
224    mov        [dstq+dsq*0], r6w
225    mov        [dstq+dsq*1], r7w
226    lea                dstq, [dstq+dsq*2]
227    sub                  hd, 2
228    jg .put_w2
229    RET
230.put_w4:
231    mov                 r6d, [srcq+ssq*0]
232    mov                 r7d, [srcq+ssq*1]
233    lea                srcq, [srcq+ssq*2]
234    mov        [dstq+dsq*0], r6d
235    mov        [dstq+dsq*1], r7d
236    lea                dstq, [dstq+dsq*2]
237    sub                  hd, 2
238    jg .put_w4
239    RET
240.put_w8:
241    mov                  r6, [srcq+ssq*0]
242    mov                  r7, [srcq+ssq*1]
243    lea                srcq, [srcq+ssq*2]
244    mov        [dstq+dsq*0], r6
245    mov        [dstq+dsq*1], r7
246    lea                dstq, [dstq+dsq*2]
247    sub                  hd, 2
248    jg .put_w8
249    RET
250.put_w16:
251    movu                 m0, [srcq+ssq*0]
252    movu                 m1, [srcq+ssq*1]
253    lea                srcq, [srcq+ssq*2]
254    mova       [dstq+dsq*0], m0
255    mova       [dstq+dsq*1], m1
256    lea                dstq, [dstq+dsq*2]
257    sub                  hd, 2
258    jg .put_w16
259    RET
260INIT_YMM avx2
261.put_w32:
262    movu                 m0, [srcq+ssq*0]
263    movu                 m1, [srcq+ssq*1]
264    lea                srcq, [srcq+ssq*2]
265    mova       [dstq+dsq*0], m0
266    mova       [dstq+dsq*1], m1
267    lea                dstq, [dstq+dsq*2]
268    sub                  hd, 2
269    jg .put_w32
270    RET
271.put_w64:
272    movu                 m0, [srcq+ssq*0+32*0]
273    movu                 m1, [srcq+ssq*0+32*1]
274    movu                 m2, [srcq+ssq*1+32*0]
275    movu                 m3, [srcq+ssq*1+32*1]
276    lea                srcq, [srcq+ssq*2]
277    mova  [dstq+dsq*0+32*0], m0
278    mova  [dstq+dsq*0+32*1], m1
279    mova  [dstq+dsq*1+32*0], m2
280    mova  [dstq+dsq*1+32*1], m3
281    lea                dstq, [dstq+dsq*2]
282    sub                  hd, 2
283    jg .put_w64
284    RET
285.put_w128:
286    movu                 m0, [srcq+32*0]
287    movu                 m1, [srcq+32*1]
288    movu                 m2, [srcq+32*2]
289    movu                 m3, [srcq+32*3]
290    add                srcq, ssq
291    mova        [dstq+32*0], m0
292    mova        [dstq+32*1], m1
293    mova        [dstq+32*2], m2
294    mova        [dstq+32*3], m3
295    add                dstq, dsq
296    dec                  hd
297    jg .put_w128
298    RET
299.h:
300    ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
301    ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
302    imul               mxyd, 255
303    vbroadcasti128       m4, [z_filter_s+2]
304    add                mxyd, 16
305    movd                xm5, mxyd
306    mov                mxyd, r7m ; my
307    vpbroadcastw         m5, xm5
308    test               mxyd, mxyd
309    jnz .hv
310    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_h)]
311    vpbroadcastd         m3, [pw_2048]
312    add                  wq, r7
313    jmp                  wq
314.h_w2:
315    movd                xm0, [srcq+ssq*0]
316    pinsrd              xm0, [srcq+ssq*1], 1
317    lea                srcq, [srcq+ssq*2]
318    pshufb              xm0, xm4
319    pmaddubsw           xm0, xm5
320    pmulhrsw            xm0, xm3
321    packuswb            xm0, xm0
322    pextrw     [dstq+dsq*0], xm0, 0
323    pextrw     [dstq+dsq*1], xm0, 2
324    lea                dstq, [dstq+dsq*2]
325    sub                  hd, 2
326    jg .h_w2
327    RET
328.h_w4:
329    mova                xm4, [bilin_h_shuf4]
330.h_w4_loop:
331    movq                xm0, [srcq+ssq*0]
332    movhps              xm0, [srcq+ssq*1]
333    lea                srcq, [srcq+ssq*2]
334    pshufb              xm0, xm4
335    pmaddubsw           xm0, xm5
336    pmulhrsw            xm0, xm3
337    packuswb            xm0, xm0
338    movd       [dstq+dsq*0], xm0
339    pextrd     [dstq+dsq*1], xm0, 1
340    lea                dstq, [dstq+dsq*2]
341    sub                  hd, 2
342    jg .h_w4_loop
343    RET
344.h_w8:
345    movu                xm0, [srcq+ssq*0]
346    movu                xm1, [srcq+ssq*1]
347    lea                srcq, [srcq+ssq*2]
348    pshufb              xm0, xm4
349    pshufb              xm1, xm4
350    pmaddubsw           xm0, xm5
351    pmaddubsw           xm1, xm5
352    pmulhrsw            xm0, xm3
353    pmulhrsw            xm1, xm3
354    packuswb            xm0, xm1
355    movq       [dstq+dsq*0], xm0
356    movhps     [dstq+dsq*1], xm0
357    lea                dstq, [dstq+dsq*2]
358    sub                  hd, 2
359    jg .h_w8
360    RET
361.h_w16:
362    movu                xm0, [srcq+ssq*0+8*0]
363    vinserti128          m0, [srcq+ssq*1+8*0], 1
364    movu                xm1, [srcq+ssq*0+8*1]
365    vinserti128          m1, [srcq+ssq*1+8*1], 1
366    lea                srcq, [srcq+ssq*2]
367    pshufb               m0, m4
368    pshufb               m1, m4
369    pmaddubsw            m0, m5
370    pmaddubsw            m1, m5
371    pmulhrsw             m0, m3
372    pmulhrsw             m1, m3
373    packuswb             m0, m1
374    mova         [dstq+dsq*0], xm0
375    vextracti128 [dstq+dsq*1], m0, 1
376    lea                dstq, [dstq+dsq*2]
377    sub                  hd, 2
378    jg .h_w16
379    RET
380.h_w32:
381    movu                 m0, [srcq+8*0]
382    movu                 m1, [srcq+8*1]
383    add                srcq, ssq
384    pshufb               m0, m4
385    pshufb               m1, m4
386    pmaddubsw            m0, m5
387    pmaddubsw            m1, m5
388    pmulhrsw             m0, m3
389    pmulhrsw             m1, m3
390    packuswb             m0, m1
391    mova             [dstq], m0
392    add                dstq, dsq
393    dec                  hd
394    jg .h_w32
395    RET
396.h_w64:
397    movu                 m0, [srcq+8*0]
398    movu                 m1, [srcq+8*1]
399    pshufb               m0, m4
400    pshufb               m1, m4
401    pmaddubsw            m0, m5
402    pmaddubsw            m1, m5
403    pmulhrsw             m0, m3
404    pmulhrsw             m1, m3
405    packuswb             m0, m1
406    movu                 m1, [srcq+8*4]
407    movu                 m2, [srcq+8*5]
408    add                srcq, ssq
409    pshufb               m1, m4
410    pshufb               m2, m4
411    pmaddubsw            m1, m5
412    pmaddubsw            m2, m5
413    pmulhrsw             m1, m3
414    pmulhrsw             m2, m3
415    packuswb             m1, m2
416    mova        [dstq+32*0], m0
417    mova        [dstq+32*1], m1
418    add                dstq, dsq
419    dec                  hd
420    jg .h_w64
421    RET
422.h_w128:
423    mov                  r6, -32*3
424.h_w128_loop:
425    movu                 m0, [srcq+r6+32*3+8*0]
426    movu                 m1, [srcq+r6+32*3+8*1]
427    pshufb               m0, m4
428    pshufb               m1, m4
429    pmaddubsw            m0, m5
430    pmaddubsw            m1, m5
431    pmulhrsw             m0, m3
432    pmulhrsw             m1, m3
433    packuswb             m0, m1
434    mova     [dstq+r6+32*3], m0
435    add                  r6, 32
436    jle .h_w128_loop
437    add                srcq, ssq
438    add                dstq, dsq
439    dec                  hd
440    jg .h_w128
441    RET
442.v:
443    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_v)]
444    imul               mxyd, 255
445    vpbroadcastd         m5, [pw_2048]
446    add                mxyd, 16
447    add                  wq, r7
448    movd                xm4, mxyd
449    vpbroadcastw         m4, xm4
450    jmp                  wq
451.v_w2:
452    movd                xm0,      [srcq+ssq*0]
453.v_w2_loop:
454    pinsrw              xm1, xm0, [srcq+ssq*1], 1 ; 0 1
455    lea                srcq,      [srcq+ssq*2]
456    pinsrw              xm0, xm1, [srcq+ssq*0], 0 ; 2 1
457    pshuflw             xm1, xm1, q2301           ; 1 0
458    punpcklbw           xm1, xm0
459    pmaddubsw           xm1, xm4
460    pmulhrsw            xm1, xm5
461    packuswb            xm1, xm1
462    pextrw     [dstq+dsq*0], xm1, 1
463    pextrw     [dstq+dsq*1], xm1, 0
464    lea                dstq, [dstq+dsq*2]
465    sub                  hd, 2
466    jg .v_w2_loop
467    RET
468.v_w4:
469    movd                xm0, [srcq+ssq*0]
470.v_w4_loop:
471    vpbroadcastd        xm2, [srcq+ssq*1]
472    lea                srcq, [srcq+ssq*2]
473    vpblendd            xm1, xm2, xm0, 0x01 ; 0 1
474    vpbroadcastd        xm0, [srcq+ssq*0]
475    vpblendd            xm2, xm0, 0x02      ; 1 2
476    punpcklbw           xm1, xm2
477    pmaddubsw           xm1, xm4
478    pmulhrsw            xm1, xm5
479    packuswb            xm1, xm1
480    movd       [dstq+dsq*0], xm1
481    pextrd     [dstq+dsq*1], xm1, 1
482    lea                dstq, [dstq+dsq*2]
483    sub                  hd, 2
484    jg .v_w4_loop
485    RET
486.v_w8:
487    movq                xm0, [srcq+ssq*0]
488.v_w8_loop:
489    movq                xm2, [srcq+ssq*1]
490    lea                srcq, [srcq+ssq*2]
491    punpcklbw           xm1, xm0, xm2
492    movq                xm0, [srcq+ssq*0]
493    punpcklbw           xm2, xm0
494    pmaddubsw           xm1, xm4
495    pmaddubsw           xm2, xm4
496    pmulhrsw            xm1, xm5
497    pmulhrsw            xm2, xm5
498    packuswb            xm1, xm2
499    movq       [dstq+dsq*0], xm1
500    movhps     [dstq+dsq*1], xm1
501    lea                dstq, [dstq+dsq*2]
502    sub                  hd, 2
503    jg .v_w8_loop
504    RET
505.v_w16:
506    movu                xm0, [srcq+ssq*0]
507.v_w16_loop:
508    vbroadcasti128       m3, [srcq+ssq*1]
509    lea                srcq, [srcq+ssq*2]
510    vpblendd             m2, m3, m0, 0x0f ; 0 1
511    vbroadcasti128       m0, [srcq+ssq*0]
512    vpblendd             m3, m0, 0xf0     ; 1 2
513    punpcklbw            m1, m2, m3
514    punpckhbw            m2, m3
515    pmaddubsw            m1, m4
516    pmaddubsw            m2, m4
517    pmulhrsw             m1, m5
518    pmulhrsw             m2, m5
519    packuswb             m1, m2
520    mova         [dstq+dsq*0], xm1
521    vextracti128 [dstq+dsq*1], m1, 1
522    lea                dstq, [dstq+dsq*2]
523    sub                  hd, 2
524    jg .v_w16_loop
525    RET
526.v_w32:
527%macro PUT_BILIN_V_W32 0
528    movu                 m0, [srcq+ssq*0]
529%%loop:
530    movu                 m3, [srcq+ssq*1]
531    lea                srcq, [srcq+ssq*2]
532    punpcklbw            m1, m0, m3
533    punpckhbw            m2, m0, m3
534    movu                 m0, [srcq+ssq*0]
535    pmaddubsw            m1, m4
536    pmaddubsw            m2, m4
537    pmulhrsw             m1, m5
538    pmulhrsw             m2, m5
539    packuswb             m1, m2
540    punpcklbw            m2, m3, m0
541    punpckhbw            m3, m0
542    pmaddubsw            m2, m4
543    pmaddubsw            m3, m4
544    pmulhrsw             m2, m5
545    pmulhrsw             m3, m5
546    packuswb             m2, m3
547    mova       [dstq+dsq*0], m1
548    mova       [dstq+dsq*1], m2
549    lea                dstq, [dstq+dsq*2]
550    sub                  hd, 2
551    jg %%loop
552%endmacro
553    PUT_BILIN_V_W32
554    RET
555.v_w64:
556    movu                 m0, [srcq+32*0]
557    movu                 m1, [srcq+32*1]
558.v_w64_loop:
559    add                srcq, ssq
560    movu                 m3, [srcq+32*0]
561    punpcklbw            m2, m0, m3
562    punpckhbw            m0, m3
563    pmaddubsw            m2, m4
564    pmaddubsw            m0, m4
565    pmulhrsw             m2, m5
566    pmulhrsw             m0, m5
567    packuswb             m2, m0
568    mova                 m0, m3
569    movu                 m3, [srcq+32*1]
570    mova        [dstq+32*0], m2
571    punpcklbw            m2, m1, m3
572    punpckhbw            m1, m3
573    pmaddubsw            m2, m4
574    pmaddubsw            m1, m4
575    pmulhrsw             m2, m5
576    pmulhrsw             m1, m5
577    packuswb             m2, m1
578    mova                 m1, m3
579    mova        [dstq+32*1], m2
580    add                dstq, dsq
581    dec                  hd
582    jg .v_w64_loop
583    RET
584.v_w128:
585    lea                 r6d, [hq+(3<<8)]
586    mov                  r4, srcq
587    mov                  r7, dstq
588.v_w128_loop:
589    PUT_BILIN_V_W32
590    add                  r4, 32
591    add                  r7, 32
592    movzx                hd, r6b
593    mov                srcq, r4
594    mov                dstq, r7
595    sub                 r6d, 1<<8
596    jg .v_w128_loop
597    RET
598.hv:
599    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
600    ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
601    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
602    WIN64_SPILL_XMM       8
603    shl                mxyd, 11 ; can't shift by 12 due to signed overflow
604    vpbroadcastd         m7, [pw_15]
605    movd                xm6, mxyd
606    add                  wq, r7
607    paddb                m5, m5
608    vpbroadcastw         m6, xm6
609    jmp                  wq
610.hv_w2:
611    vpbroadcastd        xm0, [srcq+ssq*0]
612    pshufb              xm0, xm4
613    pmaddubsw           xm0, xm5
614.hv_w2_loop:
615    movd                xm1, [srcq+ssq*1]
616    lea                srcq, [srcq+ssq*2]
617    pinsrd              xm1, [srcq+ssq*0], 1
618    pshufb              xm1, xm4
619    pmaddubsw           xm1, xm5             ; 1 _ 2 _
620    shufps              xm2, xm0, xm1, q1032 ; 0 _ 1 _
621    mova                xm0, xm1
622    psubw               xm1, xm2
623    pmulhw              xm1, xm6
624    pavgw               xm2, xm7
625    paddw               xm1, xm2
626    psrlw               xm1, 4
627    packuswb            xm1, xm1
628    pextrw     [dstq+dsq*0], xm1, 0
629    pextrw     [dstq+dsq*1], xm1, 2
630    lea                dstq, [dstq+dsq*2]
631    sub                  hd, 2
632    jg .hv_w2_loop
633    RET
634.hv_w4:
635    mova                xm4, [bilin_h_shuf4]
636    movddup             xm0, [srcq+ssq*0]
637    pshufb              xm0, xm4
638    pmaddubsw           xm0, xm5
639.hv_w4_loop:
640    movq                xm1, [srcq+ssq*1]
641    lea                srcq, [srcq+ssq*2]
642    movhps              xm1, [srcq+ssq*0]
643    pshufb              xm1, xm4
644    pmaddubsw           xm1, xm5             ; 1 2
645    shufps              xm2, xm0, xm1, q1032 ; 0 1
646    mova                xm0, xm1
647    psubw               xm1, xm2
648    pmulhw              xm1, xm6
649    pavgw               xm2, xm7
650    paddw               xm1, xm2
651    psrlw               xm1, 4
652    packuswb            xm1, xm1
653    movd       [dstq+dsq*0], xm1
654    pextrd     [dstq+dsq*1], xm1, 1
655    lea                dstq, [dstq+dsq*2]
656    sub                  hd, 2
657    jg .hv_w4_loop
658    RET
659.hv_w8:
660    vbroadcasti128       m0, [srcq+ssq*0]
661    pshufb               m0, m4
662    pmaddubsw            m0, m5
663.hv_w8_loop:
664    movu                xm1, [srcq+ssq*1]
665    lea                srcq, [srcq+ssq*2]
666    vinserti128          m1, [srcq+ssq*0], 1
667    pshufb               m1, m4
668    pmaddubsw            m1, m5           ; 1 2
669    vperm2i128           m2, m0, m1, 0x21 ; 0 1
670    mova                 m0, m1
671    psubw                m1, m2
672    pmulhw               m1, m6
673    pavgw                m2, m7
674    paddw                m1, m2
675    psrlw                m1, 4
676    vextracti128        xm2, m1, 1
677    packuswb            xm1, xm2
678    movq       [dstq+dsq*0], xm1
679    movhps     [dstq+dsq*1], xm1
680    lea                dstq, [dstq+dsq*2]
681    sub                  hd, 2
682    jg .hv_w8_loop
683    RET
684.hv_w16:
685    movu                 m0, [srcq+ssq*0+8*0]
686    vinserti128          m0, [srcq+ssq*0+8*1], 1
687    pshufb               m0, m4
688    pmaddubsw            m0, m5
689.hv_w16_loop:
690    movu                xm2, [srcq+ssq*1+8*0]
691    vinserti128          m2, [srcq+ssq*1+8*1], 1
692    lea                srcq, [srcq+ssq*2]
693    movu                xm3, [srcq+ssq*0+8*0]
694    vinserti128          m3, [srcq+ssq*0+8*1], 1
695    pshufb               m2, m4
696    pshufb               m3, m4
697    pmaddubsw            m2, m5
698    psubw                m1, m2, m0
699    pmulhw               m1, m6
700    pavgw                m0, m7
701    paddw                m1, m0
702    pmaddubsw            m0, m3, m5
703    psubw                m3, m0, m2
704    pmulhw               m3, m6
705    pavgw                m2, m7
706    paddw                m3, m2
707    psrlw                m1, 4
708    psrlw                m3, 4
709    packuswb             m1, m3
710    vpermq               m1, m1, q3120
711    mova         [dstq+dsq*0], xm1
712    vextracti128 [dstq+dsq*1], m1, 1
713    lea                dstq, [dstq+dsq*2]
714    sub                  hd, 2
715    jg .hv_w16_loop
716    RET
717.hv_w128:
718    lea                 r6d, [hq+(3<<16)]
719    jmp .hv_w32_start
720.hv_w64:
721    lea                 r6d, [hq+(1<<16)]
722.hv_w32_start:
723    mov                  r4, srcq
724    mov                  r7, dstq
725.hv_w32:
726%if WIN64
727    movaps              r4m, xmm8
728%endif
729.hv_w32_loop0:
730    movu                 m0, [srcq+8*0]
731    movu                 m1, [srcq+8*1]
732    pshufb               m0, m4
733    pshufb               m1, m4
734    pmaddubsw            m0, m5
735    pmaddubsw            m1, m5
736.hv_w32_loop:
737    add                srcq, ssq
738    movu                 m2, [srcq+8*0]
739    movu                 m3, [srcq+8*1]
740    pshufb               m2, m4
741    pshufb               m3, m4
742    pmaddubsw            m2, m5
743    pmaddubsw            m3, m5
744    psubw                m8, m2, m0
745    pmulhw               m8, m6
746    pavgw                m0, m7
747    paddw                m8, m0
748    mova                 m0, m2
749    psubw                m2, m3, m1
750    pmulhw               m2, m6
751    pavgw                m1, m7
752    paddw                m2, m1
753    mova                 m1, m3
754    psrlw                m8, 4
755    psrlw                m2, 4
756    packuswb             m8, m2
757    mova             [dstq], m8
758    add                dstq, dsq
759    dec                  hd
760    jg .hv_w32_loop
761    add                  r4, 32
762    add                  r7, 32
763    movzx                hd, r6b
764    mov                srcq, r4
765    mov                dstq, r7
766    sub                 r6d, 1<<16
767    jg .hv_w32_loop0
768%if WIN64
769    movaps             xmm8, r4m
770%endif
771    RET
772
773cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
774    movifnidn          mxyd, r5m ; mx
775    lea                  r6, [prep%+SUFFIX]
776    tzcnt                wd, wm
777    movifnidn            hd, hm
778    test               mxyd, mxyd
779    jnz .h
780    mov                mxyd, r6m ; my
781    test               mxyd, mxyd
782    jnz .v
783.prep:
784    movzx                wd, word [r6+wq*2+table_offset(prep,)]
785    add                  wq, r6
786    lea            stride3q, [strideq*3]
787    jmp                  wq
788.prep_w4:
789    movd                xm0, [srcq+strideq*0]
790    pinsrd              xm0, [srcq+strideq*1], 1
791    pinsrd              xm0, [srcq+strideq*2], 2
792    pinsrd              xm0, [srcq+stride3q ], 3
793    lea                srcq, [srcq+strideq*4]
794    pmovzxbw             m0, xm0
795    psllw                m0, 4
796    mova             [tmpq], m0
797    add                tmpq, 32
798    sub                  hd, 4
799    jg .prep_w4
800    RET
801.prep_w8:
802    movq                xm0, [srcq+strideq*0]
803    movhps              xm0, [srcq+strideq*1]
804    movq                xm1, [srcq+strideq*2]
805    movhps              xm1, [srcq+stride3q ]
806    lea                srcq, [srcq+strideq*4]
807    pmovzxbw             m0, xm0
808    pmovzxbw             m1, xm1
809    psllw                m0, 4
810    psllw                m1, 4
811    mova        [tmpq+32*0], m0
812    mova        [tmpq+32*1], m1
813    add                tmpq, 32*2
814    sub                  hd, 4
815    jg .prep_w8
816    RET
817.prep_w16:
818    pmovzxbw             m0, [srcq+strideq*0]
819    pmovzxbw             m1, [srcq+strideq*1]
820    pmovzxbw             m2, [srcq+strideq*2]
821    pmovzxbw             m3, [srcq+stride3q ]
822    lea                srcq, [srcq+strideq*4]
823    psllw                m0, 4
824    psllw                m1, 4
825    psllw                m2, 4
826    psllw                m3, 4
827    mova        [tmpq+32*0], m0
828    mova        [tmpq+32*1], m1
829    mova        [tmpq+32*2], m2
830    mova        [tmpq+32*3], m3
831    add                tmpq, 32*4
832    sub                  hd, 4
833    jg .prep_w16
834    RET
835.prep_w32:
836    pmovzxbw             m0, [srcq+strideq*0+16*0]
837    pmovzxbw             m1, [srcq+strideq*0+16*1]
838    pmovzxbw             m2, [srcq+strideq*1+16*0]
839    pmovzxbw             m3, [srcq+strideq*1+16*1]
840    lea                srcq, [srcq+strideq*2]
841    psllw                m0, 4
842    psllw                m1, 4
843    psllw                m2, 4
844    psllw                m3, 4
845    mova        [tmpq+32*0], m0
846    mova        [tmpq+32*1], m1
847    mova        [tmpq+32*2], m2
848    mova        [tmpq+32*3], m3
849    add                tmpq, 32*4
850    sub                  hd, 2
851    jg .prep_w32
852    RET
853.prep_w64:
854    pmovzxbw             m0, [srcq+16*0]
855    pmovzxbw             m1, [srcq+16*1]
856    pmovzxbw             m2, [srcq+16*2]
857    pmovzxbw             m3, [srcq+16*3]
858    add                srcq, strideq
859    psllw                m0, 4
860    psllw                m1, 4
861    psllw                m2, 4
862    psllw                m3, 4
863    mova        [tmpq+32*0], m0
864    mova        [tmpq+32*1], m1
865    mova        [tmpq+32*2], m2
866    mova        [tmpq+32*3], m3
867    add                tmpq, 32*4
868    dec                  hd
869    jg .prep_w64
870    RET
871.prep_w128:
872    pmovzxbw             m0, [srcq+16*0]
873    pmovzxbw             m1, [srcq+16*1]
874    pmovzxbw             m2, [srcq+16*2]
875    pmovzxbw             m3, [srcq+16*3]
876    psllw                m0, 4
877    psllw                m1, 4
878    psllw                m2, 4
879    psllw                m3, 4
880    mova        [tmpq+32*0], m0
881    mova        [tmpq+32*1], m1
882    mova        [tmpq+32*2], m2
883    mova        [tmpq+32*3], m3
884    pmovzxbw             m0, [srcq+16*4]
885    pmovzxbw             m1, [srcq+16*5]
886    pmovzxbw             m2, [srcq+16*6]
887    pmovzxbw             m3, [srcq+16*7]
888    add                tmpq, 32*8
889    add                srcq, strideq
890    psllw                m0, 4
891    psllw                m1, 4
892    psllw                m2, 4
893    psllw                m3, 4
894    mova        [tmpq-32*4], m0
895    mova        [tmpq-32*3], m1
896    mova        [tmpq-32*2], m2
897    mova        [tmpq-32*1], m3
898    dec                  hd
899    jg .prep_w128
900    RET
901.h:
902    ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
903    ; = (16 - mx) * src[x] + mx * src[x + 1]
904    imul               mxyd, 255
905    vbroadcasti128       m4, [z_filter_s+2]
906    add                mxyd, 16
907    movd                xm5, mxyd
908    mov                mxyd, r6m ; my
909    vpbroadcastw         m5, xm5
910    test               mxyd, mxyd
911    jnz .hv
912    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
913    add                  wq, r6
914    lea            stride3q, [strideq*3]
915    jmp                  wq
916.h_w4:
917    vbroadcasti128       m4, [bilin_h_shuf4]
918.h_w4_loop:
919    movq                xm0, [srcq+strideq*0]
920    movhps              xm0, [srcq+strideq*1]
921    movq                xm1, [srcq+strideq*2]
922    movhps              xm1, [srcq+stride3q ]
923    lea                srcq, [srcq+strideq*4]
924    vinserti128          m0, xm1, 1
925    pshufb               m0, m4
926    pmaddubsw            m0, m5
927    mova             [tmpq], m0
928    add                tmpq, 32
929    sub                  hd, 4
930    jg .h_w4_loop
931    RET
932.h_w8:
933.h_w8_loop:
934    movu                xm0, [srcq+strideq*0]
935    vinserti128          m0, [srcq+strideq*1], 1
936    movu                xm1, [srcq+strideq*2]
937    vinserti128          m1, [srcq+stride3q ], 1
938    lea                srcq, [srcq+strideq*4]
939    pshufb               m0, m4
940    pshufb               m1, m4
941    pmaddubsw            m0, m5
942    pmaddubsw            m1, m5
943    mova        [tmpq+32*0], m0
944    mova        [tmpq+32*1], m1
945    add                tmpq, 32*2
946    sub                  hd, 4
947    jg .h_w8_loop
948    RET
949.h_w16:
950.h_w16_loop:
951    movu                xm0, [srcq+strideq*0+8*0]
952    vinserti128          m0, [srcq+strideq*0+8*1], 1
953    movu                xm1, [srcq+strideq*1+8*0]
954    vinserti128          m1, [srcq+strideq*1+8*1], 1
955    movu                xm2, [srcq+strideq*2+8*0]
956    vinserti128          m2, [srcq+strideq*2+8*1], 1
957    movu                xm3, [srcq+stride3q +8*0]
958    vinserti128          m3, [srcq+stride3q +8*1], 1
959    lea                srcq, [srcq+strideq*4]
960    pshufb               m0, m4
961    pshufb               m1, m4
962    pshufb               m2, m4
963    pshufb               m3, m4
964    pmaddubsw            m0, m5
965    pmaddubsw            m1, m5
966    pmaddubsw            m2, m5
967    pmaddubsw            m3, m5
968    mova        [tmpq+32*0], m0
969    mova        [tmpq+32*1], m1
970    mova        [tmpq+32*2], m2
971    mova        [tmpq+32*3], m3
972    add                tmpq, 32*4
973    sub                  hd, 4
974    jg .h_w16_loop
975    RET
976.h_w32:
977.h_w32_loop:
978    movu                xm0, [srcq+strideq*0+8*0]
979    vinserti128          m0, [srcq+strideq*0+8*1], 1
980    movu                xm1, [srcq+strideq*0+8*2]
981    vinserti128          m1, [srcq+strideq*0+8*3], 1
982    movu                xm2, [srcq+strideq*1+8*0]
983    vinserti128          m2, [srcq+strideq*1+8*1], 1
984    movu                xm3, [srcq+strideq*1+8*2]
985    vinserti128          m3, [srcq+strideq*1+8*3], 1
986    lea                srcq, [srcq+strideq*2]
987    pshufb               m0, m4
988    pshufb               m1, m4
989    pshufb               m2, m4
990    pshufb               m3, m4
991    pmaddubsw            m0, m5
992    pmaddubsw            m1, m5
993    pmaddubsw            m2, m5
994    pmaddubsw            m3, m5
995    mova        [tmpq+32*0], m0
996    mova        [tmpq+32*1], m1
997    mova        [tmpq+32*2], m2
998    mova        [tmpq+32*3], m3
999    add                tmpq, 32*4
1000    sub                  hd, 2
1001    jg .h_w32_loop
1002    RET
1003.h_w64:
1004    movu                xm0, [srcq+8*0]
1005    vinserti128          m0, [srcq+8*1], 1
1006    movu                xm1, [srcq+8*2]
1007    vinserti128          m1, [srcq+8*3], 1
1008    movu                xm2, [srcq+8*4]
1009    vinserti128          m2, [srcq+8*5], 1
1010    movu                xm3, [srcq+8*6]
1011    vinserti128          m3, [srcq+8*7], 1
1012    add                srcq, strideq
1013    pshufb               m0, m4
1014    pshufb               m1, m4
1015    pshufb               m2, m4
1016    pshufb               m3, m4
1017    pmaddubsw            m0, m5
1018    pmaddubsw            m1, m5
1019    pmaddubsw            m2, m5
1020    pmaddubsw            m3, m5
1021    mova        [tmpq+32*0], m0
1022    mova        [tmpq+32*1], m1
1023    mova        [tmpq+32*2], m2
1024    mova        [tmpq+32*3], m3
1025    add                tmpq, 32*4
1026    dec                  hd
1027    jg .h_w64
1028    RET
1029.h_w128:
1030    movu                xm0, [srcq+8*0]
1031    vinserti128          m0, [srcq+8*1], 1
1032    movu                xm1, [srcq+8*2]
1033    vinserti128          m1, [srcq+8*3], 1
1034    movu                xm2, [srcq+8*4]
1035    vinserti128          m2, [srcq+8*5], 1
1036    movu                xm3, [srcq+8*6]
1037    vinserti128          m3, [srcq+8*7], 1
1038    pshufb               m0, m4
1039    pshufb               m1, m4
1040    pshufb               m2, m4
1041    pshufb               m3, m4
1042    pmaddubsw            m0, m5
1043    pmaddubsw            m1, m5
1044    pmaddubsw            m2, m5
1045    pmaddubsw            m3, m5
1046    mova        [tmpq+32*0], m0
1047    mova        [tmpq+32*1], m1
1048    mova        [tmpq+32*2], m2
1049    mova        [tmpq+32*3], m3
1050    movu                xm0, [srcq+8* 8]
1051    vinserti128          m0, [srcq+8* 9], 1
1052    movu                xm1, [srcq+8*10]
1053    vinserti128          m1, [srcq+8*11], 1
1054    movu                xm2, [srcq+8*12]
1055    vinserti128          m2, [srcq+8*13], 1
1056    movu                xm3, [srcq+8*14]
1057    vinserti128          m3, [srcq+8*15], 1
1058    add                tmpq, 32*8
1059    add                srcq, strideq
1060    pshufb               m0, m4
1061    pshufb               m1, m4
1062    pshufb               m2, m4
1063    pshufb               m3, m4
1064    pmaddubsw            m0, m5
1065    pmaddubsw            m1, m5
1066    pmaddubsw            m2, m5
1067    pmaddubsw            m3, m5
1068    mova        [tmpq-32*4], m0
1069    mova        [tmpq-32*3], m1
1070    mova        [tmpq-32*2], m2
1071    mova        [tmpq-32*1], m3
1072    dec                  hd
1073    jg .h_w128
1074    RET
1075.v:
1076    WIN64_SPILL_XMM       7
1077    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
1078    imul               mxyd, 255
1079    add                mxyd, 16
1080    add                  wq, r6
1081    lea            stride3q, [strideq*3]
1082    movd                xm6, mxyd
1083    vpbroadcastw         m6, xm6
1084    jmp                  wq
1085.v_w4:
1086    movd                xm0, [srcq+strideq*0]
1087.v_w4_loop:
1088    vpbroadcastd         m1, [srcq+strideq*2]
1089    vpbroadcastd        xm2, [srcq+strideq*1]
1090    vpbroadcastd         m3, [srcq+stride3q ]
1091    lea                srcq, [srcq+strideq*4]
1092    vpblendd             m1, m0, 0x05     ; 0 2 2 2
1093    vpbroadcastd         m0, [srcq+strideq*0]
1094    vpblendd             m3, m2, 0x0f     ; 1 1 3 3
1095    vpblendd             m2, m1, m0, 0xa0 ; 0 2 2 4
1096    vpblendd             m1, m3, 0xaa     ; 0 1 2 3
1097    vpblendd             m2, m3, 0x55     ; 1 2 3 4
1098    punpcklbw            m1, m2
1099    pmaddubsw            m1, m6
1100    mova             [tmpq], m1
1101    add                tmpq, 32
1102    sub                  hd, 4
1103    jg .v_w4_loop
1104    RET
1105.v_w8:
1106    movq                xm0, [srcq+strideq*0]
1107.v_w8_loop:
1108    vpbroadcastq         m1, [srcq+strideq*2]
1109    vpbroadcastq         m2, [srcq+strideq*1]
1110    vpbroadcastq         m3, [srcq+stride3q ]
1111    lea                srcq, [srcq+strideq*4]
1112    vpblendd             m1, m0, 0x03     ; 0 2 2 2
1113    vpbroadcastq         m0, [srcq+strideq*0]
1114    vpblendd             m2, m3, 0xcc     ; 1 3 1 3
1115    vpblendd             m3, m2, m1, 0xf0 ; 1 3 2 2
1116    vpblendd             m2, m1, 0x0f     ; 0 2 1 3
1117    vpblendd             m3, m0, 0xc0     ; 1 3 2 4
1118    punpcklbw            m1, m2, m3
1119    punpckhbw            m2, m3
1120    pmaddubsw            m1, m6
1121    pmaddubsw            m2, m6
1122    mova        [tmpq+32*0], m1
1123    mova        [tmpq+32*1], m2
1124    add                tmpq, 32*2
1125    sub                  hd, 4
1126    jg .v_w8_loop
1127    RET
1128.v_w16:
1129    vbroadcasti128       m0, [srcq+strideq*0]
1130.v_w16_loop:
1131    vbroadcasti128       m1, [srcq+strideq*1]
1132    vbroadcasti128       m2, [srcq+strideq*2]
1133    vbroadcasti128       m3, [srcq+stride3q ]
1134    lea                srcq, [srcq+strideq*4]
1135    shufpd               m4, m0, m2, 0x0c ; 0 2
1136    vbroadcasti128       m0, [srcq+strideq*0]
1137    shufpd               m1, m3, 0x0c     ; 1 3
1138    shufpd               m2, m0, 0x0c     ; 2 4
1139    punpcklbw            m3, m4, m1
1140    punpcklbw            m5, m1, m2
1141    punpckhbw            m4, m1
1142    punpckhbw            m1, m2
1143    pmaddubsw            m3, m6
1144    pmaddubsw            m5, m6
1145    pmaddubsw            m4, m6
1146    pmaddubsw            m1, m6
1147    mova        [tmpq+32*0], m3
1148    mova        [tmpq+32*1], m5
1149    mova        [tmpq+32*2], m4
1150    mova        [tmpq+32*3], m1
1151    add                tmpq, 32*4
1152    sub                  hd, 4
1153    jg .v_w16_loop
1154    RET
1155.v_w32:
1156    vpermq               m0, [srcq+strideq*0], q3120
1157.v_w32_loop:
1158    vpermq               m1, [srcq+strideq*1], q3120
1159    vpermq               m2, [srcq+strideq*2], q3120
1160    vpermq               m3, [srcq+stride3q ], q3120
1161    lea                srcq, [srcq+strideq*4]
1162    punpcklbw            m4, m0, m1
1163    punpckhbw            m5, m0, m1
1164    vpermq               m0, [srcq+strideq*0], q3120
1165    pmaddubsw            m4, m6
1166    pmaddubsw            m5, m6
1167    mova        [tmpq+32*0], m4
1168    mova        [tmpq+32*1], m5
1169    punpcklbw            m4, m1, m2
1170    punpckhbw            m1, m2
1171    pmaddubsw            m4, m6
1172    pmaddubsw            m1, m6
1173    punpcklbw            m5, m2, m3
1174    punpckhbw            m2, m3
1175    pmaddubsw            m5, m6
1176    pmaddubsw            m2, m6
1177    mova        [tmpq+32*2], m4
1178    mova        [tmpq+32*3], m1
1179    add                tmpq, 32*8
1180    punpcklbw            m1, m3, m0
1181    punpckhbw            m3, m0
1182    pmaddubsw            m1, m6
1183    pmaddubsw            m3, m6
1184    mova        [tmpq-32*4], m5
1185    mova        [tmpq-32*3], m2
1186    mova        [tmpq-32*2], m1
1187    mova        [tmpq-32*1], m3
1188    sub                  hd, 4
1189    jg .v_w32_loop
1190    RET
1191.v_w64:
1192    vpermq               m0, [srcq+strideq*0+32*0], q3120
1193    vpermq               m1, [srcq+strideq*0+32*1], q3120
1194.v_w64_loop:
1195    vpermq               m2, [srcq+strideq*1+32*0], q3120
1196    vpermq               m3, [srcq+strideq*1+32*1], q3120
1197    lea                srcq, [srcq+strideq*2]
1198    punpcklbw            m4, m0, m2
1199    punpckhbw            m0, m2
1200    pmaddubsw            m4, m6
1201    pmaddubsw            m0, m6
1202    mova        [tmpq+32*0], m4
1203    mova        [tmpq+32*1], m0
1204    punpcklbw            m4, m1, m3
1205    punpckhbw            m5, m1, m3
1206    vpermq               m0, [srcq+strideq*0+32*0], q3120
1207    vpermq               m1, [srcq+strideq*0+32*1], q3120
1208    pmaddubsw            m4, m6
1209    pmaddubsw            m5, m6
1210    mova        [tmpq+32*2], m4
1211    mova        [tmpq+32*3], m5
1212    add                tmpq, 32*8
1213    punpcklbw            m4, m2, m0
1214    punpckhbw            m2, m0
1215    punpcklbw            m5, m3, m1
1216    punpckhbw            m3, m1
1217    pmaddubsw            m4, m6
1218    pmaddubsw            m2, m6
1219    pmaddubsw            m5, m6
1220    pmaddubsw            m3, m6
1221    mova        [tmpq-32*4], m4
1222    mova        [tmpq-32*3], m2
1223    mova        [tmpq-32*2], m5
1224    mova        [tmpq-32*1], m3
1225    sub                  hd, 2
1226    jg .v_w64_loop
1227    RET
1228.v_w128:
1229    lea                 r6d, [hq+(3<<8)]
1230    mov                  r3, srcq
1231    mov                  r5, tmpq
1232.v_w128_loop0:
1233    vpermq               m0, [srcq+strideq*0], q3120
1234.v_w128_loop:
1235    vpermq               m1, [srcq+strideq*1], q3120
1236    lea                srcq, [srcq+strideq*2]
1237    punpcklbw            m2, m0, m1
1238    punpckhbw            m3, m0, m1
1239    vpermq               m0, [srcq+strideq*0], q3120
1240    pmaddubsw            m2, m6
1241    pmaddubsw            m3, m6
1242    punpcklbw            m4, m1, m0
1243    punpckhbw            m1, m0
1244    pmaddubsw            m4, m6
1245    pmaddubsw            m1, m6
1246    mova        [tmpq+32*0], m2
1247    mova        [tmpq+32*1], m3
1248    mova        [tmpq+32*8], m4
1249    mova        [tmpq+32*9], m1
1250    add                tmpq, 32*16
1251    sub                  hd, 2
1252    jg .v_w128_loop
1253    add                  r3, 32
1254    add                  r5, 64
1255    movzx                hd, r6b
1256    mov                srcq, r3
1257    mov                tmpq, r5
1258    sub                 r6d, 1<<8
1259    jg .v_w128_loop0
1260    RET
1261.hv:
1262    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
1263    ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
1264    WIN64_SPILL_XMM       7
1265    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
1266    shl                mxyd, 11
1267    movd                xm6, mxyd
1268    vpbroadcastw         m6, xm6
1269    add                  wq, r6
1270    lea            stride3q, [strideq*3]
1271    jmp                  wq
1272.hv_w4:
1273    vbroadcasti128       m4, [bilin_h_shuf4]
1274    vpbroadcastq         m0, [srcq+strideq*0]
1275    pshufb               m0, m4
1276    pmaddubsw            m0, m5
1277.hv_w4_loop:
1278    movq                xm1, [srcq+strideq*1]
1279    movhps              xm1, [srcq+strideq*2]
1280    movq                xm2, [srcq+stride3q ]
1281    lea                srcq, [srcq+strideq*4]
1282    movhps              xm2, [srcq+strideq*0]
1283    vinserti128          m1, xm2, 1
1284    pshufb               m1, m4
1285    pmaddubsw            m1, m5        ; 1 2 3 4
1286    vpblendd             m2, m1, m0, 0xc0
1287    vpermq               m2, m2, q2103 ; 0 1 2 3
1288    mova                 m0, m1
1289    psubw                m1, m2
1290    pmulhrsw             m1, m6
1291    paddw                m1, m2
1292    mova             [tmpq], m1
1293    add                tmpq, 32
1294    sub                  hd, 4
1295    jg .hv_w4_loop
1296    RET
1297.hv_w8:
1298    vbroadcasti128       m0, [srcq+strideq*0]
1299    pshufb               m0, m4
1300    pmaddubsw            m0, m5
1301.hv_w8_loop:
1302    movu                xm1, [srcq+strideq*1]
1303    vinserti128          m1, [srcq+strideq*2], 1
1304    movu                xm2, [srcq+stride3q ]
1305    lea                srcq, [srcq+strideq*4]
1306    vinserti128          m2, [srcq+strideq*0], 1
1307    pshufb               m1, m4
1308    pshufb               m2, m4
1309    pmaddubsw            m1, m5           ; 1 2
1310    vperm2i128           m3, m0, m1, 0x21 ; 0 1
1311    pmaddubsw            m0, m2, m5       ; 3 4
1312    vperm2i128           m2, m1, m0, 0x21 ; 2 3
1313    psubw                m1, m3
1314    pmulhrsw             m1, m6
1315    paddw                m1, m3
1316    psubw                m3, m0, m2
1317    pmulhrsw             m3, m6
1318    paddw                m3, m2
1319    mova        [tmpq+32*0], m1
1320    mova        [tmpq+32*1], m3
1321    add                tmpq, 32*2
1322    sub                  hd, 4
1323    jg .hv_w8_loop
1324    RET
1325.hv_w16:
1326    movu                xm0, [srcq+strideq*0+8*0]
1327    vinserti128          m0, [srcq+strideq*0+8*1], 1
1328    pshufb               m0, m4
1329    pmaddubsw            m0, m5
1330.hv_w16_loop:
1331    movu                xm1, [srcq+strideq*1+8*0]
1332    vinserti128          m1, [srcq+strideq*1+8*1], 1
1333    lea                srcq, [srcq+strideq*2]
1334    movu                xm2, [srcq+strideq*0+8*0]
1335    vinserti128          m2, [srcq+strideq*0+8*1], 1
1336    pshufb               m1, m4
1337    pshufb               m2, m4
1338    pmaddubsw            m1, m5
1339    psubw                m3, m1, m0
1340    pmulhrsw             m3, m6
1341    paddw                m3, m0
1342    pmaddubsw            m0, m2, m5
1343    psubw                m2, m0, m1
1344    pmulhrsw             m2, m6
1345    paddw                m2, m1
1346    mova        [tmpq+32*0], m3
1347    mova        [tmpq+32*1], m2
1348    add                tmpq, 32*2
1349    sub                  hd, 2
1350    jg .hv_w16_loop
1351    RET
1352.hv_w32:
1353    movu                xm0, [srcq+8*0]
1354    vinserti128          m0, [srcq+8*1], 1
1355    movu                xm1, [srcq+8*2]
1356    vinserti128          m1, [srcq+8*3], 1
1357    pshufb               m0, m4
1358    pshufb               m1, m4
1359    pmaddubsw            m0, m5
1360    pmaddubsw            m1, m5
1361.hv_w32_loop:
1362    add                srcq, strideq
1363    movu                xm2, [srcq+8*0]
1364    vinserti128          m2, [srcq+8*1], 1
1365    pshufb               m2, m4
1366    pmaddubsw            m2, m5
1367    psubw                m3, m2, m0
1368    pmulhrsw             m3, m6
1369    paddw                m3, m0
1370    mova                 m0, m2
1371    movu                xm2, [srcq+8*2]
1372    vinserti128          m2, [srcq+8*3], 1
1373    pshufb               m2, m4
1374    pmaddubsw            m2, m5
1375    mova        [tmpq+32*0], m3
1376    psubw                m3, m2, m1
1377    pmulhrsw             m3, m6
1378    paddw                m3, m1
1379    mova                 m1, m2
1380    mova        [tmpq+32*1], m3
1381    add                tmpq, 32*2
1382    dec                  hd
1383    jg .hv_w32_loop
1384    RET
1385.hv_w128:
1386    lea                 r3d, [hq+(7<<8)]
1387    mov                 r6d, 256
1388    jmp .hv_w64_start
1389.hv_w64:
1390    lea                 r3d, [hq+(3<<8)]
1391    mov                 r6d, 128
1392.hv_w64_start:
1393%if WIN64
1394    PUSH                 r7
1395%endif
1396    mov                  r5, srcq
1397    mov                  r7, tmpq
1398.hv_w64_loop0:
1399    movu                xm0, [srcq+strideq*0+8*0]
1400    vinserti128          m0, [srcq+strideq*0+8*1], 1
1401    pshufb               m0, m4
1402    pmaddubsw            m0, m5
1403.hv_w64_loop:
1404    movu                xm1, [srcq+strideq*1+8*0]
1405    vinserti128          m1, [srcq+strideq*1+8*1], 1
1406    lea                srcq, [srcq+strideq*2]
1407    movu                xm2, [srcq+strideq*0+8*0]
1408    vinserti128          m2, [srcq+strideq*0+8*1], 1
1409    pshufb               m1, m4
1410    pshufb               m2, m4
1411    pmaddubsw            m1, m5
1412    psubw                m3, m1, m0
1413    pmulhrsw             m3, m6
1414    paddw                m3, m0
1415    pmaddubsw            m0, m2, m5
1416    psubw                m2, m0, m1
1417    pmulhrsw             m2, m6
1418    paddw                m2, m1
1419    mova        [tmpq+r6*0], m3
1420    mova        [tmpq+r6*1], m2
1421    lea                tmpq, [tmpq+r6*2]
1422    sub                  hd, 2
1423    jg .hv_w64_loop
1424    add                  r5, 16
1425    add                  r7, 32
1426    movzx                hd, r3b
1427    mov                srcq, r5
1428    mov                tmpq, r7
1429    sub                 r3d, 1<<8
1430    jg .hv_w64_loop0
1431%if WIN64
1432    POP                  r7
1433%endif
1434    RET
1435
1436; int8_t subpel_filters[5][15][8]
1437%assign FILTER_REGULAR (0*15 << 16) | 3*15
1438%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
1439%assign FILTER_SHARP   (2*15 << 16) | 3*15
1440
1441%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to
1442cglobal %1_%2_8bpc
1443    mov                 t0d, FILTER_%3
1444%ifidn %3, %4
1445    mov                 t1d, t0d
1446%else
1447    mov                 t1d, FILTER_%4
1448%endif
1449%if %0 == 5 ; skip the jump in the last filter
1450    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
1451%endif
1452%endmacro
1453
1454%if WIN64
1455DECLARE_REG_TMP 4, 5
1456%else
1457DECLARE_REG_TMP 7, 8
1458%endif
1459
1460%define PUT_8TAP_FN FN put_8tap,
1461PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_8bpc
1462PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_8bpc
1463PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_8bpc
1464PUT_8TAP_FN regular,        REGULAR, REGULAR
1465
1466cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns
1467    imul                mxd, mxm, 0x010101
1468    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
1469    imul                myd, mym, 0x010101
1470    add                 myd, t1d ; 6tap_v, my, 4tap_v
1471    lea                  r8, [put_avx2]
1472    mov                  wd, wm
1473    movifnidn            hd, hm
1474    test                mxd, 0xf00
1475    jnz .h
1476    test                myd, 0xf00
1477    jnz .v
1478.put:
1479    tzcnt                wd, wd
1480    movzx                wd, word [r8+wq*2+table_offset(put,)]
1481    add                  wq, r8
1482    lea                  r6, [ssq*3]
1483    lea                  r7, [dsq*3]
1484%if WIN64
1485    pop                  r8
1486%endif
1487    jmp                  wq
1488.h_w2:
1489    movzx               mxd, mxb
1490    lea                srcq, [srcq-1]
1491    vpbroadcastd        xm4, [r8+mxq*8+subpel_filters-put_avx2+2]
1492    je .h_w4
1493    mova                xm3, [subpel_h_shuf4]
1494.h_w2_loop:
1495    movq                xm0, [srcq+ssq*0]
1496    movhps              xm0, [srcq+ssq*1]
1497    lea                srcq, [srcq+ssq*2]
1498    pshufb              xm0, xm3
1499    pmaddubsw           xm0, xm4
1500    phaddw              xm0, xm0
1501    paddw               xm0, xm5
1502    psraw               xm0, 6
1503    packuswb            xm0, xm0
1504    pextrw     [dstq+dsq*0], xm0, 0
1505    pextrw     [dstq+dsq*1], xm0, 1
1506    lea                dstq, [dstq+dsq*2]
1507    sub                  hd, 2
1508    jg .h_w2_loop
1509    RET
1510.h_w4:
1511    mova                xm3, [subpel_h_shufA]
1512.h_w4_loop:
1513    movq                xm0, [srcq+ssq*0]
1514    movq                xm1, [srcq+ssq*1]
1515    lea                srcq, [srcq+ssq*2]
1516    pshufb              xm0, xm3
1517    pshufb              xm1, xm3
1518    pmaddubsw           xm0, xm4
1519    pmaddubsw           xm1, xm4
1520    phaddw              xm0, xm1
1521    paddw               xm0, xm5
1522    psraw               xm0, 6
1523    packuswb            xm0, xm0
1524    movd       [dstq+dsq*0], xm0
1525    pextrd     [dstq+dsq*1], xm0, 1
1526    lea                dstq, [dstq+dsq*2]
1527    sub                  hd, 2
1528    jg .h_w4_loop
1529    RET
1530.h:
1531    test                myd, 0xf00
1532    jnz .hv
1533    vpbroadcastd         m5, [pw_34] ; 2 + (8 << 2)
1534    cmp                  wd, 4
1535    jle .h_w2
1536    WIN64_SPILL_XMM      11
1537    tzcnt                wd, wd
1538    vbroadcasti128       m4, [z_filter_s+ 2] ; 01
1539    shr                 mxd, 16
1540    vbroadcasti128       m6, [z_filter_s+ 6] ; 23
1541    sub                srcq, 2
1542    vbroadcasti128       m7, [z_filter_s+10] ; 45
1543    lea                 mxq, [r8+mxq*8+subpel_filters+1-put_avx2]
1544    movzx                wd, word [r8+wq*2+table_offset(put, _6tap_h)]
1545    vpbroadcastw         m8, [mxq+0]
1546    vpbroadcastw         m9, [mxq+2]
1547    add                  wq, r8
1548    vpbroadcastw        m10, [mxq+4]
1549    jmp                  wq
1550.h_w8:
1551%macro PUT_6TAP_H 3 ; dst/src, tmp[1-2]
1552    pshufb              m%2, m%1, m4
1553    pmaddubsw           m%2, m8
1554    pshufb              m%3, m%1, m6
1555    pmaddubsw           m%3, m9
1556    pshufb              m%1, m7
1557    pmaddubsw           m%1, m10
1558    paddw               m%2, m5
1559    paddw               m%1, m%3
1560    paddw               m%1, m%2
1561    psraw               m%1, 6
1562%endmacro
1563    movu                xm0, [srcq+ssq*0]
1564    vinserti128          m0, [srcq+ssq*1], 1
1565    lea                srcq, [srcq+ssq*2]
1566    PUT_6TAP_H            0, 1, 2
1567    vextracti128        xm1, m0, 1
1568    packuswb            xm0, xm1
1569    movq       [dstq+dsq*0], xm0
1570    movhps     [dstq+dsq*1], xm0
1571    lea                dstq, [dstq+dsq*2]
1572    sub                  hd, 2
1573    jg .h_w8
1574    RET
1575.h_w16:
1576    movu                xm0, [srcq+ssq*0+8*0]
1577    vinserti128          m0, [srcq+ssq*1+8*0], 1
1578    movu                xm1, [srcq+ssq*0+8*1]
1579    vinserti128          m1, [srcq+ssq*1+8*1], 1
1580    PUT_6TAP_H            0, 2, 3
1581    lea                srcq, [srcq+ssq*2]
1582    PUT_6TAP_H            1, 2, 3
1583    packuswb             m0, m1
1584    mova         [dstq+dsq*0], xm0
1585    vextracti128 [dstq+dsq*1], m0, 1
1586    lea                dstq, [dstq+dsq*2]
1587    sub                  hd, 2
1588    jg .h_w16
1589    RET
1590.h_w32:
1591    xor                 r6d, r6d
1592    jmp .h_start
1593.h_w64:
1594    mov                  r6, -32*1
1595    jmp .h_start
1596.h_w128:
1597    mov                  r6, -32*3
1598.h_start:
1599    sub                srcq, r6
1600    sub                dstq, r6
1601    mov                  r4, r6
1602.h_loop:
1603    movu                 m0, [srcq+r6+8*0]
1604    movu                 m1, [srcq+r6+8*1]
1605    PUT_6TAP_H            0, 2, 3
1606    PUT_6TAP_H            1, 2, 3
1607    packuswb             m0, m1
1608    mova          [dstq+r6], m0
1609    add                  r6, 32
1610    jle .h_loop
1611    add                srcq, ssq
1612    add                dstq, dsq
1613    mov                  r6, r4
1614    dec                  hd
1615    jg .h_loop
1616    RET
1617.v:
1618    WIN64_SPILL_XMM       9, 12
1619    movzx               mxd, myb
1620    shr                 myd, 16
1621    cmp                  hd, 6
1622    cmovs               myd, mxd
1623    tzcnt               r6d, wd
1624    movzx               r6d, word [r8+r6*2+table_offset(put, _6tap_v)]
1625    vpbroadcastd         m8, [pw_512]
1626    lea                 myq, [r8+myq*8+subpel_filters+1-put_avx2]
1627    vpbroadcastw         m5, [myq+0]
1628    vpbroadcastw         m6, [myq+2]
1629    vpbroadcastw         m7, [myq+4]
1630    add                  r6, r8
1631    mov                 nsq, ssq
1632    neg                 nsq
1633    jmp                  r6
1634.v_w2:
1635    movd                xm2, [srcq+nsq*2]
1636    pinsrw              xm2, [srcq+nsq*1], 2
1637    pinsrw              xm2, [srcq+ssq*0], 4
1638    pinsrw              xm2, [srcq+ssq*1], 6 ; 0 1 2 3
1639    lea                srcq, [srcq+ssq*2]
1640    vpbroadcastd        xm0, [srcq+ssq*0]
1641    palignr             xm3, xm0, xm2, 4     ; 1 2 3 4
1642    punpcklbw           xm1, xm2, xm3        ; 01 12
1643    punpckhbw           xm2, xm3             ; 23 34
1644.v_w2_loop:
1645    vpbroadcastd        xm4, [srcq+ssq*1]
1646    lea                srcq, [srcq+ssq*2]
1647    pmaddubsw           xm3, xm1, xm5        ; a0 b0
1648    mova                xm1, xm2
1649    pmaddubsw           xm2, xm6             ; a1 b1
1650    paddw               xm3, xm2
1651    vpblendd            xm2, xm0, xm4, 0x02  ; 4 5
1652    vpbroadcastd        xm0, [srcq+ssq*0]
1653    vpblendd            xm4, xm0, 0x02       ; 5 6
1654    punpcklbw           xm2, xm4             ; 67 78
1655    pmaddubsw           xm4, xm2, xm7        ; a3 b3
1656    paddw               xm3, xm4
1657    pmulhrsw            xm3, xm8
1658    packuswb            xm3, xm3
1659    pextrw     [dstq+dsq*0], xm3, 0
1660    pextrw     [dstq+dsq*1], xm3, 2
1661    lea                dstq, [dstq+dsq*2]
1662    sub                  hd, 2
1663    jg .v_w2_loop
1664    RET
1665.v_w4:
1666    movd                xm2, [srcq+nsq*2]
1667    pinsrd              xm2, [srcq+nsq*1], 1
1668    pinsrd              xm2, [srcq+ssq*0], 2
1669    pinsrd              xm2, [srcq+ssq*1], 3 ; 0 1 2 3
1670    lea                srcq, [srcq+ssq*2]
1671    vpbroadcastd        xm0, [srcq+ssq*0]
1672    palignr             xm3, xm0, xm2, 4     ; 1 2 3 4
1673    punpcklbw           xm1, xm2, xm3        ; 01 12
1674    punpckhbw           xm2, xm3             ; 23 34
1675.v_w4_loop:
1676    vpbroadcastd        xm4, [srcq+ssq*1]
1677    lea                srcq, [srcq+ssq*2]
1678    pmaddubsw           xm3, xm1, xm5        ; a0 b0
1679    mova                xm1, xm2
1680    pmaddubsw           xm2, xm6             ; a1 b1
1681    paddw               xm3, xm2
1682    vpblendd            xm2, xm0, xm4, 0x02  ; 4 5
1683    vpbroadcastd        xm0, [srcq+ssq*0]
1684    vpblendd            xm4, xm0, 0x02       ; 5 6
1685    punpcklbw           xm2, xm4             ; 45 56
1686    pmaddubsw           xm4, xm2, xm7        ; a2 b2
1687    paddw               xm3, xm4
1688    pmulhrsw            xm3, xm8
1689    packuswb            xm3, xm3
1690    movd       [dstq+dsq*0], xm3
1691    pextrd     [dstq+dsq*1], xm3, 1
1692    lea                dstq, [dstq+dsq*2]
1693    sub                  hd, 2
1694    jg .v_w4_loop
1695    RET
1696.v_w8:
1697    movq                xm1, [srcq+nsq*2]
1698    vpbroadcastq         m3, [srcq+nsq*1]
1699    vpbroadcastq         m2, [srcq+ssq*0]
1700    vpbroadcastq         m4, [srcq+ssq*1]
1701    lea                srcq, [srcq+ssq*2]
1702    vpbroadcastq         m0, [srcq+ssq*0]
1703    vpblendd             m1, m3, 0x30
1704    vpblendd             m3, m2, 0x30
1705    punpcklbw            m1, m3      ; 01 12
1706    vpblendd             m2, m4, 0x30
1707    vpblendd             m4, m0, 0x30
1708    punpcklbw            m2, m4      ; 23 34
1709.v_w8_loop:
1710    vpbroadcastq         m4, [srcq+ssq*1]
1711    lea                srcq, [srcq+ssq*2]
1712    pmaddubsw            m3, m1, m5  ; a0 b0
1713    mova                 m1, m2
1714    pmaddubsw            m2, m6      ; a1 b1
1715    paddw                m3, m2
1716    vpblendd             m2, m0, m4, 0x30
1717    vpbroadcastq         m0, [srcq+ssq*0]
1718    vpblendd             m4, m0, 0x30
1719    punpcklbw            m2, m4      ; 45 56
1720    pmaddubsw            m4, m2, m7  ; a2 b2
1721    paddw                m3, m4
1722    pmulhrsw             m3, m8
1723    vextracti128        xm4, m3, 1
1724    packuswb            xm3, xm4
1725    movq       [dstq+dsq*0], xm3
1726    movhps     [dstq+dsq*1], xm3
1727    lea                dstq, [dstq+dsq*2]
1728    sub                  hd, 2
1729    jg .v_w8_loop
1730    RET
1731.v_w16:
1732.v_w32:
1733.v_w64:
1734.v_w128:
1735    lea                 r6d, [wq*8-128]
1736    WIN64_PUSH_XMM       12
1737    lea                 r6d, [hq+r6*2]
1738.v_w16_loop0:
1739    vbroadcasti128       m3, [srcq+nsq*2]
1740    vbroadcasti128       m4, [srcq+nsq*1]
1741    lea                  r4, [srcq+ssq*2]
1742    vbroadcasti128       m0, [srcq+ssq*0]
1743    vbroadcasti128       m1, [srcq+ssq*1]
1744    mov                  r7, dstq
1745    vbroadcasti128       m2, [r4+ssq*0]
1746    shufpd               m3, m0, 0x0c
1747    shufpd               m4, m1, 0x0c
1748    punpcklbw            m1, m3, m4 ; 01
1749    punpckhbw            m3, m4     ; 23
1750    shufpd               m0, m2, 0x0c
1751    punpcklbw            m2, m4, m0 ; 12
1752    punpckhbw            m4, m0     ; 34
1753.v_w16_loop:
1754    vbroadcasti128       m9, [r4+ssq*1]
1755    pmaddubsw           m10, m1, m5  ; a0
1756    lea                  r4, [r4+ssq*2]
1757    pmaddubsw           m11, m2, m5  ; b0
1758    mova                 m1, m3
1759    pmaddubsw            m3, m6      ; a1
1760    mova                 m2, m4
1761    pmaddubsw            m4, m6      ; b1
1762    paddw               m10, m3
1763    vbroadcasti128       m3, [r4+ssq*0]
1764    paddw               m11, m4
1765    shufpd               m4, m0, m9, 0x0d
1766    shufpd               m0, m9, m3, 0x0c
1767    punpcklbw            m3, m4, m0  ; 45
1768    punpckhbw            m4, m0      ; 56
1769    pmaddubsw            m9, m3, m7  ; a2
1770    paddw               m10, m9
1771    pmaddubsw            m9, m4, m7  ; b2
1772    paddw               m11, m9
1773    pmulhrsw            m10, m8
1774    pmulhrsw            m11, m8
1775    packuswb            m10, m11
1776    vpermq              m10, m10, q3120
1777    mova         [r7+dsq*0], xm10
1778    vextracti128 [r7+dsq*1], m10, 1
1779    lea                  r7, [r7+dsq*2]
1780    sub                  hd, 2
1781    jg .v_w16_loop
1782    add                srcq, 16
1783    add                dstq, 16
1784    movzx                hd, r6b
1785    sub                 r6d, 1<<8
1786    jg .v_w16_loop0
1787    RET
1788.hv:
1789    WIN64_SPILL_XMM      12, 16
1790    cmp                  wd, 4
1791    jg .hv_w8
1792    movzx               mxd, mxb
1793    dec                srcq
1794    vpbroadcastd         m6, [r8+mxq*8+subpel_filters-put_avx2+2]
1795    movzx               mxd, myb
1796    shr                 myd, 16
1797    cmp                  hd, 6
1798    cmovs               myd, mxd
1799    vpbroadcastq         m0, [r8+myq*8+subpel_filters+1-put_avx2]
1800    vpbroadcastd         m7, [pw_8192]
1801    punpcklbw            m0, m0
1802    vpbroadcastd         m8, [pd_512]
1803    psraw                m0, 8 ; sign-extend
1804    mov                 nsq, ssq
1805    pshufd               m9, m0, q0000
1806    neg                 nsq
1807    pshufd              m10, m0, q1111
1808    pshufd              m11, m0, q2222
1809    cmp                  wd, 4
1810    je .hv_w4
1811    vbroadcasti128       m5, [subpel_h_shuf4]
1812    movq                xm2, [srcq+nsq*2]
1813    movhps              xm2, [srcq+nsq*1]
1814    movq                xm0, [srcq+ssq*0]
1815    movhps              xm0, [srcq+ssq*1]
1816    lea                srcq, [srcq+ssq*2]
1817    vpbroadcastq         m1, [srcq+ssq*0]
1818    vpblendd             m2, m1, 0x30
1819    pshufb               m2, m5
1820    pshufb              xm0, xm5
1821    pmaddubsw            m2, m6
1822    pmaddubsw           xm0, xm6
1823    phaddw               m2, m0
1824    pmulhrsw             m2, m7
1825    vextracti128        xm0, m2, 1
1826    palignr             xm0, xm2, 4
1827    punpcklwd           xm1, xm2, xm0  ; 01 12
1828    punpckhwd           xm2, xm0       ; 23 34
1829.hv_w2_loop:
1830    movq                xm4, [srcq+ssq*1]
1831    lea                srcq, [srcq+ssq*2]
1832    movhps              xm4, [srcq+ssq*0]
1833    pshufb              xm4, xm5
1834    pmaddubsw           xm4, xm6
1835    pmaddwd             xm3, xm9, xm1  ; a0 b0
1836    mova                xm1, xm2
1837    pmaddwd             xm2, xm10      ; a1 b1
1838    phaddw              xm4, xm4
1839    paddd               xm3, xm2
1840    pmulhrsw            xm4, xm7
1841    palignr             xm2, xm4, xm0, 12
1842    mova                xm0, xm4
1843    punpcklwd           xm2, xm4       ; 45 56
1844    pmaddwd             xm4, xm11, xm2 ; a2 b2
1845    paddd               xm3, xm8
1846    paddd               xm3, xm4
1847    psrad               xm3, 10
1848    packssdw            xm3, xm3
1849    packuswb            xm3, xm3
1850    pextrw     [dstq+dsq*0], xm3, 0
1851    pextrw     [dstq+dsq*1], xm3, 1
1852    lea                dstq, [dstq+dsq*2]
1853    sub                  hd, 2
1854    jg .hv_w2_loop
1855    RET
1856.hv_w4:
1857    mova                 m5, [subpel_h_shuf4]
1858    vpbroadcastq         m2, [srcq+nsq*2]
1859    vpbroadcastq         m4, [srcq+nsq*1]
1860    vpbroadcastq         m1, [srcq+ssq*0]
1861    vpbroadcastq         m3, [srcq+ssq*1]
1862    lea                srcq, [srcq+ssq*2]
1863    vpbroadcastq         m0, [srcq+ssq*0]
1864    vpblendd             m2, m4, 0xcc ; 0 1
1865    vpblendd             m1, m3, 0xcc ; 2 3
1866    pshufb               m2, m5
1867    pshufb               m1, m5
1868    pshufb               m0, m5
1869    pmaddubsw            m2, m6
1870    pmaddubsw            m1, m6
1871    pmaddubsw            m0, m6
1872    phaddw               m2, m1
1873    phaddw               m0, m0
1874    pmulhrsw             m2, m7
1875    pmulhrsw             m0, m7
1876    palignr              m3, m0, m2, 4
1877    punpcklwd            m1, m2, m3   ; 01 12
1878    punpckhwd            m2, m3       ; 23 34
1879.hv_w4_loop:
1880    vpbroadcastq         m4, [srcq+ssq*1]
1881    lea                srcq, [srcq+ssq*2]
1882    pmaddwd              m3, m9, m1   ; a0 b0
1883    mova                 m1, m2
1884    pmaddwd              m2, m10      ; a1 b1
1885    paddd                m3, m2
1886    vpbroadcastq         m2, [srcq+ssq*0]
1887    vpblendd             m4, m2, 0xcc ; 5 6
1888    pshufb               m4, m5
1889    pmaddubsw            m4, m6
1890    phaddw               m4, m4
1891    pmulhrsw             m4, m7
1892    palignr              m2, m4, m0, 12
1893    mova                 m0, m4
1894    punpcklwd            m2, m4       ; 45 56
1895    pmaddwd              m4, m11, m2  ; a2 b2
1896    paddd                m3, m8
1897    paddd                m3, m4
1898    psrad                m3, 10
1899    vextracti128        xm4, m3, 1
1900    packssdw            xm3, xm4
1901    packuswb            xm3, xm3
1902    pshuflw             xm3, xm3, q3120
1903    movd       [dstq+dsq*0], xm3
1904    pextrd     [dstq+dsq*1], xm3, 1
1905    lea                dstq, [dstq+dsq*2]
1906    sub                  hd, 2
1907    jg .hv_w4_loop
1908    RET
1909.hv_w8:
1910    shr                 mxd, 16
1911    sub                srcq, 2
1912    lea                 mxq, [r8+mxq*8+subpel_filters+1-put_avx2]
1913    WIN64_PUSH_XMM       16
1914    vpbroadcastw        m10, [mxq+0]
1915    vpbroadcastw        m11, [mxq+2]
1916    vpbroadcastw        m12, [mxq+4]
1917    movzx               mxd, myb
1918    shr                 myd, 16
1919    cmp                  hd, 6
1920    cmovs               myd, mxd
1921    vpbroadcastq         m0, [r8+myq*8+subpel_filters+1-put_avx2]
1922    lea                 r6d, [wq*8-64]
1923    vbroadcasti128       m8, [z_filter_s+ 6]
1924    punpcklbw            m0, m0
1925    vbroadcasti128       m9, [z_filter_s+10]
1926    psraw                m0, 8 ; sign-extend
1927    mov                 nsq, ssq
1928    pshufd              m13, m0, q0000
1929    neg                 nsq
1930    pshufd              m14, m0, q1111
1931    lea                 r6d, [hq+r6*4]
1932    pshufd              m15, m0, q2222
1933.hv_w8_loop0:
1934    vbroadcasti128       m7, [z_filter_s+2]
1935    movu                xm3, [srcq+nsq*2]
1936    lea                  r4, [srcq+ssq*2]
1937    movu                xm4, [srcq+nsq*1]
1938    vbroadcasti128       m0, [srcq+ssq*0]
1939    mov                  r7, dstq
1940    vinserti128          m4, [srcq+ssq*1], 1 ; 1 3
1941    vpblendd             m3, m0, 0xf0        ; 0 2
1942    vinserti128          m0, [r4+ssq*0], 1   ; 2 4
1943    vpbroadcastd         m5, [pw_8192]
1944%macro HV_H_6TAP_W8 6 ; src/dst, tmp[1-2], shuf[1-3]
1945    pshufb               %2, %1, %4
1946    pmaddubsw            %2, m10
1947    pshufb               %3, %1, %5
1948    pmaddubsw            %3, m11
1949    pshufb               %1, %6
1950    pmaddubsw            %1, m12
1951    paddw                %2, %3
1952    paddw                %1, %2
1953%endmacro
1954    HV_H_6TAP_W8         m3, m1, m2, m7, m8, m9
1955    HV_H_6TAP_W8         m4, m1, m2, m7, m8, m9
1956    HV_H_6TAP_W8         m0, m1, m2, m7, m8, m9
1957    vpermq               m3, m3, q3120
1958    vpermq               m4, m4, q3120
1959    vpermq               m0, m0, q3120
1960    pmulhrsw             m3, m5
1961    pmulhrsw             m4, m5
1962    pmulhrsw             m0, m5
1963    punpcklwd            m1, m3, m4  ; 01
1964    punpckhwd            m3, m4      ; 23
1965    punpcklwd            m2, m4, m0  ; 12
1966    punpckhwd            m4, m0      ; 34
1967.hv_w8_loop:
1968    movu                xm7, [r4+ssq*1]
1969    lea                  r4, [r4+ssq*2]
1970    vinserti128          m7, [r4+ssq*0], 1 ; 5 6
1971    pmaddwd              m5, m13, m1 ; a0
1972    mova                 m1, m3
1973    pmaddwd              m6, m13, m2 ; b0
1974    mova                 m2, m4
1975    pmaddwd              m3, m14     ; a1
1976    pmaddwd              m4, m14     ; b1
1977    paddd                m5, m3
1978    vbroadcasti128       m3, [z_filter_s+2]
1979    paddd                m6, m4
1980    HV_H_6TAP_W8         m7, m3, m4, m3, m8, m9
1981    vpbroadcastd         m3, [pw_8192]
1982    vpbroadcastd         m4, [pd_512]
1983    pmulhrsw             m7, m3
1984    paddd                m5, m4
1985    paddd                m6, m4
1986    mova                 m4, m0
1987    vpermq               m0, m7, q3120
1988    shufpd               m4, m0, 0x05
1989    punpcklwd            m3, m4, m0  ; 45
1990    pmaddwd              m7, m15, m3 ; a2
1991    punpckhwd            m4, m0      ; 67
1992    paddd                m5, m7
1993    pmaddwd              m7, m15, m4 ; b2
1994    paddd                m6, m7
1995    psrad                m5, 10
1996    psrad                m6, 10
1997    packssdw             m5, m6
1998    vextracti128        xm6, m5, 1
1999    packuswb            xm5, xm6
2000    pshufd              xm5, xm5, q3120
2001    movq         [r7+dsq*0], xm5
2002    movhps       [r7+dsq*1], xm5
2003    lea                  r7, [r7+dsq*2]
2004    sub                  hd, 2
2005    jg .hv_w8_loop
2006    add                srcq, 8
2007    add                dstq, 8
2008    movzx                hd, r6b
2009    sub                 r6d, 1<<8
2010    jg .hv_w8_loop0
2011    RET
2012
2013PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,  put_8tap_8bpc
2014PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH, put_8tap_8bpc
2015PUT_8TAP_FN regular_sharp,  REGULAR, SHARP,  put_8tap_8bpc
2016PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_8tap_8bpc
2017PUT_8TAP_FN sharp,          SHARP,   SHARP
2018
2019cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
2020    imul                mxd, mxm, 0x010101
2021    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
2022    imul                myd, mym, 0x010101
2023    add                 myd, t1d ; 8tap_v, my, 4tap_v
2024    lea                  r8, [put_avx2]
2025    movsxd               wq, wm
2026    movifnidn            hd, hm
2027    test                mxd, 0xf00
2028    jnz .h
2029    test                myd, 0xf00
2030    jz mangle(private_prefix %+ _put_6tap_8bpc_avx2).put
2031.v:
2032    WIN64_SPILL_XMM      12, 15
2033    movzx               mxd, myb
2034    shr                 myd, 16
2035    cmp                  hd, 6
2036    cmovs               myd, mxd
2037    tzcnt               r6d, wd
2038    movzx               r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
2039    vpbroadcastd         m7, [pw_512]
2040    lea                 myq, [r8+myq*8+subpel_filters-put_avx2]
2041    vpbroadcastw         m8, [myq+0]
2042    vpbroadcastw         m9, [myq+2]
2043    vpbroadcastw        m10, [myq+4]
2044    vpbroadcastw        m11, [myq+6]
2045    add                  r6, r8
2046    lea                ss3q, [ssq*3]
2047    sub                srcq, ss3q
2048    jmp                  r6
2049.v_w2:
2050    movd                xm2, [srcq+ssq*0]
2051    pinsrw              xm2, [srcq+ssq*1], 2
2052    pinsrw              xm2, [srcq+ssq*2], 4
2053    add                srcq, ss3q
2054    pinsrw              xm2, [srcq+ssq*0], 6 ; 0 1 2 3
2055    movd                xm3, [srcq+ssq*1]
2056    vpbroadcastd        xm1, [srcq+ssq*2]
2057    add                srcq, ss3q
2058    vpbroadcastd        xm0, [srcq+ssq*0]
2059    vpblendd            xm3, xm1, 0x02       ; 4 5
2060    vpblendd            xm1, xm0, 0x02       ; 5 6
2061    palignr             xm4, xm3, xm2, 4     ; 1 2 3 4
2062    punpcklbw           xm3, xm1             ; 45 56
2063    punpcklbw           xm1, xm2, xm4        ; 01 12
2064    punpckhbw           xm2, xm4             ; 23 34
2065.v_w2_loop:
2066    pmaddubsw           xm5, xm1, xm8        ; a0 b0
2067    mova                xm1, xm2
2068    pmaddubsw           xm2, xm9             ; a1 b1
2069    paddw               xm5, xm2
2070    mova                xm2, xm3
2071    pmaddubsw           xm3, xm10            ; a2 b2
2072    paddw               xm5, xm3
2073    vpbroadcastd        xm4, [srcq+ssq*1]
2074    lea                srcq, [srcq+ssq*2]
2075    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
2076    vpbroadcastd        xm0, [srcq+ssq*0]
2077    vpblendd            xm4, xm0, 0x02       ; 7 8
2078    punpcklbw           xm3, xm4             ; 67 78
2079    pmaddubsw           xm4, xm3, xm11       ; a3 b3
2080    paddw               xm5, xm4
2081    pmulhrsw            xm5, xm7
2082    packuswb            xm5, xm5
2083    pextrw     [dstq+dsq*0], xm5, 0
2084    pextrw     [dstq+dsq*1], xm5, 2
2085    lea                dstq, [dstq+dsq*2]
2086    sub                  hd, 2
2087    jg .v_w2_loop
2088    RET
2089.v_w4:
2090    movd                xm2, [srcq+ssq*0]
2091    pinsrd              xm2, [srcq+ssq*1], 1
2092    pinsrd              xm2, [srcq+ssq*2], 2
2093    add                srcq, ss3q
2094    pinsrd              xm2, [srcq+ssq*0], 3 ; 0 1 2 3
2095    movd                xm3, [srcq+ssq*1]
2096    vpbroadcastd        xm1, [srcq+ssq*2]
2097    add                srcq, ss3q
2098    vpbroadcastd        xm0, [srcq+ssq*0]
2099    vpblendd            xm3, xm1, 0x02       ; 4 5
2100    vpblendd            xm1, xm0, 0x02       ; 5 6
2101    palignr             xm4, xm3, xm2, 4     ; 1 2 3 4
2102    punpcklbw           xm3, xm1             ; 45 56
2103    punpcklbw           xm1, xm2, xm4        ; 01 12
2104    punpckhbw           xm2, xm4             ; 23 34
2105.v_w4_loop:
2106    pmaddubsw           xm5, xm1, xm8        ; a0 b0
2107    mova                xm1, xm2
2108    pmaddubsw           xm2, xm9             ; a1 b1
2109    paddw               xm5, xm2
2110    mova                xm2, xm3
2111    pmaddubsw           xm3, xm10            ; a2 b2
2112    paddw               xm5, xm3
2113    vpbroadcastd        xm4, [srcq+ssq*1]
2114    lea                srcq, [srcq+ssq*2]
2115    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
2116    vpbroadcastd        xm0, [srcq+ssq*0]
2117    vpblendd            xm4, xm0, 0x02       ; 7 8
2118    punpcklbw           xm3, xm4             ; 67 78
2119    pmaddubsw           xm4, xm3, xm11       ; a3 b3
2120    paddw               xm5, xm4
2121    pmulhrsw            xm5, xm7
2122    packuswb            xm5, xm5
2123    movd       [dstq+dsq*0], xm5
2124    pextrd     [dstq+dsq*1], xm5, 1
2125    lea                dstq, [dstq+dsq*2]
2126    sub                  hd, 2
2127    jg .v_w4_loop
2128    RET
2129.v_w8:
2130    movq                xm1, [srcq+ssq*0]
2131    vpbroadcastq         m4, [srcq+ssq*1]
2132    vpbroadcastq         m2, [srcq+ssq*2]
2133    add                srcq, ss3q
2134    vpbroadcastq         m5, [srcq+ssq*0]
2135    vpbroadcastq         m3, [srcq+ssq*1]
2136    vpbroadcastq         m6, [srcq+ssq*2]
2137    add                srcq, ss3q
2138    vpbroadcastq         m0, [srcq+ssq*0]
2139    vpblendd             m1, m4, 0x30
2140    vpblendd             m4, m2, 0x30
2141    punpcklbw            m1, m4      ; 01 12
2142    vpblendd             m2, m5, 0x30
2143    vpblendd             m5, m3, 0x30
2144    punpcklbw            m2, m5      ; 23 34
2145    vpblendd             m3, m6, 0x30
2146    vpblendd             m6, m0, 0x30
2147    punpcklbw            m3, m6      ; 45 56
2148.v_w8_loop:
2149    vpbroadcastq         m4, [srcq+ssq*1]
2150    lea                srcq, [srcq+ssq*2]
2151    pmaddubsw            m5, m1, m8  ; a0 b0
2152    mova                 m1, m2
2153    pmaddubsw            m2, m9      ; a1 b1
2154    paddw                m5, m2
2155    mova                 m2, m3
2156    pmaddubsw            m3, m10     ; a2 b2
2157    paddw                m5, m3
2158    vpblendd             m3, m0, m4, 0x30
2159    vpbroadcastq         m0, [srcq+ssq*0]
2160    vpblendd             m4, m0, 0x30
2161    punpcklbw            m3, m4      ; 67 78
2162    pmaddubsw            m4, m3, m11 ; a3 b3
2163    paddw                m5, m4
2164    pmulhrsw             m5, m7
2165    vextracti128        xm4, m5, 1
2166    packuswb            xm5, xm4
2167    movq       [dstq+dsq*0], xm5
2168    movhps     [dstq+dsq*1], xm5
2169    lea                dstq, [dstq+dsq*2]
2170    sub                  hd, 2
2171    jg .v_w8_loop
2172    RET
2173.v_w16:
2174.v_w32:
2175.v_w64:
2176.v_w128:
2177    lea                 r6d, [wq*8-128]
2178    WIN64_PUSH_XMM       15
2179    lea                 r6d, [hq+r6*2]
2180.v_w16_loop0:
2181    vbroadcasti128       m4, [srcq+ssq*0]
2182    vbroadcasti128       m5, [srcq+ssq*1]
2183    lea                  r4, [srcq+ss3q]
2184    vbroadcasti128       m6, [srcq+ssq*2]
2185    vbroadcasti128       m0, [r4+ssq*0]
2186    mov                  r7, dstq
2187    vbroadcasti128       m1, [r4+ssq*1]
2188    vbroadcasti128       m2, [r4+ssq*2]
2189    add                  r4, ss3q
2190    vbroadcasti128       m3, [r4+ssq*0]
2191    shufpd               m4, m0, 0x0c
2192    shufpd               m5, m1, 0x0c
2193    punpcklbw            m1, m4, m5 ; 01
2194    punpckhbw            m4, m5     ; 34
2195    shufpd               m6, m2, 0x0c
2196    punpcklbw            m2, m5, m6 ; 12
2197    punpckhbw            m5, m6     ; 45
2198    shufpd               m0, m3, 0x0c
2199    punpcklbw            m3, m6, m0 ; 23
2200    punpckhbw            m6, m0     ; 56
2201.v_w16_loop:
2202    vbroadcasti128      m12, [r4+ssq*1]
2203    lea                  r4, [r4+ssq*2]
2204    pmaddubsw           m13, m1, m8  ; a0
2205    pmaddubsw           m14, m2, m8  ; b0
2206    mova                 m1, m3
2207    mova                 m2, m4
2208    pmaddubsw            m3, m9      ; a1
2209    pmaddubsw            m4, m9      ; b1
2210    paddw               m13, m3
2211    paddw               m14, m4
2212    mova                 m3, m5
2213    mova                 m4, m6
2214    pmaddubsw            m5, m10     ; a2
2215    pmaddubsw            m6, m10     ; b2
2216    paddw               m13, m5
2217    vbroadcasti128       m5, [r4+ssq*0]
2218    paddw               m14, m6
2219    shufpd               m6, m0, m12, 0x0d
2220    shufpd               m0, m12, m5, 0x0c
2221    punpcklbw            m5, m6, m0  ; 67
2222    punpckhbw            m6, m0      ; 78
2223    pmaddubsw           m12, m5, m11 ; a3
2224    paddw               m13, m12
2225    pmaddubsw           m12, m6, m11 ; b3
2226    paddw               m14, m12
2227    pmulhrsw            m13, m7
2228    pmulhrsw            m14, m7
2229    packuswb            m13, m14
2230    vpermq              m13, m13, q3120
2231    mova         [r7+dsq*0], xm13
2232    vextracti128 [r7+dsq*1], m13, 1
2233    lea                  r7, [r7+dsq*2]
2234    sub                  hd, 2
2235    jg .v_w16_loop
2236    add                srcq, 16
2237    add                dstq, 16
2238    movzx                hd, r6b
2239    sub                 r6d, 1<<8
2240    jg .v_w16_loop0
2241    RET
2242.h:
2243.h_w2:
2244.h_w4:
2245    test                myd, 0xf00
2246    jnz .hv
2247    vpbroadcastd         m5, [pw_34] ; 2 + (8 << 2)
2248    cmp                  wd, 4
2249    jle mangle(private_prefix %+ _put_6tap_8bpc_avx2).h_w2
2250    WIN64_SPILL_XMM      11
2251    tzcnt                wd, wd
2252    vbroadcasti128       m6, [subpel_h_shufA]
2253    shr                 mxd, 16
2254    vbroadcasti128       m7, [subpel_h_shufB]
2255    sub                srcq, 3
2256    vbroadcasti128       m8, [subpel_h_shufC]
2257    movzx                wd, word [r8+wq*2+table_offset(put, _8tap_h)]
2258    vpbroadcastd         m9, [r8+mxq*8+subpel_filters-put_avx2+0]
2259    vpbroadcastd        m10, [r8+mxq*8+subpel_filters-put_avx2+4]
2260    add                  wq, r8
2261    jmp                  wq
2262.h_w8:
2263%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
2264    pshufb              m%2, m%1, m7
2265    pshufb              m%3, m%1, m8
2266    pshufb              m%1, m6
2267    pmaddubsw           m%4, m%2, m9
2268    pmaddubsw           m%2, m10
2269    pmaddubsw           m%3, m10
2270    pmaddubsw           m%1, m9
2271    paddw               m%3, m%4
2272    paddw               m%1, m%2
2273    phaddw              m%1, m%3
2274    paddw               m%1, m5
2275    psraw               m%1, 6
2276%endmacro
2277    movu                xm0, [srcq+ssq*0]
2278    vinserti128          m0, [srcq+ssq*1], 1
2279    lea                srcq, [srcq+ssq*2]
2280    PUT_8TAP_H            0, 1, 2, 3
2281    vextracti128        xm1, m0, 1
2282    packuswb            xm0, xm1
2283    movq       [dstq+dsq*0], xm0
2284    movhps     [dstq+dsq*1], xm0
2285    lea                dstq, [dstq+dsq*2]
2286    sub                  hd, 2
2287    jg .h_w8
2288    RET
2289.h_w16:
2290    movu                xm0, [srcq+ssq*0+8*0]
2291    vinserti128          m0, [srcq+ssq*1+8*0], 1
2292    movu                xm1, [srcq+ssq*0+8*1]
2293    vinserti128          m1, [srcq+ssq*1+8*1], 1
2294    PUT_8TAP_H            0, 2, 3, 4
2295    lea                srcq, [srcq+ssq*2]
2296    PUT_8TAP_H            1, 2, 3, 4
2297    packuswb             m0, m1
2298    mova         [dstq+dsq*0], xm0
2299    vextracti128 [dstq+dsq*1], m0, 1
2300    lea                dstq, [dstq+dsq*2]
2301    sub                  hd, 2
2302    jg .h_w16
2303    RET
2304.h_w32:
2305    xor                 r6d, r6d
2306    jmp .h_start
2307.h_w64:
2308    mov                  r6, -32*1
2309    jmp .h_start
2310.h_w128:
2311    mov                  r6, -32*3
2312.h_start:
2313    sub                srcq, r6
2314    sub                dstq, r6
2315    mov                  r4, r6
2316.h_loop:
2317    movu                 m0, [srcq+r6+8*0]
2318    movu                 m1, [srcq+r6+8*1]
2319    PUT_8TAP_H            0, 2, 3, 4
2320    PUT_8TAP_H            1, 2, 3, 4
2321    packuswb             m0, m1
2322    mova          [dstq+r6], m0
2323    add                  r6, 32
2324    jle .h_loop
2325    add                srcq, ssq
2326    add                dstq, dsq
2327    mov                  r6, r4
2328    dec                  hd
2329    jg .h_loop
2330    RET
2331.hv:
2332    WIN64_SPILL_XMM      14, 16
2333    cmp                  wd, 4
2334    jg .hv_w8
2335    movzx               mxd, mxb
2336    dec                srcq
2337    vpbroadcastd         m7, [r8+mxq*8+subpel_filters-put_avx2+2]
2338    movzx               mxd, myb
2339    shr                 myd, 16
2340    cmp                  hd, 6
2341    cmovs               myd, mxd
2342    vpbroadcastq         m0, [r8+myq*8+subpel_filters-put_avx2]
2343    lea                ss3q, [ssq*3]
2344    sub                srcq, ss3q
2345    punpcklbw            m0, m0
2346    psraw                m0, 8 ; sign-extend
2347    vpbroadcastd         m8, [pw_8192]
2348    vpbroadcastd         m9, [pd_512]
2349    pshufd              m10, m0, q0000
2350    pshufd              m11, m0, q1111
2351    pshufd              m12, m0, q2222
2352    pshufd              m13, m0, q3333
2353    cmp                  wd, 4
2354    je .hv_w4
2355    vbroadcasti128       m6, [subpel_h_shuf4]
2356    movq                xm2, [srcq+ssq*0]
2357    movhps              xm2, [srcq+ssq*1]
2358    movq                xm0, [srcq+ssq*2]
2359    add                srcq, ss3q
2360    movhps              xm0, [srcq+ssq*0]
2361    vpbroadcastq         m3, [srcq+ssq*1]
2362    vpbroadcastq         m4, [srcq+ssq*2]
2363    add                srcq, ss3q
2364    vpbroadcastq         m1, [srcq+ssq*0]
2365    vpblendd             m2, m3, 0x30
2366    vpblendd             m0, m1, 0x30
2367    vpblendd             m2, m4, 0xc0
2368    pshufb               m2, m6
2369    pshufb               m0, m6
2370    pmaddubsw            m2, m7
2371    pmaddubsw            m0, m7
2372    phaddw               m2, m0
2373    pmulhrsw             m2, m8
2374    vextracti128        xm3, m2, 1
2375    palignr             xm4, xm3, xm2, 4
2376    punpcklwd           xm1, xm2, xm4  ; 01 12
2377    punpckhwd           xm2, xm4       ; 23 34
2378    pshufd              xm0, xm3, q2121
2379    punpcklwd           xm3, xm0       ; 45 56
2380.hv_w2_loop:
2381    movq                xm4, [srcq+ssq*1]
2382    lea                srcq, [srcq+ssq*2]
2383    movhps              xm4, [srcq+ssq*0]
2384    pshufb              xm4, xm6
2385    pmaddubsw           xm4, xm7
2386    pmaddwd             xm5, xm1, xm10 ; a0 b0
2387    mova                xm1, xm2
2388    pmaddwd             xm2, xm11      ; a1 b1
2389    paddd               xm5, xm2
2390    mova                xm2, xm3
2391    pmaddwd             xm3, xm12      ; a2 b2
2392    phaddw              xm4, xm4
2393    pmulhrsw            xm4, xm8
2394    paddd               xm5, xm3
2395    palignr             xm3, xm4, xm0, 12
2396    mova                xm0, xm4
2397    punpcklwd           xm3, xm0       ; 67 78
2398    pmaddwd             xm4, xm3, xm13 ; a3 b3
2399    paddd               xm5, xm9
2400    paddd               xm5, xm4
2401    psrad               xm5, 10
2402    packssdw            xm5, xm5
2403    packuswb            xm5, xm5
2404    pextrw     [dstq+dsq*0], xm5, 0
2405    pextrw     [dstq+dsq*1], xm5, 1
2406    lea                dstq, [dstq+dsq*2]
2407    sub                  hd, 2
2408    jg .hv_w2_loop
2409    RET
2410.hv_w4:
2411    mova                 m6, [subpel_h_shuf4]
2412    vpbroadcastq         m2, [srcq+ssq*0]
2413    vpbroadcastq         m4, [srcq+ssq*1]
2414    vpbroadcastq         m0, [srcq+ssq*2]
2415    add                srcq, ss3q
2416    vpbroadcastq         m5, [srcq+ssq*0]
2417    vpbroadcastq         m3, [srcq+ssq*1]
2418    vpblendd             m2, m4, 0xcc ; 0 1
2419    vpbroadcastq         m4, [srcq+ssq*2]
2420    add                srcq, ss3q
2421    vpbroadcastq         m1, [srcq+ssq*0]
2422    vpblendd             m0, m5, 0xcc ; 2 3
2423    vpblendd             m3, m4, 0xcc ; 4 5
2424    pshufb               m2, m6
2425    pshufb               m0, m6
2426    pshufb               m3, m6
2427    pshufb               m1, m6
2428    pmaddubsw            m2, m7
2429    pmaddubsw            m0, m7
2430    pmaddubsw            m3, m7
2431    pmaddubsw            m1, m7
2432    phaddw               m2, m0
2433    phaddw               m3, m1
2434    pmulhrsw             m2, m8
2435    pmulhrsw             m3, m8
2436    palignr              m4, m3, m2, 4
2437    punpcklwd            m1, m2, m4   ; 01 12
2438    punpckhwd            m2, m4       ; 23 34
2439    pshufd               m0, m3, q2121
2440    punpcklwd            m3, m0       ; 45 56
2441.hv_w4_loop:
2442    vpbroadcastq         m4, [srcq+ssq*1]
2443    lea                srcq, [srcq+ssq*2]
2444    pmaddwd              m5, m1, m10  ; a0 b0
2445    mova                 m1, m2
2446    pmaddwd              m2, m11      ; a1 b1
2447    paddd                m5, m2
2448    mova                 m2, m3
2449    pmaddwd              m3, m12      ; a2 b2
2450    paddd                m5, m3
2451    vpbroadcastq         m3, [srcq+ssq*0]
2452    vpblendd             m4, m3, 0xcc ; 7 8
2453    pshufb               m4, m6
2454    pmaddubsw            m4, m7
2455    phaddw               m4, m4
2456    pmulhrsw             m4, m8
2457    palignr              m3, m4, m0, 12
2458    mova                 m0, m4
2459    punpcklwd            m3, m0       ; 67 78
2460    pmaddwd              m4, m3, m13  ; a3 b3
2461    paddd                m5, m9
2462    paddd                m5, m4
2463    psrad                m5, 10
2464    vextracti128        xm4, m5, 1
2465    packssdw            xm5, xm4
2466    packuswb            xm5, xm5
2467    pshuflw             xm5, xm5, q3120
2468    movd       [dstq+dsq*0], xm5
2469    pextrd     [dstq+dsq*1], xm5, 1
2470    lea                dstq, [dstq+dsq*2]
2471    sub                  hd, 2
2472    jg .hv_w4_loop
2473    RET
2474.hv_w8:
2475    WIN64_PUSH_XMM       16
2476    shr                 mxd, 16
2477    sub                srcq, 3
2478    vpbroadcastd        m10, [r8+mxq*8+subpel_filters-put_avx2+0]
2479    vpbroadcastd        m11, [r8+mxq*8+subpel_filters-put_avx2+4]
2480    movzx               mxd, myb
2481    shr                 myd, 16
2482    cmp                  hd, 6
2483    cmovs               myd, mxd
2484    vpbroadcastq         m0, [r8+myq*8+subpel_filters-put_avx2]
2485    lea                ss3q, [ssq*3]
2486    sub                srcq, ss3q
2487    punpcklbw            m0, m0
2488    psraw                m0, 8 ; sign-extend
2489    pshufd              m12, m0, q0000
2490    pshufd              m13, m0, q1111
2491    pshufd              m14, m0, q2222
2492    pshufd              m15, m0, q3333
2493    lea                 r6d, [wq*8-64]
2494    lea                 r6d, [hq+r6*4]
2495.hv_w8_loop0:
2496    vbroadcasti128       m7, [subpel_h_shufA]
2497    movu                xm4, [srcq+ssq*0]
2498    lea                  r4, [srcq+ss3q]
2499    vbroadcasti128       m8, [subpel_h_shufB]
2500    movu                xm5, [srcq+ssq*1]
2501    mov                  r7, dstq
2502    vbroadcasti128       m9, [subpel_h_shufC]
2503    movu                xm6, [srcq+ssq*2]
2504    vbroadcasti128       m0, [r4+ssq*0]
2505    vpblendd             m4, m0, 0xf0      ; 0 3
2506    vinserti128          m5, [r4+ssq*1], 1 ; 1 4
2507    vinserti128          m6, [r4+ssq*2], 1 ; 2 5
2508    add                  r4, ss3q
2509    vinserti128          m0, [r4+ssq*0], 1 ; 3 6
2510%macro HV_H_8TAP_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
2511    pshufb               %3, %1, %6
2512    pshufb               %4, %1, %7
2513    pshufb               %1, %5
2514    pmaddubsw            %2, %3, m10
2515    pmaddubsw            %4, m11
2516    pmaddubsw            %3, m11
2517    pmaddubsw            %1, m10
2518    paddw                %2, %4
2519    paddw                %1, %3
2520    phaddw               %1, %2
2521%endmacro
2522    HV_H_8TAP_W8         m4, m1, m2, m3, m7, m8, m9
2523    HV_H_8TAP_W8         m5, m1, m2, m3, m7, m8, m9
2524    HV_H_8TAP_W8         m6, m1, m2, m3, m7, m8, m9
2525    HV_H_8TAP_W8         m0, m1, m2, m3, m7, m8, m9
2526    vpbroadcastd         m7, [pw_8192]
2527    vpermq               m4, m4, q3120
2528    vpermq               m5, m5, q3120
2529    vpermq               m6, m6, q3120
2530    pmulhrsw             m0, m7
2531    pmulhrsw             m4, m7
2532    pmulhrsw             m5, m7
2533    pmulhrsw             m6, m7
2534    vpermq               m7, m0, q3120
2535    punpcklwd            m1, m4, m5  ; 01
2536    punpckhwd            m4, m5      ; 34
2537    punpcklwd            m2, m5, m6  ; 12
2538    punpckhwd            m5, m6      ; 45
2539    punpcklwd            m3, m6, m7  ; 23
2540    punpckhwd            m6, m7      ; 56
2541.hv_w8_loop:
2542    vextracti128        r6m, m0, 1 ; not enough registers
2543    movu                xm0, [r4+ssq*1]
2544    lea                  r4, [r4+ssq*2]
2545    vinserti128          m0, [r4+ssq*0], 1 ; 7 8
2546    pmaddwd              m8, m1, m12 ; a0
2547    pmaddwd              m9, m2, m12 ; b0
2548    mova                 m1, m3
2549    mova                 m2, m4
2550    pmaddwd              m3, m13     ; a1
2551    pmaddwd              m4, m13     ; b1
2552    paddd                m8, m3
2553    paddd                m9, m4
2554    mova                 m3, m5
2555    mova                 m4, m6
2556    pmaddwd              m5, m14     ; a2
2557    pmaddwd              m6, m14     ; b2
2558    paddd                m8, m5
2559    paddd                m9, m6
2560    vbroadcasti128       m6, [subpel_h_shufB]
2561    vbroadcasti128       m7, [subpel_h_shufC]
2562    vbroadcasti128       m5, [subpel_h_shufA]
2563    HV_H_8TAP_W8         m0, m5, m6, m7, m5, m6, m7
2564    vpbroadcastd         m5, [pw_8192]
2565    vpbroadcastd         m7, [pd_512]
2566    vbroadcasti128       m6, r6m
2567    pmulhrsw             m0, m5
2568    paddd                m8, m7
2569    paddd                m9, m7
2570    vpermq               m7, m0, q3120 ; 7 8
2571    shufpd               m6, m7, 0x04  ; 6 7
2572    punpcklwd            m5, m6, m7  ; 67
2573    punpckhwd            m6, m7      ; 78
2574    pmaddwd              m7, m5, m15 ; a3
2575    paddd                m8, m7
2576    pmaddwd              m7, m6, m15 ; b3
2577    paddd                m7, m9
2578    psrad                m8, 10
2579    psrad                m7, 10
2580    packssdw             m8, m7
2581    vextracti128        xm7, m8, 1
2582    packuswb            xm8, xm7
2583    pshufd              xm7, xm8, q3120
2584    movq         [r7+dsq*0], xm7
2585    movhps       [r7+dsq*1], xm7
2586    lea                  r7, [r7+dsq*2]
2587    sub                  hd, 2
2588    jg .hv_w8_loop
2589    add                srcq, 8
2590    add                dstq, 8
2591    movzx                hd, r6b
2592    sub                 r6d, 1<<8
2593    jg .hv_w8_loop0
2594    RET
2595
2596%if WIN64
2597DECLARE_REG_TMP 6, 4
2598%else
2599DECLARE_REG_TMP 6, 7
2600%endif
2601
2602%define PREP_8TAP_FN FN prep_8tap,
2603PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_8bpc
2604PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_8bpc
2605PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_8bpc
2606PREP_8TAP_FN regular,        REGULAR, REGULAR
2607
2608cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my, ns
2609    imul                mxd, mxm, 0x010101
2610    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
2611    imul                myd, mym, 0x010101
2612    add                 myd, t1d ; 6tap_v, my, 4tap_v
2613    lea                  r7, [prep%+SUFFIX]
2614    mov                  wd, wm
2615    movifnidn            hd, hm
2616    test                mxd, 0xf00
2617    jnz .h
2618    test                myd, 0xf00
2619    jnz .v
2620.prep:
2621    tzcnt                wd, wd
2622    movzx                wd, word [r7+wq*2+table_offset(prep,)]
2623    add                  wq, r7
2624    lea                  r6, [ssq*3]
2625%if WIN64
2626    pop                  r7
2627%endif
2628    jmp                  wq
2629.v:
2630    WIN64_SPILL_XMM      10, 12
2631    movzx               mxd, myb
2632    shr                 myd, 16
2633    cmp                  hd, 4
2634    cmove               myd, mxd
2635    lea                 myq, [r7+myq*8+subpel_filters+1-prep%+SUFFIX]
2636    vpbroadcastd         m9, [pw_8192]
2637    vpbroadcastw         m6, [myq+0]
2638    mov                 nsq, ssq
2639    vpbroadcastw         m7, [myq+2]
2640    neg                 nsq
2641    vpbroadcastw         m8, [myq+4]
2642    cmp                  wd, 8
2643    jg .v_w16
2644    je .v_w8
2645.v_w4:
2646    movd                xm2, [srcq+nsq*2]
2647    pinsrd              xm2, [srcq+nsq*1], 1
2648    vpbroadcastd         m1, [srcq+ssq*0]
2649    vpbroadcastd         m3, [srcq+ssq*1]
2650    vpbroadcastd         m0, [srcq+ssq*2]
2651    vbroadcasti128       m5, [deint_shuf4]
2652    vpblendd             m1, m2, 0xeb
2653    punpcklqdq           m3, m0
2654    vpblendd             m1, m3, 0x60 ; 0 1 2 _   2 3 4 _
2655    pshufb               m1, m5       ; 01  12    23  34
2656.v_w4_loop:
2657    lea                srcq, [srcq+ssq*4]
2658    pinsrd              xm0, [srcq+nsq*1], 1
2659    vpbroadcastd         m2, [srcq+ssq*0]
2660    vpbroadcastd         m3, [srcq+ssq*1]
2661    vpblendd             m2, m0, 0xeb
2662    vpbroadcastd         m0, [srcq+ssq*2]
2663    punpcklqdq           m3, m0
2664    vpblendd             m2, m3, 0x60 ; 4 5 6 _   6 7 8 _
2665    pshufb               m2, m5       ; 45  56    67  78
2666    pmaddubsw            m3, m1, m6   ; a0  b0    c0  d0
2667    vperm2i128           m1, m2, 0x21 ; 23  34    45  56
2668    pmaddubsw            m4, m2, m8   ; a2  b2    c2  d2
2669    pmaddubsw            m1, m7       ; a1  b1    c1  d1
2670    paddw                m3, m4
2671    paddw                m3, m1
2672    pmulhrsw             m3, m9
2673    mova                 m1, m2
2674    mova             [tmpq], m3
2675    add                tmpq, 32
2676    sub                  hd, 4
2677    jg .v_w4_loop
2678    RET
2679.v_w8:
2680    movq                xm1, [srcq+nsq*2]
2681    vpbroadcastq         m3, [srcq+nsq*1]
2682    vpbroadcastq         m2, [srcq+ssq*0]
2683    vpbroadcastq         m4, [srcq+ssq*1]
2684    vpbroadcastq         m0, [srcq+ssq*2]
2685    vpblendd             m1, m3, 0x30
2686    vpblendd             m3, m2, 0x30
2687    punpcklbw            m1, m3     ; 01 12
2688    vpblendd             m2, m4, 0x30
2689    vpblendd             m4, m0, 0x30
2690    punpcklbw            m2, m4     ; 23 34
2691.v_w8_loop:
2692    lea                srcq, [srcq+ssq*4]
2693    pmaddubsw            m1, m6     ; a0
2694    vpbroadcastq         m3, [srcq+nsq*1]
2695    pmaddubsw            m4, m2, m7 ; a1
2696    pmaddubsw            m5, m2, m6 ; b0
2697    vpbroadcastq         m2, [srcq+ssq*0]
2698    vpblendd             m0, m3, 0x30
2699    vpblendd             m3, m2, 0x30
2700    paddw                m4, m1
2701    punpcklbw            m1, m0, m3 ; 45 56
2702    vpbroadcastq         m3, [srcq+ssq*1]
2703    vpbroadcastq         m0, [srcq+ssq*2]
2704    vpblendd             m2, m3, 0x30
2705    vpblendd             m3, m0, 0x30
2706    punpcklbw            m2, m3     ; 67 78
2707    pmaddubsw            m3, m1, m7 ; b1
2708    paddw                m5, m3
2709    pmaddubsw            m3, m1, m8 ; a2
2710    paddw                m4, m3
2711    pmaddubsw            m3, m2, m8 ; b2
2712    paddw                m5, m3
2713    pmulhrsw             m4, m9
2714    pmulhrsw             m5, m9
2715    mova        [tmpq+32*0], m4
2716    mova        [tmpq+32*1], m5
2717    add                tmpq, 32*2
2718    sub                  hd, 4
2719    jg .v_w8_loop
2720    RET
2721.v_w16:
2722    lea                 r6d, [wq*2-32]
2723    lea                srcq, [srcq+nsq*2]
2724    WIN64_PUSH_XMM       12
2725    lea                 r6d, [hq+r6*8]
2726.v_w16_loop0:
2727    vbroadcasti128       m3, [srcq+ssq*0]
2728    lea                  r5, [srcq+ssq*2]
2729    vbroadcasti128       m4, [srcq+ssq*1]
2730    mov                  r7, tmpq
2731    vbroadcasti128       m0, [r5+ssq*0]
2732    vbroadcasti128       m1, [r5+ssq*1]
2733    lea                  r5, [r5+ssq*2]
2734    vbroadcasti128       m2, [r5+ssq*0]
2735    shufpd               m3, m0, 0x0c
2736    shufpd               m4, m1, 0x0c
2737    punpcklbw            m1, m3, m4 ; 01
2738    punpckhbw            m3, m4     ; 23
2739    shufpd               m0, m2, 0x0c
2740    punpcklbw            m2, m4, m0 ; 12
2741    punpckhbw            m4, m0     ; 34
2742.v_w16_loop:
2743    vbroadcasti128       m5, [r5+ssq*1]
2744    pmaddubsw           m10, m1, m6 ; a0
2745    lea                  r5, [r5+ssq*2]
2746    pmaddubsw           m11, m2, m6 ; b0
2747    mova                 m1, m3
2748    pmaddubsw            m3, m7     ; a1
2749    mova                 m2, m4
2750    pmaddubsw            m4, m7     ; b1
2751    paddw               m10, m3
2752    vbroadcasti128       m3, [r5+ssq*0]
2753    paddw               m11, m4
2754    shufpd               m4, m0, m5, 0x0d
2755    shufpd               m0, m5, m3, 0x0c
2756    punpcklbw            m3, m4, m0 ; 45
2757    punpckhbw            m4, m0     ; 56
2758    pmaddubsw            m5, m3, m8 ; a2
2759    paddw               m10, m5
2760    pmaddubsw            m5, m4, m8 ; b2
2761    paddw               m11, m5
2762    pmulhrsw            m10, m9
2763    pmulhrsw            m11, m9
2764    mova          [r7+wq*0], m10
2765    mova          [r7+wq*2], m11
2766    lea                  r7, [r7+wq*4]
2767    sub                  hd, 2
2768    jg .v_w16_loop
2769    add                srcq, 16
2770    add                tmpq, 32
2771    movzx                hd, r6b
2772    sub                 r6d, 1<<8
2773    jg .v_w16_loop0
2774    RET
2775.h_w4:
2776    RESET_STACK_STATE
2777    movzx               mxd, mxb
2778    vbroadcasti128       m3, [subpel_h_shufA]
2779    dec                srcq
2780    vpbroadcastd         m5, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
2781    lea                  r3, [ssq*3]
2782.h_w4_loop:
2783    movq                xm0, [srcq+ssq*0]
2784    vpbroadcastq         m2, [srcq+ssq*2]
2785    movq                xm1, [srcq+ssq*1]
2786    vpblendd             m0, m2, 0x30
2787    vpbroadcastq         m2, [srcq+r3   ]
2788    lea                srcq, [srcq+ssq*4]
2789    vpblendd             m1, m2, 0x30
2790    pshufb               m0, m3
2791    pshufb               m1, m3
2792    pmaddubsw            m0, m5
2793    pmaddubsw            m1, m5
2794    phaddw               m0, m1
2795    pmulhrsw             m0, m4
2796    mova             [tmpq], m0
2797    add                tmpq, 32
2798    sub                  hd, 4
2799    jg .h_w4_loop
2800    RET
2801.h:
2802    test                myd, 0xf00
2803    jnz .hv
2804    vpbroadcastd         m4, [pw_8192]
2805    cmp                  wd, 4
2806    je .h_w4
2807    WIN64_SPILL_XMM      10
2808    tzcnt                wd, wd
2809    vbroadcasti128       m3, [z_filter_s+ 2]
2810    shr                 mxd, 16
2811    vbroadcasti128       m5, [z_filter_s+ 6]
2812    sub                srcq, 2
2813    vbroadcasti128       m6, [z_filter_s+10]
2814    lea                 mxq, [r7+mxq*8+subpel_filters+1-prep%+SUFFIX]
2815    movzx                wd, word [r7+wq*2+table_offset(prep, _6tap_h)]
2816    vpbroadcastw         m7, [mxq+0]
2817    vpbroadcastw         m8, [mxq+2]
2818    add                  wq, r7
2819    vpbroadcastw         m9, [mxq+4]
2820    jmp                  wq
2821.h_w8:
2822    movu                xm0, [srcq+ssq*0]
2823    vinserti128          m0, [srcq+ssq*1], 1
2824    lea                srcq, [srcq+ssq*2]
2825%macro PREP_6TAP_H 0
2826    pshufb               m1, m0, m3
2827    pmaddubsw            m1, m7
2828    pshufb               m2, m0, m5
2829    pmaddubsw            m2, m8
2830    pshufb               m0, m6
2831    pmaddubsw            m0, m9
2832    paddw                m1, m2
2833    paddw                m0, m1
2834    pmulhrsw             m0, m4
2835%endmacro
2836    PREP_6TAP_H
2837    mova             [tmpq], m0
2838    add                tmpq, 32
2839    sub                  hd, 2
2840    jg .h_w8
2841    RET
2842.h_w16:
2843    movu                xm0, [srcq+ssq*0+8*0]
2844    vinserti128          m0, [srcq+ssq*0+8*1], 1
2845    PREP_6TAP_H
2846    mova        [tmpq+32*0], m0
2847    movu                xm0, [srcq+ssq*1+8*0]
2848    vinserti128          m0, [srcq+ssq*1+8*1], 1
2849    lea                srcq, [srcq+ssq*2]
2850    PREP_6TAP_H
2851    mova        [tmpq+32*1], m0
2852    add                tmpq, 32*2
2853    sub                  hd, 2
2854    jg .h_w16
2855    RET
2856.h_w32:
2857    xor                 r6d, r6d
2858    jmp .h_start
2859.h_w64:
2860    mov                  r6, -32*1
2861    jmp .h_start
2862.h_w128:
2863    mov                  r6, -32*3
2864.h_start:
2865    sub                srcq, r6
2866    mov                  r5, r6
2867.h_loop:
2868    movu                xm0, [srcq+r6+8*0]
2869    vinserti128          m0, [srcq+r6+8*1], 1
2870    PREP_6TAP_H
2871    mova        [tmpq+32*0], m0
2872    movu                xm0, [srcq+r6+8*2]
2873    vinserti128          m0, [srcq+r6+8*3], 1
2874    PREP_6TAP_H
2875    mova        [tmpq+32*1], m0
2876    add                tmpq, 32*2
2877    add                  r6, 32
2878    jle .h_loop
2879    add                srcq, ssq
2880    mov                  r6, r5
2881    dec                  hd
2882    jg .h_loop
2883    RET
2884.hv:
2885    WIN64_SPILL_XMM      14, 16
2886    cmp                  wd, 4
2887    jne .hv_w8
2888.hv_w4:
2889    movzx               mxd, mxb
2890    dec                srcq
2891    vpbroadcastd         m7, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
2892    movzx               mxd, myb
2893    shr                 myd, 16
2894    cmp                  hd, 4
2895    cmove               myd, mxd
2896    mova                 m6, [subpel_h_shuf4]
2897    vpbroadcastq         m0, [r7+myq*8+subpel_filters+1-prep%+SUFFIX]
2898    mov                 nsq, ssq
2899    pmovzxbd            m13, [deint_shuf4]
2900    neg                 nsq
2901    vpbroadcastd         m8, [pw_8192]
2902    vpbroadcastd         m9, [pd_32]
2903    punpcklbw            m0, m0
2904    vpbroadcastq         m2, [srcq+nsq*2]
2905    psraw                m0, 8 ; sign-extend
2906    vpbroadcastq         m4, [srcq+nsq*1]
2907    pshufd              m10, m0, q0000
2908    vpbroadcastq         m1, [srcq+ssq*0]
2909    pshufd              m11, m0, q1111
2910    vpbroadcastq         m3, [srcq+ssq*1]
2911    pshufd              m12, m0, q2222
2912    vpbroadcastq         m0, [srcq+ssq*2]
2913    vpblendd             m2, m4, 0xcc ; 0 1
2914    vpblendd             m1, m3, 0xcc ; 2 3
2915    pshufb               m2, m6
2916    pshufb               m1, m6
2917    pshufb               m0, m6
2918    pmaddubsw            m2, m7
2919    pmaddubsw            m1, m7
2920    pmaddubsw            m0, m7
2921    phaddw               m2, m1       ; 0 1 2 3
2922    phaddw               m0, m0       ; 4
2923    pmulhrsw             m2, m8
2924    pmulhrsw             m0, m8
2925    palignr              m0, m2, 4
2926    punpcklwd            m1, m2, m0   ; 01 12
2927    punpckhwd            m2, m0       ; 23 34
2928.hv_w4_loop:
2929    pmaddwd              m4, m10, m1  ; a0 b0
2930    lea                srcq, [srcq+ssq*4]
2931    pmaddwd              m5, m2, m10  ; c0 d0
2932    vpbroadcastq         m1, [srcq+nsq*1]
2933    pmaddwd              m2, m11      ; a1 b1
2934    vpbroadcastq         m3, [srcq+ssq*0]
2935    paddd                m4, m2
2936    vpbroadcastq         m2, [srcq+ssq*1]
2937    vpblendd             m1, m3, 0xcc ; 5 6
2938    vpbroadcastq         m3, [srcq+ssq*2]
2939    vpblendd             m2, m3, 0xcc ; 7 8
2940    pshufb               m1, m6
2941    pshufb               m2, m6
2942    pmaddubsw            m1, m7
2943    pmaddubsw            m2, m7
2944    phaddw               m1, m2       ; 5 6 7 8
2945    pmulhrsw             m1, m8
2946    paddd                m5, m9
2947    paddd                m4, m9
2948    palignr              m2, m1, m0, 12
2949    mova                 m0, m1
2950    punpcklwd            m1, m2, m0   ; 45 56
2951    punpckhwd            m2, m0       ; 67 78
2952    pmaddwd              m3, m11, m1  ; c1 d1
2953    paddd                m5, m3
2954    pmaddwd              m3, m12, m1  ; a2 b2
2955    paddd                m4, m3
2956    pmaddwd              m3, m12, m2  ; c2 d2
2957    paddd                m5, m3
2958    psrad                m4, 6
2959    psrad                m5, 6
2960    packssdw             m4, m5
2961    vpermd               m4, m13, m4
2962    mova             [tmpq], m4
2963    add                tmpq, 32
2964    sub                  hd, 4
2965    jg .hv_w4_loop
2966    RET
2967.hv_w8:
2968    shr                 mxd, 16
2969    lea                 mxq, [r7+mxq*8+subpel_filters+1-prep_avx2]
2970    WIN64_PUSH_XMM       16
2971    vpbroadcastw        m10, [mxq+0]
2972    vpbroadcastw        m11, [mxq+2]
2973    vpbroadcastw        m12, [mxq+4]
2974    movzx               mxd, myb
2975    shr                 myd, 16
2976    cmp                  hd, 6
2977    cmovs               myd, mxd
2978    vpbroadcastq         m0, [r7+myq*8+subpel_filters+1-prep_avx2]
2979    lea                  r7, [ssq*2+2]
2980    vbroadcasti128       m8, [z_filter_s+ 6]
2981    punpcklbw            m0, m0
2982    vbroadcasti128       m9, [z_filter_s+10]
2983    psraw                m0, 8 ; sign-extend
2984    lea                 r6d, [wq*8-64]
2985    pshufd              m13, m0, q0000
2986    sub                srcq, r7
2987    pshufd              m14, m0, q1111
2988    lea                 r6d, [hq+r6*4]
2989    pshufd              m15, m0, q2222
2990.hv_w8_loop0:
2991    vbroadcasti128       m7, [z_filter_s+2]
2992    movu                xm3, [srcq+ssq*0]
2993    lea                  r5, [srcq+ssq*2]
2994    movu                xm4, [srcq+ssq*1]
2995    vbroadcasti128       m0, [r5+ssq*0]
2996    mov                  r7, tmpq
2997    vinserti128          m4, [r5+ssq*1], 1 ; 1 3
2998    lea                  r5, [r5+ssq*2]
2999    vpblendd             m3, m0, 0xf0      ; 0 2
3000    vinserti128          m0, [r5+ssq*0], 1 ; 2 4
3001    vpbroadcastd         m5, [pw_8192]
3002    HV_H_6TAP_W8         m3, m1, m2, m7, m8, m9
3003    HV_H_6TAP_W8         m4, m1, m2, m7, m8, m9
3004    HV_H_6TAP_W8         m0, m1, m2, m7, m8, m9
3005    vpermq               m3, m3, q3120
3006    vpermq               m4, m4, q3120
3007    vpermq               m0, m0, q3120
3008    pmulhrsw             m3, m5
3009    pmulhrsw             m4, m5
3010    pmulhrsw             m0, m5
3011    punpcklwd            m1, m3, m4  ; 01
3012    punpckhwd            m3, m4      ; 23
3013    punpcklwd            m2, m4, m0  ; 12
3014    punpckhwd            m4, m0      ; 34
3015.hv_w8_loop:
3016    movu                xm7, [r5+ssq*1]
3017    lea                  r5, [r5+ssq*2]
3018    vinserti128          m7, [r5+ssq*0], 1 ; 5 6
3019    pmaddwd              m5, m13, m1 ; a0
3020    mova                 m1, m3
3021    pmaddwd              m6, m13, m2 ; b0
3022    mova                 m2, m4
3023    pmaddwd              m3, m14     ; a1
3024    pmaddwd              m4, m14     ; b1
3025    paddd                m5, m3
3026    vbroadcasti128       m3, [z_filter_s+2]
3027    paddd                m6, m4
3028    HV_H_6TAP_W8         m7, m3, m4, m3, m8, m9
3029    vpbroadcastd         m3, [pw_8192]
3030    vpbroadcastd         m4, [pd_32]
3031    pmulhrsw             m7, m3
3032    paddd                m5, m4
3033    paddd                m6, m4
3034    mova                 m4, m0
3035    vpermq               m0, m7, q3120
3036    shufpd               m4, m0, 0x05
3037    punpcklwd            m3, m4, m0  ; 45
3038    pmaddwd              m7, m15, m3 ; a2
3039    punpckhwd            m4, m0      ; 67
3040    paddd                m5, m7
3041    pmaddwd              m7, m15, m4 ; b2
3042    paddd                m6, m7
3043    psrad                m5, 6
3044    psrad                m6, 6
3045    packssdw             m5, m6
3046    vpermq               m5, m5, q3120
3047    mova          [r7+wq*0], xm5
3048    vextracti128  [r7+wq*2], m5, 1
3049    lea                  r7, [r7+wq*4]
3050    sub                  hd, 2
3051    jg .hv_w8_loop
3052    add                srcq, 8
3053    add                tmpq, 16
3054    movzx                hd, r6b
3055    sub                 r6d, 1<<8
3056    jg .hv_w8_loop0
3057    RET
3058
3059PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_8bpc
3060PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_8bpc
3061PREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_8bpc
3062PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_8bpc
3063PREP_8TAP_FN sharp,          SHARP,   SHARP
3064
3065cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
3066    imul                mxd, mxm, 0x010101
3067    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
3068    imul                myd, mym, 0x010101
3069    add                 myd, t1d ; 8tap_v, my, 4tap_v
3070    lea                  r7, [prep%+SUFFIX]
3071    mov                  wd, wm
3072    movifnidn            hd, hm
3073    test                mxd, 0xf00
3074    jnz .h
3075    test                myd, 0xf00
3076    jz mangle(private_prefix %+ _prep_6tap_8bpc_avx2).prep
3077.v:
3078    WIN64_SPILL_XMM      12, 15
3079    movzx               mxd, myb ; Select 4-tap/8-tap filter multipliers.
3080    shr                 myd, 16  ; Note that the code is 8-tap only, having
3081    cmp                  hd, 4   ; a separate 4-tap code path for (4|8|16)x4
3082    cmove               myd, mxd ; had a negligible effect on performance.
3083    lea                 myq, [r7+myq*8+subpel_filters-prep%+SUFFIX]
3084    lea            stride3q, [strideq*3]
3085    sub                srcq, stride3q
3086    vpbroadcastd         m7, [pw_8192]
3087    vpbroadcastw         m8, [myq+0]
3088    vpbroadcastw         m9, [myq+2]
3089    vpbroadcastw        m10, [myq+4]
3090    vpbroadcastw        m11, [myq+6]
3091    cmp                  wd, 8
3092    jg .v_w16
3093    je .v_w8
3094.v_w4:
3095    movd                xm0, [srcq+strideq*0]
3096    vpbroadcastd         m1, [srcq+strideq*2]
3097    vpbroadcastd        xm2, [srcq+strideq*1]
3098    add                srcq, stride3q
3099    vpbroadcastd         m3, [srcq+strideq*0]
3100    vpblendd             m1, m0, 0x01     ; 0 2 2 _   2 _ _ _
3101    vpblendd             m3, m2, 0x03     ; 1 1 3 3   3 3 _ _
3102    vpbroadcastd         m0, [srcq+strideq*1]
3103    vpbroadcastd         m2, [srcq+strideq*2]
3104    vpblendd             m1, m0, 0x68     ; 0 2 2 4   2 4 4 _
3105    vpbroadcastd         m0, [srcq+stride3q ]
3106    vbroadcasti128       m5, [deint_shuf4]
3107    vpblendd             m3, m2, 0xc0     ; 1 1 3 3   3 3 5 5
3108    vpblendd             m2, m3, m1, 0x55 ; 0 1 2 3   2 3 4 5
3109    vpblendd             m3, m1, 0xaa     ; 1 2 3 4   3 4 5 _
3110    punpcklbw            m1, m2, m3       ; 01  12    23  34
3111    vpblendd             m3, m0, 0x80     ; 1 2 3 4   3 4 5 6
3112    punpckhbw            m2, m3           ; 23  34    45  56
3113.v_w4_loop:
3114    lea                srcq, [srcq+strideq*4]
3115    pinsrd              xm0, [srcq+strideq*0], 1
3116    vpbroadcastd         m3, [srcq+strideq*1]
3117    vpbroadcastd         m4, [srcq+strideq*2]
3118    vpblendd             m3, m0, 0x03     ; 6 7 8 _   8 _ _ _
3119    vpbroadcastd         m0, [srcq+stride3q ]
3120    vpblendd             m3, m4, 0x20     ; 6 7 8 _   8 9 _ _
3121    vpblendd             m3, m0, 0x40     ; 6 7 8 _   8 9 a _
3122    pshufb               m3, m5           ; 67  78    89  9a
3123    pmaddubsw            m4, m1, m8
3124    vperm2i128           m1, m2, m3, 0x21 ; 45  56    67  78
3125    pmaddubsw            m2, m9
3126    paddw                m4, m2
3127    mova                 m2, m3
3128    pmaddubsw            m3, m11
3129    paddw                m3, m4
3130    pmaddubsw            m4, m1, m10
3131    paddw                m3, m4
3132    pmulhrsw             m3, m7
3133    mova             [tmpq], m3
3134    add                tmpq, 32
3135    sub                  hd, 4
3136    jg .v_w4_loop
3137    RET
3138.v_w8:
3139    movq                xm1, [srcq+strideq*0]
3140    vpbroadcastq         m4, [srcq+strideq*1]
3141    vpbroadcastq         m2, [srcq+strideq*2]
3142    vpbroadcastq         m5, [srcq+stride3q ]
3143    lea                srcq, [srcq+strideq*4]
3144    vpbroadcastq         m3, [srcq+strideq*0]
3145    vpbroadcastq         m6, [srcq+strideq*1]
3146    vpbroadcastq         m0, [srcq+strideq*2]
3147    vpblendd             m1, m4, 0x30
3148    vpblendd             m4, m2, 0x30
3149    punpcklbw            m1, m4 ; 01 12
3150    vpblendd             m2, m5, 0x30
3151    vpblendd             m5, m3, 0x30
3152    punpcklbw            m2, m5 ; 23 34
3153    vpblendd             m3, m6, 0x30
3154    vpblendd             m6, m0, 0x30
3155    punpcklbw            m3, m6 ; 45 56
3156.v_w8_loop:
3157    vpbroadcastq         m4, [srcq+stride3q ]
3158    lea                srcq, [srcq+strideq*4]
3159    pmaddubsw            m5, m2, m9  ; a1
3160    pmaddubsw            m6, m2, m8  ; b0
3161    vpblendd             m2, m0, m4, 0x30
3162    vpbroadcastq         m0, [srcq+strideq*0]
3163    vpblendd             m4, m0, 0x30
3164    punpcklbw            m2, m4      ; 67 78
3165    pmaddubsw            m1, m8      ; a0
3166    pmaddubsw            m4, m3, m9  ; b1
3167    paddw                m5, m1
3168    mova                 m1, m3
3169    pmaddubsw            m3, m10     ; a2
3170    paddw                m6, m4
3171    paddw                m5, m3
3172    vpbroadcastq         m4, [srcq+strideq*1]
3173    vpblendd             m3, m0, m4, 0x30
3174    vpbroadcastq         m0, [srcq+strideq*2]
3175    vpblendd             m4, m0, 0x30
3176    punpcklbw            m3, m4      ; 89 9a
3177    pmaddubsw            m4, m2, m11 ; a3
3178    paddw                m5, m4
3179    pmaddubsw            m4, m2, m10 ; b2
3180    paddw                m6, m4
3181    pmaddubsw            m4, m3, m11 ; b3
3182    paddw                m6, m4
3183    pmulhrsw             m5, m7
3184    pmulhrsw             m6, m7
3185    mova        [tmpq+32*0], m5
3186    mova        [tmpq+32*1], m6
3187    add                tmpq, 32*2
3188    sub                  hd, 4
3189    jg .v_w8_loop
3190    RET
3191.v_w16:
3192    lea                 r6d, [wq*2-32]
3193    WIN64_PUSH_XMM       15
3194    lea                 r6d, [hq+r6*8]
3195.v_w16_loop0:
3196    vbroadcasti128       m4, [srcq+strideq*0]
3197    vbroadcasti128       m5, [srcq+strideq*1]
3198    lea                  r5, [srcq+strideq*2]
3199    vbroadcasti128       m0, [r5+strideq*1]
3200    vbroadcasti128       m6, [r5+strideq*0]
3201    lea                  r5, [r5+strideq*2]
3202    vbroadcasti128       m1, [r5+strideq*0]
3203    vbroadcasti128       m2, [r5+strideq*1]
3204    lea                  r5, [r5+strideq*2]
3205    vbroadcasti128       m3, [r5+strideq*0]
3206    mov                  r7, tmpq
3207    shufpd               m4, m0, 0x0c
3208    shufpd               m5, m1, 0x0c
3209    punpcklbw            m1, m4, m5 ; 01
3210    punpckhbw            m4, m5     ; 34
3211    shufpd               m6, m2, 0x0c
3212    punpcklbw            m2, m5, m6 ; 12
3213    punpckhbw            m5, m6     ; 45
3214    shufpd               m0, m3, 0x0c
3215    punpcklbw            m3, m6, m0 ; 23
3216    punpckhbw            m6, m0     ; 56
3217.v_w16_loop:
3218    vbroadcasti128      m12, [r5+strideq*1]
3219    lea                  r5, [r5+strideq*2]
3220    pmaddubsw           m13, m1, m8  ; a0
3221    pmaddubsw           m14, m2, m8  ; b0
3222    mova                 m1, m3
3223    mova                 m2, m4
3224    pmaddubsw            m3, m9      ; a1
3225    pmaddubsw            m4, m9      ; b1
3226    paddw               m13, m3
3227    paddw               m14, m4
3228    mova                 m3, m5
3229    mova                 m4, m6
3230    pmaddubsw            m5, m10     ; a2
3231    pmaddubsw            m6, m10     ; b2
3232    paddw               m13, m5
3233    vbroadcasti128       m5, [r5+strideq*0]
3234    paddw               m14, m6
3235    shufpd               m6, m0, m12, 0x0d
3236    shufpd               m0, m12, m5, 0x0c
3237    punpcklbw            m5, m6, m0  ; 67
3238    punpckhbw            m6, m0      ; 78
3239    pmaddubsw           m12, m5, m11 ; a3
3240    paddw               m13, m12
3241    pmaddubsw           m12, m6, m11 ; b3
3242    paddw               m14, m12
3243    pmulhrsw            m13, m7
3244    pmulhrsw            m14, m7
3245    mova          [r7+wq*0], m13
3246    mova          [r7+wq*2], m14
3247    lea                  r7, [r7+wq*4]
3248    sub                  hd, 2
3249    jg .v_w16_loop
3250    add                srcq, 16
3251    add                tmpq, 32
3252    movzx                hd, r6b
3253    sub                 r6d, 1<<8
3254    jg .v_w16_loop0
3255    RET
3256.h:
3257.h_w4:
3258    test                myd, 0xf00
3259    jnz .hv
3260    vpbroadcastd         m4, [pw_8192]
3261    cmp                  wd, 4
3262    je mangle(private_prefix %+ _prep_6tap_8bpc_avx2).h_w4
3263    WIN64_SPILL_XMM      10
3264    vbroadcasti128       m5, [subpel_h_shufA]
3265    tzcnt                wd, wd
3266    vbroadcasti128       m6, [subpel_h_shufB]
3267    vbroadcasti128       m7, [subpel_h_shufC]
3268    shr                 mxd, 16
3269    sub                srcq, 3
3270    movzx                wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
3271    vpbroadcastd         m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
3272    vpbroadcastd         m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
3273    add                  wq, r7
3274    jmp                  wq
3275.h_w8:
3276    movu                xm0, [srcq+strideq*0]
3277    vinserti128          m0, [srcq+strideq*1], 1
3278    lea                srcq, [srcq+strideq*2]
3279%macro PREP_8TAP_H 0
3280    pshufb               m1, m0, m5
3281    pshufb               m2, m0, m6
3282    pshufb               m3, m0, m7
3283    pmaddubsw            m1, m8
3284    pmaddubsw            m0, m2, m8
3285    pmaddubsw            m2, m9
3286    pmaddubsw            m3, m9
3287    paddw                m1, m2
3288    paddw                m0, m3
3289    phaddw               m0, m1, m0
3290    pmulhrsw             m0, m4
3291%endmacro
3292    PREP_8TAP_H
3293    mova             [tmpq], m0
3294    add                tmpq, 32
3295    sub                  hd, 2
3296    jg .h_w8
3297    RET
3298.h_w16:
3299    movu                xm0, [srcq+strideq*0+8*0]
3300    vinserti128          m0, [srcq+strideq*0+8*1], 1
3301    PREP_8TAP_H
3302    mova        [tmpq+32*0], m0
3303    movu                xm0, [srcq+strideq*1+8*0]
3304    vinserti128          m0, [srcq+strideq*1+8*1], 1
3305    lea                srcq, [srcq+strideq*2]
3306    PREP_8TAP_H
3307    mova        [tmpq+32*1], m0
3308    add                tmpq, 32*2
3309    sub                  hd, 2
3310    jg .h_w16
3311    RET
3312.h_w32:
3313    xor                 r6d, r6d
3314    jmp .h_start
3315.h_w64:
3316    mov                  r6, -32*1
3317    jmp .h_start
3318.h_w128:
3319    mov                  r6, -32*3
3320.h_start:
3321    sub                srcq, r6
3322    mov                  r5, r6
3323.h_loop:
3324    movu                xm0, [srcq+r6+8*0]
3325    vinserti128          m0, [srcq+r6+8*1], 1
3326    PREP_8TAP_H
3327    mova        [tmpq+32*0], m0
3328    movu                xm0, [srcq+r6+8*2]
3329    vinserti128          m0, [srcq+r6+8*3], 1
3330    PREP_8TAP_H
3331    mova        [tmpq+32*1], m0
3332    add                tmpq, 32*2
3333    add                  r6, 32
3334    jle .h_loop
3335    add                srcq, strideq
3336    mov                  r6, r5
3337    dec                  hd
3338    jg .h_loop
3339    RET
3340.hv:
3341    WIN64_SPILL_XMM      16
3342    cmp                  wd, 4
3343    je .hv_w4
3344    shr                 mxd, 16
3345    sub                srcq, 3
3346    vpbroadcastd        m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
3347    vpbroadcastd        m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
3348    movzx               mxd, myb
3349    shr                 myd, 16
3350    cmp                  hd, 4
3351    cmove               myd, mxd
3352    vpbroadcastq         m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
3353    lea            stride3q, [strideq*3]
3354    sub                srcq, stride3q
3355    punpcklbw            m0, m0
3356    psraw                m0, 8 ; sign-extend
3357    pshufd              m12, m0, q0000
3358    pshufd              m13, m0, q1111
3359    pshufd              m14, m0, q2222
3360    pshufd              m15, m0, q3333
3361    jmp .hv_w8
3362.hv_w4:
3363    movzx               mxd, mxb
3364    dec                srcq
3365    vpbroadcastd         m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
3366    movzx               mxd, myb
3367    shr                 myd, 16
3368    cmp                  hd, 4
3369    cmove               myd, mxd
3370    vpbroadcastq         m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
3371    lea            stride3q, [strideq*3]
3372    sub                srcq, stride3q
3373    mova                 m7, [subpel_h_shuf4]
3374    pmovzxbd             m9, [deint_shuf4]
3375    vpbroadcastd        m10, [pw_8192]
3376    punpcklbw            m0, m0
3377    psraw                m0, 8 ; sign-extend
3378    vpbroadcastd        m11, [pd_32]
3379    pshufd              m12, m0, q0000
3380    pshufd              m13, m0, q1111
3381    pshufd              m14, m0, q2222
3382    pshufd              m15, m0, q3333
3383    vpbroadcastq         m2, [srcq+strideq*0]
3384    vpbroadcastq         m4, [srcq+strideq*1]
3385    vpbroadcastq         m0, [srcq+strideq*2]
3386    vpbroadcastq         m5, [srcq+stride3q ]
3387    lea                srcq, [srcq+strideq*4]
3388    vpbroadcastq         m3, [srcq+strideq*0]
3389    vpbroadcastq         m6, [srcq+strideq*1]
3390    vpbroadcastq         m1, [srcq+strideq*2]
3391    vpblendd             m2, m4, 0xcc ; 0 1
3392    vpblendd             m0, m5, 0xcc ; 2 3
3393    vpblendd             m3, m6, 0xcc ; 4 5
3394    pshufb               m2, m7 ; 00 01 10 11  02 03 12 13
3395    pshufb               m0, m7 ; 20 21 30 31  22 23 32 33
3396    pshufb               m3, m7 ; 40 41 50 51  42 43 52 53
3397    pshufb               m1, m7 ; 60 61 60 61  62 63 62 63
3398    pmaddubsw            m2, m8
3399    pmaddubsw            m0, m8
3400    pmaddubsw            m3, m8
3401    pmaddubsw            m1, m8
3402    phaddw               m2, m0 ; 0a 1a 2a 3a  0b 1b 2b 3b
3403    phaddw               m3, m1 ; 4a 5a 6a __  4b 5b 6b __
3404    pmulhrsw             m2, m10
3405    pmulhrsw             m3, m10
3406    palignr              m4, m3, m2, 4 ; 1a 2a 3a 4a  1b 2b 3b 4b
3407    punpcklwd            m1, m2, m4  ; 01 12
3408    punpckhwd            m2, m4      ; 23 34
3409    pshufd               m0, m3, q2121
3410    punpcklwd            m3, m0      ; 45 56
3411.hv_w4_loop:
3412    pmaddwd              m5, m1, m12 ; a0 b0
3413    pmaddwd              m6, m2, m12 ; c0 d0
3414    pmaddwd              m2, m13     ; a1 b1
3415    pmaddwd              m4, m3, m13 ; c1 d1
3416    mova                 m1, m3
3417    pmaddwd              m3, m14     ; a2 b2
3418    paddd                m5, m2
3419    vpbroadcastq         m2, [srcq+stride3q ]
3420    lea                srcq, [srcq+strideq*4]
3421    paddd                m6, m4
3422    vpbroadcastq         m4, [srcq+strideq*0]
3423    paddd                m5, m3
3424    vpbroadcastq         m3, [srcq+strideq*1]
3425    vpblendd             m2, m4, 0xcc
3426    vpbroadcastq         m4, [srcq+strideq*2]
3427    vpblendd             m3, m4, 0xcc
3428    pshufb               m2, m7
3429    pshufb               m3, m7
3430    pmaddubsw            m2, m8
3431    pmaddubsw            m3, m8
3432    phaddw               m2, m3
3433    pmulhrsw             m2, m10
3434    palignr              m3, m2, m0, 12
3435    mova                 m0, m2
3436    punpcklwd            m2, m3, m0  ; 67 78
3437    punpckhwd            m3, m0      ; 89 9a
3438    pmaddwd              m4, m2, m14 ; c2 d2
3439    paddd                m6, m11
3440    paddd                m5, m11
3441    paddd                m6, m4
3442    pmaddwd              m4, m2, m15 ; a3 b3
3443    paddd                m5, m4
3444    pmaddwd              m4, m3, m15 ; c3 d3
3445    paddd                m6, m4
3446    psrad                m5, 6
3447    psrad                m6, 6
3448    packssdw             m5, m6
3449    vpermd               m5, m9, m5
3450    mova             [tmpq], m5
3451    add                tmpq, 32
3452    sub                  hd, 4
3453    jg .hv_w4_loop
3454    RET
3455.hv_w8:
3456    lea                 r6d, [wq*8-64]
3457    lea                 r6d, [hq+r6*4]
3458.hv_w8_loop0:
3459    vbroadcasti128       m7, [subpel_h_shufA]
3460    movu                xm4, [srcq+strideq*0]
3461    lea                  r5, [srcq+strideq*2]
3462    vbroadcasti128       m8, [subpel_h_shufB]
3463    movu                xm5, [srcq+strideq*1]
3464    mov                  r7, tmpq
3465    vbroadcasti128       m9, [subpel_h_shufC]
3466    movu                xm6, [r5+strideq*0]
3467    vbroadcasti128       m0, [r5+strideq*1]
3468    lea                  r5, [r5+strideq*2]
3469    vpblendd             m4, m0, 0xf0          ; 0 3
3470    vinserti128          m5, [r5+strideq*0], 1 ; 1 4
3471    vinserti128          m6, [r5+strideq*1], 1 ; 2 5
3472    lea                  r5, [r5+strideq*2]
3473    vinserti128          m0, [r5+strideq*0], 1 ; 3 6
3474    HV_H_8TAP_W8         m4, m1, m2, m3, m7, m8, m9
3475    HV_H_8TAP_W8         m5, m1, m2, m3, m7, m8, m9
3476    HV_H_8TAP_W8         m6, m1, m2, m3, m7, m8, m9
3477    HV_H_8TAP_W8         m0, m1, m2, m3, m7, m8, m9
3478    vpbroadcastd         m7, [pw_8192]
3479    vpermq               m4, m4, q3120
3480    vpermq               m5, m5, q3120
3481    vpermq               m6, m6, q3120
3482    pmulhrsw             m0, m7
3483    pmulhrsw             m4, m7
3484    pmulhrsw             m5, m7
3485    pmulhrsw             m6, m7
3486    vpermq               m7, m0, q3120
3487    punpcklwd            m1, m4, m5  ; 01
3488    punpckhwd            m4, m5      ; 34
3489    punpcklwd            m2, m5, m6  ; 12
3490    punpckhwd            m5, m6      ; 45
3491    punpcklwd            m3, m6, m7  ; 23
3492    punpckhwd            m6, m7      ; 56
3493.hv_w8_loop:
3494    vextracti128       [r7], m0, 1 ; not enough registers
3495    movu                xm0, [r5+strideq*1]
3496    lea                  r5, [r5+strideq*2]
3497    vinserti128          m0, [r5+strideq*0], 1 ; 7 8
3498    pmaddwd              m8, m1, m12 ; a0
3499    pmaddwd              m9, m2, m12 ; b0
3500    mova                 m1, m3
3501    mova                 m2, m4
3502    pmaddwd              m3, m13     ; a1
3503    pmaddwd              m4, m13     ; b1
3504    paddd                m8, m3
3505    paddd                m9, m4
3506    mova                 m3, m5
3507    mova                 m4, m6
3508    pmaddwd              m5, m14     ; a2
3509    pmaddwd              m6, m14     ; b2
3510    paddd                m8, m5
3511    paddd                m9, m6
3512    vbroadcasti128       m6, [subpel_h_shufB]
3513    vbroadcasti128       m7, [subpel_h_shufC]
3514    vbroadcasti128       m5, [subpel_h_shufA]
3515    HV_H_8TAP_W8         m0, m5, m6, m7, m5, m6, m7
3516    vpbroadcastd         m5, [pw_8192]
3517    vpbroadcastd         m7, [pd_32]
3518    vbroadcasti128       m6, [r7]
3519    pmulhrsw             m0, m5
3520    paddd                m8, m7
3521    paddd                m9, m7
3522    vpermq               m7, m0, q3120 ; 7 8
3523    shufpd               m6, m7, 0x04  ; 6 7
3524    punpcklwd            m5, m6, m7  ; 67
3525    punpckhwd            m6, m7      ; 78
3526    pmaddwd              m7, m5, m15 ; a3
3527    paddd                m8, m7
3528    pmaddwd              m7, m6, m15 ; b3
3529    paddd                m7, m9
3530    psrad                m8, 6
3531    psrad                m7, 6
3532    packssdw             m8, m7
3533    vpermq               m7, m8, q3120
3534    mova          [r7+wq*0], xm7
3535    vextracti128  [r7+wq*2], m7, 1
3536    lea                  r7, [r7+wq*4]
3537    sub                  hd, 2
3538    jg .hv_w8_loop
3539    add                srcq, 8
3540    add                tmpq, 16
3541    movzx                hd, r6b
3542    sub                 r6d, 1<<8
3543    jg .hv_w8_loop0
3544    RET
3545
3546%macro movifprep 2
3547 %if isprep
3548    mov %1, %2
3549 %endif
3550%endmacro
3551
3552%macro REMAP_REG 2
3553 %xdefine r%1  r%2
3554 %xdefine r%1q r%2q
3555 %xdefine r%1d r%2d
3556%endmacro
3557
3558%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
3559 %if isprep
3560  %xdefine r14_save r14
3561  %assign %%i 14
3562  %rep 14
3563   %assign %%j %%i-1
3564   REMAP_REG %%i, %%j
3565   %assign %%i %%i-1
3566  %endrep
3567 %endif
3568%endmacro
3569
3570%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
3571 %if isprep
3572  %assign %%i 1
3573  %rep 13
3574   %assign %%j %%i+1
3575   REMAP_REG %%i, %%j
3576   %assign %%i %%i+1
3577  %endrep
3578  %xdefine r14 r14_save
3579  %undef r14_save
3580 %endif
3581%endmacro
3582
3583%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
3584    MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
3585    RET
3586 %if %1
3587    MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
3588 %endif
3589%endmacro
3590
3591%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
3592    movq               xm%1, [srcq+ r4]
3593    movq               xm%2, [srcq+ r6]
3594    movhps             xm%1, [srcq+ r7]
3595    movhps             xm%2, [srcq+ r9]
3596    vinserti128         m%1, [srcq+r10], 1
3597    vinserti128         m%2, [srcq+r11], 1
3598    vpbroadcastq        m%5, [srcq+r13]
3599    vpbroadcastq        m%6, [srcq+ rX]
3600    add                srcq, ssq
3601    movq               xm%3, [srcq+ r4]
3602    movq               xm%4, [srcq+ r6]
3603    movhps             xm%3, [srcq+ r7]
3604    movhps             xm%4, [srcq+ r9]
3605    vinserti128         m%3, [srcq+r10], 1
3606    vinserti128         m%4, [srcq+r11], 1
3607    vpbroadcastq        m%7, [srcq+r13]
3608    vpbroadcastq        m%8, [srcq+ rX]
3609    add                srcq, ssq
3610    vpblendd            m%1, m%5, 0xc0
3611    vpblendd            m%2, m%6, 0xc0
3612    vpblendd            m%3, m%7, 0xc0
3613    vpblendd            m%4, m%8, 0xc0
3614    pmaddubsw           m%1, m15
3615    pmaddubsw           m%2, m10
3616    pmaddubsw           m%3, m15
3617    pmaddubsw           m%4, m10
3618    phaddw              m%1, m%2
3619    phaddw              m%3, m%4
3620    phaddw              m%1, m%3
3621    pmulhrsw            m%1, m12
3622%endmacro
3623
3624%macro MC_8TAP_SCALED 1
3625%ifidn %1, put
3626 %assign isprep 0
3627cglobal put_8tap_scaled_8bpc, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy
3628 %xdefine base_reg r12
3629 %define rndshift 10
3630%else
3631 %assign isprep 1
3632cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
3633 %define tmp_stridem qword [rsp+120]
3634 %xdefine base_reg r11
3635 %define rndshift 6
3636%endif
3637    lea            base_reg, [%1_8tap_scaled_8bpc_avx2]
3638%define base base_reg-%1_8tap_scaled_8bpc_avx2
3639    tzcnt                wd, wm
3640    vpbroadcastd         m8, dxm
3641%if isprep && UNIX64
3642    movd               xm14, mxd
3643    vpbroadcastd        m14, xm14
3644    mov                 r5d, t0d
3645 DECLARE_REG_TMP 5, 7
3646%else
3647    vpbroadcastd        m14, mxm
3648%endif
3649    mov                 dyd, dym
3650%ifidn %1, put
3651 %if WIN64
3652    mov                 r8d, hm
3653  DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
3654  %define hm r5m
3655  %define dxm r8m
3656 %else
3657  DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
3658  %define hm r6m
3659 %endif
3660 %define dsm [rsp+112]
3661 %define rX r1
3662 %define rXd r1d
3663%else ; prep
3664 %if WIN64
3665    mov                 r7d, hm
3666  DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
3667  %define hm r4m
3668  %define dxm r7m
3669 %else
3670  DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
3671  %define hm [rsp+112]
3672 %endif
3673 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
3674 %define rX r14
3675 %define rXd r14d
3676%endif
3677    vpbroadcastd        m10, [base+pd_0x3ff]
3678    vpbroadcastd        m12, [base+pw_8192]
3679%ifidn %1, put
3680    vpbroadcastd        m13, [base+pd_512]
3681%else
3682    vpbroadcastd        m13, [base+pd_32]
3683%endif
3684    pxor                 m9, m9
3685    lea                ss3q, [ssq*3]
3686    movzx               r7d, t1b
3687    shr                 t1d, 16
3688    cmp                  hd, 6
3689    cmovs               t1d, r7d
3690    sub                srcq, ss3q
3691    cmp                 dyd, 1024
3692    je .dy1
3693    cmp                 dyd, 2048
3694    je .dy2
3695    movzx                wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
3696    add                  wq, base_reg
3697    jmp                  wq
3698%ifidn %1, put
3699.w2:
3700    mov                 myd, mym
3701    movzx               t0d, t0b
3702    dec                srcq
3703    movd               xm15, t0d
3704    punpckldq            m8, m9, m8
3705    paddd               m14, m8 ; mx+dx*[0,1]
3706    vpbroadcastd        m11, [base+pd_0x4000]
3707    vpbroadcastd       xm15, xm15
3708    pand                 m8, m14, m10
3709    psrld                m8, 6
3710    paddd              xm15, xm8
3711    movd                r4d, xm15
3712    pextrd              r6d, xm15, 1
3713    vbroadcasti128       m5, [base+bdct_lb_dw]
3714    vbroadcasti128       m6, [base+subpel_s_shuf2]
3715    vpbroadcastd        m15, [base+subpel_filters+r4*8+2]
3716    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
3717    pcmpeqd              m8, m9
3718    psrld               m14, 10
3719    movq                xm0, [srcq+ssq*0]
3720    movq                xm1, [srcq+ssq*2]
3721    movhps              xm0, [srcq+ssq*1]
3722    movhps              xm1, [srcq+ss3q ]
3723    lea                srcq, [srcq+ssq*4]
3724    pshufb              m14, m5
3725    paddb               m14, m6
3726    vinserti128          m0, [srcq+ssq*0], 1
3727    vinserti128          m1, [srcq+ssq*2], 1
3728    vpbroadcastq         m2, [srcq+ssq*1]
3729    vpbroadcastq         m3, [srcq+ss3q ]
3730    lea                srcq, [srcq+ssq*4]
3731    vpblendd            m15, m7, 0xaa
3732    vpblendd             m0, m2, 0xc0       ; 0 1  4 5
3733    vpblendd             m1, m3, 0xc0       ; 2 3  6 7
3734    pblendvb            m15, m11, m8
3735    pshufb               m0, m14
3736    pshufb               m1, m14
3737    pmaddubsw            m0, m15
3738    pmaddubsw            m1, m15
3739    phaddw               m0, m1
3740    pmulhrsw             m0, m12            ; 0 1 2 3  4 5 6 7
3741    vextracti128        xm1, m0, 1          ; 4 5 6 7
3742    palignr             xm2, xm1, xm0, 4    ; 1 2 3 4
3743    punpcklwd           xm3, xm0, xm2       ; 01 12
3744    punpckhwd           xm0, xm2            ; 23 34
3745    pshufd              xm4, xm1, q0321     ; 5 6 7 _
3746    punpcklwd           xm2, xm1, xm4       ; 45 56
3747    punpckhwd           xm4, xm1, xm4       ; 67 __
3748.w2_loop:
3749    and                 myd, 0x3ff
3750    mov                 r6d, 64 << 24
3751    mov                 r4d, myd
3752    shr                 r4d, 6
3753    lea                 r4d, [t1+r4]
3754    cmovnz              r6q, [base+subpel_filters+r4*8]
3755    movq               xm11, r6q
3756    pmovsxbw           xm11, xm11
3757    pshufd              xm8, xm11, q0000
3758    pshufd              xm9, xm11, q1111
3759    pshufd             xm10, xm11, q2222
3760    pshufd             xm11, xm11, q3333
3761    pmaddwd             xm5, xm3, xm8
3762    pmaddwd             xm6, xm0, xm9
3763    pmaddwd             xm7, xm2, xm10
3764    pmaddwd             xm8, xm4, xm11
3765    paddd               xm5, xm6
3766    paddd               xm7, xm8
3767    paddd               xm5, xm13
3768    paddd               xm5, xm7
3769    psrad               xm5, 10
3770    packssdw            xm5, xm5
3771    packuswb            xm5, xm5
3772    pextrw           [dstq], xm5, 0
3773    add                dstq, dsq
3774    dec                  hd
3775    jz .ret
3776    add                 myd, dyd
3777    test                myd, ~0x3ff
3778    jz .w2_loop
3779    movq                xm5, [srcq]
3780    test                myd, 0x400
3781    jz .w2_skip_line
3782    add                srcq, ssq
3783    shufps              xm3, xm0, q1032     ; 01 12
3784    shufps              xm0, xm2, q1032     ; 23 34
3785    shufps              xm2, xm4, q1032     ; 45 56
3786    pshufb              xm5, xm14
3787    pmaddubsw           xm5, xm15
3788    phaddw              xm5, xm5
3789    pmulhrsw            xm5, xm12
3790    palignr             xm1, xm5, xm1, 12
3791    punpcklqdq          xm1, xm1            ; 6 7 6 7
3792    punpcklwd           xm4, xm1, xm5       ; 67 __
3793    jmp .w2_loop
3794.w2_skip_line:
3795    movhps              xm5, [srcq+ssq*1]
3796    lea                srcq, [srcq+ssq*2]
3797    mova                xm3, xm0            ; 01 12
3798    mova                xm0, xm2            ; 23 34
3799    pshufb              xm5, xm14
3800    pmaddubsw           xm5, xm15
3801    phaddw              xm5, xm5
3802    pmulhrsw            xm5, xm12           ; 6 7 6 7
3803    palignr             xm1, xm5, xm1, 8    ; 4 5 6 7
3804    pshufd              xm5, xm1, q0321     ; 5 6 7 _
3805    punpcklwd           xm2, xm1, xm5       ; 45 56
3806    punpckhwd           xm4, xm1, xm5       ; 67 __
3807    jmp .w2_loop
3808%endif
3809.w4:
3810    mov                 myd, mym
3811    vbroadcasti128       m7, [base+rescale_mul]
3812    movzx               t0d, t0b
3813    dec                srcq
3814    movd               xm15, t0d
3815    pmaddwd              m8, m7
3816    vpbroadcastd        m11, [base+pd_0x4000]
3817    vpbroadcastd       xm15, xm15
3818    paddd               m14, m8 ; mx+dx*[0-3]
3819    pand                 m0, m14, m10
3820    psrld                m0, 6
3821    paddd              xm15, xm0
3822    movd                r4d, xm15
3823    pextrd              r6d, xm15, 1
3824    pextrd             r11d, xm15, 2
3825    pextrd             r13d, xm15, 3
3826    movd               xm15, [base+subpel_filters+r4*8+2]
3827    vbroadcasti128       m5, [base+bdct_lb_dw]
3828    vpbroadcastq         m6, [base+subpel_s_shuf2]
3829    pinsrd             xm15, [base+subpel_filters+r6*8+2], 1
3830    pcmpeqd              m0, m9
3831    psrld               m14, 10
3832    movu                xm7, [srcq+ssq*0]
3833    movu                xm9, [srcq+ssq*1]
3834    pinsrd             xm15, [base+subpel_filters+r11*8+2], 2
3835    movu                xm8, [srcq+ssq*2]
3836    movu               xm10, [srcq+ss3q ]
3837    pinsrd             xm15, [base+subpel_filters+r13*8+2], 3
3838    lea                srcq, [srcq+ssq*4]
3839    pshufb              m14, m5
3840    paddb               m14, m6
3841    vinserti128          m7, [srcq+ssq*0], 1
3842    vinserti128          m9, [srcq+ssq*1], 1
3843    vinserti128         m15, xm15, 1
3844    vinserti128          m8, [srcq+ssq*2], 1
3845    vinserti128         m10, [srcq+ss3q ], 1
3846    lea                srcq, [srcq+ssq*4]
3847    pblendvb            m15, m11, m0
3848    pshufb               m7, m14
3849    pshufb               m9, m14
3850    pshufb               m8, m14
3851    pshufb              m10, m14
3852    pmaddubsw            m7, m15
3853    pmaddubsw            m9, m15
3854    pmaddubsw            m8, m15
3855    pmaddubsw           m10, m15
3856    phaddw               m7, m9
3857    phaddw               m8, m10
3858    pmulhrsw             m7, m12                ; 0 1  4 5
3859    pmulhrsw             m8, m12                ; 2 3  6 7
3860    vextracti128        xm9, m7, 1              ; 4 5
3861    vextracti128        xm3, m8, 1              ; 6 7
3862    shufps              xm4, xm7, xm8, q1032    ; 1 2
3863    shufps              xm5, xm8, xm9, q1032    ; 3 4
3864    shufps              xm6, xm9, xm3, q1032    ; 5 6
3865    psrldq             xm11, xm3, 8             ; 7 _
3866    punpcklwd           xm0, xm7, xm4   ; 01
3867    punpckhwd           xm7, xm4        ; 12
3868    punpcklwd           xm1, xm8, xm5   ; 23
3869    punpckhwd           xm8, xm5        ; 34
3870    punpcklwd           xm2, xm9, xm6   ; 45
3871    punpckhwd           xm9, xm6        ; 56
3872    punpcklwd           xm3, xm11       ; 67
3873    mova         [rsp+0x00], xm7
3874    mova         [rsp+0x10], xm8
3875    mova         [rsp+0x20], xm9
3876.w4_loop:
3877    and                 myd, 0x3ff
3878    mov                 r6d, 64 << 24
3879    mov                 r4d, myd
3880    shr                 r4d, 6
3881    lea                 r4d, [t1+r4]
3882    cmovnz              r6q, [base+subpel_filters+r4*8]
3883    movq               xm10, r6q
3884    pmovsxbw           xm10, xm10
3885    pshufd              xm7, xm10, q0000
3886    pshufd              xm8, xm10, q1111
3887    pshufd              xm9, xm10, q2222
3888    pshufd             xm10, xm10, q3333
3889    pmaddwd             xm4, xm0, xm7
3890    pmaddwd             xm5, xm1, xm8
3891    pmaddwd             xm6, xm2, xm9
3892    pmaddwd             xm7, xm3, xm10
3893    paddd               xm4, xm5
3894    paddd               xm6, xm7
3895    paddd               xm4, xm13
3896    paddd               xm4, xm6
3897    psrad               xm4, rndshift
3898    packssdw            xm4, xm4
3899%ifidn %1, put
3900    packuswb            xm4, xm4
3901    movd             [dstq], xm4
3902    add                dstq, dsq
3903%else
3904    movq             [tmpq], xm4
3905    add                tmpq, 8
3906%endif
3907    dec                  hd
3908    jz .ret
3909    add                 myd, dyd
3910    test                myd, ~0x3ff
3911    jz .w4_loop
3912    movu                xm4, [srcq]
3913    test                myd, 0x400
3914    jz .w4_skip_line
3915    mova                xm0, [rsp+0x00]
3916    mova         [rsp+0x00], xm1
3917    mova                xm1, [rsp+0x10]
3918    mova         [rsp+0x10], xm2
3919    mova                xm2, [rsp+0x20]
3920    mova         [rsp+0x20], xm3
3921    pshufb              xm4, xm14
3922    pmaddubsw           xm4, xm15
3923    phaddw              xm4, xm4
3924    pmulhrsw            xm4, xm12
3925    punpcklwd           xm3, xm11, xm4
3926    mova               xm11, xm4
3927    add                srcq, ssq
3928    jmp .w4_loop
3929.w4_skip_line:
3930    movu                xm5, [srcq+ssq*1]
3931    movu                 m6, [rsp+0x10]
3932    pshufb              xm4, xm14
3933    pshufb              xm5, xm14
3934    pmaddubsw           xm4, xm15
3935    pmaddubsw           xm5, xm15
3936    movu         [rsp+0x00], m6
3937    phaddw              xm4, xm5
3938    pmulhrsw            xm4, xm12
3939    punpcklwd           xm9, xm11, xm4
3940    mova         [rsp+0x20], xm9
3941    psrldq             xm11, xm4, 8
3942    mova                xm0, xm1
3943    mova                xm1, xm2
3944    mova                xm2, xm3
3945    punpcklwd           xm3, xm4, xm11
3946    lea                srcq, [srcq+ssq*2]
3947    jmp .w4_loop
3948.w8:
3949    mov      dword [rsp+48], 1
3950    movifprep   tmp_stridem, 16
3951    jmp .w_start
3952.w16:
3953    mov      dword [rsp+48], 2
3954    movifprep   tmp_stridem, 32
3955    jmp .w_start
3956.w32:
3957    mov      dword [rsp+48], 4
3958    movifprep   tmp_stridem, 64
3959    jmp .w_start
3960.w64:
3961    mov      dword [rsp+48], 8
3962    movifprep   tmp_stridem, 128
3963    jmp .w_start
3964.w128:
3965    mov      dword [rsp+48], 16
3966    movifprep   tmp_stridem, 256
3967.w_start:
3968%ifidn %1, put
3969    movifnidn           dsm, dsq
3970%endif
3971    shr                 t0d, 16
3972    sub                srcq, 3
3973    pmaddwd              m8, [base+rescale_mul]
3974    movd               xm15, t0d
3975    mov            [rsp+72], t0d
3976    mov            [rsp+56], srcq
3977    mov            [rsp+64], r0q ; dstq / tmpq
3978%if UNIX64
3979    mov                  hm, hd
3980%endif
3981    shl           dword dxm, 3 ; dx*8
3982    vpbroadcastd        m15, xm15
3983    paddd               m14, m8 ; mx+dx*[0-7]
3984    jmp .hloop
3985.hloop_prep:
3986    dec      dword [rsp+48]
3987    jz .ret
3988    add      qword [rsp+64], 8*(isprep+1)
3989    mov                  hd, hm
3990    vpbroadcastd         m8, dxm
3991    vpbroadcastd        m10, [base+pd_0x3ff]
3992    paddd               m14, m8, [rsp+16]
3993    vpbroadcastd        m15, [rsp+72]
3994    pxor                 m9, m9
3995    mov                srcq, [rsp+56]
3996    mov                 r0q, [rsp+64] ; dstq / tmpq
3997.hloop:
3998    vpbroadcastq        m11, [base+pq_0x40000000]
3999    pand                 m6, m14, m10
4000    psrld                m6, 6
4001    paddd               m15, m6
4002    pcmpeqd              m6, m9
4003    vextracti128        xm7, m15, 1
4004    movd                r4d, xm15
4005    pextrd              r6d, xm15, 2
4006    pextrd              r7d, xm15, 1
4007    pextrd              r9d, xm15, 3
4008    movd               r10d, xm7
4009    pextrd             r11d, xm7, 2
4010    pextrd             r13d, xm7, 1
4011    pextrd              rXd, xm7, 3
4012    movu           [rsp+16], m14
4013    movq               xm15, [base+subpel_filters+ r4*8]
4014    movq               xm10, [base+subpel_filters+ r6*8]
4015    movhps             xm15, [base+subpel_filters+ r7*8]
4016    movhps             xm10, [base+subpel_filters+ r9*8]
4017    vinserti128         m15, [base+subpel_filters+r10*8], 1
4018    vinserti128         m10, [base+subpel_filters+r11*8], 1
4019    vpbroadcastq         m9, [base+subpel_filters+r13*8]
4020    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
4021    psrld               m14, 10
4022    vextracti128        xm7, m14, 1
4023    mova              [rsp], xm14
4024    movd                r4d, xm14
4025    pextrd              r6d, xm14, 2
4026    pextrd              r7d, xm14, 1
4027    pextrd              r9d, xm14, 3
4028    movd               r10d, xm7
4029    pextrd             r11d, xm7, 2
4030    pextrd             r13d, xm7, 1
4031    pextrd              rXd, xm7, 3
4032    pshufd               m5, m6, q1100
4033    pshufd               m6, m6, q3322
4034    vpblendd            m15, m9, 0xc0
4035    vpblendd            m10, m8, 0xc0
4036    pblendvb            m15, m11, m5
4037    pblendvb            m10, m11, m6
4038    vbroadcasti128      m14, [base+subpel_s_shuf8]
4039    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
4040    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
4041    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
4042    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
4043    mov                 myd, mym
4044    mov                 dyd, dym
4045    pshufb               m0, m14    ; 01a 01b
4046    pshufb               m1, m14    ; 23a 23b
4047    pshufb               m2, m14    ; 45a 45b
4048    pshufb               m3, m14    ; 67a 67b
4049    vbroadcasti128      m14, [base+wswap]
4050.vloop:
4051    and                 myd, 0x3ff
4052    mov                 r6d, 64 << 24
4053    mov                 r4d, myd
4054    shr                 r4d, 6
4055    lea                 r4d, [t1+r4]
4056    cmovnz              r6q, [base+subpel_filters+r4*8]
4057    movq               xm11, r6q
4058    punpcklqdq         xm11, xm11
4059    pmovsxbw            m11, xm11
4060    pshufd               m8, m11, q0000
4061    pshufd               m9, m11, q1111
4062    pmaddwd              m4, m0, m8
4063    pmaddwd              m5, m1, m9
4064    pshufd               m8, m11, q2222
4065    pshufd              m11, m11, q3333
4066    pmaddwd              m6, m2, m8
4067    pmaddwd              m7, m3, m11
4068    paddd                m4, m5
4069    paddd                m6, m7
4070    paddd                m4, m13
4071    paddd                m4, m6
4072    psrad                m4, rndshift
4073    vextracti128        xm5, m4, 1
4074    packssdw            xm4, xm5
4075%ifidn %1, put
4076    packuswb            xm4, xm4
4077    movq             [dstq], xm4
4078    add                dstq, dsm
4079%else
4080    mova             [tmpq], xm4
4081    add                tmpq, tmp_stridem
4082%endif
4083    dec                  hd
4084    jz .hloop_prep
4085    add                 myd, dyd
4086    test                myd, ~0x3ff
4087    jz .vloop
4088    test                myd, 0x400
4089    mov            [rsp+52], myd
4090    mov                 r4d, [rsp+ 0]
4091    mov                 r6d, [rsp+ 8]
4092    mov                 r7d, [rsp+ 4]
4093    mov                 r9d, [rsp+12]
4094    jz .skip_line
4095    vpbroadcastq         m6, [srcq+r13]
4096    vpbroadcastq         m7, [srcq+ rX]
4097    movq                xm4, [srcq+ r4]
4098    movq                xm5, [srcq+ r6]
4099    movhps              xm4, [srcq+ r7]
4100    movhps              xm5, [srcq+ r9]
4101    vinserti128          m4, [srcq+r10], 1
4102    vinserti128          m5, [srcq+r11], 1
4103    add                srcq, ssq
4104    mov                 myd, [rsp+52]
4105    mov                 dyd, dym
4106    pshufb               m0, m14
4107    pshufb               m1, m14
4108    pshufb               m2, m14
4109    pshufb               m3, m14
4110    vpblendd             m4, m6, 0xc0
4111    vpblendd             m5, m7, 0xc0
4112    pmaddubsw            m4, m15
4113    pmaddubsw            m5, m10
4114    phaddw               m4, m5
4115    pslld                m5, m4, 16
4116    paddw                m4, m5
4117    pmulhrsw             m4, m12
4118    pblendw              m0, m1, 0xaa
4119    pblendw              m1, m2, 0xaa
4120    pblendw              m2, m3, 0xaa
4121    pblendw              m3, m4, 0xaa
4122    jmp .vloop
4123.skip_line:
4124    mova                 m0, m1
4125    mova                 m1, m2
4126    mova                 m2, m3
4127    vpbroadcastq         m7, [srcq+r13]
4128    vpbroadcastq         m8, [srcq+ rX]
4129    movq                xm3, [srcq+ r4]
4130    movq                xm4, [srcq+ r6]
4131    movhps              xm3, [srcq+ r7]
4132    movhps              xm4, [srcq+ r9]
4133    vinserti128          m3, [srcq+r10], 1
4134    vinserti128          m4, [srcq+r11], 1
4135    add                srcq, ssq
4136    movq                xm5, [srcq+ r4]
4137    movq                xm6, [srcq+ r6]
4138    movhps              xm5, [srcq+ r7]
4139    movhps              xm6, [srcq+ r9]
4140    vinserti128          m5, [srcq+r10], 1
4141    vinserti128          m6, [srcq+r11], 1
4142    vpbroadcastq         m9, [srcq+r13]
4143    vpbroadcastq        m11, [srcq+ rX]
4144    add                srcq, ssq
4145    mov                 myd, [rsp+52]
4146    mov                 dyd, dym
4147    vpblendd             m3, m7, 0xc0
4148    vpblendd             m4, m8, 0xc0
4149    vpblendd             m5, m9, 0xc0
4150    vpblendd             m6, m11, 0xc0
4151    pmaddubsw            m3, m15
4152    pmaddubsw            m4, m10
4153    pmaddubsw            m5, m15
4154    pmaddubsw            m6, m10
4155    phaddw               m3, m4
4156    phaddw               m5, m6
4157    psrld                m4, m3, 16
4158    pslld                m6, m5, 16
4159    paddw                m3, m4
4160    paddw                m5, m6
4161    pblendw              m3, m5, 0xaa
4162    pmulhrsw             m3, m12
4163    jmp .vloop
4164.dy1:
4165    movzx                wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
4166    add                  wq, base_reg
4167    jmp                  wq
4168%ifidn %1, put
4169.dy1_w2:
4170    mov                 myd, mym
4171    movzx               t0d, t0b
4172    dec                srcq
4173    movd               xm15, t0d
4174    punpckldq            m8, m9, m8
4175    paddd               m14, m8 ; mx+dx*[0-1]
4176    vpbroadcastd        m11, [base+pd_0x4000]
4177    vpbroadcastd       xm15, xm15
4178    pand                 m8, m14, m10
4179    psrld                m8, 6
4180    paddd              xm15, xm8
4181    movd                r4d, xm15
4182    pextrd              r6d, xm15, 1
4183    vbroadcasti128       m5, [base+bdct_lb_dw]
4184    vbroadcasti128       m6, [base+subpel_s_shuf2]
4185    vpbroadcastd        m15, [base+subpel_filters+r4*8+2]
4186    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
4187    pcmpeqd              m8, m9
4188    psrld               m14, 10
4189    movq                xm0, [srcq+ssq*0]
4190    movq                xm1, [srcq+ssq*2]
4191    movhps              xm0, [srcq+ssq*1]
4192    movhps              xm1, [srcq+ss3q ]
4193    lea                srcq, [srcq+ssq*4]
4194    shr                 myd, 6
4195    mov                 r4d, 64 << 24
4196    lea                 myd, [t1+myq]
4197    cmovnz              r4q, [base+subpel_filters+myq*8]
4198    pshufb              m14, m5
4199    paddb               m14, m6
4200    vinserti128          m0, [srcq+ssq*0], 1
4201    vinserti128          m1, [srcq+ssq*2], 1
4202    vpbroadcastq         m2, [srcq+ssq*1]
4203    add                srcq, ss3q
4204    movq               xm10, r4q
4205    pmovsxbw           xm10, xm10
4206    vpblendd            m15, m7, 0xaa
4207    pblendvb            m15, m11, m8
4208    pshufd              xm8, xm10, q0000
4209    pshufd              xm9, xm10, q1111
4210    pshufd             xm11, xm10, q3333
4211    pshufd             xm10, xm10, q2222
4212    vpblendd             m0, m2, 0xc0
4213    pshufb               m1, m14
4214    pshufb               m0, m14
4215    pmaddubsw            m1, m15
4216    pmaddubsw            m0, m15
4217    phaddw               m0, m1
4218    pmulhrsw             m0, m12
4219    vextracti128        xm1, m0, 1
4220    palignr             xm2, xm1, xm0, 4
4221    pshufd              xm4, xm1, q2121
4222    punpcklwd           xm3, xm0, xm2       ; 01 12
4223    punpckhwd           xm0, xm2            ; 23 34
4224    punpcklwd           xm2, xm1, xm4       ; 45 56
4225.dy1_w2_loop:
4226    movq                xm1, [srcq+ssq*0]
4227    movhps              xm1, [srcq+ssq*1]
4228    lea                srcq, [srcq+ssq*2]
4229    pmaddwd             xm5, xm3, xm8
4230    pmaddwd             xm6, xm0, xm9
4231    pmaddwd             xm7, xm2, xm10
4232    mova                xm3, xm0
4233    mova                xm0, xm2
4234    paddd               xm5, xm13
4235    paddd               xm6, xm7
4236    pshufb              xm1, xm14
4237    pmaddubsw           xm1, xm15
4238    phaddw              xm1, xm1
4239    pmulhrsw            xm1, xm12
4240    palignr             xm7, xm1, xm4, 12
4241    punpcklwd           xm2, xm7, xm1     ; 67 78
4242    pmaddwd             xm7, xm2, xm11
4243    mova                xm4, xm1
4244    paddd               xm5, xm6
4245    paddd               xm5, xm7
4246    psrad               xm5, rndshift
4247    packssdw            xm5, xm5
4248    packuswb            xm5, xm5
4249    pextrw     [dstq+dsq*0], xm5, 0
4250    pextrw     [dstq+dsq*1], xm5, 1
4251    lea                dstq, [dstq+dsq*2]
4252    sub                  hd, 2
4253    jg .dy1_w2_loop
4254    RET
4255%endif
4256.dy1_w4:
4257    mov                 myd, mym
4258    vbroadcasti128       m7, [base+rescale_mul]
4259    movzx               t0d, t0b
4260    dec                srcq
4261    movd               xm15, t0d
4262    pmaddwd              m8, m7
4263    vpbroadcastd        m11, [base+pd_0x4000]
4264    vpbroadcastd       xm15, xm15
4265    paddd               m14, m8 ; mx+dx*[0-3]
4266    pand                 m8, m14, m10
4267    psrld                m8, 6
4268    paddd              xm15, xm8
4269    vpermq               m8, m8, q3120
4270    movd                r4d, xm15
4271    pextrd              r6d, xm15, 2
4272    pextrd             r11d, xm15, 1
4273    pextrd             r13d, xm15, 3
4274    movd               xm15, [base+subpel_filters+r4*8+2]
4275    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
4276    movu                xm2, [srcq+ssq*0]
4277    movu                xm3, [srcq+ssq*2]
4278    vbroadcasti128       m5, [base+bdct_lb_dw]
4279    vpbroadcastq         m6, [base+subpel_s_shuf2]
4280    pcmpeqd              m8, m9
4281    psrld               m14, 10
4282    pinsrd             xm15, [base+subpel_filters+r11*8+2], 1
4283    vpblendd             m7, [base+subpel_filters+r13*8+2-20], 0x20
4284    vinserti128          m2, [srcq+ssq*1], 1
4285    vinserti128          m3, [srcq+ss3q ], 1
4286    lea                srcq, [srcq+ssq*4]
4287    shr                 myd, 6
4288    mov                 r4d, 64 << 24
4289    lea                 myd, [t1+myq]
4290    cmovnz              r4q, [base+subpel_filters+myq*8]
4291    pshufb              m14, m5
4292    paddb               m14, m6
4293    movu                xm4, [srcq+ssq*0]
4294    movu                xm5, [srcq+ssq*2]
4295    vinserti128          m4, [srcq+ssq*1], 1
4296    add                srcq, ss3q
4297    vpblendd            m15, m7, 0x30
4298    punpcklqdq          m15, m15
4299    pblendvb            m15, m11, m8
4300    movq               xm10, r4q
4301    punpcklqdq         xm10, xm10
4302    pmovsxbw            m10, xm10
4303    pshufb               m2, m14
4304    pshufb               m3, m14
4305    pshufb               m4, m14
4306    pshufb              xm5, xm14
4307    vpermq               m2, m2, q3120
4308    vpermq               m3, m3, q3120
4309    vpermq               m4, m4, q3120
4310    vpermq               m5, m5, q3120
4311    pshufd               m7, m10, q0000
4312    pshufd               m8, m10, q1111
4313    pshufd               m9, m10, q2222
4314    pshufd              m10, m10, q3333
4315    pmaddubsw            m2, m15
4316    pmaddubsw            m3, m15
4317    pmaddubsw            m4, m15
4318    pmaddubsw            m5, m15
4319    phaddw               m2, m3
4320    phaddw               m4, m5
4321    pmulhrsw             m2, m12
4322    pmulhrsw             m4, m12
4323    palignr              m5, m4, m2, 4
4324    pshufd               m3, m4, q2121
4325    punpcklwd            m0, m2, m5     ; 01 12
4326    punpckhwd            m1, m2, m5     ; 23 34
4327    punpcklwd            m2, m4, m3     ; 45 56
4328.dy1_w4_loop:
4329    movu               xm11, [srcq+ssq*0]
4330    vinserti128         m11, [srcq+ssq*1], 1
4331    lea                srcq, [srcq+ssq*2]
4332    pmaddwd              m4, m0, m7
4333    pmaddwd              m5, m1, m8
4334    pmaddwd              m6, m2, m9
4335    mova                 m0, m1
4336    mova                 m1, m2
4337    paddd                m4, m13
4338    paddd                m5, m6
4339    pshufb              m11, m14
4340    vpermq              m11, m11, q3120
4341    pmaddubsw           m11, m15
4342    phaddw              m11, m11
4343    pmulhrsw            m11, m12
4344    palignr              m6, m11, m3, 12
4345    punpcklwd            m2, m6, m11    ; 67 78
4346    mova                 m3, m11
4347    pmaddwd              m6, m2, m10
4348    paddd                m4, m5
4349    paddd                m4, m6
4350    psrad                m4, rndshift
4351    vextracti128        xm5, m4, 1
4352    packssdw            xm4, xm5
4353%ifidn %1, put
4354    packuswb            xm4, xm4
4355    pshuflw             xm4, xm4, q3120
4356    movd       [dstq+dsq*0], xm4
4357    pextrd     [dstq+dsq*1], xm4, 1
4358    lea                dstq, [dstq+dsq*2]
4359%else
4360    pshufd              xm4, xm4, q3120
4361    mova             [tmpq], xm4
4362    add                tmpq, 16
4363%endif
4364    sub                  hd, 2
4365    jg .dy1_w4_loop
4366    MC_8TAP_SCALED_RET
4367.dy1_w8:
4368    mov      dword [rsp+72], 1
4369    movifprep   tmp_stridem, 16
4370    jmp .dy1_w_start
4371.dy1_w16:
4372    mov      dword [rsp+72], 2
4373    movifprep   tmp_stridem, 32
4374    jmp .dy1_w_start
4375.dy1_w32:
4376    mov      dword [rsp+72], 4
4377    movifprep   tmp_stridem, 64
4378    jmp .dy1_w_start
4379.dy1_w64:
4380    mov      dword [rsp+72], 8
4381    movifprep   tmp_stridem, 128
4382    jmp .dy1_w_start
4383.dy1_w128:
4384    mov      dword [rsp+72], 16
4385    movifprep   tmp_stridem, 256
4386.dy1_w_start:
4387    mov                 myd, mym
4388%ifidn %1, put
4389    movifnidn           dsm, dsq
4390%endif
4391    shr                 t0d, 16
4392    sub                srcq, 3
4393    shr                 myd, 6
4394    mov                 r4d, 64 << 24
4395    lea                 myd, [t1+myq]
4396    cmovnz              r4q, [base+subpel_filters+myq*8]
4397    pmaddwd              m8, [base+rescale_mul]
4398    movd               xm15, t0d
4399    mov            [rsp+76], t0d
4400    mov            [rsp+80], srcq
4401    mov            [rsp+88], r0q ; dstq / tmpq
4402%if UNIX64
4403    mov                  hm, hd
4404%endif
4405    shl           dword dxm, 3 ; dx*8
4406    vpbroadcastd        m15, xm15
4407    paddd               m14, m8 ; mx+dx*[0-7]
4408    movq                xm0, r4q
4409    pmovsxbw            xm0, xm0
4410    mova           [rsp+96], xm0
4411    jmp .dy1_hloop
4412.dy1_hloop_prep:
4413    dec      dword [rsp+72]
4414    jz .ret
4415    add      qword [rsp+88], 8*(isprep+1)
4416    mov                  hd, hm
4417    vpbroadcastd         m8, dxm
4418    vpbroadcastd        m10, [base+pd_0x3ff]
4419    paddd               m14, m8, [rsp+32]
4420    vpbroadcastd        m15, [rsp+76]
4421    pxor                 m9, m9
4422    mov                srcq, [rsp+80]
4423    mov                 r0q, [rsp+88] ; dstq / tmpq
4424.dy1_hloop:
4425    vpbroadcastq        m11, [base+pq_0x40000000]
4426    pand                 m6, m14, m10
4427    psrld                m6, 6
4428    paddd               m15, m6
4429    pcmpeqd              m6, m9
4430    vextracti128        xm7, m15, 1
4431    movd                r4d, xm15
4432    pextrd              r6d, xm15, 2
4433    pextrd              r7d, xm15, 1
4434    pextrd              r9d, xm15, 3
4435    movd               r10d, xm7
4436    pextrd             r11d, xm7, 2
4437    pextrd             r13d, xm7, 1
4438    pextrd              rXd, xm7, 3
4439    movu           [rsp+32], m14
4440    movq               xm15, [base+subpel_filters+ r4*8]
4441    movq               xm10, [base+subpel_filters+ r6*8]
4442    movhps             xm15, [base+subpel_filters+ r7*8]
4443    movhps             xm10, [base+subpel_filters+ r9*8]
4444    vinserti128         m15, [base+subpel_filters+r10*8], 1
4445    vinserti128         m10, [base+subpel_filters+r11*8], 1
4446    vpbroadcastq         m9, [base+subpel_filters+r13*8]
4447    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
4448    psrld               m14, 10
4449    vextracti128        xm7, m14, 1
4450    movq           [rsp+64], xm14
4451    movd                r4d, xm14
4452    pextrd              r6d, xm14, 2
4453    pextrd              r7d, xm14, 1
4454    pextrd              r9d, xm14, 3
4455    movd               r10d, xm7
4456    pextrd             r11d, xm7, 2
4457    pextrd             r13d, xm7, 1
4458    pextrd              rXd, xm7, 3
4459    pshufd               m5, m6, q1100
4460    pshufd               m6, m6, q3322
4461    vpblendd            m15, m9, 0xc0
4462    vpblendd            m10, m8, 0xc0
4463    pblendvb            m15, m11, m5
4464    pblendvb            m10, m11, m6
4465    vbroadcasti128      m14, [base+subpel_s_shuf8]
4466    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
4467    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
4468    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
4469    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
4470    movu              [rsp], m10
4471    vpbroadcastd         m8, [rsp+0x60]
4472    vpbroadcastd         m9, [rsp+0x64]
4473    vpbroadcastd        m10, [rsp+0x68]
4474    vpbroadcastd        m11, [rsp+0x6c]
4475    pshufb               m0, m14    ; 01a 01b
4476    pshufb               m1, m14    ; 23a 23b
4477    pshufb               m2, m14    ; 45a 45b
4478    pshufb               m3, m14    ; 67a 67b
4479    vbroadcasti128      m14, [base+wswap]
4480.dy1_vloop:
4481    pmaddwd              m4, m0, m8
4482    pmaddwd              m5, m1, m9
4483    pmaddwd              m6, m2, m10
4484    pmaddwd              m7, m3, m11
4485    paddd                m4, m5
4486    paddd                m6, m7
4487    paddd                m4, m13
4488    paddd                m4, m6
4489    psrad                m4, rndshift
4490    vextracti128        xm5, m4, 1
4491    packssdw            xm4, xm5
4492%ifidn %1, put
4493    packuswb            xm4, xm4
4494    movq             [dstq], xm4
4495    add                dstq, dsm
4496%else
4497    mova             [tmpq], xm4
4498    add                tmpq, tmp_stridem
4499%endif
4500    dec                  hd
4501    jz .dy1_hloop_prep
4502    movq                xm4, [srcq+ r4]
4503    movq                xm5, [srcq+ r6]
4504    movhps              xm4, [srcq+ r7]
4505    movhps              xm5, [srcq+ r9]
4506    vinserti128          m4, [srcq+r10], 1
4507    vinserti128          m5, [srcq+r11], 1
4508    vpbroadcastq         m6, [srcq+r13]
4509    vpbroadcastq         m7, [srcq+ rX]
4510    add                srcq, ssq
4511    pshufb               m0, m14
4512    pshufb               m1, m14
4513    pshufb               m2, m14
4514    pshufb               m3, m14
4515    vpblendd             m4, m6, 0xc0
4516    vpblendd             m5, m7, 0xc0
4517    pmaddubsw            m4, m15
4518    pmaddubsw            m5, [rsp]
4519    phaddw               m4, m5
4520    pslld                m5, m4, 16
4521    paddw                m4, m5
4522    pmulhrsw             m4, m12
4523    pblendw              m0, m1, 0xaa
4524    pblendw              m1, m2, 0xaa
4525    pblendw              m2, m3, 0xaa
4526    pblendw              m3, m4, 0xaa
4527    jmp .dy1_vloop
4528.dy2:
4529    movzx                wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
4530    add                  wq, base_reg
4531    jmp                  wq
4532%ifidn %1, put
4533.dy2_w2:
4534    mov                 myd, mym
4535    movzx               t0d, t0b
4536    dec                srcq
4537    movd               xm15, t0d
4538    punpckldq            m8, m9, m8
4539    paddd               m14, m8 ; mx+dx*[0-1]
4540    vpbroadcastd        m11, [base+pd_0x4000]
4541    vpbroadcastd       xm15, xm15
4542    pand                 m8, m14, m10
4543    psrld                m8, 6
4544    paddd              xm15, xm8
4545    movd                r4d, xm15
4546    pextrd              r6d, xm15, 1
4547    vbroadcasti128       m5, [base+bdct_lb_dw]
4548    vbroadcasti128       m6, [base+subpel_s_shuf2]
4549    vpbroadcastd        m15, [base+subpel_filters+r4*8+2]
4550    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
4551    pcmpeqd              m8, m9
4552    psrld               m14, 10
4553    movq                xm0, [srcq+ssq*0]
4554    vpbroadcastq         m2, [srcq+ssq*1]
4555    movhps              xm0, [srcq+ssq*2]
4556    vpbroadcastq         m3, [srcq+ss3q ]
4557    lea                srcq, [srcq+ssq*4]
4558    pshufb              m14, m5
4559    paddb               m14, m6
4560    vpblendd            m15, m7, 0xaa
4561    pblendvb            m15, m11, m8
4562    movhps              xm1, [srcq+ssq*0]
4563    vpbroadcastq         m4, [srcq+ssq*1]
4564    lea                srcq, [srcq+ssq*2]
4565    shr                 myd, 6
4566    mov                 r4d, 64 << 24
4567    lea                 myd, [t1+myq]
4568    cmovnz              r4q, [base+subpel_filters+myq*8]
4569    vpblendd             m0, m2, 0x30
4570    vpblendd             m1, m4, 0xc0
4571    vpblendd             m0, m3, 0xc0
4572    pshufb               m0, m14
4573    pshufb               m1, m14
4574    pmaddubsw            m0, m15
4575    pmaddubsw            m1, m15
4576    movq               xm11, r4q
4577    pmovsxbw           xm11, xm11
4578    phaddw               m0, m1
4579    pmulhrsw             m0, m12            ; 0 2 _ 4  1 3 _ 5
4580    pshufd              xm8, xm11, q0000
4581    pshufd              xm9, xm11, q1111
4582    pshufd             xm10, xm11, q2222
4583    pshufd             xm11, xm11, q3333
4584    pshufd               m2, m0, q3110      ; 0 2 2 4  1 3 3 5
4585    vextracti128        xm1, m2, 1
4586    punpcklwd           xm3, xm2, xm1       ; 01 23
4587    punpckhwd           xm2, xm1            ; 23 45
4588.dy2_w2_loop:
4589    movq                xm6, [srcq+ssq*0]
4590    vpbroadcastq         m7, [srcq+ssq*1]
4591    movhps              xm6, [srcq+ssq*2]
4592    vpbroadcastq         m1, [srcq+ss3q ]
4593    lea                srcq, [srcq+ssq*4]
4594    pmaddwd             xm4, xm3, xm8
4595    pmaddwd             xm5, xm2, xm9
4596    vpblendd             m6, m7, 0x30
4597    vpblendd             m6, m1, 0xc0
4598    pshufb               m6, m14
4599    pmaddubsw            m6, m15
4600    phaddw               m6, m6
4601    pmulhrsw             m6, m12
4602    palignr              m0, m6, m0, 8
4603    pshufd               m2, m0, q3221
4604    vextracti128        xm1, m2, 1
4605    punpcklwd           xm3, xm2, xm1       ; 45 67
4606    punpckhwd           xm2, xm1            ; 67 89
4607    pmaddwd             xm6, xm3, xm10
4608    pmaddwd             xm7, xm2, xm11
4609    paddd               xm4, xm5
4610    paddd               xm4, xm13
4611    paddd               xm6, xm7
4612    paddd               xm4, xm6
4613    psrad               xm4, rndshift
4614    packssdw            xm4, xm4
4615    packuswb            xm4, xm4
4616    pextrw     [dstq+dsq*0], xm4, 0
4617    pextrw     [dstq+dsq*1], xm4, 1
4618    lea                dstq, [dstq+dsq*2]
4619    sub                  hd, 2
4620    jg .dy2_w2_loop
4621    RET
4622%endif
4623.dy2_w4:
4624    mov                 myd, mym
4625    vbroadcasti128       m7, [base+rescale_mul]
4626    movzx               t0d, t0b
4627    dec                srcq
4628    movd               xm15, t0d
4629    pmaddwd              m8, m7
4630    vpbroadcastd        m11, [base+pd_0x4000]
4631    vpbroadcastd       xm15, xm15
4632    paddd               m14, m8 ; mx+dx*[0-3]
4633    pand                 m8, m14, m10
4634    psrld                m8, 6
4635    paddd              xm15, xm8
4636    movd                r4d, xm15
4637    pextrd              r6d, xm15, 1
4638    pextrd             r11d, xm15, 2
4639    pextrd             r13d, xm15, 3
4640    movd               xm15, [base+subpel_filters+r4*8+2]
4641    vbroadcasti128       m5, [base+bdct_lb_dw]
4642    vpbroadcastq         m6, [base+subpel_s_shuf2]
4643    pinsrd             xm15, [base+subpel_filters+r6*8+2], 1
4644    pcmpeqd              m8, m9
4645    psrld               m14, 10
4646    movu                xm0, [srcq+ssq*0]
4647    movu                xm2, [srcq+ssq*2]
4648    pinsrd             xm15, [base+subpel_filters+r11*8+2], 2
4649    movu                xm1, [srcq+ssq*1]
4650    movu                xm3, [srcq+ss3q ]
4651    pinsrd             xm15, [base+subpel_filters+r13*8+2], 3
4652    lea                srcq, [srcq+ssq*4]
4653    shr                 myd, 6
4654    mov                 r4d, 64 << 24
4655    lea                 myd, [t1+myq]
4656    cmovnz              r4q, [base+subpel_filters+myq*8]
4657    vinserti128         m15, xm15, 1
4658    pshufb              m14, m5
4659    paddb               m14, m6
4660    vinserti128          m2, [srcq+ssq*0], 1
4661    vinserti128          m3, [srcq+ssq*1], 1
4662    lea                srcq, [srcq+ssq*2]
4663    pblendvb            m15, m11, m8
4664    pshufb              xm0, xm14
4665    pshufb               m2, m14
4666    pshufb              xm1, xm14
4667    pshufb               m3, m14
4668    pmaddubsw           xm0, xm15
4669    pmaddubsw            m2, m15
4670    pmaddubsw           xm1, xm15
4671    pmaddubsw            m3, m15
4672    movq               xm11, r4q
4673    punpcklqdq         xm11, xm11
4674    pmovsxbw            m11, xm11
4675    phaddw               m0, m2
4676    phaddw               m1, m3
4677    pmulhrsw             m0, m12    ; 0 2  _ 4
4678    pmulhrsw             m1, m12    ; 1 3  _ 5
4679    pshufd               m8, m11, q0000
4680    pshufd               m9, m11, q1111
4681    pshufd              m10, m11, q2222
4682    pshufd              m11, m11, q3333
4683    punpcklwd           xm2, xm0, xm1
4684    punpckhwd            m1, m0, m1     ; 23 45
4685    vinserti128          m0, m2, xm1, 1 ; 01 23
4686.dy2_w4_loop:
4687    movu                xm6, [srcq+ssq*0]
4688    movu                xm7, [srcq+ssq*1]
4689    vinserti128          m6, [srcq+ssq*2], 1
4690    vinserti128          m7, [srcq+ss3q ], 1
4691    lea                srcq, [srcq+ssq*4]
4692    pmaddwd              m4, m0, m8
4693    pmaddwd              m5, m1, m9
4694    pshufb               m6, m14
4695    pshufb               m7, m14
4696    pmaddubsw            m6, m15
4697    pmaddubsw            m7, m15
4698    psrld                m2, m6, 16
4699    pslld                m3, m7, 16
4700    paddw                m6, m2
4701    paddw                m7, m3
4702    pblendw              m6, m7, 0xaa   ; 67 89
4703    pmulhrsw             m6, m12
4704    paddd                m4, m5
4705    vperm2i128           m0, m1, m6, 0x21 ; 45 67
4706    mova                 m1, m6
4707    pmaddwd              m6, m0, m10
4708    pmaddwd              m7, m1, m11
4709    paddd                m4, m13
4710    paddd                m6, m7
4711    paddd                m4, m6
4712    psrad                m4, rndshift
4713    vextracti128        xm5, m4, 1
4714    packssdw            xm4, xm5
4715%ifidn %1, put
4716    packuswb            xm4, xm4
4717    movd       [dstq+dsq*0], xm4
4718    pextrd     [dstq+dsq*1], xm4, 1
4719    lea                dstq, [dstq+dsq*2]
4720%else
4721    mova             [tmpq], xm4
4722    add                tmpq, 16
4723%endif
4724    sub                  hd, 2
4725    jg .dy2_w4_loop
4726    MC_8TAP_SCALED_RET
4727.dy2_w8:
4728    mov      dword [rsp+40], 1
4729    movifprep   tmp_stridem, 16
4730    jmp .dy2_w_start
4731.dy2_w16:
4732    mov      dword [rsp+40], 2
4733    movifprep   tmp_stridem, 32
4734    jmp .dy2_w_start
4735.dy2_w32:
4736    mov      dword [rsp+40], 4
4737    movifprep   tmp_stridem, 64
4738    jmp .dy2_w_start
4739.dy2_w64:
4740    mov      dword [rsp+40], 8
4741    movifprep   tmp_stridem, 128
4742    jmp .dy2_w_start
4743.dy2_w128:
4744    mov      dword [rsp+40], 16
4745    movifprep   tmp_stridem, 256
4746.dy2_w_start:
4747    mov                 myd, mym
4748%ifidn %1, put
4749    movifnidn           dsm, dsq
4750%endif
4751    shr                 t0d, 16
4752    sub                srcq, 3
4753    shr                 myd, 6
4754    mov                 r4d, 64 << 24
4755    lea                 myd, [t1+myq]
4756    cmovnz              r4q, [base+subpel_filters+myq*8]
4757    pmaddwd              m8, [base+rescale_mul]
4758    movd               xm15, t0d
4759    mov            [rsp+64], t0d
4760    mov            [rsp+48], srcq
4761    mov            [rsp+56], r0q ; dstq / tmpq
4762%if UNIX64
4763    mov                  hm, hd
4764%endif
4765    shl           dword dxm, 3 ; dx*8
4766    vpbroadcastd        m15, xm15
4767    paddd               m14, m8 ; mx+dx*[0-7]
4768    movq                xm0, r4q
4769    pmovsxbw            xm0, xm0
4770    mova         [rsp+0x50], xm0
4771    jmp .dy2_hloop
4772.dy2_hloop_prep:
4773    dec      dword [rsp+40]
4774    jz .ret
4775    add      qword [rsp+56], 8*(isprep+1)
4776    mov                  hd, hm
4777    vpbroadcastd         m8, dxm
4778    vpbroadcastd        m10, [base+pd_0x3ff]
4779    paddd               m14, m8, [rsp]
4780    vpbroadcastd        m15, [rsp+64]
4781    pxor                 m9, m9
4782    mov                srcq, [rsp+48]
4783    mov                 r0q, [rsp+56] ; dstq / tmpq
4784.dy2_hloop:
4785    vpbroadcastq        m11, [base+pq_0x40000000]
4786    pand                 m6, m14, m10
4787    psrld                m6, 6
4788    paddd               m15, m6
4789    pcmpeqd              m6, m9
4790    vextracti128        xm7, m15, 1
4791    movd                r4d, xm15
4792    pextrd              r6d, xm15, 2
4793    pextrd              r7d, xm15, 1
4794    pextrd              r9d, xm15, 3
4795    movd               r10d, xm7
4796    pextrd             r11d, xm7, 2
4797    pextrd             r13d, xm7, 1
4798    pextrd              rXd, xm7, 3
4799    movu              [rsp], m14
4800    movq               xm15, [base+subpel_filters+ r4*8]
4801    movq               xm10, [base+subpel_filters+ r6*8]
4802    movhps             xm15, [base+subpel_filters+ r7*8]
4803    movhps             xm10, [base+subpel_filters+ r9*8]
4804    vinserti128         m15, [base+subpel_filters+r10*8], 1
4805    vinserti128         m10, [base+subpel_filters+r11*8], 1
4806    vpbroadcastq         m9, [base+subpel_filters+r13*8]
4807    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
4808    psrld               m14, 10
4809    vextracti128        xm7, m14, 1
4810    movd                r4d, xm14
4811    pextrd              r6d, xm14, 2
4812    pextrd              r7d, xm14, 1
4813    pextrd              r9d, xm14, 3
4814    movd               r10d, xm7
4815    pextrd             r11d, xm7, 2
4816    pextrd             r13d, xm7, 1
4817    pextrd              rXd, xm7, 3
4818    pshufd               m5, m6, q1100
4819    pshufd               m6, m6, q3322
4820    vpblendd            m15, m9, 0xc0
4821    vpblendd            m10, m8, 0xc0
4822    pblendvb            m15, m11, m5
4823    pblendvb            m10, m11, m6
4824    vbroadcasti128      m14, [base+subpel_s_shuf8]
4825    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
4826    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
4827    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
4828    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
4829    vpbroadcastd         m8, [rsp+0x50]
4830    vpbroadcastd         m9, [rsp+0x54]
4831    vpbroadcastd        m11, [rsp+0x58]
4832    vpbroadcastd         m4, [rsp+0x5c]
4833    pshufb               m0, m14    ; 01a 01b
4834    pshufb               m1, m14    ; 23a 23b
4835    pshufb               m2, m14    ; 45a 45b
4836    pshufb               m3, m14    ; 67a 67b
4837    SWAP                m14, m4
4838.dy2_vloop:
4839    pmaddwd              m4, m0, m8
4840    pmaddwd              m5, m1, m9
4841    pmaddwd              m6, m2, m11
4842    pmaddwd              m7, m3, m14
4843    paddd                m4, m5
4844    paddd                m6, m7
4845    paddd                m4, m13
4846    paddd                m4, m6
4847    psrad                m4, rndshift
4848    vextracti128        xm5, m4, 1
4849    packssdw            xm4, xm5
4850%ifidn %1, put
4851    packuswb            xm4, xm4
4852    movq             [dstq], xm4
4853    add                dstq, dsm
4854%else
4855    mova             [tmpq], xm4
4856    add                tmpq, tmp_stridem
4857%endif
4858    dec                  hd
4859    jz .dy2_hloop_prep
4860    mova                 m0, m1
4861    mova                 m1, m2
4862    mova                 m2, m3
4863    movq                xm3, [srcq+ r4]
4864    movq                xm4, [srcq+ r6]
4865    movhps              xm3, [srcq+ r7]
4866    movhps              xm4, [srcq+ r9]
4867    vinserti128          m3, [srcq+r10], 1
4868    vinserti128          m4, [srcq+r11], 1
4869    vpbroadcastq         m5, [srcq+r13]
4870    vpbroadcastq         m6, [srcq+ rX]
4871    add                srcq, ssq
4872    vpblendd             m3, m5, 0xc0
4873    vpblendd             m4, m6, 0xc0
4874    pmaddubsw            m3, m15
4875    pmaddubsw            m4, m10
4876    phaddw               m3, m4
4877    movq                xm4, [srcq+ r4]
4878    movq                xm5, [srcq+ r6]
4879    movhps              xm4, [srcq+ r7]
4880    movhps              xm5, [srcq+ r9]
4881    vinserti128          m4, [srcq+r10], 1
4882    vinserti128          m5, [srcq+r11], 1
4883    vpbroadcastq         m6, [srcq+r13]
4884    vpbroadcastq         m7, [srcq+ rX]
4885    add                srcq, ssq
4886    vpblendd             m4, m6, 0xc0
4887    vpblendd             m5, m7, 0xc0
4888    pmaddubsw            m4, m15
4889    pmaddubsw            m5, m10
4890    phaddw               m4, m5
4891    psrld                m5, m3, 16
4892    pslld                m6, m4, 16
4893    paddw                m3, m5
4894    paddw                m4, m6
4895    pblendw              m3, m4, 0xaa
4896    pmulhrsw             m3, m12
4897    jmp .dy2_vloop
4898.ret:
4899    MC_8TAP_SCALED_RET 0
4900%undef isprep
4901%endmacro
4902
4903%macro BILIN_SCALED_FN 1
4904cglobal %1_bilin_scaled_8bpc
4905    mov                 t0d, (5*15 << 16) | 5*15
4906    mov                 t1d, t0d
4907    jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX)
4908%endmacro
4909
4910%if WIN64
4911DECLARE_REG_TMP 6, 5
4912%else
4913DECLARE_REG_TMP 6, 8
4914%endif
4915
4916%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
4917%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
4918
4919BILIN_SCALED_FN put
4920PUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   put_8tap_scaled_8bpc
4921PUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_scaled_8bpc
4922PUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_scaled_8bpc
4923PUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  put_8tap_scaled_8bpc
4924PUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, put_8tap_scaled_8bpc
4925PUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   put_8tap_scaled_8bpc
4926PUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, put_8tap_scaled_8bpc
4927PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  put_8tap_scaled_8bpc
4928PUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
4929MC_8TAP_SCALED put
4930
4931%if WIN64
4932DECLARE_REG_TMP 5, 4
4933%else
4934DECLARE_REG_TMP 6, 7
4935%endif
4936
4937BILIN_SCALED_FN prep
4938PREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   prep_8tap_scaled_8bpc
4939PREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_scaled_8bpc
4940PREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_scaled_8bpc
4941PREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  prep_8tap_scaled_8bpc
4942PREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_scaled_8bpc
4943PREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_scaled_8bpc
4944PREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, prep_8tap_scaled_8bpc
4945PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  prep_8tap_scaled_8bpc
4946PREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
4947MC_8TAP_SCALED prep
4948
4949%macro WARP_V 5 ; dst, 02, 46, 13, 57
4950    ; Can be done using gathers, but that's terribly slow on many CPU:s
4951    lea               tmp1d, [myq+deltaq*4]
4952    lea               tmp2d, [myq+deltaq*1]
4953    shr                 myd, 10
4954    shr               tmp1d, 10
4955    movq                xm8, [filterq+myq  *8]
4956    vinserti128          m8, [filterq+tmp1q*8], 1 ; a e
4957    lea               tmp1d, [tmp2q+deltaq*4]
4958    lea                 myd, [tmp2q+deltaq*1]
4959    shr               tmp2d, 10
4960    shr               tmp1d, 10
4961    movq                xm0, [filterq+tmp2q*8]
4962    vinserti128          m0, [filterq+tmp1q*8], 1 ; b f
4963    lea               tmp1d, [myq+deltaq*4]
4964    lea               tmp2d, [myq+deltaq*1]
4965    shr                 myd, 10
4966    shr               tmp1d, 10
4967    movq                xm9, [filterq+myq  *8]
4968    vinserti128          m9, [filterq+tmp1q*8], 1 ; c g
4969    lea               tmp1d, [tmp2q+deltaq*4]
4970    lea                 myd, [tmp2q+gammaq]       ; my += gamma
4971    shr               tmp2d, 10
4972    shr               tmp1d, 10
4973    punpcklwd            m8, m0
4974    movq                xm0, [filterq+tmp2q*8]
4975    vinserti128          m0, [filterq+tmp1q*8], 1 ; d h
4976    punpcklwd            m0, m9, m0
4977    punpckldq            m9, m8, m0
4978    punpckhdq            m0, m8, m0
4979    punpcklbw            m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
4980    punpckhbw            m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
4981    pmaddwd             m%2, m8
4982    pmaddwd              m9, m%3
4983    punpcklbw            m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
4984    punpckhbw            m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
4985    pmaddwd              m8, m%4
4986    pmaddwd              m0, m%5
4987    paddd               m%2, m9
4988    paddd                m0, m8
4989    paddd               m%1, m0, m%2
4990%endmacro
4991
4992cglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts
4993%if WIN64
4994    sub                 rsp, 0xa0
4995%endif
4996    call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main
4997.loop:
4998    psrad                m7, 13
4999    psrad                m0, 13
5000    packssdw             m7, m0
5001    pmulhrsw             m7, m14 ; (x + (1 << 6)) >> 7
5002    vpermq               m7, m7, q3120
5003    mova         [tmpq+tsq*0], xm7
5004    vextracti128 [tmpq+tsq*2], m7, 1
5005    dec                 r4d
5006    jz   mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).end
5007    call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main2
5008    lea                tmpq, [tmpq+tsq*4]
5009    jmp .loop
5010
5011cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
5012                                        beta, filter, tmp1, delta, my, gamma
5013%if WIN64
5014    %assign xmm_regs_used 16
5015    %assign stack_size_padded 0xa0
5016    SUB                 rsp, stack_size_padded
5017%endif
5018    call .main
5019    jmp .start
5020.loop:
5021    call .main2
5022    lea                dstq, [dstq+dsq*2]
5023.start:
5024    psrad                m7, 18
5025    psrad                m0, 18
5026    packusdw             m7, m0
5027    pavgw                m7, m11 ; (x + (1 << 10)) >> 11
5028    vextracti128        xm0, m7, 1
5029    packuswb            xm7, xm0
5030    pshufd              xm7, xm7, q3120
5031    movq       [dstq+dsq*0], xm7
5032    movhps     [dstq+dsq*1], xm7
5033    dec                 r4d
5034    jg .loop
5035.end:
5036    RET
5037ALIGN function_align
5038.main:
5039    ; Stack is offset due to call
5040    %assign stack_offset stack_offset + gprsize
5041    %assign stack_size stack_size + gprsize
5042    %assign stack_size_padded stack_size_padded + gprsize
5043    movifnidn         abcdq, abcdmp
5044    movifnidn           mxd, mxm
5045    WIN64_PUSH_XMM
5046    movsx            alphad, word [abcdq+2*0]
5047    movsx             betad, word [abcdq+2*1]
5048    mova                m12, [warp_8x8_shufA]
5049    mova                m13, [warp_8x8_shufB]
5050    vpbroadcastd        m14, [pw_8192]
5051    vpbroadcastd        m15, [pd_32768]
5052    pxor                m11, m11
5053    lea             filterq, [mc_warp_filter2]
5054    lea               tmp1q, [ssq*3+3]
5055    add                 mxd, 512+(64<<10)
5056    lea               tmp2d, [alphaq*3]
5057    sub                srcq, tmp1q    ; src -= src_stride*3 + 3
5058    sub               betad, tmp2d    ; beta -= alpha*3
5059    mov                 myd, r6m
5060    call .h
5061    psrld                m1, m0, 16
5062    call .h
5063    psrld                m4, m0, 16
5064    call .h
5065    pblendw              m1, m0, 0xaa ; 02
5066    call .h
5067    pblendw              m4, m0, 0xaa ; 13
5068    call .h
5069    psrld                m2, m1, 16
5070    pblendw              m2, m0, 0xaa ; 24
5071    call .h
5072    psrld                m5, m4, 16
5073    pblendw              m5, m0, 0xaa ; 35
5074    call .h
5075    psrld                m3, m2, 16
5076    pblendw              m3, m0, 0xaa ; 46
5077    movsx            deltad, word [abcdq+2*2]
5078    movsx            gammad, word [abcdq+2*3]
5079    add                 myd, 512+(64<<10)
5080    mov                 r4d, 4
5081    lea               tmp1d, [deltaq*3]
5082    sub              gammad, tmp1d    ; gamma -= delta*3
5083.main2:
5084    call .h
5085    psrld                m6, m5, 16
5086    pblendw              m6, m0, 0xaa ; 57
5087    WARP_V                7, 1, 3, 4, 6
5088    call .h
5089    mova                 m1, m2
5090    mova                 m2, m3
5091    psrld                m3, 16
5092    pblendw              m3, m0, 0xaa ; 68
5093    WARP_V                0, 4, 6, 1, 3
5094    mova                 m4, m5
5095    mova                 m5, m6
5096    ret
5097ALIGN function_align
5098.h:
5099    lea               tmp1d, [mxq+alphaq*4]
5100    lea               tmp2d, [mxq+alphaq*1]
5101    vbroadcasti128      m10, [srcq]
5102    shr                 mxd, 10
5103    shr               tmp1d, 10
5104    movq                xm8, [filterq+mxq  *8]
5105    vinserti128          m8, [filterq+tmp1q*8], 1
5106    lea               tmp1d, [tmp2q+alphaq*4]
5107    lea                 mxd, [tmp2q+alphaq*1]
5108    shr               tmp2d, 10
5109    shr               tmp1d, 10
5110    movq                xm0, [filterq+tmp2q*8]
5111    vinserti128          m0, [filterq+tmp1q*8], 1
5112    lea               tmp1d, [mxq+alphaq*4]
5113    lea               tmp2d, [mxq+alphaq*1]
5114    shr                 mxd, 10
5115    shr               tmp1d, 10
5116    movq                xm9, [filterq+mxq  *8]
5117    vinserti128          m9, [filterq+tmp1q*8], 1
5118    lea               tmp1d, [tmp2q+alphaq*4]
5119    lea                 mxd, [tmp2q+betaq] ; mx += beta
5120    shr               tmp2d, 10
5121    shr               tmp1d, 10
5122    punpcklqdq           m8, m0  ; 0 1   4 5
5123    movq                xm0, [filterq+tmp2q*8]
5124    vinserti128          m0, [filterq+tmp1q*8], 1
5125    punpcklqdq           m9, m0  ; 2 3   6 7
5126    pshufb               m0, m10, m12
5127    pmaddubsw            m0, m8
5128    pshufb              m10, m13
5129    pmaddubsw           m10, m9
5130    add                srcq, ssq
5131    phaddw               m0, m10
5132    pmaddwd              m0, m14 ; 17-bit intermediate, upshifted by 13
5133    paddd                m0, m15 ; rounded 14-bit result in upper 16 bits of dword
5134    ret
5135
5136%macro BIDIR_FN 1 ; op
5137    %1                    0
5138    lea            stride3q, [strideq*3]
5139    jmp                  wq
5140.w4:
5141    vextracti128        xm1, m0, 1
5142    movd   [dstq          ], xm0
5143    pextrd [dstq+strideq*1], xm0, 1
5144    movd   [dstq+strideq*2], xm1
5145    pextrd [dstq+stride3q ], xm1, 1
5146    cmp                  hd, 4
5147    je .ret
5148    lea                dstq, [dstq+strideq*4]
5149    pextrd [dstq          ], xm0, 2
5150    pextrd [dstq+strideq*1], xm0, 3
5151    pextrd [dstq+strideq*2], xm1, 2
5152    pextrd [dstq+stride3q ], xm1, 3
5153    cmp                  hd, 8
5154    je .ret
5155    %1                    2
5156    lea                dstq, [dstq+strideq*4]
5157    vextracti128        xm1, m0, 1
5158    movd   [dstq          ], xm0
5159    pextrd [dstq+strideq*1], xm0, 1
5160    movd   [dstq+strideq*2], xm1
5161    pextrd [dstq+stride3q ], xm1, 1
5162    lea                dstq, [dstq+strideq*4]
5163    pextrd [dstq          ], xm0, 2
5164    pextrd [dstq+strideq*1], xm0, 3
5165    pextrd [dstq+strideq*2], xm1, 2
5166    pextrd [dstq+stride3q ], xm1, 3
5167.ret:
5168    RET
5169.w8_loop:
5170    %1_INC_PTR            2
5171    %1                    0
5172    lea                dstq, [dstq+strideq*4]
5173.w8:
5174    vextracti128        xm1, m0, 1
5175    movq   [dstq          ], xm0
5176    movq   [dstq+strideq*1], xm1
5177    movhps [dstq+strideq*2], xm0
5178    movhps [dstq+stride3q ], xm1
5179    sub                  hd, 4
5180    jg .w8_loop
5181    RET
5182.w16_loop:
5183    %1_INC_PTR            4
5184    %1                    0
5185    lea                dstq, [dstq+strideq*4]
5186.w16:
5187    vpermq               m0, m0, q3120
5188    mova         [dstq          ], xm0
5189    vextracti128 [dstq+strideq*1], m0, 1
5190    %1                    2
5191    vpermq               m0, m0, q3120
5192    mova         [dstq+strideq*2], xm0
5193    vextracti128 [dstq+stride3q ], m0, 1
5194    sub                  hd, 4
5195    jg .w16_loop
5196    RET
5197.w32_loop:
5198    %1_INC_PTR            4
5199    %1                    0
5200    lea                dstq, [dstq+strideq*2]
5201.w32:
5202    vpermq               m0, m0, q3120
5203    mova   [dstq+strideq*0], m0
5204    %1                    2
5205    vpermq               m0, m0, q3120
5206    mova   [dstq+strideq*1], m0
5207    sub                  hd, 2
5208    jg .w32_loop
5209    RET
5210.w64_loop:
5211    %1_INC_PTR            4
5212    %1                    0
5213    add                dstq, strideq
5214.w64:
5215    vpermq               m0, m0, q3120
5216    mova             [dstq], m0
5217    %1                    2
5218    vpermq               m0, m0, q3120
5219    mova          [dstq+32], m0
5220    dec                  hd
5221    jg .w64_loop
5222    RET
5223.w128_loop:
5224    %1                    0
5225    add                dstq, strideq
5226.w128:
5227    vpermq               m0, m0, q3120
5228    mova        [dstq+0*32], m0
5229    %1                    2
5230    vpermq               m0, m0, q3120
5231    mova        [dstq+1*32], m0
5232    %1_INC_PTR            8
5233    %1                   -4
5234    vpermq               m0, m0, q3120
5235    mova        [dstq+2*32], m0
5236    %1                   -2
5237    vpermq               m0, m0, q3120
5238    mova        [dstq+3*32], m0
5239    dec                  hd
5240    jg .w128_loop
5241    RET
5242%endmacro
5243
5244%macro AVG 1 ; src_offset
5245    mova                 m0, [tmp1q+(%1+0)*32]
5246    paddw                m0, [tmp2q+(%1+0)*32]
5247    mova                 m1, [tmp1q+(%1+1)*32]
5248    paddw                m1, [tmp2q+(%1+1)*32]
5249    pmulhrsw             m0, m2
5250    pmulhrsw             m1, m2
5251    packuswb             m0, m1
5252%endmacro
5253
5254%macro AVG_INC_PTR 1
5255    add               tmp1q, %1*32
5256    add               tmp2q, %1*32
5257%endmacro
5258
5259cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
5260%define base r6-avg %+ SUFFIX %+ _table
5261    lea                  r6, [avg %+ SUFFIX %+ _table]
5262    tzcnt                wd, wm
5263    movifnidn            hd, hm
5264    movsxd               wq, dword [r6+wq*4]
5265    vpbroadcastd         m2, [base+pw_1024]
5266    add                  wq, r6
5267    BIDIR_FN            AVG
5268
5269%macro W_AVG 1 ; src_offset
5270    ; (a * weight + b * (16 - weight) + 128) >> 8
5271    ; = ((a - b) * weight + (b << 4) + 128) >> 8
5272    ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
5273    ; = ((((b - a) * (-weight     << 12)) >> 16) + b + 8) >> 4
5274    mova                 m0,     [tmp1q+(%1+0)*32]
5275    psubw                m2, m0, [tmp2q+(%1+0)*32]
5276    mova                 m1,     [tmp1q+(%1+1)*32]
5277    psubw                m3, m1, [tmp2q+(%1+1)*32]
5278    pmulhw               m2, m4
5279    pmulhw               m3, m4
5280    paddw                m0, m2
5281    paddw                m1, m3
5282    pmulhrsw             m0, m5
5283    pmulhrsw             m1, m5
5284    packuswb             m0, m1
5285%endmacro
5286
5287%define W_AVG_INC_PTR AVG_INC_PTR
5288
5289cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
5290%define base r6-w_avg %+ SUFFIX %+ _table
5291    lea                  r6, [w_avg %+ SUFFIX %+ _table]
5292    tzcnt                wd, wm
5293    movifnidn            hd, hm
5294    vpbroadcastw         m4, r6m ; weight
5295    movsxd               wq, dword [r6+wq*4]
5296    vpbroadcastd         m5, [base+pw_2048]
5297    psllw                m4, 12 ; (weight-16) << 12 when interpreted as signed
5298    add                  wq, r6
5299    cmp           dword r6m, 7
5300    jg .weight_gt7
5301    mov                  r6, tmp1q
5302    pxor                 m0, m0
5303    mov               tmp1q, tmp2q
5304    psubw                m4, m0, m4 ; -weight
5305    mov               tmp2q, r6
5306.weight_gt7:
5307    BIDIR_FN          W_AVG
5308
5309%macro MASK 1 ; src_offset
5310    ; (a * m + b * (64 - m) + 512) >> 10
5311    ; = ((a - b) * m + (b << 6) + 512) >> 10
5312    ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
5313    vpermq               m3,     [maskq+%1*16], q3120
5314    mova                 m0,     [tmp2q+(%1+0)*32]
5315    psubw                m1, m0, [tmp1q+(%1+0)*32]
5316    psubb                m3, m4, m3
5317    paddw                m1, m1     ; (b - a) << 1
5318    paddb                m3, m3
5319    punpcklbw            m2, m4, m3 ; -m << 9
5320    pmulhw               m1, m2
5321    paddw                m0, m1
5322    mova                 m1,     [tmp2q+(%1+1)*32]
5323    psubw                m2, m1, [tmp1q+(%1+1)*32]
5324    paddw                m2, m2
5325    punpckhbw            m3, m4, m3
5326    pmulhw               m2, m3
5327    paddw                m1, m2
5328    pmulhrsw             m0, m5
5329    pmulhrsw             m1, m5
5330    packuswb             m0, m1
5331%endmacro
5332
5333%macro MASK_INC_PTR 1
5334    add               maskq, %1*16
5335    add               tmp2q, %1*32
5336    add               tmp1q, %1*32
5337%endmacro
5338
5339cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
5340%define base r7-mask %+ SUFFIX %+ _table
5341    lea                  r7, [mask %+ SUFFIX %+ _table]
5342    tzcnt                wd, wm
5343    movifnidn            hd, hm
5344    mov               maskq, maskmp
5345    movsxd               wq, dword [r7+wq*4]
5346    vpbroadcastd         m5, [base+pw_2048]
5347    pxor                 m4, m4
5348    add                  wq, r7
5349    BIDIR_FN           MASK
5350
5351%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4
5352    mova                m%1, [tmp1q+32*%3]
5353    mova                 m1, [tmp2q+32*%3]
5354    psubw                m1, m%1
5355    pabsw               m%2, m1
5356    psubusw             m%2, m6, m%2
5357    psrlw               m%2, 8 ; 64 - m
5358    psllw                m2, m%2, 10
5359    pmulhw               m1, m2
5360    paddw               m%1, m1
5361    mova                 m1, [tmp1q+32*%4]
5362    mova                 m2, [tmp2q+32*%4]
5363    psubw                m2, m1
5364    pabsw                m3, m2
5365    psubusw              m3, m6, m3
5366    psrlw                m3, 8
5367%if %5
5368    packuswb            m%2, m3
5369    psubb               m%2, m5, m%2
5370    vpermq              m%2, m%2, q3120
5371%else
5372    phaddw              m%2, m3
5373%endif
5374    psllw                m3, 10
5375    pmulhw               m2, m3
5376    paddw                m1, m2
5377    pmulhrsw            m%1, m7
5378    pmulhrsw             m1, m7
5379    packuswb            m%1, m1
5380%endmacro
5381
5382cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
5383%define base r6-blend_avx2_table
5384    lea                  r6, [blend_avx2_table]
5385    tzcnt                wd, wm
5386    movifnidn         maskq, maskmp
5387    movifnidn            hd, hm
5388    movsxd               wq, dword [r6+wq*4]
5389    vpbroadcastd         m4, [base+pb_64]
5390    vpbroadcastd         m5, [base+pw_512]
5391    sub                tmpq, maskq
5392    add                  wq, r6
5393    lea                  r6, [dsq*3]
5394    jmp                  wq
5395.w4:
5396    movd                xm0, [dstq+dsq*0]
5397    pinsrd              xm0, [dstq+dsq*1], 1
5398    vpbroadcastd        xm1, [dstq+dsq*2]
5399    pinsrd              xm1, [dstq+r6   ], 3
5400    mova                xm6, [maskq]
5401    psubb               xm3, xm4, xm6
5402    punpcklbw           xm2, xm3, xm6
5403    punpckhbw           xm3, xm6
5404    mova                xm6, [maskq+tmpq]
5405    add               maskq, 4*4
5406    punpcklbw           xm0, xm6
5407    punpckhbw           xm1, xm6
5408    pmaddubsw           xm0, xm2
5409    pmaddubsw           xm1, xm3
5410    pmulhrsw            xm0, xm5
5411    pmulhrsw            xm1, xm5
5412    packuswb            xm0, xm1
5413    movd       [dstq+dsq*0], xm0
5414    pextrd     [dstq+dsq*1], xm0, 1
5415    pextrd     [dstq+dsq*2], xm0, 2
5416    pextrd     [dstq+r6   ], xm0, 3
5417    lea                dstq, [dstq+dsq*4]
5418    sub                  hd, 4
5419    jg .w4
5420    RET
5421ALIGN function_align
5422.w8:
5423    movq                xm1, [dstq+dsq*0]
5424    movhps              xm1, [dstq+dsq*1]
5425    vpbroadcastq         m2, [dstq+dsq*2]
5426    vpbroadcastq         m3, [dstq+r6   ]
5427    mova                 m0, [maskq]
5428    mova                 m6, [maskq+tmpq]
5429    add               maskq, 8*4
5430    vpblendd             m1, m2, 0x30
5431    vpblendd             m1, m3, 0xc0
5432    psubb                m3, m4, m0
5433    punpcklbw            m2, m3, m0
5434    punpckhbw            m3, m0
5435    punpcklbw            m0, m1, m6
5436    punpckhbw            m1, m6
5437    pmaddubsw            m0, m2
5438    pmaddubsw            m1, m3
5439    pmulhrsw             m0, m5
5440    pmulhrsw             m1, m5
5441    packuswb             m0, m1
5442    vextracti128        xm1, m0, 1
5443    movq       [dstq+dsq*0], xm0
5444    movhps     [dstq+dsq*1], xm0
5445    movq       [dstq+dsq*2], xm1
5446    movhps     [dstq+r6   ], xm1
5447    lea                dstq, [dstq+dsq*4]
5448    sub                  hd, 4
5449    jg .w8
5450    RET
5451ALIGN function_align
5452.w16:
5453    mova                 m0, [maskq]
5454    mova                xm1, [dstq+dsq*0]
5455    vinserti128          m1, [dstq+dsq*1], 1
5456    psubb                m3, m4, m0
5457    punpcklbw            m2, m3, m0
5458    punpckhbw            m3, m0
5459    mova                 m6, [maskq+tmpq]
5460    add               maskq, 16*2
5461    punpcklbw            m0, m1, m6
5462    punpckhbw            m1, m6
5463    pmaddubsw            m0, m2
5464    pmaddubsw            m1, m3
5465    pmulhrsw             m0, m5
5466    pmulhrsw             m1, m5
5467    packuswb             m0, m1
5468    mova         [dstq+dsq*0], xm0
5469    vextracti128 [dstq+dsq*1], m0, 1
5470    lea                dstq, [dstq+dsq*2]
5471    sub                  hd, 2
5472    jg .w16
5473    RET
5474ALIGN function_align
5475.w32:
5476    mova                 m0, [maskq]
5477    mova                 m1, [dstq]
5478    mova                 m6, [maskq+tmpq]
5479    add               maskq, 32
5480    psubb                m3, m4, m0
5481    punpcklbw            m2, m3, m0
5482    punpckhbw            m3, m0
5483    punpcklbw            m0, m1, m6
5484    punpckhbw            m1, m6
5485    pmaddubsw            m0, m2
5486    pmaddubsw            m1, m3
5487    pmulhrsw             m0, m5
5488    pmulhrsw             m1, m5
5489    packuswb             m0, m1
5490    mova             [dstq], m0
5491    add                dstq, dsq
5492    dec                  hd
5493    jg .w32
5494    RET
5495
5496cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
5497%define base r5-blend_v_avx2_table
5498    lea                  r5, [blend_v_avx2_table]
5499    tzcnt                wd, wm
5500    movifnidn            hd, hm
5501    movsxd               wq, dword [r5+wq*4]
5502    vpbroadcastd         m5, [base+pw_512]
5503    add                  wq, r5
5504    add               maskq, obmc_masks-blend_v_avx2_table
5505    jmp                  wq
5506.w2:
5507    vpbroadcastd        xm2, [maskq+2*2]
5508.w2_s0_loop:
5509    movd                xm0, [dstq+dsq*0]
5510    pinsrw              xm0, [dstq+dsq*1], 1
5511    movd                xm1, [tmpq]
5512    add                tmpq, 2*2
5513    punpcklbw           xm0, xm1
5514    pmaddubsw           xm0, xm2
5515    pmulhrsw            xm0, xm5
5516    packuswb            xm0, xm0
5517    pextrw     [dstq+dsq*0], xm0, 0
5518    pextrw     [dstq+dsq*1], xm0, 1
5519    lea                dstq, [dstq+dsq*2]
5520    sub                  hd, 2
5521    jg .w2_s0_loop
5522    RET
5523ALIGN function_align
5524.w4:
5525    vpbroadcastq        xm2, [maskq+4*2]
5526.w4_loop:
5527    movd                xm0, [dstq+dsq*0]
5528    pinsrd              xm0, [dstq+dsq*1], 1
5529    movq                xm1, [tmpq]
5530    add                tmpq, 4*2
5531    punpcklbw           xm0, xm1
5532    pmaddubsw           xm0, xm2
5533    pmulhrsw            xm0, xm5
5534    packuswb            xm0, xm0
5535    movd       [dstq+dsq*0], xm0
5536    pextrd     [dstq+dsq*1], xm0, 1
5537    lea                dstq, [dstq+dsq*2]
5538    sub                  hd, 2
5539    jg .w4_loop
5540    RET
5541ALIGN function_align
5542.w8:
5543    mova                xm3, [maskq+8*2]
5544.w8_loop:
5545    movq                xm0, [dstq+dsq*0]
5546    vpbroadcastq        xm1, [dstq+dsq*1]
5547    mova                xm2, [tmpq]
5548    add                tmpq, 8*2
5549    punpcklbw           xm0, xm2
5550    punpckhbw           xm1, xm2
5551    pmaddubsw           xm0, xm3
5552    pmaddubsw           xm1, xm3
5553    pmulhrsw            xm0, xm5
5554    pmulhrsw            xm1, xm5
5555    packuswb            xm0, xm1
5556    movq       [dstq+dsq*0], xm0
5557    movhps     [dstq+dsq*1], xm0
5558    lea                dstq, [dstq+dsq*2]
5559    sub                  hd, 2
5560    jg .w8_loop
5561    RET
5562ALIGN function_align
5563.w16:
5564    vbroadcasti128       m3, [maskq+16*2]
5565    vbroadcasti128       m4, [maskq+16*3]
5566.w16_loop:
5567    mova                xm1, [dstq+dsq*0]
5568    vinserti128          m1, [dstq+dsq*1], 1
5569    mova                 m2, [tmpq]
5570    add                tmpq, 16*2
5571    punpcklbw            m0, m1, m2
5572    punpckhbw            m1, m2
5573    pmaddubsw            m0, m3
5574    pmaddubsw            m1, m4
5575    pmulhrsw             m0, m5
5576    pmulhrsw             m1, m5
5577    packuswb             m0, m1
5578    mova         [dstq+dsq*0], xm0
5579    vextracti128 [dstq+dsq*1], m0, 1
5580    lea                dstq, [dstq+dsq*2]
5581    sub                  hd, 2
5582    jg .w16_loop
5583    RET
5584ALIGN function_align
5585.w32:
5586    mova                xm3, [maskq+16*4]
5587    vinserti128          m3, [maskq+16*6], 1
5588    mova                xm4, [maskq+16*5]
5589    vinserti128          m4, [maskq+16*7], 1
5590.w32_loop:
5591    mova                 m1, [dstq]
5592    mova                 m2, [tmpq]
5593    add                tmpq, 32
5594    punpcklbw            m0, m1, m2
5595    punpckhbw            m1, m2
5596    pmaddubsw            m0, m3
5597    pmaddubsw            m1, m4
5598    pmulhrsw             m0, m5
5599    pmulhrsw             m1, m5
5600    packuswb             m0, m1
5601    mova             [dstq], m0
5602    add                dstq, dsq
5603    dec                  hd
5604    jg .w32_loop
5605    RET
5606
5607cglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mask
5608%define base r5-blend_h_avx2_table
5609    lea                  r5, [blend_h_avx2_table]
5610    mov                 r6d, wd
5611    tzcnt                wd, wd
5612    mov                  hd, hm
5613    movsxd               wq, dword [r5+wq*4]
5614    vpbroadcastd         m5, [base+pw_512]
5615    add                  wq, r5
5616    lea               maskq, [base+obmc_masks+hq*2]
5617    lea                  hd, [hq*3]
5618    shr                  hd, 2 ; h * 3/4
5619    lea               maskq, [maskq+hq*2]
5620    neg                  hq
5621    jmp                  wq
5622.w2:
5623    movd                xm0, [dstq+dsq*0]
5624    pinsrw              xm0, [dstq+dsq*1], 1
5625    movd                xm2, [maskq+hq*2]
5626    movd                xm1, [tmpq]
5627    add                tmpq, 2*2
5628    punpcklwd           xm2, xm2
5629    punpcklbw           xm0, xm1
5630    pmaddubsw           xm0, xm2
5631    pmulhrsw            xm0, xm5
5632    packuswb            xm0, xm0
5633    pextrw     [dstq+dsq*0], xm0, 0
5634    pextrw     [dstq+dsq*1], xm0, 1
5635    lea                dstq, [dstq+dsq*2]
5636    add                  hq, 2
5637    jl .w2
5638    RET
5639ALIGN function_align
5640.w4:
5641    mova                xm3, [blend_shuf]
5642.w4_loop:
5643    movd                xm0, [dstq+dsq*0]
5644    pinsrd              xm0, [dstq+dsq*1], 1
5645    movd                xm2, [maskq+hq*2]
5646    movq                xm1, [tmpq]
5647    add                tmpq, 4*2
5648    pshufb              xm2, xm3
5649    punpcklbw           xm0, xm1
5650    pmaddubsw           xm0, xm2
5651    pmulhrsw            xm0, xm5
5652    packuswb            xm0, xm0
5653    movd       [dstq+dsq*0], xm0
5654    pextrd     [dstq+dsq*1], xm0, 1
5655    lea                dstq, [dstq+dsq*2]
5656    add                  hq, 2
5657    jl .w4_loop
5658    RET
5659ALIGN function_align
5660.w8:
5661    vbroadcasti128       m4, [blend_shuf]
5662    shufpd               m4, m4, 0x03
5663.w8_loop:
5664    vpbroadcastq         m1, [dstq+dsq*0]
5665    movq                xm0, [dstq+dsq*1]
5666    vpblendd             m0, m1, 0x30
5667    vpbroadcastd         m3, [maskq+hq*2]
5668    movq                xm1, [tmpq+8*1]
5669    vinserti128          m1, [tmpq+8*0], 1
5670    add                tmpq, 8*2
5671    pshufb               m3, m4
5672    punpcklbw            m0, m1
5673    pmaddubsw            m0, m3
5674    pmulhrsw             m0, m5
5675    vextracti128        xm1, m0, 1
5676    packuswb            xm0, xm1
5677    movhps     [dstq+dsq*0], xm0
5678    movq       [dstq+dsq*1], xm0
5679    lea                dstq, [dstq+dsq*2]
5680    add                  hq, 2
5681    jl .w8_loop
5682    RET
5683ALIGN function_align
5684.w16:
5685    vbroadcasti128       m4, [blend_shuf]
5686    shufpd               m4, m4, 0x0c
5687.w16_loop:
5688    mova                xm1, [dstq+dsq*0]
5689    vinserti128          m1, [dstq+dsq*1], 1
5690    vpbroadcastd         m3, [maskq+hq*2]
5691    mova                 m2, [tmpq]
5692    add                tmpq, 16*2
5693    pshufb               m3, m4
5694    punpcklbw            m0, m1, m2
5695    punpckhbw            m1, m2
5696    pmaddubsw            m0, m3
5697    pmaddubsw            m1, m3
5698    pmulhrsw             m0, m5
5699    pmulhrsw             m1, m5
5700    packuswb             m0, m1
5701    mova         [dstq+dsq*0], xm0
5702    vextracti128 [dstq+dsq*1], m0, 1
5703    lea                dstq, [dstq+dsq*2]
5704    add                  hq, 2
5705    jl .w16_loop
5706    RET
5707ALIGN function_align
5708.w32: ; w32/w64/w128
5709    sub                 dsq, r6
5710.w32_loop0:
5711    vpbroadcastw         m3, [maskq+hq*2]
5712    mov                  wd, r6d
5713.w32_loop:
5714    mova                 m1, [dstq]
5715    mova                 m2, [tmpq]
5716    add                tmpq, 32
5717    punpcklbw            m0, m1, m2
5718    punpckhbw            m1, m2
5719    pmaddubsw            m0, m3
5720    pmaddubsw            m1, m3
5721    pmulhrsw             m0, m5
5722    pmulhrsw             m1, m5
5723    packuswb             m0, m1
5724    mova             [dstq], m0
5725    add                dstq, 32
5726    sub                  wd, 32
5727    jg .w32_loop
5728    add                dstq, dsq
5729    inc                  hq
5730    jl .w32_loop0
5731    RET
5732
5733cglobal emu_edge_8bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
5734                             bottomext, rightext
5735    ; we assume that the buffer (stride) is larger than width, so we can
5736    ; safely overwrite by a few bytes
5737
5738    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
5739    xor                r12d, r12d
5740    lea                 r10, [ihq-1]
5741    cmp                  yq, ihq
5742    cmovs               r10, yq
5743    test                 yq, yq
5744    cmovs               r10, r12
5745    imul                r10, sstrideq
5746    add                srcq, r10
5747
5748    ; ref += iclip(x, 0, iw - 1)
5749    lea                 r10, [iwq-1]
5750    cmp                  xq, iwq
5751    cmovs               r10, xq
5752    test                 xq, xq
5753    cmovs               r10, r12
5754    add                srcq, r10
5755
5756    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
5757    lea          bottomextq, [yq+bhq]
5758    sub          bottomextq, ihq
5759    lea                  r3, [bhq-1]
5760    cmovs        bottomextq, r12
5761
5762    DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
5763                bottomext, rightext
5764
5765    ; top_ext = iclip(-y, 0, bh - 1)
5766    neg             topextq
5767    cmovs           topextq, r12
5768    cmp          bottomextq, bhq
5769    cmovns       bottomextq, r3
5770    cmp             topextq, bhq
5771    cmovg           topextq, r3
5772
5773    ; right_ext = iclip(x + bw - iw, 0, bw - 1)
5774    lea           rightextq, [xq+bwq]
5775    sub           rightextq, iwq
5776    lea                  r2, [bwq-1]
5777    cmovs         rightextq, r12
5778
5779    DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
5780                bottomext, rightext
5781
5782    ; left_ext = iclip(-x, 0, bw - 1)
5783    neg            leftextq
5784    cmovs          leftextq, r12
5785    cmp           rightextq, bwq
5786    cmovns        rightextq, r2
5787    cmp            leftextq, bwq
5788    cmovns         leftextq, r2
5789
5790    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
5791                dst, dstride, src, sstride, bottomext, rightext
5792
5793    ; center_h = bh - top_ext - bottom_ext
5794    lea                  r3, [bottomextq+topextq]
5795    sub            centerhq, r3
5796
5797    ; blk += top_ext * PXSTRIDE(dst_stride)
5798    mov                  r2, topextq
5799    imul                 r2, dstrideq
5800    add                dstq, r2
5801    mov                 r9m, dstq
5802
5803    ; center_w = bw - left_ext - right_ext
5804    mov            centerwq, bwq
5805    lea                  r3, [rightextq+leftextq]
5806    sub            centerwq, r3
5807
5808%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
5809.v_loop_%3:
5810%if %1
5811    ; left extension
5812    xor                  r3, r3
5813    vpbroadcastb         m0, [srcq]
5814.left_loop_%3:
5815    mova          [dstq+r3], m0
5816    add                  r3, 32
5817    cmp                  r3, leftextq
5818    jl .left_loop_%3
5819
5820    ; body
5821    lea                 r12, [dstq+leftextq]
5822%endif
5823    xor                  r3, r3
5824.body_loop_%3:
5825    movu                 m0, [srcq+r3]
5826%if %1
5827    movu           [r12+r3], m0
5828%else
5829    movu          [dstq+r3], m0
5830%endif
5831    add                  r3, 32
5832    cmp                  r3, centerwq
5833    jl .body_loop_%3
5834
5835%if %2
5836    ; right extension
5837%if %1
5838    add                 r12, centerwq
5839%else
5840    lea                 r12, [dstq+centerwq]
5841%endif
5842    xor                  r3, r3
5843    vpbroadcastb         m0, [srcq+centerwq-1]
5844.right_loop_%3:
5845    movu           [r12+r3], m0
5846    add                  r3, 32
5847    cmp                  r3, rightextq
5848    jl .right_loop_%3
5849
5850%endif
5851    add                dstq, dstrideq
5852    add                srcq, sstrideq
5853    dec            centerhq
5854    jg .v_loop_%3
5855%endmacro
5856
5857    test           leftextq, leftextq
5858    jnz .need_left_ext
5859    test          rightextq, rightextq
5860    jnz .need_right_ext
5861    v_loop                0, 0, 0
5862    jmp .body_done
5863
5864.need_left_ext:
5865    test          rightextq, rightextq
5866    jnz .need_left_right_ext
5867    v_loop                1, 0, 1
5868    jmp .body_done
5869
5870.need_left_right_ext:
5871    v_loop                1, 1, 2
5872    jmp .body_done
5873
5874.need_right_ext:
5875    v_loop                0, 1, 3
5876
5877.body_done:
5878    ; bottom edge extension
5879    test         bottomextq, bottomextq
5880    jz .top
5881    mov                srcq, dstq
5882    sub                srcq, dstrideq
5883    xor                  r1, r1
5884.bottom_x_loop:
5885    mova                 m0, [srcq+r1]
5886    lea                  r3, [dstq+r1]
5887    mov                  r4, bottomextq
5888.bottom_y_loop:
5889    mova               [r3], m0
5890    add                  r3, dstrideq
5891    dec                  r4
5892    jg .bottom_y_loop
5893    add                  r1, 32
5894    cmp                  r1, bwq
5895    jl .bottom_x_loop
5896
5897.top:
5898    ; top edge extension
5899    test            topextq, topextq
5900    jz .end
5901    mov                srcq, r9m
5902    mov                dstq, dstm
5903    xor                  r1, r1
5904.top_x_loop:
5905    mova                 m0, [srcq+r1]
5906    lea                  r3, [dstq+r1]
5907    mov                  r4, topextq
5908.top_y_loop:
5909    mova               [r3], m0
5910    add                  r3, dstrideq
5911    dec                  r4
5912    jg .top_y_loop
5913    add                  r1, 32
5914    cmp                  r1, bwq
5915    jl .top_x_loop
5916
5917.end:
5918    RET
5919
5920cglobal resize_8bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
5921                                dst_w, h, src_w, dx, mx0
5922    sub          dword mx0m, 4<<14
5923    sub        dword src_wm, 8
5924    vpbroadcastd         m5, dxm
5925    vpbroadcastd         m8, mx0m
5926    vpbroadcastd         m6, src_wm
5927
5928    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
5929    LEA                  r7, $$
5930%define base r7-$$
5931
5932    vpbroadcastd        xm3, [base+pw_m256]
5933    vpbroadcastd         m7, [base+pd_63]
5934    vbroadcasti128      m15, [base+pb_8x0_8x8]
5935    pmaddwd              m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
5936    pslld                m5, 3                      ; dx*8
5937    pslld                m6, 14
5938    paddd                m8, m2                     ; mx+[0..7]*dx
5939    pxor                 m2, m2
5940
5941    ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7
5942    ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8
5943
5944.loop_y:
5945    xor                  xd, xd
5946    mova                 m4, m8                     ; per-line working version of mx
5947
5948.loop_x:
5949    pmaxsd               m0, m4, m2
5950    psrad                m9, m4, 8                  ; filter offset (unmasked)
5951    pminsd               m0, m6                     ; iclip(mx, 0, src_w-8)
5952    psubd                m1, m4, m0                 ; pshufb offset
5953    psrad                m0, 14                     ; clipped src_x offset
5954    psrad                m1, 14                     ; pshufb edge_emu offset
5955    pand                 m9, m7                     ; filter offset (masked)
5956
5957    ; load source pixels - this ugly code is vpgatherdq emulation since
5958    ; directly using vpgatherdq on Haswell is quite a bit slower :(
5959    movd                r8d, xm0
5960    pextrd              r9d, xm0, 1
5961    pextrd             r10d, xm0, 2
5962    pextrd             r11d, xm0, 3
5963    vextracti128        xm0, m0, 1
5964    movq               xm12, [srcq+r8]
5965    movq               xm13, [srcq+r10]
5966    movhps             xm12, [srcq+r9]
5967    movhps             xm13, [srcq+r11]
5968    movd                r8d, xm0
5969    pextrd              r9d, xm0, 1
5970    pextrd             r10d, xm0, 2
5971    pextrd             r11d, xm0, 3
5972    vinserti128         m12, [srcq+r8], 1
5973    vinserti128         m13, [srcq+r10], 1
5974    vpbroadcastq        m10, [srcq+r9]
5975    vpbroadcastq        m11, [srcq+r11]
5976    vpblendd            m12, m10, 11000000b
5977    vpblendd            m13, m11, 11000000b
5978
5979    ; if no emulation is required, we don't need to shuffle or emulate edges
5980    ; this also saves 2 quasi-vpgatherdqs
5981    vptest               m1, m1
5982    jz .filter
5983
5984    movq                 r9, xm1
5985    pextrq              r11, xm1, 1
5986    movsxd               r8, r9d
5987    sar                  r9, 32
5988    movsxd              r10, r11d
5989    sar                 r11, 32
5990    vextracti128        xm1, m1, 1
5991    movq               xm14, [base+resize_shuf+4+r8]
5992    movq                xm0, [base+resize_shuf+4+r10]
5993    movhps             xm14, [base+resize_shuf+4+r9]
5994    movhps              xm0, [base+resize_shuf+4+r11]
5995    movq                 r9, xm1
5996    pextrq              r11, xm1, 1
5997    movsxd               r8, r9d
5998    sar                  r9, 32
5999    movsxd              r10, r11d
6000    sar                 r11, 32
6001    vinserti128         m14, [base+resize_shuf+4+r8], 1
6002    vinserti128          m0, [base+resize_shuf+4+r10], 1
6003    vpbroadcastq        m10, [base+resize_shuf+4+r9]
6004    vpbroadcastq        m11, [base+resize_shuf+4+r11]
6005    vpblendd            m14, m10, 11000000b
6006    vpblendd             m0, m11, 11000000b
6007
6008    paddb               m14, m15
6009    paddb                m0, m15
6010    pshufb              m12, m14
6011    pshufb              m13, m0
6012
6013.filter:
6014    movd                r8d, xm9
6015    pextrd              r9d, xm9, 1
6016    pextrd             r10d, xm9, 2
6017    pextrd             r11d, xm9, 3
6018    vextracti128        xm9, m9, 1
6019    movq               xm10, [base+resize_filter+r8*8]
6020    movq               xm11, [base+resize_filter+r10*8]
6021    movhps             xm10, [base+resize_filter+r9*8]
6022    movhps             xm11, [base+resize_filter+r11*8]
6023    movd                r8d, xm9
6024    pextrd              r9d, xm9, 1
6025    pextrd             r10d, xm9, 2
6026    pextrd             r11d, xm9, 3
6027    vinserti128         m10, [base+resize_filter+r8*8], 1
6028    vinserti128         m11, [base+resize_filter+r10*8], 1
6029    vpbroadcastq        m14, [base+resize_filter+r9*8]
6030    vpbroadcastq         m1, [base+resize_filter+r11*8]
6031    vpblendd            m10, m14, 11000000b
6032    vpblendd            m11, m1, 11000000b
6033
6034    pmaddubsw           m12, m10
6035    pmaddubsw           m13, m11
6036    phaddw              m12, m13
6037    vextracti128       xm13, m12, 1
6038    phaddsw            xm12, xm13
6039    pmulhrsw           xm12, xm3                    ; x=(x+64)>>7
6040    packuswb           xm12, xm12
6041    movq          [dstq+xq], xm12
6042
6043    paddd                m4, m5
6044    add                  xd, 8
6045    cmp                  xd, dst_wd
6046    jl .loop_x
6047
6048    add                dstq, dst_strideq
6049    add                srcq, src_strideq
6050    dec                  hd
6051    jg .loop_y
6052    RET
6053
6054cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
6055%define base r7-w_mask_420_avx2_table
6056    lea                  r7, [w_mask_420_avx2_table]
6057    tzcnt                wd, wm
6058    mov                 r6d, r7m ; sign
6059    movifnidn            hd, hm
6060    movsxd               wq, [r7+wq*4]
6061    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
6062    vpbroadcastd         m7, [base+pw_2048]
6063    pmovzxbd             m9, [base+deint_shuf4]
6064    vpbroadcastd         m8, [base+wm_420_sign+r6*4] ; 258 - sign
6065    add                  wq, r7
6066    W_MASK                0, 4, 0, 1
6067    mov               maskq, maskmp
6068    lea            stride3q, [strideq*3]
6069    jmp                  wq
6070.w4:
6071    vextracti128        xm1, m0, 1
6072    movd   [dstq+strideq*0], xm0
6073    pextrd [dstq+strideq*1], xm0, 1
6074    movd   [dstq+strideq*2], xm1
6075    pextrd [dstq+stride3q ], xm1, 1
6076    cmp                  hd, 8
6077    jl .w4_end
6078    lea                dstq, [dstq+strideq*4]
6079    pextrd [dstq+strideq*0], xm0, 2
6080    pextrd [dstq+strideq*1], xm0, 3
6081    pextrd [dstq+strideq*2], xm1, 2
6082    pextrd [dstq+stride3q ], xm1, 3
6083    jg .w4_h16
6084.w4_end:
6085    vextracti128        xm0, m4, 1
6086    vpblendd            xm1, xm4, xm0, 0x05
6087    vpblendd            xm4, xm0, 0x0a
6088    pshufd              xm1, xm1, q2301
6089    psubw               xm4, xm8, xm4
6090    psubw               xm4, xm1
6091    psrlw               xm4, 2
6092    packuswb            xm4, xm4
6093    movq            [maskq], xm4
6094    RET
6095.w4_h16:
6096    W_MASK                0, 5, 2, 3
6097    lea                dstq, [dstq+strideq*4]
6098    phaddd               m4, m5
6099    vextracti128        xm1, m0, 1
6100    psubw                m4, m8, m4
6101    psrlw                m4, 2
6102    vpermd               m4, m9, m4
6103    vextracti128        xm5, m4, 1
6104    packuswb            xm4, xm5
6105    movd   [dstq+strideq*0], xm0
6106    pextrd [dstq+strideq*1], xm0, 1
6107    movd   [dstq+strideq*2], xm1
6108    pextrd [dstq+stride3q], xm1, 1
6109    lea                dstq, [dstq+strideq*4]
6110    pextrd [dstq+strideq*0], xm0, 2
6111    pextrd [dstq+strideq*1], xm0, 3
6112    pextrd [dstq+strideq*2], xm1, 2
6113    pextrd [dstq+stride3q ], xm1, 3
6114    mova            [maskq], xm4
6115    RET
6116.w8_loop:
6117    add               tmp1q, 2*32
6118    add               tmp2q, 2*32
6119    W_MASK                0, 4, 0, 1
6120    lea                dstq, [dstq+strideq*4]
6121    add               maskq, 8
6122.w8:
6123    vextracti128        xm2, m4, 1
6124    vextracti128        xm1, m0, 1
6125    psubw               xm4, xm8, xm4
6126    psubw               xm4, xm2
6127    psrlw               xm4, 2
6128    packuswb            xm4, xm4
6129    movq   [dstq+strideq*0], xm0
6130    movq   [dstq+strideq*1], xm1
6131    movhps [dstq+strideq*2], xm0
6132    movhps [dstq+stride3q ], xm1
6133    movq            [maskq], xm4
6134    sub                  hd, 4
6135    jg .w8_loop
6136    RET
6137.w16_loop:
6138    add               tmp1q, 4*32
6139    add               tmp2q, 4*32
6140    W_MASK                0, 4, 0, 1
6141    lea                dstq, [dstq+strideq*4]
6142    add               maskq, 16
6143.w16:
6144    vpermq               m0, m0, q3120
6145    mova         [dstq+strideq*0], xm0
6146    vextracti128 [dstq+strideq*1], m0, 1
6147    W_MASK                0, 5, 2, 3
6148    punpckhqdq           m1, m4, m5
6149    punpcklqdq           m4, m5
6150    psubw                m1, m8, m1
6151    psubw                m1, m4
6152    psrlw                m1, 2
6153    vpermq               m0, m0, q3120
6154    packuswb             m1, m1
6155    vpermd               m1, m9, m1
6156    mova         [dstq+strideq*2], xm0
6157    vextracti128 [dstq+stride3q ], m0, 1
6158    mova            [maskq], xm1
6159    sub                  hd, 4
6160    jg .w16_loop
6161    RET
6162.w32_loop:
6163    add               tmp1q, 4*32
6164    add               tmp2q, 4*32
6165    W_MASK                0, 4, 0, 1
6166    lea                dstq, [dstq+strideq*2]
6167    add               maskq, 16
6168.w32:
6169    vpermq               m0, m0, q3120
6170    mova   [dstq+strideq*0], m0
6171    W_MASK                0, 5, 2, 3
6172    psubw                m4, m8, m4
6173    psubw                m4, m5
6174    psrlw                m4, 2
6175    vpermq               m0, m0, q3120
6176    packuswb             m4, m4
6177    vpermd               m4, m9, m4
6178    mova   [dstq+strideq*1], m0
6179    mova            [maskq], xm4
6180    sub                  hd, 2
6181    jg .w32_loop
6182    RET
6183.w64_loop_even:
6184    psubw               m10, m8, m4
6185    psubw               m11, m8, m5
6186    dec                  hd
6187.w64_loop:
6188    add               tmp1q, 4*32
6189    add               tmp2q, 4*32
6190    W_MASK                0, 4, 0, 1
6191    add                dstq, strideq
6192.w64:
6193    vpermq               m0, m0, q3120
6194    mova        [dstq+32*0], m0
6195    W_MASK                0, 5, 2, 3
6196    vpermq               m0, m0, q3120
6197    mova        [dstq+32*1], m0
6198    test                 hd, 1
6199    jz .w64_loop_even
6200    psubw                m4, m10, m4
6201    psubw                m5, m11, m5
6202    psrlw                m4, 2
6203    psrlw                m5, 2
6204    packuswb             m4, m5
6205    vpermd               m4, m9, m4
6206    mova            [maskq], m4
6207    add               maskq, 32
6208    dec                  hd
6209    jg .w64_loop
6210    RET
6211.w128_loop_even:
6212    psubw               m12, m8, m4
6213    psubw               m13, m8, m5
6214    dec                  hd
6215.w128_loop:
6216    W_MASK                0, 4, 0, 1
6217    add                dstq, strideq
6218.w128:
6219    vpermq               m0, m0, q3120
6220    mova        [dstq+32*0], m0
6221    W_MASK                0, 5, 2, 3
6222    vpermq               m0, m0, q3120
6223    mova        [dstq+32*1], m0
6224    add               tmp1q, 8*32
6225    add               tmp2q, 8*32
6226    test                 hd, 1
6227    jz .w128_even
6228    psubw                m4, m10, m4
6229    psubw                m5, m11, m5
6230    psrlw                m4, 2
6231    psrlw                m5, 2
6232    packuswb             m4, m5
6233    vpermd               m4, m9, m4
6234    mova       [maskq+32*0], m4
6235    jmp .w128_odd
6236.w128_even:
6237    psubw               m10, m8, m4
6238    psubw               m11, m8, m5
6239.w128_odd:
6240    W_MASK                0, 4, -4, -3
6241    vpermq               m0, m0, q3120
6242    mova        [dstq+32*2], m0
6243    W_MASK                0, 5, -2, -1
6244    vpermq               m0, m0, q3120
6245    mova        [dstq+32*3], m0
6246    test                 hd, 1
6247    jz .w128_loop_even
6248    psubw                m4, m12, m4
6249    psubw                m5, m13, m5
6250    psrlw                m4, 2
6251    psrlw                m5, 2
6252    packuswb             m4, m5
6253    vpermd               m4, m9, m4
6254    mova       [maskq+32*1], m4
6255    add               maskq, 64
6256    dec                  hd
6257    jg .w128_loop
6258    RET
6259
6260cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
6261%define base r7-w_mask_422_avx2_table
6262    lea                  r7, [w_mask_422_avx2_table]
6263    tzcnt                wd, wm
6264    mov                 r6d, r7m ; sign
6265    movifnidn            hd, hm
6266    pxor                 m9, m9
6267    movsxd               wq, dword [r7+wq*4]
6268    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
6269    vpbroadcastd         m7, [base+pw_2048]
6270    pmovzxbd            m10, [base+deint_shuf4]
6271    vpbroadcastd         m8, [base+wm_422_sign+r6*4] ; 128 - sign
6272    add                  wq, r7
6273    mov               maskq, maskmp
6274    W_MASK                0, 4, 0, 1
6275    lea            stride3q, [strideq*3]
6276    jmp                  wq
6277.w4:
6278    vextracti128        xm1, m0, 1
6279    movd   [dstq+strideq*0], xm0
6280    pextrd [dstq+strideq*1], xm0, 1
6281    movd   [dstq+strideq*2], xm1
6282    pextrd [dstq+stride3q ], xm1, 1
6283    cmp                  hd, 8
6284    jl .w4_end
6285    lea                dstq, [dstq+strideq*4]
6286    pextrd [dstq+strideq*0], xm0, 2
6287    pextrd [dstq+strideq*1], xm0, 3
6288    pextrd [dstq+strideq*2], xm1, 2
6289    pextrd [dstq+stride3q ], xm1, 3
6290    jg .w4_h16
6291.w4_end:
6292    vextracti128        xm5, m4, 1
6293    packuswb            xm4, xm5
6294    psubb               xm5, xm8, xm4
6295    pavgb               xm5, xm9
6296    pshufd              xm5, xm5, q3120
6297    mova            [maskq], xm5
6298    RET
6299.w4_h16:
6300    W_MASK                0, 5, 2, 3
6301    lea                dstq, [dstq+strideq*4]
6302    packuswb             m4, m5
6303    psubb                m5, m8, m4
6304    pavgb                m5, m9
6305    vpermd               m5, m10, m5
6306    vextracti128        xm1, m0, 1
6307    movd   [dstq+strideq*0], xm0
6308    pextrd [dstq+strideq*1], xm0, 1
6309    movd   [dstq+strideq*2], xm1
6310    pextrd [dstq+stride3q ], xm1, 1
6311    lea                dstq, [dstq+strideq*4]
6312    pextrd [dstq+strideq*0], xm0, 2
6313    pextrd [dstq+strideq*1], xm0, 3
6314    pextrd [dstq+strideq*2], xm1, 2
6315    pextrd [dstq+stride3q ], xm1, 3
6316    mova            [maskq], m5
6317    RET
6318.w8_loop:
6319    add               tmp1q, 32*2
6320    add               tmp2q, 32*2
6321    W_MASK                0, 4, 0, 1
6322    lea                dstq, [dstq+strideq*4]
6323    add               maskq, 16
6324.w8:
6325    vextracti128        xm5, m4, 1
6326    vextracti128        xm1, m0, 1
6327    packuswb            xm4, xm5
6328    psubb               xm5, xm8, xm4
6329    pavgb               xm5, xm9
6330    pshufd              xm5, xm5, q3120
6331    movq   [dstq+strideq*0], xm0
6332    movq   [dstq+strideq*1], xm1
6333    movhps [dstq+strideq*2], xm0
6334    movhps [dstq+stride3q ], xm1
6335    mova            [maskq], xm5
6336    sub                  hd, 4
6337    jg .w8_loop
6338    RET
6339.w16_loop:
6340    add               tmp1q, 32*4
6341    add               tmp2q, 32*4
6342    W_MASK                0, 4, 0, 1
6343    lea                dstq, [dstq+strideq*4]
6344    add               maskq, 32
6345.w16:
6346    vpermq               m0, m0, q3120
6347    mova         [dstq+strideq*0], xm0
6348    vextracti128 [dstq+strideq*1], m0, 1
6349    W_MASK                0, 5, 2, 3
6350    packuswb             m4, m5
6351    psubb                m5, m8, m4
6352    pavgb                m5, m9
6353    vpermq               m0, m0, q3120
6354    vpermd               m5, m10, m5
6355    mova         [dstq+strideq*2], xm0
6356    vextracti128 [dstq+stride3q ], m0, 1
6357    mova            [maskq], m5
6358    sub                  hd, 4
6359    jg .w16_loop
6360    RET
6361.w32_loop:
6362    add               tmp1q, 32*4
6363    add               tmp2q, 32*4
6364    W_MASK                0, 4, 0, 1
6365    lea                dstq, [dstq+strideq*2]
6366    add               maskq, 32
6367.w32:
6368    vpermq               m0, m0, q3120
6369    mova   [dstq+strideq*0], m0
6370    W_MASK                0, 5, 2, 3
6371    packuswb             m4, m5
6372    psubb                m5, m8, m4
6373    pavgb                m5, m9
6374    vpermq               m0, m0, q3120
6375    vpermd               m5, m10, m5
6376    mova   [dstq+strideq*1], m0
6377    mova            [maskq], m5
6378    sub                  hd, 2
6379    jg .w32_loop
6380    RET
6381.w64_loop:
6382    add               tmp1q, 32*4
6383    add               tmp2q, 32*4
6384    W_MASK                0, 4, 0, 1
6385    add                dstq, strideq
6386    add               maskq, 32
6387.w64:
6388    vpermq               m0, m0, q3120
6389    mova        [dstq+32*0], m0
6390    W_MASK                0, 5, 2, 3
6391    packuswb             m4, m5
6392    psubb                m5, m8, m4
6393    pavgb                m5, m9
6394    vpermq               m0, m0, q3120
6395    vpermd               m5, m10, m5
6396    mova        [dstq+32*1], m0
6397    mova            [maskq], m5
6398    dec                  hd
6399    jg .w64_loop
6400    RET
6401.w128_loop:
6402    add               tmp1q, 32*8
6403    add               tmp2q, 32*8
6404    W_MASK                0, 4, 0, 1
6405    add                dstq, strideq
6406    add               maskq, 32*2
6407.w128:
6408    vpermq               m0, m0, q3120
6409    mova        [dstq+32*0], m0
6410    W_MASK                0, 5, 2, 3
6411    packuswb             m4, m5
6412    psubb                m5, m8, m4
6413    pavgb                m5, m9
6414    vpermq               m0, m0, q3120
6415    vpermd               m5, m10, m5
6416    mova        [dstq+32*1], m0
6417    mova       [maskq+32*0], m5
6418    W_MASK                0, 4, 4, 5
6419    vpermq               m0, m0, q3120
6420    mova        [dstq+32*2], m0
6421    W_MASK                0, 5, 6, 7
6422    packuswb             m4, m5
6423    psubb                m5, m8, m4
6424    pavgb                m5, m9
6425    vpermq               m0, m0, q3120
6426    vpermd               m5, m10, m5
6427    mova        [dstq+32*3], m0
6428    mova       [maskq+32*1], m5
6429    dec                  hd
6430    jg .w128_loop
6431    RET
6432
6433cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3
6434%define base r7-w_mask_444_avx2_table
6435    lea                  r7, [w_mask_444_avx2_table]
6436    tzcnt                wd, wm
6437    movifnidn            hd, hm
6438    mov               maskq, maskmp
6439    movsxd               wq, dword [r7+wq*4]
6440    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
6441    vpbroadcastd         m5, [base+pb_64]
6442    vpbroadcastd         m7, [base+pw_2048]
6443    add                  wq, r7
6444    W_MASK                0, 4, 0, 1, 1
6445    lea            stride3q, [strideq*3]
6446    jmp                  wq
6447.w4:
6448    vextracti128        xm1, m0, 1
6449    movd   [dstq+strideq*0], xm0
6450    pextrd [dstq+strideq*1], xm0, 1
6451    movd   [dstq+strideq*2], xm1
6452    pextrd [dstq+stride3q ], xm1, 1
6453    mova       [maskq+32*0], m4
6454    cmp                  hd, 8
6455    jl .w4_end
6456    lea                dstq, [dstq+strideq*4]
6457    pextrd [dstq+strideq*0], xm0, 2
6458    pextrd [dstq+strideq*1], xm0, 3
6459    pextrd [dstq+strideq*2], xm1, 2
6460    pextrd [dstq+stride3q ], xm1, 3
6461    je .w4_end
6462    W_MASK                0, 4, 2, 3, 1
6463    lea                dstq, [dstq+strideq*4]
6464    vextracti128        xm1, m0, 1
6465    movd   [dstq+strideq*0], xm0
6466    pextrd [dstq+strideq*1], xm0, 1
6467    movd   [dstq+strideq*2], xm1
6468    pextrd [dstq+stride3q ], xm1, 1
6469    lea                dstq, [dstq+strideq*4]
6470    pextrd [dstq+strideq*0], xm0, 2
6471    pextrd [dstq+strideq*1], xm0, 3
6472    pextrd [dstq+strideq*2], xm1, 2
6473    pextrd [dstq+stride3q ], xm1, 3
6474    mova       [maskq+32*1], m4
6475.w4_end:
6476    RET
6477.w8_loop:
6478    add               tmp1q, 32*2
6479    add               tmp2q, 32*2
6480    W_MASK                0, 4, 0, 1, 1
6481    lea                dstq, [dstq+strideq*4]
6482    add               maskq, 32
6483.w8:
6484    vextracti128        xm1, m0, 1
6485    movq   [dstq+strideq*0], xm0
6486    movq   [dstq+strideq*1], xm1
6487    movhps [dstq+strideq*2], xm0
6488    movhps [dstq+stride3q ], xm1
6489    mova            [maskq], m4
6490    sub                  hd, 4
6491    jg .w8_loop
6492    RET
6493.w16_loop:
6494    add               tmp1q, 32*2
6495    add               tmp2q, 32*2
6496    W_MASK                0, 4, 0, 1, 1
6497    lea                dstq, [dstq+strideq*2]
6498    add               maskq, 32
6499.w16:
6500    vpermq               m0, m0, q3120
6501    mova         [dstq+strideq*0], xm0
6502    vextracti128 [dstq+strideq*1], m0, 1
6503    mova            [maskq], m4
6504    sub                  hd, 2
6505    jg .w16_loop
6506    RET
6507.w32_loop:
6508    add               tmp1q, 32*2
6509    add               tmp2q, 32*2
6510    W_MASK                0, 4, 0, 1, 1
6511    add                dstq, strideq
6512    add               maskq, 32
6513.w32:
6514    vpermq               m0, m0, q3120
6515    mova             [dstq], m0
6516    mova            [maskq], m4
6517    dec                  hd
6518    jg .w32_loop
6519    RET
6520.w64_loop:
6521    add               tmp1q, 32*4
6522    add               tmp2q, 32*4
6523    W_MASK                0, 4, 0, 1, 1
6524    add                dstq, strideq
6525    add               maskq, 32*2
6526.w64:
6527    vpermq               m0, m0, q3120
6528    mova        [dstq+32*0], m0
6529    mova       [maskq+32*0], m4
6530    W_MASK                0, 4, 2, 3, 1
6531    vpermq               m0, m0, q3120
6532    mova        [dstq+32*1], m0
6533    mova       [maskq+32*1], m4
6534    dec                  hd
6535    jg .w64_loop
6536    RET
6537.w128_loop:
6538    add               tmp1q, 32*8
6539    add               tmp2q, 32*8
6540    W_MASK                0, 4, 0, 1, 1
6541    add                dstq, strideq
6542    add               maskq, 32*4
6543.w128:
6544    vpermq               m0, m0, q3120
6545    mova        [dstq+32*0], m0
6546    mova       [maskq+32*0], m4
6547    W_MASK                0, 4, 2, 3, 1
6548    vpermq               m0, m0, q3120
6549    mova        [dstq+32*1], m0
6550    mova       [maskq+32*1], m4
6551    W_MASK                0, 4, 4, 5, 1
6552    vpermq               m0, m0, q3120
6553    mova        [dstq+32*2], m0
6554    mova       [maskq+32*2], m4
6555    W_MASK                0, 4, 6, 7, 1
6556    vpermq               m0, m0, q3120
6557    mova        [dstq+32*3], m0
6558    mova       [maskq+32*3], m4
6559    dec                  hd
6560    jg .w128_loop
6561    RET
6562
6563%endif ; ARCH_X86_64
6564