xref: /aosp_15_r20/external/libdav1d/src/x86/mc16_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64
32*c0909341SAndroid Build Coastguard Worker
33*c0909341SAndroid Build Coastguard Worker; dav1d_obmc_masks[] * -512
34*c0909341SAndroid Build Coastguard Workerconst obmc_masks_avx2
35*c0909341SAndroid Build Coastguard Worker            dw      0,      0,  -9728,      0, -12800,  -7168,  -2560,      0
36*c0909341SAndroid Build Coastguard Worker            dw -14336, -11264,  -8192,  -5632,  -3584,  -1536,      0,      0
37*c0909341SAndroid Build Coastguard Worker            dw -15360, -13824, -12288, -10752,  -9216,  -7680,  -6144,  -5120
38*c0909341SAndroid Build Coastguard Worker            dw  -4096,  -3072,  -2048,  -1536,      0,      0,      0,      0
39*c0909341SAndroid Build Coastguard Worker            dw -15872, -14848, -14336, -13312, -12288, -11776, -10752, -10240
40*c0909341SAndroid Build Coastguard Worker            dw  -9728,  -8704,  -8192,  -7168,  -6656,  -6144,  -5632,  -4608
41*c0909341SAndroid Build Coastguard Worker            dw  -4096,  -3584,  -3072,  -2560,  -2048,  -2048,  -1536,  -1024
42*c0909341SAndroid Build Coastguard Worker            dw      0,      0,      0,      0,      0,      0,      0,      0
43*c0909341SAndroid Build Coastguard Worker
44*c0909341SAndroid Build Coastguard Workerdeint_shuf:     dd 0,  4,  1,  5,  2,  6,  3,  7
45*c0909341SAndroid Build Coastguard Workersubpel_h_shufA: db 0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
46*c0909341SAndroid Build Coastguard Workersubpel_h_shufB: db 4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
47*c0909341SAndroid Build Coastguard Workersubpel_h_shuf2: db 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9
48*c0909341SAndroid Build Coastguard Workersubpel_s_shuf2: db 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7
49*c0909341SAndroid Build Coastguard Workersubpel_s_shuf8: db 0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
50*c0909341SAndroid Build Coastguard Workerrescale_mul:    dd 0,  1,  2,  3,  4,  5,  6,  7
51*c0909341SAndroid Build Coastguard Workerrescale_mul2:   dd 0,  1,  4,  5,  2,  3,  6,  7
52*c0909341SAndroid Build Coastguard Workerresize_shuf:    db 0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7
53*c0909341SAndroid Build Coastguard Worker                db 8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
54*c0909341SAndroid Build Coastguard Workerblend_shuf:     db 0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
55*c0909341SAndroid Build Coastguard Workerwswap:          db 2,  3,  0,  1,  6,  7,  4,  5, 10, 11,  8,  9, 14, 15, 12, 13
56*c0909341SAndroid Build Coastguard Workerbdct_lb_q: times 8 db 0
57*c0909341SAndroid Build Coastguard Worker           times 8 db 4
58*c0909341SAndroid Build Coastguard Worker           times 8 db 8
59*c0909341SAndroid Build Coastguard Worker           times 8 db 12
60*c0909341SAndroid Build Coastguard Worker
61*c0909341SAndroid Build Coastguard Workerprep_mul:         dw 16, 16, 4, 4
62*c0909341SAndroid Build Coastguard Workerput_bilin_h_rnd:  dw 8, 8, 10, 10
63*c0909341SAndroid Build Coastguard Workerput_8tap_h_rnd:   dd 34, 40
64*c0909341SAndroid Build Coastguard Workers_8tap_h_rnd:     dd 2, 8
65*c0909341SAndroid Build Coastguard Workers_8tap_h_sh:      dd 2, 4
66*c0909341SAndroid Build Coastguard Workerput_s_8tap_v_rnd: dd 512, 128
67*c0909341SAndroid Build Coastguard Workerput_s_8tap_v_sh:  dd 10, 8
68*c0909341SAndroid Build Coastguard Workerprep_8tap_1d_rnd: dd     8 - (8192 <<  4)
69*c0909341SAndroid Build Coastguard Workerprep_8tap_2d_rnd: dd    32 - (8192 <<  5)
70*c0909341SAndroid Build Coastguard Workerwarp8x8t_rnd:     dd 16384 - (8192 << 15)
71*c0909341SAndroid Build Coastguard Workerwarp8x8_shift:    dd  5,  3
72*c0909341SAndroid Build Coastguard Workerwarp8x8_rnd:      dw   4096,   4096,  16384,  16384
73*c0909341SAndroid Build Coastguard Workerbidir_rnd:        dw -16400, -16400, -16388, -16388
74*c0909341SAndroid Build Coastguard Workerbidir_mul:        dw   2048,   2048,   8192,   8192
75*c0909341SAndroid Build Coastguard Worker
76*c0909341SAndroid Build Coastguard Worker%define pw_16 prep_mul
77*c0909341SAndroid Build Coastguard Worker%define pd_512 put_s_8tap_v_rnd
78*c0909341SAndroid Build Coastguard Worker
79*c0909341SAndroid Build Coastguard Workerpw_2:          times 2 dw 2
80*c0909341SAndroid Build Coastguard Workerpw_64:         times 2 dw 64
81*c0909341SAndroid Build Coastguard Workerpw_2048:       times 2 dw 2048
82*c0909341SAndroid Build Coastguard Workerpw_8192:       times 2 dw 8192
83*c0909341SAndroid Build Coastguard Workerpw_27615:      times 2 dw 27615
84*c0909341SAndroid Build Coastguard Workerpw_32766:      times 2 dw 32766
85*c0909341SAndroid Build Coastguard Workerpw_m512:       times 2 dw -512
86*c0909341SAndroid Build Coastguard Workerpd_32:         dd 32
87*c0909341SAndroid Build Coastguard Workerpd_63:         dd 63
88*c0909341SAndroid Build Coastguard Workerpd_64:         dd 64
89*c0909341SAndroid Build Coastguard Workerpd_32768:      dd 32768
90*c0909341SAndroid Build Coastguard Workerpd_65538:      dd 65538
91*c0909341SAndroid Build Coastguard Workerpd_m524256:    dd -524256 ; -8192 << 6 + 32
92*c0909341SAndroid Build Coastguard Workerpd_0x3ff:      dd 0x3ff
93*c0909341SAndroid Build Coastguard Workerpq_0x40000000: dq 0x40000000
94*c0909341SAndroid Build Coastguard Worker               dd 0
95*c0909341SAndroid Build Coastguard Worker
96*c0909341SAndroid Build Coastguard Worker%macro BIDIR_JMP_TABLE 2-*
97*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - 2*%3)
98*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_%2_table
99*c0909341SAndroid Build Coastguard Worker    %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
100*c0909341SAndroid Build Coastguard Worker    %%table:
101*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
102*c0909341SAndroid Build Coastguard Worker        dd %%prefix %+ .w%3 - %%base
103*c0909341SAndroid Build Coastguard Worker        %rotate 1
104*c0909341SAndroid Build Coastguard Worker    %endrep
105*c0909341SAndroid Build Coastguard Worker%endmacro
106*c0909341SAndroid Build Coastguard Worker
107*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE avg,        avx2,    4, 8, 16, 32, 64, 128
108*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_avg,      avx2,    4, 8, 16, 32, 64, 128
109*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE mask,       avx2,    4, 8, 16, 32, 64, 128
110*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_420, avx2,    4, 8, 16, 32, 64, 128
111*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_422, avx2,    4, 8, 16, 32, 64, 128
112*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_444, avx2,    4, 8, 16, 32, 64, 128
113*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend,      avx2,    4, 8, 16, 32
114*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend_v,    avx2, 2, 4, 8, 16, 32
115*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend_h,    avx2, 2, 4, 8, 16, 32, 64, 128
116*c0909341SAndroid Build Coastguard Worker
117*c0909341SAndroid Build Coastguard Worker%macro BASE_JMP_TABLE 3-*
118*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - %3)
119*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_%2
120*c0909341SAndroid Build Coastguard Worker    %%table:
121*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
122*c0909341SAndroid Build Coastguard Worker        dw %%base %+ _w%3 - %%base
123*c0909341SAndroid Build Coastguard Worker        %rotate 1
124*c0909341SAndroid Build Coastguard Worker    %endrep
125*c0909341SAndroid Build Coastguard Worker%endmacro
126*c0909341SAndroid Build Coastguard Worker
127*c0909341SAndroid Build Coastguard Worker%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_16bpc_avx2.put)
128*c0909341SAndroid Build Coastguard Worker%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_16bpc_avx2.prep)
129*c0909341SAndroid Build Coastguard Worker
130*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE put,  avx2, 2, 4, 8, 16, 32, 64, 128
131*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE prep, avx2,    4, 8, 16, 32, 64, 128
132*c0909341SAndroid Build Coastguard Worker
133*c0909341SAndroid Build Coastguard Worker%macro HV_JMP_TABLE 5-*
134*c0909341SAndroid Build Coastguard Worker    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3)
135*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_%3
136*c0909341SAndroid Build Coastguard Worker    %assign %%types %4
137*c0909341SAndroid Build Coastguard Worker    %if %%types & 1
138*c0909341SAndroid Build Coastguard Worker        %xdefine %1_%2_h_%3_table  (%%h  - %5)
139*c0909341SAndroid Build Coastguard Worker        %%h:
140*c0909341SAndroid Build Coastguard Worker        %rep %0 - 4
141*c0909341SAndroid Build Coastguard Worker            dw %%prefix %+ .h_w%5 - %%base
142*c0909341SAndroid Build Coastguard Worker            %rotate 1
143*c0909341SAndroid Build Coastguard Worker        %endrep
144*c0909341SAndroid Build Coastguard Worker        %rotate 4
145*c0909341SAndroid Build Coastguard Worker    %endif
146*c0909341SAndroid Build Coastguard Worker    %if %%types & 2
147*c0909341SAndroid Build Coastguard Worker        %xdefine %1_%2_v_%3_table  (%%v  - %5)
148*c0909341SAndroid Build Coastguard Worker        %%v:
149*c0909341SAndroid Build Coastguard Worker        %rep %0 - 4
150*c0909341SAndroid Build Coastguard Worker            dw %%prefix %+ .v_w%5 - %%base
151*c0909341SAndroid Build Coastguard Worker            %rotate 1
152*c0909341SAndroid Build Coastguard Worker        %endrep
153*c0909341SAndroid Build Coastguard Worker        %rotate 4
154*c0909341SAndroid Build Coastguard Worker    %endif
155*c0909341SAndroid Build Coastguard Worker    %if %%types & 4
156*c0909341SAndroid Build Coastguard Worker        %xdefine %1_%2_hv_%3_table (%%hv - %5)
157*c0909341SAndroid Build Coastguard Worker        %%hv:
158*c0909341SAndroid Build Coastguard Worker        %rep %0 - 4
159*c0909341SAndroid Build Coastguard Worker            dw %%prefix %+ .hv_w%5 - %%base
160*c0909341SAndroid Build Coastguard Worker            %rotate 1
161*c0909341SAndroid Build Coastguard Worker        %endrep
162*c0909341SAndroid Build Coastguard Worker    %endif
163*c0909341SAndroid Build Coastguard Worker%endmacro
164*c0909341SAndroid Build Coastguard Worker
165*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE put,  bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
166*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE prep, bilin, avx2, 7,    4, 8, 16, 32, 64, 128
167*c0909341SAndroid Build Coastguard Worker
168*c0909341SAndroid Build Coastguard Worker%macro SCALED_JMP_TABLE 2-*
169*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - %3)
170*c0909341SAndroid Build Coastguard Worker    %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2)
171*c0909341SAndroid Build Coastguard Worker%%table:
172*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
173*c0909341SAndroid Build Coastguard Worker        dw %%base %+ .w%3 - %%base
174*c0909341SAndroid Build Coastguard Worker        %rotate 1
175*c0909341SAndroid Build Coastguard Worker    %endrep
176*c0909341SAndroid Build Coastguard Worker    %rotate 2
177*c0909341SAndroid Build Coastguard Worker %%dy_1024:
178*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
179*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
180*c0909341SAndroid Build Coastguard Worker        dw %%base %+ .dy1_w%3 - %%base
181*c0909341SAndroid Build Coastguard Worker        %rotate 1
182*c0909341SAndroid Build Coastguard Worker    %endrep
183*c0909341SAndroid Build Coastguard Worker    %rotate 2
184*c0909341SAndroid Build Coastguard Worker %%dy_2048:
185*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
186*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
187*c0909341SAndroid Build Coastguard Worker        dw %%base %+ .dy2_w%3 - %%base
188*c0909341SAndroid Build Coastguard Worker        %rotate 1
189*c0909341SAndroid Build Coastguard Worker    %endrep
190*c0909341SAndroid Build Coastguard Worker%endmacro
191*c0909341SAndroid Build Coastguard Worker
192*c0909341SAndroid Build Coastguard WorkerSCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128
193*c0909341SAndroid Build Coastguard WorkerSCALED_JMP_TABLE prep_8tap_scaled, avx2,   4, 8, 16, 32, 64, 128
194*c0909341SAndroid Build Coastguard Worker
195*c0909341SAndroid Build Coastguard Worker%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
196*c0909341SAndroid Build Coastguard Worker
197*c0909341SAndroid Build Coastguard Workercextern mc_subpel_filters
198*c0909341SAndroid Build Coastguard Worker%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
199*c0909341SAndroid Build Coastguard Worker
200*c0909341SAndroid Build Coastguard Workercextern mc_warp_filter
201*c0909341SAndroid Build Coastguard Workercextern resize_filter
202*c0909341SAndroid Build Coastguard Worker
203*c0909341SAndroid Build Coastguard WorkerSECTION .text
204*c0909341SAndroid Build Coastguard Worker
205*c0909341SAndroid Build Coastguard WorkerINIT_XMM avx2
206*c0909341SAndroid Build Coastguard Workercglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
207*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r6m ; mx
208*c0909341SAndroid Build Coastguard Worker    lea                  r7, [put_avx2]
209*c0909341SAndroid Build Coastguard Worker%if UNIX64
210*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 8
211*c0909341SAndroid Build Coastguard Worker    %define org_w r8d
212*c0909341SAndroid Build Coastguard Worker    mov                 r8d, wd
213*c0909341SAndroid Build Coastguard Worker%else
214*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 7
215*c0909341SAndroid Build Coastguard Worker    %define org_w wm
216*c0909341SAndroid Build Coastguard Worker%endif
217*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
218*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
219*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
220*c0909341SAndroid Build Coastguard Worker    jnz .h
221*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r7m ; my
222*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
223*c0909341SAndroid Build Coastguard Worker    jnz .v
224*c0909341SAndroid Build Coastguard Worker.put:
225*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(put,)]
226*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
227*c0909341SAndroid Build Coastguard Worker    jmp                  wq
228*c0909341SAndroid Build Coastguard Worker.put_w2:
229*c0909341SAndroid Build Coastguard Worker    mov                 r6d, [srcq+ssq*0]
230*c0909341SAndroid Build Coastguard Worker    mov                 r7d, [srcq+ssq*1]
231*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
232*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r6d
233*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r7d
234*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
235*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
236*c0909341SAndroid Build Coastguard Worker    jg .put_w2
237*c0909341SAndroid Build Coastguard Worker    RET
238*c0909341SAndroid Build Coastguard Worker.put_w4:
239*c0909341SAndroid Build Coastguard Worker    mov                  r6, [srcq+ssq*0]
240*c0909341SAndroid Build Coastguard Worker    mov                  r7, [srcq+ssq*1]
241*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
242*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r6
243*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r7
244*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
245*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
246*c0909341SAndroid Build Coastguard Worker    jg .put_w4
247*c0909341SAndroid Build Coastguard Worker    RET
248*c0909341SAndroid Build Coastguard Worker.put_w8:
249*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
250*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
251*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
252*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m0
253*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
254*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
255*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
256*c0909341SAndroid Build Coastguard Worker    jg .put_w8
257*c0909341SAndroid Build Coastguard Worker    RET
258*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
259*c0909341SAndroid Build Coastguard Worker.put_w16:
260*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
261*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
262*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
263*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m0
264*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
265*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
266*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
267*c0909341SAndroid Build Coastguard Worker    jg .put_w16
268*c0909341SAndroid Build Coastguard Worker    RET
269*c0909341SAndroid Build Coastguard Worker.put_w32:
270*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+32*0]
271*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+32*1]
272*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1+32*0]
273*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1+32*1]
274*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
275*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+32*0], m0
276*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+32*1], m1
277*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+32*0], m2
278*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+32*1], m3
279*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
280*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
281*c0909341SAndroid Build Coastguard Worker    jg .put_w32
282*c0909341SAndroid Build Coastguard Worker    RET
283*c0909341SAndroid Build Coastguard Worker.put_w64:
284*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+32*0]
285*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+32*1]
286*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+32*2]
287*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+32*3]
288*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
289*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
290*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m1
291*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*2], m2
292*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*3], m3
293*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
294*c0909341SAndroid Build Coastguard Worker    dec                  hd
295*c0909341SAndroid Build Coastguard Worker    jg .put_w64
296*c0909341SAndroid Build Coastguard Worker    RET
297*c0909341SAndroid Build Coastguard Worker.put_w128:
298*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+32*0]
299*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+32*1]
300*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+32*2]
301*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+32*3]
302*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
303*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m1
304*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*2], m2
305*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*3], m3
306*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+32*4]
307*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+32*5]
308*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+32*6]
309*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+32*7]
310*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
311*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*4], m0
312*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*5], m1
313*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*6], m2
314*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*7], m3
315*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
316*c0909341SAndroid Build Coastguard Worker    dec                  hd
317*c0909341SAndroid Build Coastguard Worker    jg .put_w128
318*c0909341SAndroid Build Coastguard Worker    RET
319*c0909341SAndroid Build Coastguard Worker.h:
320*c0909341SAndroid Build Coastguard Worker    movd                xm5, mxyd
321*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r7m ; my
322*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [pw_16]
323*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, xm5
324*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5
325*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
326*c0909341SAndroid Build Coastguard Worker    jnz .hv
327*c0909341SAndroid Build Coastguard Worker    ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
328*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_h)]
329*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r8m ; bitdepth_max
330*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
331*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
332*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [r7-put_avx2+put_bilin_h_rnd+r6*4]
333*c0909341SAndroid Build Coastguard Worker    jmp                  wq
334*c0909341SAndroid Build Coastguard Worker.h_w2:
335*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+ssq*0]
336*c0909341SAndroid Build Coastguard Worker    movhps              xm1, [srcq+ssq*1]
337*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
338*c0909341SAndroid Build Coastguard Worker    pmullw              xm0, xm4, xm1
339*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, 16
340*c0909341SAndroid Build Coastguard Worker    pmullw              xm1, xm5
341*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm3
342*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
343*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 4
344*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm0
345*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm0, 2
346*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
347*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
348*c0909341SAndroid Build Coastguard Worker    jg .h_w2
349*c0909341SAndroid Build Coastguard Worker    RET
350*c0909341SAndroid Build Coastguard Worker.h_w4:
351*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+ssq*0]
352*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [srcq+ssq*1]
353*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+ssq*0+2]
354*c0909341SAndroid Build Coastguard Worker    movhps              xm1, [srcq+ssq*1+2]
355*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
356*c0909341SAndroid Build Coastguard Worker    pmullw              xm0, xm4
357*c0909341SAndroid Build Coastguard Worker    pmullw              xm1, xm5
358*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm3
359*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
360*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 4
361*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm0
362*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm0
363*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
364*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
365*c0909341SAndroid Build Coastguard Worker    jg .h_w4
366*c0909341SAndroid Build Coastguard Worker    RET
367*c0909341SAndroid Build Coastguard Worker.h_w8:
368*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
369*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*1], 1
370*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*0+2]
371*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+ssq*1+2], 1
372*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
373*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
374*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5
375*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
376*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
377*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 4
378*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm0
379*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], m0, 1
380*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
381*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
382*c0909341SAndroid Build Coastguard Worker    jg .h_w8
383*c0909341SAndroid Build Coastguard Worker    RET
384*c0909341SAndroid Build Coastguard Worker.h_w16:
385*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+ssq*0]
386*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5, [srcq+ssq*0+2]
387*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
388*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
389*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+ssq*1]
390*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5, [srcq+ssq*1+2]
391*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
392*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
393*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
394*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 4
395*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 4
396*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m0
397*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
398*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
399*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
400*c0909341SAndroid Build Coastguard Worker    jg .h_w16
401*c0909341SAndroid Build Coastguard Worker    RET
402*c0909341SAndroid Build Coastguard Worker.h_w32:
403*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+32*0]
404*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5, [srcq+32*0+2]
405*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
406*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
407*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+32*1]
408*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5, [srcq+32*1+2]
409*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
410*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
411*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
412*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 4
413*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 4
414*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
415*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m1
416*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
417*c0909341SAndroid Build Coastguard Worker    dec                  hd
418*c0909341SAndroid Build Coastguard Worker    jg .h_w32
419*c0909341SAndroid Build Coastguard Worker    RET
420*c0909341SAndroid Build Coastguard Worker.h_w64:
421*c0909341SAndroid Build Coastguard Worker.h_w128:
422*c0909341SAndroid Build Coastguard Worker    movifnidn           t0d, org_w
423*c0909341SAndroid Build Coastguard Worker.h_w64_loop0:
424*c0909341SAndroid Build Coastguard Worker    mov                 r6d, t0d
425*c0909341SAndroid Build Coastguard Worker.h_w64_loop:
426*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+r6*2-32*1]
427*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5, [srcq+r6*2-32*1+2]
428*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
429*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
430*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+r6*2-32*2]
431*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5, [srcq+r6*2-32*2+2]
432*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
433*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
434*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 4
435*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 4
436*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r6*2-32*1], m0
437*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r6*2-32*2], m1
438*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 32
439*c0909341SAndroid Build Coastguard Worker    jg .h_w64_loop
440*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
441*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
442*c0909341SAndroid Build Coastguard Worker    dec                  hd
443*c0909341SAndroid Build Coastguard Worker    jg .h_w64_loop0
444*c0909341SAndroid Build Coastguard Worker    RET
445*c0909341SAndroid Build Coastguard Worker.v:
446*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_v)]
447*c0909341SAndroid Build Coastguard Worker    shl                mxyd, 11
448*c0909341SAndroid Build Coastguard Worker    movd                xm5, mxyd
449*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
450*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, xm5
451*c0909341SAndroid Build Coastguard Worker    jmp                  wq
452*c0909341SAndroid Build Coastguard Worker.v_w2:
453*c0909341SAndroid Build Coastguard Worker    movd                xm0, [srcq+ssq*0]
454*c0909341SAndroid Build Coastguard Worker.v_w2_loop:
455*c0909341SAndroid Build Coastguard Worker    movd                xm1, [srcq+ssq*1]
456*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
457*c0909341SAndroid Build Coastguard Worker    punpckldq           xm2, xm0, xm1
458*c0909341SAndroid Build Coastguard Worker    movd                xm0, [srcq+ssq*0]
459*c0909341SAndroid Build Coastguard Worker    punpckldq           xm1, xm0
460*c0909341SAndroid Build Coastguard Worker    psubw               xm1, xm2
461*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm1, xm5
462*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
463*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm1
464*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm1, 1
465*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
466*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
467*c0909341SAndroid Build Coastguard Worker    jg .v_w2_loop
468*c0909341SAndroid Build Coastguard Worker    RET
469*c0909341SAndroid Build Coastguard Worker.v_w4:
470*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+ssq*0]
471*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
472*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+ssq*1]
473*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
474*c0909341SAndroid Build Coastguard Worker    punpcklqdq          xm2, xm0, xm1
475*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+ssq*0]
476*c0909341SAndroid Build Coastguard Worker    punpcklqdq          xm1, xm0
477*c0909341SAndroid Build Coastguard Worker    psubw               xm1, xm2
478*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm1, xm5
479*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
480*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm1
481*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm1
482*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
483*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
484*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
485*c0909341SAndroid Build Coastguard Worker    RET
486*c0909341SAndroid Build Coastguard Worker.v_w8:
487*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
488*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
489*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [srcq+ssq*1]
490*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
491*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m0, m1, 0xf0
492*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [srcq+ssq*0]
493*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m0, 0xf0
494*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
495*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
496*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
497*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm1
498*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], m1, 1
499*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
500*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
501*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
502*c0909341SAndroid Build Coastguard Worker    RET
503*c0909341SAndroid Build Coastguard Worker.v_w32:
504*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+32*0]
505*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+32*1]
506*c0909341SAndroid Build Coastguard Worker.v_w32_loop:
507*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1+32*0]
508*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1+32*1]
509*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
510*c0909341SAndroid Build Coastguard Worker    psubw                m4, m2, m0
511*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m5
512*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
513*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+32*0]
514*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+32*0], m4
515*c0909341SAndroid Build Coastguard Worker    psubw                m4, m3, m1
516*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m5
517*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
518*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+32*1]
519*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+32*1], m4
520*c0909341SAndroid Build Coastguard Worker    psubw                m4, m0, m2
521*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m5
522*c0909341SAndroid Build Coastguard Worker    paddw                m4, m2
523*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+32*0], m4
524*c0909341SAndroid Build Coastguard Worker    psubw                m4, m1, m3
525*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m5
526*c0909341SAndroid Build Coastguard Worker    paddw                m4, m3
527*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+32*1], m4
528*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
529*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
530*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop
531*c0909341SAndroid Build Coastguard Worker    RET
532*c0909341SAndroid Build Coastguard Worker.v_w16:
533*c0909341SAndroid Build Coastguard Worker.v_w64:
534*c0909341SAndroid Build Coastguard Worker.v_w128:
535*c0909341SAndroid Build Coastguard Worker    movifnidn           t0d, org_w
536*c0909341SAndroid Build Coastguard Worker    add                 t0d, t0d
537*c0909341SAndroid Build Coastguard Worker    mov                  r4, srcq
538*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+t0*8-256]
539*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
540*c0909341SAndroid Build Coastguard Worker.v_w16_loop0:
541*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
542*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
543*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1]
544*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
545*c0909341SAndroid Build Coastguard Worker    psubw                m1, m3, m0
546*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
547*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
548*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
549*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, m3
550*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5
551*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
552*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m1
553*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m2
554*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
555*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
556*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
557*c0909341SAndroid Build Coastguard Worker    add                  r4, 32
558*c0909341SAndroid Build Coastguard Worker    add                  r7, 32
559*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
560*c0909341SAndroid Build Coastguard Worker    mov                srcq, r4
561*c0909341SAndroid Build Coastguard Worker    mov                dstq, r7
562*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
563*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop0
564*c0909341SAndroid Build Coastguard Worker    RET
565*c0909341SAndroid Build Coastguard Worker.hv:
566*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
567*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       8
568*c0909341SAndroid Build Coastguard Worker    shl                mxyd, 11
569*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [pw_2]
570*c0909341SAndroid Build Coastguard Worker    movd                xm6, mxyd
571*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pw_8192]
572*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
573*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, xm6
574*c0909341SAndroid Build Coastguard Worker    test          dword r8m, 0x800
575*c0909341SAndroid Build Coastguard Worker    jnz .hv_12bpc
576*c0909341SAndroid Build Coastguard Worker    psllw                m4, 2
577*c0909341SAndroid Build Coastguard Worker    psllw                m5, 2
578*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pw_2048]
579*c0909341SAndroid Build Coastguard Worker.hv_12bpc:
580*c0909341SAndroid Build Coastguard Worker    jmp                  wq
581*c0909341SAndroid Build Coastguard Worker.hv_w2:
582*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        xm1, [srcq+ssq*0]
583*c0909341SAndroid Build Coastguard Worker    pmullw              xm0, xm4, xm1
584*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, 16
585*c0909341SAndroid Build Coastguard Worker    pmullw              xm1, xm5
586*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm3
587*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
588*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 2
589*c0909341SAndroid Build Coastguard Worker.hv_w2_loop:
590*c0909341SAndroid Build Coastguard Worker    movq                xm2, [srcq+ssq*1]
591*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
592*c0909341SAndroid Build Coastguard Worker    movhps              xm2, [srcq+ssq*0]
593*c0909341SAndroid Build Coastguard Worker    pmullw              xm1, xm4, xm2
594*c0909341SAndroid Build Coastguard Worker    psrlq               xm2, 16
595*c0909341SAndroid Build Coastguard Worker    pmullw              xm2, xm5
596*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm3
597*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
598*c0909341SAndroid Build Coastguard Worker    psrlw               xm1, 2              ; 1 _ 2 _
599*c0909341SAndroid Build Coastguard Worker    shufpd              xm2, xm0, xm1, 0x01 ; 0 _ 1 _
600*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm1
601*c0909341SAndroid Build Coastguard Worker    psubw               xm1, xm2
602*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm1
603*c0909341SAndroid Build Coastguard Worker    pmulhw              xm1, xm6
604*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
605*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm1, xm7
606*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm1
607*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm1, 2
608*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
609*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
610*c0909341SAndroid Build Coastguard Worker    jg .hv_w2_loop
611*c0909341SAndroid Build Coastguard Worker    RET
612*c0909341SAndroid Build Coastguard Worker.hv_w4:
613*c0909341SAndroid Build Coastguard Worker    pmullw              xm0, xm4, [srcq+ssq*0-8]
614*c0909341SAndroid Build Coastguard Worker    pmullw              xm1, xm5, [srcq+ssq*0-6]
615*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm3
616*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
617*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 2
618*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
619*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+ssq*1]
620*c0909341SAndroid Build Coastguard Worker    movq                xm2, [srcq+ssq*1+2]
621*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
622*c0909341SAndroid Build Coastguard Worker    movhps              xm1, [srcq+ssq*0]
623*c0909341SAndroid Build Coastguard Worker    movhps              xm2, [srcq+ssq*0+2]
624*c0909341SAndroid Build Coastguard Worker    pmullw              xm1, xm4
625*c0909341SAndroid Build Coastguard Worker    pmullw              xm2, xm5
626*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm3
627*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
628*c0909341SAndroid Build Coastguard Worker    psrlw               xm1, 2              ; 1 2
629*c0909341SAndroid Build Coastguard Worker    shufpd              xm2, xm0, xm1, 0x01 ; 0 1
630*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm1
631*c0909341SAndroid Build Coastguard Worker    psubw               xm1, xm2
632*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm1
633*c0909341SAndroid Build Coastguard Worker    pmulhw              xm1, xm6
634*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
635*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm1, xm7
636*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm1
637*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm1
638*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
639*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
640*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
641*c0909341SAndroid Build Coastguard Worker    RET
642*c0909341SAndroid Build Coastguard Worker.hv_w8:
643*c0909341SAndroid Build Coastguard Worker    pmullw              xm0, xm4, [srcq+ssq*0]
644*c0909341SAndroid Build Coastguard Worker    pmullw              xm1, xm5, [srcq+ssq*0+2]
645*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm3
646*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
647*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 2
648*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, xm0, 1
649*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
650*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*1]
651*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+ssq*1+2]
652*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
653*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+ssq*0], 1
654*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+ssq*0+2], 1
655*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
656*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5
657*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
658*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
659*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2            ; 1 2
660*c0909341SAndroid Build Coastguard Worker    vperm2i128           m2, m0, m1, 0x21 ; 0 1
661*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
662*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
663*c0909341SAndroid Build Coastguard Worker    paddw                m1, m1
664*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m6
665*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
666*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7
667*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm1
668*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], m1, 1
669*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
670*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
671*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
672*c0909341SAndroid Build Coastguard Worker    RET
673*c0909341SAndroid Build Coastguard Worker.hv_w16:
674*c0909341SAndroid Build Coastguard Worker.hv_w32:
675*c0909341SAndroid Build Coastguard Worker.hv_w64:
676*c0909341SAndroid Build Coastguard Worker.hv_w128:
677*c0909341SAndroid Build Coastguard Worker%if UNIX64
678*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [r8*2-32]
679*c0909341SAndroid Build Coastguard Worker%else
680*c0909341SAndroid Build Coastguard Worker    mov                 r6d, wm
681*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [r6*2-32]
682*c0909341SAndroid Build Coastguard Worker%endif
683*c0909341SAndroid Build Coastguard Worker    mov                  r4, srcq
684*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+r6*8]
685*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
686*c0909341SAndroid Build Coastguard Worker.hv_w16_loop0:
687*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+ssq*0]
688*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5, [srcq+ssq*0+2]
689*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
690*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
691*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
692*c0909341SAndroid Build Coastguard Worker.hv_w16_loop:
693*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+ssq*1]
694*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5, [srcq+ssq*1+2]
695*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
696*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
697*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
698*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2
699*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1, m0
700*c0909341SAndroid Build Coastguard Worker    paddw                m2, m2
701*c0909341SAndroid Build Coastguard Worker    pmulhw               m2, m6
702*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
703*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m7
704*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m2
705*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+ssq*0]
706*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5, [srcq+ssq*0+2]
707*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
708*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
709*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
710*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, m1
711*c0909341SAndroid Build Coastguard Worker    paddw                m2, m2
712*c0909341SAndroid Build Coastguard Worker    pmulhw               m2, m6
713*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1
714*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m7
715*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m2
716*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
717*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
718*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop
719*c0909341SAndroid Build Coastguard Worker    add                  r4, 32
720*c0909341SAndroid Build Coastguard Worker    add                  r7, 32
721*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
722*c0909341SAndroid Build Coastguard Worker    mov                srcq, r4
723*c0909341SAndroid Build Coastguard Worker    mov                dstq, r7
724*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
725*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop0
726*c0909341SAndroid Build Coastguard Worker    RET
727*c0909341SAndroid Build Coastguard Worker
728*c0909341SAndroid Build Coastguard Workercglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
729*c0909341SAndroid Build Coastguard Worker    movifnidn          mxyd, r5m ; mx
730*c0909341SAndroid Build Coastguard Worker    lea                  r6, [prep_avx2]
731*c0909341SAndroid Build Coastguard Worker%if UNIX64
732*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 7
733*c0909341SAndroid Build Coastguard Worker    %define org_w r7d
734*c0909341SAndroid Build Coastguard Worker%else
735*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 6
736*c0909341SAndroid Build Coastguard Worker    %define org_w r5m
737*c0909341SAndroid Build Coastguard Worker%endif
738*c0909341SAndroid Build Coastguard Worker    mov               org_w, wd
739*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
740*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
741*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
742*c0909341SAndroid Build Coastguard Worker    jnz .h
743*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r6m ; my
744*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
745*c0909341SAndroid Build Coastguard Worker    jnz .v
746*c0909341SAndroid Build Coastguard Worker.prep:
747*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r6+wq*2+table_offset(prep,)]
748*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r7m ; bitdepth_max
749*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [r6-prep_avx2+pw_8192]
750*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
751*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 11
752*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [r6-prep_avx2+prep_mul+r5*4]
753*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
754*c0909341SAndroid Build Coastguard Worker    jmp                  wq
755*c0909341SAndroid Build Coastguard Worker.prep_w4:
756*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+strideq*0]
757*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [srcq+strideq*1]
758*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [srcq+strideq*2]
759*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+stride3q ]
760*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
761*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m1, 0x30
762*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m2, 0xc0
763*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
764*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
765*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
766*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
767*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
768*c0909341SAndroid Build Coastguard Worker    jg .prep_w4
769*c0909341SAndroid Build Coastguard Worker    RET
770*c0909341SAndroid Build Coastguard Worker.prep_w8:
771*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*0]
772*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+strideq*1], 1
773*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*2]
774*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+stride3q ], 1
775*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
776*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
777*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
778*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
779*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5
780*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
781*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m1
782*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
783*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
784*c0909341SAndroid Build Coastguard Worker    jg .prep_w8
785*c0909341SAndroid Build Coastguard Worker    RET
786*c0909341SAndroid Build Coastguard Worker.prep_w16:
787*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+strideq*0]
788*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+strideq*1]
789*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4, [srcq+strideq*2]
790*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m4, [srcq+stride3q ]
791*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
792*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
793*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5
794*c0909341SAndroid Build Coastguard Worker    psubw                m2, m5
795*c0909341SAndroid Build Coastguard Worker    psubw                m3, m5
796*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
797*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m1
798*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*2], m2
799*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*3], m3
800*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*4
801*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
802*c0909341SAndroid Build Coastguard Worker    jg .prep_w16
803*c0909341SAndroid Build Coastguard Worker    RET
804*c0909341SAndroid Build Coastguard Worker.prep_w32:
805*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+strideq*0+32*0]
806*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+strideq*0+32*1]
807*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4, [srcq+strideq*1+32*0]
808*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m4, [srcq+strideq*1+32*1]
809*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
810*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
811*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5
812*c0909341SAndroid Build Coastguard Worker    psubw                m2, m5
813*c0909341SAndroid Build Coastguard Worker    psubw                m3, m5
814*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
815*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m1
816*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*2], m2
817*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*3], m3
818*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*4
819*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
820*c0909341SAndroid Build Coastguard Worker    jg .prep_w32
821*c0909341SAndroid Build Coastguard Worker    RET
822*c0909341SAndroid Build Coastguard Worker.prep_w64:
823*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+32*0]
824*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+32*1]
825*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4, [srcq+32*2]
826*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m4, [srcq+32*3]
827*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
828*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
829*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5
830*c0909341SAndroid Build Coastguard Worker    psubw                m2, m5
831*c0909341SAndroid Build Coastguard Worker    psubw                m3, m5
832*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
833*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m1
834*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*2], m2
835*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*3], m3
836*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*4
837*c0909341SAndroid Build Coastguard Worker    dec                  hd
838*c0909341SAndroid Build Coastguard Worker    jg .prep_w64
839*c0909341SAndroid Build Coastguard Worker    RET
840*c0909341SAndroid Build Coastguard Worker.prep_w128:
841*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+32*0]
842*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+32*1]
843*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4, [srcq+32*2]
844*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m4, [srcq+32*3]
845*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
846*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5
847*c0909341SAndroid Build Coastguard Worker    psubw                m2, m5
848*c0909341SAndroid Build Coastguard Worker    psubw                m3, m5
849*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
850*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m1
851*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*2], m2
852*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*3], m3
853*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+32*4]
854*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+32*5]
855*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4, [srcq+32*6]
856*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m4, [srcq+32*7]
857*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*8
858*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
859*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
860*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5
861*c0909341SAndroid Build Coastguard Worker    psubw                m2, m5
862*c0909341SAndroid Build Coastguard Worker    psubw                m3, m5
863*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*4], m0
864*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*3], m1
865*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*2], m2
866*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*1], m3
867*c0909341SAndroid Build Coastguard Worker    dec                  hd
868*c0909341SAndroid Build Coastguard Worker    jg .prep_w128
869*c0909341SAndroid Build Coastguard Worker    RET
870*c0909341SAndroid Build Coastguard Worker.h:
871*c0909341SAndroid Build Coastguard Worker    movd                xm5, mxyd
872*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r6m ; my
873*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [pw_16]
874*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, xm5
875*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [pw_32766]
876*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5
877*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
878*c0909341SAndroid Build Coastguard Worker    jnz .h_12bpc
879*c0909341SAndroid Build Coastguard Worker    psllw                m4, 2
880*c0909341SAndroid Build Coastguard Worker    psllw                m5, 2
881*c0909341SAndroid Build Coastguard Worker.h_12bpc:
882*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
883*c0909341SAndroid Build Coastguard Worker    jnz .hv
884*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
885*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
886*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
887*c0909341SAndroid Build Coastguard Worker    jmp                  wq
888*c0909341SAndroid Build Coastguard Worker.h_w4:
889*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*0]
890*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+strideq*2], 1
891*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+strideq*1]
892*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+stride3q ], 1
893*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
894*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m1, m2
895*c0909341SAndroid Build Coastguard Worker    psrldq               m1, 2
896*c0909341SAndroid Build Coastguard Worker    pslldq               m2, 6
897*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
898*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m2, 0xcc
899*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5
900*c0909341SAndroid Build Coastguard Worker    psubw                m0, m3
901*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
902*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
903*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
904*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
905*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
906*c0909341SAndroid Build Coastguard Worker    jg .h_w4
907*c0909341SAndroid Build Coastguard Worker    RET
908*c0909341SAndroid Build Coastguard Worker.h_w8:
909*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*0]
910*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+strideq*1], 1
911*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*0+2]
912*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+strideq*1+2], 1
913*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
914*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
915*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5
916*c0909341SAndroid Build Coastguard Worker    psubw                m0, m3
917*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
918*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
919*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
920*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
921*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
922*c0909341SAndroid Build Coastguard Worker    jg .h_w8
923*c0909341SAndroid Build Coastguard Worker    RET
924*c0909341SAndroid Build Coastguard Worker.h_w16:
925*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+strideq*0]
926*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5, [srcq+strideq*0+2]
927*c0909341SAndroid Build Coastguard Worker    psubw                m0, m3
928*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
929*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+strideq*1]
930*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5, [srcq+strideq*1+2]
931*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
932*c0909341SAndroid Build Coastguard Worker    psubw                m1, m3
933*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
934*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
935*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
936*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
937*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m1
938*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
939*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
940*c0909341SAndroid Build Coastguard Worker    jg .h_w16
941*c0909341SAndroid Build Coastguard Worker    RET
942*c0909341SAndroid Build Coastguard Worker.h_w32:
943*c0909341SAndroid Build Coastguard Worker.h_w64:
944*c0909341SAndroid Build Coastguard Worker.h_w128:
945*c0909341SAndroid Build Coastguard Worker    movifnidn           t0d, org_w
946*c0909341SAndroid Build Coastguard Worker.h_w32_loop0:
947*c0909341SAndroid Build Coastguard Worker    mov                 r3d, t0d
948*c0909341SAndroid Build Coastguard Worker.h_w32_loop:
949*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+r3*2-32*1]
950*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5, [srcq+r3*2-32*1+2]
951*c0909341SAndroid Build Coastguard Worker    psubw                m0, m3
952*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
953*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+r3*2-32*2]
954*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5, [srcq+r3*2-32*2+2]
955*c0909341SAndroid Build Coastguard Worker    psubw                m1, m3
956*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
957*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
958*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
959*c0909341SAndroid Build Coastguard Worker    mova   [tmpq+r3*2-32*1], m0
960*c0909341SAndroid Build Coastguard Worker    mova   [tmpq+r3*2-32*2], m1
961*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 32
962*c0909341SAndroid Build Coastguard Worker    jg .h_w32_loop
963*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
964*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+t0*2]
965*c0909341SAndroid Build Coastguard Worker    dec                  hd
966*c0909341SAndroid Build Coastguard Worker    jg .h_w32_loop0
967*c0909341SAndroid Build Coastguard Worker    RET
968*c0909341SAndroid Build Coastguard Worker.v:
969*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
970*c0909341SAndroid Build Coastguard Worker    movd                xm5, mxyd
971*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [pw_16]
972*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, xm5
973*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [pw_32766]
974*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
975*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
976*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5
977*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
978*c0909341SAndroid Build Coastguard Worker    jnz .v_12bpc
979*c0909341SAndroid Build Coastguard Worker    psllw                m4, 2
980*c0909341SAndroid Build Coastguard Worker    psllw                m5, 2
981*c0909341SAndroid Build Coastguard Worker.v_12bpc:
982*c0909341SAndroid Build Coastguard Worker    jmp                  wq
983*c0909341SAndroid Build Coastguard Worker.v_w4:
984*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+strideq*0]
985*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
986*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+strideq*2]
987*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        xm1, [srcq+strideq*1]
988*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m0, 0x03 ; 0 2 2 2
989*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+stride3q ]
990*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
991*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m0, 0xf0 ; 1 1 3 3
992*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+strideq*0]
993*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m2, 0x33 ; 0 1 2 3
994*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m2, 0x0c ; 4 2 4 4
995*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m2, m1, m0   ; 1 2 3 4
996*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
997*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5
998*c0909341SAndroid Build Coastguard Worker    psubw                m1, m3
999*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1000*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
1001*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m1
1002*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
1003*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1004*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
1005*c0909341SAndroid Build Coastguard Worker    RET
1006*c0909341SAndroid Build Coastguard Worker.v_w8:
1007*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*0]
1008*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
1009*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [srcq+strideq*1]
1010*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1011*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m0, m2, 0xf0 ; 0 1
1012*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [srcq+strideq*0]
1013*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m0, 0xf0     ; 1 2
1014*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
1015*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5
1016*c0909341SAndroid Build Coastguard Worker    psubw                m1, m3
1017*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1018*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
1019*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m1
1020*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
1021*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1022*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
1023*c0909341SAndroid Build Coastguard Worker    RET
1024*c0909341SAndroid Build Coastguard Worker.v_w16:
1025*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
1026*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
1027*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+strideq*1]
1028*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1029*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
1030*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5, m2
1031*c0909341SAndroid Build Coastguard Worker    psubw                m0, m3
1032*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
1033*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
1034*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
1035*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4
1036*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m1
1037*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5, m0
1038*c0909341SAndroid Build Coastguard Worker    psubw                m2, m3
1039*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1040*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
1041*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m1
1042*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
1043*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1044*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
1045*c0909341SAndroid Build Coastguard Worker    RET
1046*c0909341SAndroid Build Coastguard Worker.v_w32:
1047*c0909341SAndroid Build Coastguard Worker.v_w64:
1048*c0909341SAndroid Build Coastguard Worker.v_w128:
1049*c0909341SAndroid Build Coastguard Worker%if WIN64
1050*c0909341SAndroid Build Coastguard Worker    PUSH                 r7
1051*c0909341SAndroid Build Coastguard Worker%endif
1052*c0909341SAndroid Build Coastguard Worker    movifnidn           r7d, org_w
1053*c0909341SAndroid Build Coastguard Worker    add                 r7d, r7d
1054*c0909341SAndroid Build Coastguard Worker    mov                  r3, srcq
1055*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+r7*8-256]
1056*c0909341SAndroid Build Coastguard Worker    mov                  r5, tmpq
1057*c0909341SAndroid Build Coastguard Worker.v_w32_loop0:
1058*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
1059*c0909341SAndroid Build Coastguard Worker.v_w32_loop:
1060*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+strideq*1]
1061*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1062*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
1063*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5, m2
1064*c0909341SAndroid Build Coastguard Worker    psubw                m0, m3
1065*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
1066*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
1067*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
1068*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4
1069*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+r7*0], m1
1070*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5, m0
1071*c0909341SAndroid Build Coastguard Worker    psubw                m2, m3
1072*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1073*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
1074*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+r7*1], m1
1075*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+r7*2]
1076*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1077*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop
1078*c0909341SAndroid Build Coastguard Worker    add                  r3, 32
1079*c0909341SAndroid Build Coastguard Worker    add                  r5, 32
1080*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
1081*c0909341SAndroid Build Coastguard Worker    mov                srcq, r3
1082*c0909341SAndroid Build Coastguard Worker    mov                tmpq, r5
1083*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
1084*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop0
1085*c0909341SAndroid Build Coastguard Worker%if WIN64
1086*c0909341SAndroid Build Coastguard Worker    POP                  r7
1087*c0909341SAndroid Build Coastguard Worker%endif
1088*c0909341SAndroid Build Coastguard Worker    RET
1089*c0909341SAndroid Build Coastguard Worker.hv:
1090*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       7
1091*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
1092*c0909341SAndroid Build Coastguard Worker    shl                mxyd, 11
1093*c0909341SAndroid Build Coastguard Worker    movd                xm6, mxyd
1094*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
1095*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
1096*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, xm6
1097*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1098*c0909341SAndroid Build Coastguard Worker.hv_w4:
1099*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*0]
1100*c0909341SAndroid Build Coastguard Worker%if WIN64
1101*c0909341SAndroid Build Coastguard Worker    movaps         [rsp+24], xmm7
1102*c0909341SAndroid Build Coastguard Worker%endif
1103*c0909341SAndroid Build Coastguard Worker    pmullw              xm0, xm4, xm1
1104*c0909341SAndroid Build Coastguard Worker    psrldq              xm1, 2
1105*c0909341SAndroid Build Coastguard Worker    pmullw              xm1, xm5
1106*c0909341SAndroid Build Coastguard Worker    psubw               xm0, xm3
1107*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
1108*c0909341SAndroid Build Coastguard Worker    psraw               xm0, 2
1109*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, xm0
1110*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
1111*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*1]
1112*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+stride3q ], 1
1113*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+strideq*2]
1114*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1115*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+strideq*0], 1
1116*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m7, m1, m2
1117*c0909341SAndroid Build Coastguard Worker    psrldq               m1, 2
1118*c0909341SAndroid Build Coastguard Worker    pslldq               m2, 6
1119*c0909341SAndroid Build Coastguard Worker    pmullw               m7, m4
1120*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m2, 0xcc
1121*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5
1122*c0909341SAndroid Build Coastguard Worker    psubw                m7, m3
1123*c0909341SAndroid Build Coastguard Worker    paddw                m1, m7
1124*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2         ; 1 2 3 4
1125*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m1, 0x3f
1126*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m0, q2103 ; 0 1 2 3
1127*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
1128*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
1129*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6
1130*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1131*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m1
1132*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
1133*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1134*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
1135*c0909341SAndroid Build Coastguard Worker%if WIN64
1136*c0909341SAndroid Build Coastguard Worker    movaps             xmm7, [rsp+24]
1137*c0909341SAndroid Build Coastguard Worker%endif
1138*c0909341SAndroid Build Coastguard Worker    RET
1139*c0909341SAndroid Build Coastguard Worker.hv_w8:
1140*c0909341SAndroid Build Coastguard Worker    pmullw              xm0, xm4, [srcq+strideq*0]
1141*c0909341SAndroid Build Coastguard Worker    pmullw              xm1, xm5, [srcq+strideq*0+2]
1142*c0909341SAndroid Build Coastguard Worker    psubw               xm0, xm3
1143*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
1144*c0909341SAndroid Build Coastguard Worker    psraw               xm0, 2
1145*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, xm0, 1
1146*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
1147*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*1]
1148*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+strideq*1+2]
1149*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1150*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+strideq*0], 1
1151*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+strideq*0+2], 1
1152*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
1153*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5
1154*c0909341SAndroid Build Coastguard Worker    psubw                m1, m3
1155*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1156*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2            ; 1 2
1157*c0909341SAndroid Build Coastguard Worker    vperm2i128           m2, m0, m1, 0x21 ; 0 1
1158*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
1159*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
1160*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6
1161*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1162*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m1
1163*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
1164*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1165*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
1166*c0909341SAndroid Build Coastguard Worker    RET
1167*c0909341SAndroid Build Coastguard Worker.hv_w16:
1168*c0909341SAndroid Build Coastguard Worker.hv_w32:
1169*c0909341SAndroid Build Coastguard Worker.hv_w64:
1170*c0909341SAndroid Build Coastguard Worker.hv_w128:
1171*c0909341SAndroid Build Coastguard Worker%if WIN64
1172*c0909341SAndroid Build Coastguard Worker    PUSH                 r7
1173*c0909341SAndroid Build Coastguard Worker%endif
1174*c0909341SAndroid Build Coastguard Worker    movifnidn           r7d, org_w
1175*c0909341SAndroid Build Coastguard Worker    add                 r7d, r7d
1176*c0909341SAndroid Build Coastguard Worker    mov                  r3, srcq
1177*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+r7*8-256]
1178*c0909341SAndroid Build Coastguard Worker    mov                  r5, tmpq
1179*c0909341SAndroid Build Coastguard Worker.hv_w16_loop0:
1180*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq]
1181*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5, [srcq+2]
1182*c0909341SAndroid Build Coastguard Worker    psubw                m0, m3
1183*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1184*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
1185*c0909341SAndroid Build Coastguard Worker.hv_w16_loop:
1186*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [srcq+strideq*1]
1187*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5, [srcq+strideq*1+2]
1188*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1189*c0909341SAndroid Build Coastguard Worker    psubw                m1, m3
1190*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1191*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
1192*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1, m0
1193*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
1194*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
1195*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+r7*0], m2
1196*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, [srcq+strideq*0]
1197*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5, [srcq+strideq*0+2]
1198*c0909341SAndroid Build Coastguard Worker    psubw                m0, m3
1199*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1200*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
1201*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, m1
1202*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
1203*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1
1204*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+r7*1], m2
1205*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+r7*2]
1206*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1207*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop
1208*c0909341SAndroid Build Coastguard Worker    add                  r3, 32
1209*c0909341SAndroid Build Coastguard Worker    add                  r5, 32
1210*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
1211*c0909341SAndroid Build Coastguard Worker    mov                srcq, r3
1212*c0909341SAndroid Build Coastguard Worker    mov                tmpq, r5
1213*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
1214*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop0
1215*c0909341SAndroid Build Coastguard Worker%if WIN64
1216*c0909341SAndroid Build Coastguard Worker    POP                  r7
1217*c0909341SAndroid Build Coastguard Worker%endif
1218*c0909341SAndroid Build Coastguard Worker    RET
1219*c0909341SAndroid Build Coastguard Worker
1220*c0909341SAndroid Build Coastguard Worker; int8_t subpel_filters[5][15][8]
1221*c0909341SAndroid Build Coastguard Worker%assign FILTER_REGULAR (0*15 << 16) | 3*15
1222*c0909341SAndroid Build Coastguard Worker%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
1223*c0909341SAndroid Build Coastguard Worker%assign FILTER_SHARP   (2*15 << 16) | 3*15
1224*c0909341SAndroid Build Coastguard Worker
1225*c0909341SAndroid Build Coastguard Worker%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
1226*c0909341SAndroid Build Coastguard Workercglobal %1_%2_16bpc
1227*c0909341SAndroid Build Coastguard Worker    mov                 t0d, FILTER_%3
1228*c0909341SAndroid Build Coastguard Worker%ifidn %3, %4
1229*c0909341SAndroid Build Coastguard Worker    mov                 t1d, t0d
1230*c0909341SAndroid Build Coastguard Worker%else
1231*c0909341SAndroid Build Coastguard Worker    mov                 t1d, FILTER_%4
1232*c0909341SAndroid Build Coastguard Worker%endif
1233*c0909341SAndroid Build Coastguard Worker%if %0 == 5 ; skip the jump in the last filter
1234*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
1235*c0909341SAndroid Build Coastguard Worker%endif
1236*c0909341SAndroid Build Coastguard Worker%endmacro
1237*c0909341SAndroid Build Coastguard Worker
1238*c0909341SAndroid Build Coastguard Worker%if WIN64
1239*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4, 5
1240*c0909341SAndroid Build Coastguard Worker%else
1241*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7, 8
1242*c0909341SAndroid Build Coastguard Worker%endif
1243*c0909341SAndroid Build Coastguard Worker
1244*c0909341SAndroid Build Coastguard Worker%define PUT_8TAP_FN FN put_8tap,
1245*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_16bpc
1246*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_16bpc
1247*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_16bpc
1248*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular,        REGULAR, REGULAR
1249*c0909341SAndroid Build Coastguard Worker
1250*c0909341SAndroid Build Coastguard Workercglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
1251*c0909341SAndroid Build Coastguard Worker%define base r8-put_avx2
1252*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
1253*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
1254*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
1255*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 6tap_v, my, 4tap_v
1256*c0909341SAndroid Build Coastguard Worker    lea                  r8, [put_avx2]
1257*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
1258*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
1259*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
1260*c0909341SAndroid Build Coastguard Worker    jnz .h
1261*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
1262*c0909341SAndroid Build Coastguard Worker    jnz .v
1263*c0909341SAndroid Build Coastguard Worker.put:
1264*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
1265*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r8+wq*2+table_offset(put,)]
1266*c0909341SAndroid Build Coastguard Worker    add                  wq, r8
1267*c0909341SAndroid Build Coastguard Worker%if WIN64
1268*c0909341SAndroid Build Coastguard Worker    pop                  r8
1269*c0909341SAndroid Build Coastguard Worker%endif
1270*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1271*c0909341SAndroid Build Coastguard Worker.h_w2:
1272*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
1273*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
1274*c0909341SAndroid Build Coastguard Worker    mova                xm2, [subpel_h_shuf2]
1275*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm3, [base+subpel_filters+mxq*8+2]
1276*c0909341SAndroid Build Coastguard Worker    pmovsxbw            xm3, xm3
1277*c0909341SAndroid Build Coastguard Worker.h_w2_loop:
1278*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
1279*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*1]
1280*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1281*c0909341SAndroid Build Coastguard Worker    pshufb              xm0, xm2
1282*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm2
1283*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm0, xm3
1284*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm1, xm3
1285*c0909341SAndroid Build Coastguard Worker    phaddd              xm0, xm1
1286*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm4
1287*c0909341SAndroid Build Coastguard Worker    psrad               xm0, 6
1288*c0909341SAndroid Build Coastguard Worker    packusdw            xm0, xm0
1289*c0909341SAndroid Build Coastguard Worker    pminsw              xm0, xm5
1290*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm0
1291*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm0, 1
1292*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1293*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1294*c0909341SAndroid Build Coastguard Worker    jg .h_w2_loop
1295*c0909341SAndroid Build Coastguard Worker    RET
1296*c0909341SAndroid Build Coastguard Worker.h_w4:
1297*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
1298*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
1299*c0909341SAndroid Build Coastguard Worker    pmovsxbw            xm3, [base+subpel_filters+mxq*8]
1300*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       8
1301*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [subpel_h_shufA]
1302*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [subpel_h_shufB]
1303*c0909341SAndroid Build Coastguard Worker    pshufd              xm3, xm3, q2211
1304*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, xm3
1305*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m3, q1111
1306*c0909341SAndroid Build Coastguard Worker.h_w4_loop:
1307*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*0]
1308*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+ssq*1], 1
1309*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1310*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1, m6 ; 0 1 1 2 2 3 3 4
1311*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m7     ; 2 3 3 4 4 5 5 6
1312*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m2
1313*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m3
1314*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
1315*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
1316*c0909341SAndroid Build Coastguard Worker    psrad                m0, 6
1317*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
1318*c0909341SAndroid Build Coastguard Worker    packusdw            xm0, xm1
1319*c0909341SAndroid Build Coastguard Worker    pminsw              xm0, xm5
1320*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm0
1321*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm0
1322*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1323*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1324*c0909341SAndroid Build Coastguard Worker    jg .h_w4_loop
1325*c0909341SAndroid Build Coastguard Worker    RET
1326*c0909341SAndroid Build Coastguard Worker.h:
1327*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
1328*c0909341SAndroid Build Coastguard Worker    jnz .hv
1329*c0909341SAndroid Build Coastguard Worker    mov                 r7d, r8m
1330*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, r8m
1331*c0909341SAndroid Build Coastguard Worker    shr                 r7d, 11
1332*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+put_8tap_h_rnd+r7*4]
1333*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1334*c0909341SAndroid Build Coastguard Worker    je .h_w4
1335*c0909341SAndroid Build Coastguard Worker    jl .h_w2
1336*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      11
1337*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
1338*c0909341SAndroid Build Coastguard Worker    sub                srcq, 4
1339*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [base+subpel_filters+1+mxq*8]
1340*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [base+subpel_h_shufA]
1341*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
1342*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
1343*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m0, q0000
1344*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m0, q1111
1345*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m0, q2222
1346*c0909341SAndroid Build Coastguard Worker    sub                  wd, 16
1347*c0909341SAndroid Build Coastguard Worker    jge .h_w16
1348*c0909341SAndroid Build Coastguard Worker.h_w8:
1349*c0909341SAndroid Build Coastguard Worker%macro PUT_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
1350*c0909341SAndroid Build Coastguard Worker    pshufb              m%1, m6        ; 01 12 23 34
1351*c0909341SAndroid Build Coastguard Worker    pshufb              m%2, m6        ; 45 56 67 78
1352*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%4, m7, m%1   ; a0
1353*c0909341SAndroid Build Coastguard Worker    pshufb              m%3, m6        ; 89 9a ab bc
1354*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m9, m%2   ; a2
1355*c0909341SAndroid Build Coastguard Worker    shufpd              m%1, m%2, 0x05 ; 23 34 45 56
1356*c0909341SAndroid Build Coastguard Worker    paddd               m%4, m%5       ; a0+a2
1357*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m7, m%2   ; b0
1358*c0909341SAndroid Build Coastguard Worker    shufpd              m%2, m%3, 0x05 ; 67 78 89 9a
1359*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%3, m9        ; b2
1360*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m8        ; a1
1361*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%2, m8        ; b1
1362*c0909341SAndroid Build Coastguard Worker    paddd               m%3, m%5       ; b0+b2
1363*c0909341SAndroid Build Coastguard Worker    paddd               m%4, m4
1364*c0909341SAndroid Build Coastguard Worker    paddd               m%3, m4
1365*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%4
1366*c0909341SAndroid Build Coastguard Worker    paddd               m%2, m%3
1367*c0909341SAndroid Build Coastguard Worker    psrad               m%1, 6
1368*c0909341SAndroid Build Coastguard Worker    psrad               m%2, 6
1369*c0909341SAndroid Build Coastguard Worker    packusdw            m%1, m%2
1370*c0909341SAndroid Build Coastguard Worker    pminsw              m%1, m5
1371*c0909341SAndroid Build Coastguard Worker%endmacro
1372*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0+ 0]
1373*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*1+ 0], 1
1374*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+ssq*0+16]
1375*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+ssq*1+16], 1
1376*c0909341SAndroid Build Coastguard Worker    shufpd               m1, m0, m2, 0x05
1377*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1378*c0909341SAndroid Build Coastguard Worker    PUT_6TAP_H            0, 1, 2, 3, 10
1379*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm0
1380*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], m0, 1
1381*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1382*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1383*c0909341SAndroid Build Coastguard Worker    jg .h_w8
1384*c0909341SAndroid Build Coastguard Worker    RET
1385*c0909341SAndroid Build Coastguard Worker.h_w16:
1386*c0909341SAndroid Build Coastguard Worker    mov                 r6d, wd
1387*c0909341SAndroid Build Coastguard Worker.h_w16_loop:
1388*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6*2+ 0]
1389*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6*2+ 8]
1390*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6*2+16]
1391*c0909341SAndroid Build Coastguard Worker    PUT_6TAP_H            0, 1, 2, 3, 10
1392*c0909341SAndroid Build Coastguard Worker    mova        [dstq+r6*2], m0
1393*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 16
1394*c0909341SAndroid Build Coastguard Worker    jge .h_w16_loop
1395*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
1396*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
1397*c0909341SAndroid Build Coastguard Worker    dec                  hd
1398*c0909341SAndroid Build Coastguard Worker    jg .h_w16
1399*c0909341SAndroid Build Coastguard Worker    RET
1400*c0909341SAndroid Build Coastguard Worker.v:
1401*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1402*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1403*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1404*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
1405*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [base+subpel_filters+1+myq*8]
1406*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      10, 12
1407*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pd_32]
1408*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, r8m
1409*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
1410*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
1411*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
1412*c0909341SAndroid Build Coastguard Worker    neg                  r6
1413*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m0, q0000
1414*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m0, q1111
1415*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m0, q2222
1416*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1417*c0909341SAndroid Build Coastguard Worker    jg .v_w8
1418*c0909341SAndroid Build Coastguard Worker    je .v_w4
1419*c0909341SAndroid Build Coastguard Worker.v_w2:
1420*c0909341SAndroid Build Coastguard Worker    movd                xm2, [srcq+r6 *2]
1421*c0909341SAndroid Build Coastguard Worker    pinsrd              xm2, [srcq+r6 *1], 1
1422*c0909341SAndroid Build Coastguard Worker    pinsrd              xm2, [srcq+ssq*0], 2
1423*c0909341SAndroid Build Coastguard Worker    pinsrd              xm2, [srcq+ssq*1], 3 ; 0 1 2 3
1424*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1425*c0909341SAndroid Build Coastguard Worker    movd                xm0, [srcq+ssq*0]
1426*c0909341SAndroid Build Coastguard Worker    palignr             xm3, xm0, xm2, 4     ; 1 2 3 4
1427*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm1, xm2, xm3        ; 01 12
1428*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm2, xm3             ; 23 34
1429*c0909341SAndroid Build Coastguard Worker.v_w2_loop:
1430*c0909341SAndroid Build Coastguard Worker    movd                xm3, [srcq+ssq*1]
1431*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm4, xm7, xm1        ; a0 b0
1432*c0909341SAndroid Build Coastguard Worker    mova                xm1, xm2
1433*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm2, xm8             ; a1 b1
1434*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1435*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm2
1436*c0909341SAndroid Build Coastguard Worker    punpckldq           xm2, xm0, xm3        ; 4 5
1437*c0909341SAndroid Build Coastguard Worker    movd                xm0, [srcq+ssq*0]
1438*c0909341SAndroid Build Coastguard Worker    punpckldq           xm3, xm0             ; 5 6
1439*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm3             ; 45 56
1440*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm3, xm9, xm2        ; a2 b2
1441*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm5
1442*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm3
1443*c0909341SAndroid Build Coastguard Worker    psrad               xm4, 6
1444*c0909341SAndroid Build Coastguard Worker    packusdw            xm4, xm4
1445*c0909341SAndroid Build Coastguard Worker    pminsw              xm4, xm6
1446*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm4
1447*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm4, 1
1448*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1449*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1450*c0909341SAndroid Build Coastguard Worker    jg .v_w2_loop
1451*c0909341SAndroid Build Coastguard Worker    RET
1452*c0909341SAndroid Build Coastguard Worker.v_w4:
1453*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+r6 *2]
1454*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+r6 *1]
1455*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+ssq*0]
1456*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+ssq*1]
1457*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1458*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+ssq*0]
1459*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m3, 0x30
1460*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m2, 0x30
1461*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3     ; 01 12
1462*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m4, 0x30
1463*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m0, 0x30
1464*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4     ; 23 34
1465*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
1466*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+ssq*1]
1467*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7, m1 ; a0 b0
1468*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
1469*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m8     ; a1 b1
1470*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1471*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
1472*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m0, m3, 0x30
1473*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+ssq*0]
1474*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m0, 0x30
1475*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3     ; 45 56
1476*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9, m2 ; a2 b2
1477*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
1478*c0909341SAndroid Build Coastguard Worker    paddd                m4, m3
1479*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
1480*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m4, 1
1481*c0909341SAndroid Build Coastguard Worker    packusdw            xm4, xm3
1482*c0909341SAndroid Build Coastguard Worker    pminsw              xm4, xm6
1483*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm4
1484*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm4
1485*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1486*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1487*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
1488*c0909341SAndroid Build Coastguard Worker    RET
1489*c0909341SAndroid Build Coastguard Worker.v_w8:
1490*c0909341SAndroid Build Coastguard Worker    shl                  wd, 5
1491*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       12
1492*c0909341SAndroid Build Coastguard Worker    lea                  wd, [hq+wq-256]
1493*c0909341SAndroid Build Coastguard Worker.v_w8_loop0:
1494*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [srcq+r6 *2]
1495*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [srcq+r6 *1]
1496*c0909341SAndroid Build Coastguard Worker    lea                  r7, [srcq+ssq*2]
1497*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [srcq+ssq*0]
1498*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [srcq+ssq*1]
1499*c0909341SAndroid Build Coastguard Worker    mov                  r8, dstq
1500*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [r7+ssq*0]
1501*c0909341SAndroid Build Coastguard Worker    shufpd               m3, m0, 0x0c
1502*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m1, 0x0c
1503*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m4 ; 01
1504*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4     ; 23
1505*c0909341SAndroid Build Coastguard Worker    shufpd               m0, m2, 0x0c
1506*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m0 ; 12
1507*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0     ; 34
1508*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
1509*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [r7+ssq*1]
1510*c0909341SAndroid Build Coastguard Worker    pmaddwd             m10, m7, m1 ; a0
1511*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+ssq*2]
1512*c0909341SAndroid Build Coastguard Worker    pmaddwd             m11, m7, m2 ; b0
1513*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
1514*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m8     ; a1
1515*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
1516*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m8     ; b1
1517*c0909341SAndroid Build Coastguard Worker    paddd               m10, m3
1518*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [r7+ssq*0]
1519*c0909341SAndroid Build Coastguard Worker    paddd               m11, m4
1520*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m0, m5, 0x0d
1521*c0909341SAndroid Build Coastguard Worker    shufpd               m0, m5, m3, 0x0c
1522*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m0 ; 45
1523*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0     ; 56
1524*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m9, m3 ; a2
1525*c0909341SAndroid Build Coastguard Worker    paddd               m10, m5
1526*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m9, m4 ; b2
1527*c0909341SAndroid Build Coastguard Worker    paddd                m5, m11
1528*c0909341SAndroid Build Coastguard Worker    psrad               m10, 5
1529*c0909341SAndroid Build Coastguard Worker    psrad                m5, 5
1530*c0909341SAndroid Build Coastguard Worker    packusdw            m10, m5
1531*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
1532*c0909341SAndroid Build Coastguard Worker    pavgw                m5, m10
1533*c0909341SAndroid Build Coastguard Worker    pminsw               m5, m6
1534*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m5, q3120
1535*c0909341SAndroid Build Coastguard Worker    mova         [r8+dsq*0], xm5
1536*c0909341SAndroid Build Coastguard Worker    vextracti128 [r8+dsq*1], m5, 1
1537*c0909341SAndroid Build Coastguard Worker    lea                  r8, [r8+dsq*2]
1538*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1539*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
1540*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
1541*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
1542*c0909341SAndroid Build Coastguard Worker    movzx                hd, wb
1543*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<8
1544*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop0
1545*c0909341SAndroid Build Coastguard Worker    RET
1546*c0909341SAndroid Build Coastguard Worker.hv:
1547*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      12, 16
1548*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [pd_512]
1549*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m11, r8m
1550*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1551*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
1552*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
1553*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [base+subpel_filters+mxq*8+2]
1554*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1555*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1556*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1557*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
1558*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [base+subpel_filters+1+myq*8]
1559*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
1560*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
1561*c0909341SAndroid Build Coastguard Worker    neg                  r6
1562*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
1563*c0909341SAndroid Build Coastguard Worker    punpcklbw            m6, m0
1564*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
1565*c0909341SAndroid Build Coastguard Worker    psraw                m1, 8 ; sign-extend
1566*c0909341SAndroid Build Coastguard Worker    test          dword r8m, 0x800
1567*c0909341SAndroid Build Coastguard Worker    jz .hv_10bit
1568*c0909341SAndroid Build Coastguard Worker    psraw                m6, 2
1569*c0909341SAndroid Build Coastguard Worker    psllw                m1, 2
1570*c0909341SAndroid Build Coastguard Worker.hv_10bit:
1571*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m1, q0000
1572*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m1, q1111
1573*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m1, q2222
1574*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1575*c0909341SAndroid Build Coastguard Worker    je .hv_w4
1576*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [subpel_h_shuf2]
1577*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [srcq+ssq*0]
1578*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, m0, [srcq+r6*2], 1 ; 2 0
1579*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*1]
1580*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+r6 *1], 1    ; 3 1
1581*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1582*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*0], 0    ; 4 2
1583*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb  x, m5}, m2, m1, m0
1584*c0909341SAndroid Build Coastguard Worker    REPX    {pmaddwd x, m6}, m2, m1, m0
1585*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m1
1586*c0909341SAndroid Build Coastguard Worker    phaddd               m1, m0
1587*c0909341SAndroid Build Coastguard Worker    paddd                m2, m10
1588*c0909341SAndroid Build Coastguard Worker    paddd                m1, m10
1589*c0909341SAndroid Build Coastguard Worker    psrad                m2, 10
1590*c0909341SAndroid Build Coastguard Worker    psrad                m1, 10
1591*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m1       ; 2 3 3 4   0 1 1 2
1592*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m2, m2
1593*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m0       ; 23 34
1594*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m2, 1    ; 01 12
1595*c0909341SAndroid Build Coastguard Worker.hv_w2_loop:
1596*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ssq*1]
1597*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1598*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+ssq*0]
1599*c0909341SAndroid Build Coastguard Worker    pshufb              xm3, xm5
1600*c0909341SAndroid Build Coastguard Worker    pshufb              xm4, xm5
1601*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm3, xm6
1602*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm4, xm6
1603*c0909341SAndroid Build Coastguard Worker    phaddd              xm3, xm4
1604*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm4, xm7, xm1 ; a0 b0
1605*c0909341SAndroid Build Coastguard Worker    mova                xm1, xm2
1606*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm2, xm8      ; a1 b1
1607*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm2
1608*c0909341SAndroid Build Coastguard Worker    paddd               xm3, xm10
1609*c0909341SAndroid Build Coastguard Worker    psrad               xm3, 10
1610*c0909341SAndroid Build Coastguard Worker    packssdw            xm3, xm3
1611*c0909341SAndroid Build Coastguard Worker    palignr             xm2, xm3, xm0, 12
1612*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm3
1613*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm0      ; 45 56
1614*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm3, xm9, xm2 ; a2 b2
1615*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm10
1616*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm3
1617*c0909341SAndroid Build Coastguard Worker    psrad               xm4, 10
1618*c0909341SAndroid Build Coastguard Worker    packusdw            xm4, xm4
1619*c0909341SAndroid Build Coastguard Worker    pminsw              xm4, xm11
1620*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm4
1621*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm4, 1
1622*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1623*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1624*c0909341SAndroid Build Coastguard Worker    jg .hv_w2_loop
1625*c0909341SAndroid Build Coastguard Worker    RET
1626*c0909341SAndroid Build Coastguard Worker.hv_w4:
1627*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       14
1628*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m12, [subpel_h_shufA]
1629*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m6, q0000
1630*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m13, [subpel_h_shufB]
1631*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q1111
1632*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+r6 *2]
1633*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+r6 *1], 1 ; 0 1
1634*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
1635*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*1], 1 ; 2 3
1636*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1637*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ssq*0]    ; 4
1638*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2, m12
1639*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5
1640*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m13
1641*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m6
1642*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m0, m12
1643*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m5
1644*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m13
1645*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m6
1646*c0909341SAndroid Build Coastguard Worker    paddd                m2, m1
1647*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm3, xm12
1648*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm1, xm5
1649*c0909341SAndroid Build Coastguard Worker    pshufb              xm3, xm13
1650*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm3, xm6
1651*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
1652*c0909341SAndroid Build Coastguard Worker    paddd                m2, m10
1653*c0909341SAndroid Build Coastguard Worker    paddd               xm1, xm10
1654*c0909341SAndroid Build Coastguard Worker    paddd                m0, m10
1655*c0909341SAndroid Build Coastguard Worker    paddd               xm3, xm1
1656*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 10}, m2, m0, xm3
1657*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m0     ; 0 2   1 3
1658*c0909341SAndroid Build Coastguard Worker    packssdw            xm0, xm3    ; 2 4
1659*c0909341SAndroid Build Coastguard Worker    vperm2i128           m0, m2, 0x03
1660*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m0 ; 01 12
1661*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0     ; 23 34
1662*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
1663*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ssq*1]
1664*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1665*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+ssq*0], 1
1666*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7, m1 ; a0 b0
1667*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
1668*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m8     ; a1 b1
1669*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
1670*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3, m12
1671*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5
1672*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m13
1673*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m6
1674*c0909341SAndroid Build Coastguard Worker    paddd                m2, m10
1675*c0909341SAndroid Build Coastguard Worker    paddd                m3, m2
1676*c0909341SAndroid Build Coastguard Worker    psrad                m3, 10
1677*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m3     ; 5 5   6 6
1678*c0909341SAndroid Build Coastguard Worker    vperm2i128           m2, m0, m3, 0x21
1679*c0909341SAndroid Build Coastguard Worker    mova                 m0, m3
1680*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m3     ; 45 56
1681*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9, m2 ; a2 b2
1682*c0909341SAndroid Build Coastguard Worker    paddd                m4, m10
1683*c0909341SAndroid Build Coastguard Worker    paddd                m4, m3
1684*c0909341SAndroid Build Coastguard Worker    psrad                m4, 10
1685*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m4, 1
1686*c0909341SAndroid Build Coastguard Worker    packusdw            xm4, xm3
1687*c0909341SAndroid Build Coastguard Worker    pminsw              xm4, xm11
1688*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm4
1689*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm4
1690*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1691*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1692*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
1693*c0909341SAndroid Build Coastguard Worker    RET
1694*c0909341SAndroid Build Coastguard Worker.hv_w8:
1695*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       16, 12
1696*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
1697*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m12, [subpel_h_shufA]
1698*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [base+subpel_filters+1+mxq*8]
1699*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1700*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1701*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1702*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
1703*c0909341SAndroid Build Coastguard Worker    pmovsxbw            xm1, [base+subpel_filters+1+myq*8]
1704*c0909341SAndroid Build Coastguard Worker    shl                  wd, 5
1705*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
1706*c0909341SAndroid Build Coastguard Worker    sub                srcq, 4
1707*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
1708*c0909341SAndroid Build Coastguard Worker    neg                  r6
1709*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m2
1710*c0909341SAndroid Build Coastguard Worker    lea                  wd, [hq+wq-256]
1711*c0909341SAndroid Build Coastguard Worker    test          dword r8m, 0x800
1712*c0909341SAndroid Build Coastguard Worker    jz .hv_w8_10bit
1713*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
1714*c0909341SAndroid Build Coastguard Worker    psllw               xm1, 2
1715*c0909341SAndroid Build Coastguard Worker.hv_w8_10bit:
1716*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m0, q0000
1717*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m0, q1111
1718*c0909341SAndroid Build Coastguard Worker%if WIN64
1719*c0909341SAndroid Build Coastguard Worker    %define v_mul (rsp+stack_offset+40) ; r4m
1720*c0909341SAndroid Build Coastguard Worker%else
1721*c0909341SAndroid Build Coastguard Worker    %define v_mul (rsp+stack_offset+ 8) ; r6m
1722*c0909341SAndroid Build Coastguard Worker%endif
1723*c0909341SAndroid Build Coastguard Worker    mova            [v_mul], xm1
1724*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m0, q2222
1725*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0:
1726*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [srcq+ssq*0+ 0]
1727*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, m0, [srcq+r6*2+ 0], 0
1728*c0909341SAndroid Build Coastguard Worker    lea                  r7, [srcq+ssq*2]
1729*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [srcq+ssq*0+16]
1730*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, m2, [srcq+r6*2+16], 0
1731*c0909341SAndroid Build Coastguard Worker    mov                  r8, dstq
1732*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [r7  +ssq*0+ 0], 1
1733*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [r7  +ssq*0+16], 1
1734*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m3, m1, 0x05
1735*c0909341SAndroid Build Coastguard Worker%macro PUT_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
1736*c0909341SAndroid Build Coastguard Worker    pshufb              m%1, m12       ; 01 12 23 34
1737*c0909341SAndroid Build Coastguard Worker    pshufb              m%2, m12       ; 45 56 67 78
1738*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%4, m7, m%1   ; a0
1739*c0909341SAndroid Build Coastguard Worker    pshufb              m%3, m12       ; 89 9a ab bc
1740*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m9, m%2   ; a2
1741*c0909341SAndroid Build Coastguard Worker    shufpd              m%1, m%2, 0x05 ; 23 34 45 56
1742*c0909341SAndroid Build Coastguard Worker    paddd               m%4, m%5       ; a0+a2
1743*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m7, m%2   ; b0
1744*c0909341SAndroid Build Coastguard Worker    shufpd              m%2, m%3, 0x05 ; 67 78 89 9a
1745*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%3, m9        ; b2
1746*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m8        ; a1
1747*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%2, m8        ; b1
1748*c0909341SAndroid Build Coastguard Worker    paddd               m%3, m%5       ; b0+b2
1749*c0909341SAndroid Build Coastguard Worker    paddd               m%4, m10
1750*c0909341SAndroid Build Coastguard Worker    paddd               m%3, m10
1751*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%4
1752*c0909341SAndroid Build Coastguard Worker    paddd               m%2, m%3
1753*c0909341SAndroid Build Coastguard Worker    psrad               m%1, 10
1754*c0909341SAndroid Build Coastguard Worker    psrad               m%2, 10
1755*c0909341SAndroid Build Coastguard Worker    packssdw            m%1, m%2
1756*c0909341SAndroid Build Coastguard Worker%endmacro
1757*c0909341SAndroid Build Coastguard Worker    PUT_6TAP_HV_H         3, 4, 1, 5, 6  ; 0 2
1758*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+r6 *1+ 0]
1759*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [srcq+ssq*1+ 0], 1
1760*c0909341SAndroid Build Coastguard Worker    shufpd               m1, m0, m2, 0x05
1761*c0909341SAndroid Build Coastguard Worker    PUT_6TAP_HV_H         0, 1, 2, 5, 6  ; 2 4
1762*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+r6 *1+16]
1763*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+ssq*1+16], 1
1764*c0909341SAndroid Build Coastguard Worker    shufpd               m1, m4, m2, 0x05
1765*c0909341SAndroid Build Coastguard Worker    PUT_6TAP_HV_H         4, 1, 2, 5, 6  ; 1 3
1766*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m3, q3120
1767*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m4, q3120
1768*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
1769*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m4      ; 01
1770*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4          ; 23
1771*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m0      ; 12
1772*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0          ; 34
1773*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
1774*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [v_mul+4*0]
1775*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [v_mul+4*1]
1776*c0909341SAndroid Build Coastguard Worker    movu                xm5, [r7+ssq*1+ 0]
1777*c0909341SAndroid Build Coastguard Worker    movu                xm6, [r7+ssq*1+16]
1778*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+ssq*2]
1779*c0909341SAndroid Build Coastguard Worker    pmaddwd             m14, m15, m1     ; a0
1780*c0909341SAndroid Build Coastguard Worker    pmaddwd             m15, m2          ; b0
1781*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [r7+ssq*0+ 0], 1
1782*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [r7+ssq*0+16], 1
1783*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
1784*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m13         ; a1
1785*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
1786*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m13         ; b1
1787*c0909341SAndroid Build Coastguard Worker    paddd               m14, m3
1788*c0909341SAndroid Build Coastguard Worker    shufpd               m3, m5, m6, 0x05
1789*c0909341SAndroid Build Coastguard Worker    paddd               m15, m4
1790*c0909341SAndroid Build Coastguard Worker    PUT_6TAP_HV_H         5, 3, 6, 4, 13 ; 5 6
1791*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [v_mul+4*2]
1792*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m5, q3120
1793*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m0, m5, 0x05
1794*c0909341SAndroid Build Coastguard Worker    mova                 m0, m5
1795*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m5      ; 45
1796*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5          ; 56
1797*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m6, m3      ; a2
1798*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m4          ; b2
1799*c0909341SAndroid Build Coastguard Worker    paddd               m14, m10
1800*c0909341SAndroid Build Coastguard Worker    paddd               m15, m10
1801*c0909341SAndroid Build Coastguard Worker    paddd                m5, m14
1802*c0909341SAndroid Build Coastguard Worker    paddd                m6, m15
1803*c0909341SAndroid Build Coastguard Worker    psrad                m5, 10
1804*c0909341SAndroid Build Coastguard Worker    psrad                m6, 10
1805*c0909341SAndroid Build Coastguard Worker    packusdw             m5, m6
1806*c0909341SAndroid Build Coastguard Worker    pminsw               m5, m11
1807*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m5, q3120
1808*c0909341SAndroid Build Coastguard Worker    mova         [r8+dsq*0], xm5
1809*c0909341SAndroid Build Coastguard Worker    vextracti128 [r8+dsq*1], m5, 1
1810*c0909341SAndroid Build Coastguard Worker    lea                  r8, [r8+dsq*2]
1811*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1812*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
1813*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
1814*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
1815*c0909341SAndroid Build Coastguard Worker    movzx                hd, wb
1816*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<8
1817*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop0
1818*c0909341SAndroid Build Coastguard Worker    RET
1819*c0909341SAndroid Build Coastguard Worker
1820*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_16bpc
1821*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_16bpc
1822*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_sharp,  REGULAR, SHARP,   put_8tap_16bpc
1823*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_8tap_16bpc
1824*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp,          SHARP,   SHARP
1825*c0909341SAndroid Build Coastguard Worker
1826*c0909341SAndroid Build Coastguard Workercglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
1827*c0909341SAndroid Build Coastguard Worker%define base r8-put_avx2
1828*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
1829*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
1830*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
1831*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 8tap_v, my, 4tap_v
1832*c0909341SAndroid Build Coastguard Worker    lea                  r8, [put_avx2]
1833*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
1834*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
1835*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
1836*c0909341SAndroid Build Coastguard Worker    jnz .h
1837*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
1838*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _put_6tap_16bpc_avx2).put
1839*c0909341SAndroid Build Coastguard Worker.v:
1840*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1841*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1842*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1843*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
1844*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [base+subpel_filters+myq*8]
1845*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      12, 15
1846*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [pd_32]
1847*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, r8m
1848*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
1849*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
1850*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
1851*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
1852*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m0, q0000
1853*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m0, q1111
1854*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m0, q2222
1855*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m0, q3333
1856*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1857*c0909341SAndroid Build Coastguard Worker    jg .v_w8
1858*c0909341SAndroid Build Coastguard Worker    je .v_w4
1859*c0909341SAndroid Build Coastguard Worker.v_w2:
1860*c0909341SAndroid Build Coastguard Worker    movd                xm2, [srcq+ssq*0]
1861*c0909341SAndroid Build Coastguard Worker    pinsrd              xm2, [srcq+ssq*1], 1
1862*c0909341SAndroid Build Coastguard Worker    pinsrd              xm2, [srcq+ssq*2], 2
1863*c0909341SAndroid Build Coastguard Worker    pinsrd              xm2, [srcq+r6   ], 3 ; 0 1 2 3
1864*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
1865*c0909341SAndroid Build Coastguard Worker    movd                xm3, [srcq+ssq*0]
1866*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm1, [srcq+ssq*1]
1867*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [srcq+ssq*2]
1868*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
1869*c0909341SAndroid Build Coastguard Worker    vpblendd            xm3, xm1, 0x02       ; 4 5
1870*c0909341SAndroid Build Coastguard Worker    vpblendd            xm1, xm0, 0x02       ; 5 6
1871*c0909341SAndroid Build Coastguard Worker    palignr             xm4, xm3, xm2, 4     ; 1 2 3 4
1872*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm1             ; 45 56
1873*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm1, xm2, xm4        ; 01 12
1874*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm2, xm4             ; 23 34
1875*c0909341SAndroid Build Coastguard Worker.v_w2_loop:
1876*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [srcq+ssq*0]
1877*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm5, xm8, xm1        ; a0 b0
1878*c0909341SAndroid Build Coastguard Worker    mova                xm1, xm2
1879*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm2, xm9             ; a1 b1
1880*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm6
1881*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm2
1882*c0909341SAndroid Build Coastguard Worker    mova                xm2, xm3
1883*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm3, xm10            ; a2 b2
1884*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm3
1885*c0909341SAndroid Build Coastguard Worker    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
1886*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [srcq+ssq*1]
1887*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1888*c0909341SAndroid Build Coastguard Worker    vpblendd            xm4, xm0, 0x02       ; 7 8
1889*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm4             ; 67 78
1890*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm4, xm11, xm3       ; a3 b3
1891*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm4
1892*c0909341SAndroid Build Coastguard Worker    psrad               xm5, 6
1893*c0909341SAndroid Build Coastguard Worker    packusdw            xm5, xm5
1894*c0909341SAndroid Build Coastguard Worker    pminsw              xm5, xm7
1895*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm5
1896*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm5, 1
1897*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1898*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1899*c0909341SAndroid Build Coastguard Worker    jg .v_w2_loop
1900*c0909341SAndroid Build Coastguard Worker    RET
1901*c0909341SAndroid Build Coastguard Worker.v_w4:
1902*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+ssq*0]
1903*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+ssq*1]
1904*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+ssq*2]
1905*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+r6   ]
1906*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
1907*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+ssq*0]
1908*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m5, [srcq+ssq*1]
1909*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m0, 0x30
1910*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m2, 0x30
1911*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0      ; 01 12
1912*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+ssq*2]
1913*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
1914*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m4, 0x30
1915*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m3, 0x30
1916*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4      ; 23 34
1917*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m5, 0x30
1918*c0909341SAndroid Build Coastguard Worker    vpblendd             m5, m0, 0x30
1919*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m5      ; 45 56
1920*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
1921*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+ssq*0]
1922*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m8, m1  ; a0 b0
1923*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
1924*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9      ; a1 b1
1925*c0909341SAndroid Build Coastguard Worker    paddd                m5, m6
1926*c0909341SAndroid Build Coastguard Worker    paddd                m5, m2
1927*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
1928*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m10     ; a2 b2
1929*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
1930*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m0, m4, 0x30
1931*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+ssq*1]
1932*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1933*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m0, 0x30
1934*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4      ; 67 78
1935*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m11, m3 ; a3 b3
1936*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4
1937*c0909341SAndroid Build Coastguard Worker    psrad                m5, 6
1938*c0909341SAndroid Build Coastguard Worker    vextracti128        xm4, m5, 1
1939*c0909341SAndroid Build Coastguard Worker    packusdw            xm5, xm4
1940*c0909341SAndroid Build Coastguard Worker    pminsw              xm5, xm7
1941*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm5
1942*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm5
1943*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1944*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1945*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
1946*c0909341SAndroid Build Coastguard Worker    RET
1947*c0909341SAndroid Build Coastguard Worker.v_w8:
1948*c0909341SAndroid Build Coastguard Worker    shl                  wd, 5
1949*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       15
1950*c0909341SAndroid Build Coastguard Worker    lea                  wd, [hq+wq-256]
1951*c0909341SAndroid Build Coastguard Worker.v_w8_loop0:
1952*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [srcq+ssq*0]
1953*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [srcq+ssq*1]
1954*c0909341SAndroid Build Coastguard Worker    lea                  r7, [srcq+ssq*4]
1955*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [srcq+r6   ]
1956*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [srcq+ssq*2]
1957*c0909341SAndroid Build Coastguard Worker    mov                  r8, dstq
1958*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [r7+ssq*0]
1959*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [r7+ssq*1]
1960*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [r7+ssq*2]
1961*c0909341SAndroid Build Coastguard Worker    add                  r7, r6
1962*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m0, 0x0c
1963*c0909341SAndroid Build Coastguard Worker    shufpd               m5, m1, 0x0c
1964*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m4, m5 ; 01
1965*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5     ; 34
1966*c0909341SAndroid Build Coastguard Worker    shufpd               m6, m2, 0x0c
1967*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5, m6 ; 12
1968*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6     ; 45
1969*c0909341SAndroid Build Coastguard Worker    shufpd               m0, m3, 0x0c
1970*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m6, m0 ; 23
1971*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m0     ; 56
1972*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
1973*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m14, [r7+ssq*0]
1974*c0909341SAndroid Build Coastguard Worker    pmaddwd             m12, m8, m1  ; a0
1975*c0909341SAndroid Build Coastguard Worker    pmaddwd             m13, m8, m2  ; b0
1976*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
1977*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
1978*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9      ; a1
1979*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m9      ; b1
1980*c0909341SAndroid Build Coastguard Worker    paddd               m12, m3
1981*c0909341SAndroid Build Coastguard Worker    paddd               m13, m4
1982*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
1983*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
1984*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m10     ; a2
1985*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m10     ; b2
1986*c0909341SAndroid Build Coastguard Worker    paddd               m12, m5
1987*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [r7+ssq*1]
1988*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+ssq*2]
1989*c0909341SAndroid Build Coastguard Worker    paddd               m13, m6
1990*c0909341SAndroid Build Coastguard Worker    shufpd               m6, m0, m14, 0x0d
1991*c0909341SAndroid Build Coastguard Worker    shufpd               m0, m14, m5, 0x0c
1992*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6, m0  ; 67
1993*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m0      ; 78
1994*c0909341SAndroid Build Coastguard Worker    pmaddwd             m14, m11, m5 ; a3
1995*c0909341SAndroid Build Coastguard Worker    paddd               m12, m14
1996*c0909341SAndroid Build Coastguard Worker    pmaddwd             m14, m11, m6 ; b3
1997*c0909341SAndroid Build Coastguard Worker    paddd               m13, m14
1998*c0909341SAndroid Build Coastguard Worker    psrad               m12, 5
1999*c0909341SAndroid Build Coastguard Worker    psrad               m13, 5
2000*c0909341SAndroid Build Coastguard Worker    packusdw            m12, m13
2001*c0909341SAndroid Build Coastguard Worker    pxor                m13, m13
2002*c0909341SAndroid Build Coastguard Worker    pavgw               m12, m13
2003*c0909341SAndroid Build Coastguard Worker    pminsw              m12, m7
2004*c0909341SAndroid Build Coastguard Worker    vpermq              m12, m12, q3120
2005*c0909341SAndroid Build Coastguard Worker    mova         [r8+dsq*0], xm12
2006*c0909341SAndroid Build Coastguard Worker    vextracti128 [r8+dsq*1], m12, 1
2007*c0909341SAndroid Build Coastguard Worker    lea                  r8, [r8+dsq*2]
2008*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2009*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
2010*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
2011*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
2012*c0909341SAndroid Build Coastguard Worker    movzx                hd, wb
2013*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<8
2014*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop0
2015*c0909341SAndroid Build Coastguard Worker    RET
2016*c0909341SAndroid Build Coastguard Worker.h:
2017*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
2018*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2019*c0909341SAndroid Build Coastguard Worker    jnz .hv
2020*c0909341SAndroid Build Coastguard Worker    mov                 r7d, r8m
2021*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, r8m
2022*c0909341SAndroid Build Coastguard Worker    shr                 r7d, 11
2023*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+put_8tap_h_rnd+r7*4]
2024*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2025*c0909341SAndroid Build Coastguard Worker    jl mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w2
2026*c0909341SAndroid Build Coastguard Worker    je mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w4
2027*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      13
2028*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
2029*c0909341SAndroid Build Coastguard Worker    sub                srcq, 6
2030*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [base+subpel_filters+mxq*8]
2031*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [subpel_h_shufA]
2032*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [subpel_h_shufB]
2033*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
2034*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
2035*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m0, q0000
2036*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m0, q1111
2037*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m0, q2222
2038*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m0, q3333
2039*c0909341SAndroid Build Coastguard Worker    sub                  wd, 16
2040*c0909341SAndroid Build Coastguard Worker    jge .h_w16
2041*c0909341SAndroid Build Coastguard Worker.h_w8:
2042*c0909341SAndroid Build Coastguard Worker%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
2043*c0909341SAndroid Build Coastguard Worker    pshufb              m%4, m%1, m7   ; 2 3 3 4 4 5 5 6
2044*c0909341SAndroid Build Coastguard Worker    pshufb              m%1, m6        ; 0 1 1 2 2 3 3 4
2045*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m9, m%4   ; abcd1
2046*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m8        ; abcd0
2047*c0909341SAndroid Build Coastguard Worker    pshufb              m%2, m7        ; 6 7 7 8 8 9 9 a
2048*c0909341SAndroid Build Coastguard Worker    shufpd              m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
2049*c0909341SAndroid Build Coastguard Worker    paddd               m%5, m4
2050*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%5
2051*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m11, m%2  ; abcd3
2052*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%5
2053*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m10, m%4  ; abcd2
2054*c0909341SAndroid Build Coastguard Worker    pshufb              m%3, m7        ; a b b c c d d e
2055*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%4, m8        ; efgh0
2056*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%5
2057*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m9, m%2   ; efgh1
2058*c0909341SAndroid Build Coastguard Worker    shufpd              m%2, m%3, 0x05 ; 8 9 9 a a b b c
2059*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%3, m11       ; efgh3
2060*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%2, m10       ; efgh2
2061*c0909341SAndroid Build Coastguard Worker    paddd               m%4, m4
2062*c0909341SAndroid Build Coastguard Worker    paddd               m%4, m%5
2063*c0909341SAndroid Build Coastguard Worker    paddd               m%3, m%4
2064*c0909341SAndroid Build Coastguard Worker    paddd               m%2, m%3
2065*c0909341SAndroid Build Coastguard Worker    psrad               m%1, 6
2066*c0909341SAndroid Build Coastguard Worker    psrad               m%2, 6
2067*c0909341SAndroid Build Coastguard Worker    packusdw            m%1, m%2
2068*c0909341SAndroid Build Coastguard Worker    pminsw              m%1, m5
2069*c0909341SAndroid Build Coastguard Worker%endmacro
2070*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0+ 0]
2071*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*1+ 0], 1
2072*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+ssq*0+16]
2073*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+ssq*1+16], 1
2074*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2075*c0909341SAndroid Build Coastguard Worker    shufpd               m1, m0, m2, 0x05
2076*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H            0, 1, 2, 3, 12
2077*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm0
2078*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], m0, 1
2079*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2080*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2081*c0909341SAndroid Build Coastguard Worker    jg .h_w8
2082*c0909341SAndroid Build Coastguard Worker    RET
2083*c0909341SAndroid Build Coastguard Worker.h_w16:
2084*c0909341SAndroid Build Coastguard Worker    mov                 r6d, wd
2085*c0909341SAndroid Build Coastguard Worker.h_w16_loop:
2086*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6*2+ 0]
2087*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6*2+ 8]
2088*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6*2+16]
2089*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H            0, 1, 2, 3, 12
2090*c0909341SAndroid Build Coastguard Worker    mova        [dstq+r6*2], m0
2091*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 16
2092*c0909341SAndroid Build Coastguard Worker    jge .h_w16_loop
2093*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
2094*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
2095*c0909341SAndroid Build Coastguard Worker    dec                  hd
2096*c0909341SAndroid Build Coastguard Worker    jg .h_w16
2097*c0909341SAndroid Build Coastguard Worker    RET
2098*c0909341SAndroid Build Coastguard Worker.hv:
2099*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      16
2100*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m15, r8m
2101*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2102*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
2103*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
2104*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [base+subpel_filters+mxq*8+2]
2105*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2106*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2107*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2108*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
2109*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [base+subpel_filters+myq*8]
2110*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [pd_512]
2111*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
2112*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
2113*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
2114*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
2115*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m0
2116*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
2117*c0909341SAndroid Build Coastguard Worker    psraw                m1, 8 ; sign-extend
2118*c0909341SAndroid Build Coastguard Worker    test          dword r8m, 0x800
2119*c0909341SAndroid Build Coastguard Worker    jz .hv_10bit
2120*c0909341SAndroid Build Coastguard Worker    psraw                m7, 2
2121*c0909341SAndroid Build Coastguard Worker    psllw                m1, 2
2122*c0909341SAndroid Build Coastguard Worker.hv_10bit:
2123*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m1, q0000
2124*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m1, q1111
2125*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m1, q2222
2126*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m1, q3333
2127*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2128*c0909341SAndroid Build Coastguard Worker    je .hv_w4
2129*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m9, [subpel_h_shuf2]
2130*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [srcq+r6   ]    ; 3 3
2131*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ssq*2]
2132*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
2133*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+ssq*1]
2134*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
2135*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+ssq*0], 1 ; 2 4
2136*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*1], 1 ; 0 5
2137*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+ssq*2], 1 ; 1 6
2138*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2139*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m9
2140*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m9
2141*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m9
2142*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m9
2143*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m7
2144*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m7
2145*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m7
2146*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m7
2147*c0909341SAndroid Build Coastguard Worker    phaddd               m1, m3
2148*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m2
2149*c0909341SAndroid Build Coastguard Worker    paddd                m1, m6
2150*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6
2151*c0909341SAndroid Build Coastguard Worker    psrad                m1, 10
2152*c0909341SAndroid Build Coastguard Worker    psrad                m0, 10
2153*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m0         ; 3 2 0 1
2154*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m1, 1      ; 3 4 5 6
2155*c0909341SAndroid Build Coastguard Worker    pshufd              xm2, xm1, q1301 ; 2 3 1 2
2156*c0909341SAndroid Build Coastguard Worker    pshufd              xm3, xm0, q2121 ; 4 5 4 5
2157*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm1, xm2        ; 01 12
2158*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm0        ; 23 34
2159*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm3, xm0        ; 45 56
2160*c0909341SAndroid Build Coastguard Worker.hv_w2_loop:
2161*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+ssq*0]
2162*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+ssq*1]
2163*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2164*c0909341SAndroid Build Coastguard Worker    pshufb              xm4, xm9
2165*c0909341SAndroid Build Coastguard Worker    pshufb              xm5, xm9
2166*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm4, xm7
2167*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm5, xm7
2168*c0909341SAndroid Build Coastguard Worker    phaddd              xm4, xm5
2169*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm5, xm11, xm1 ; a0 b0
2170*c0909341SAndroid Build Coastguard Worker    mova                xm1, xm2
2171*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm2, xm12      ; a1 b1
2172*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm2
2173*c0909341SAndroid Build Coastguard Worker    mova                xm2, xm3
2174*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm3, xm13      ; a2 b2
2175*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm3
2176*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm6
2177*c0909341SAndroid Build Coastguard Worker    psrad               xm4, 10
2178*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm4
2179*c0909341SAndroid Build Coastguard Worker    palignr             xm3, xm4, xm0, 12
2180*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm4
2181*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm0       ; 67 78
2182*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm4, xm14, xm3 ; a3 b3
2183*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm6
2184*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm4
2185*c0909341SAndroid Build Coastguard Worker    psrad               xm5, 10
2186*c0909341SAndroid Build Coastguard Worker    packusdw            xm5, xm5
2187*c0909341SAndroid Build Coastguard Worker    pminsw              xm5, xm15
2188*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm5
2189*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm5, 1
2190*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2191*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2192*c0909341SAndroid Build Coastguard Worker    jg .hv_w2_loop
2193*c0909341SAndroid Build Coastguard Worker    RET
2194*c0909341SAndroid Build Coastguard Worker.hv_w4:
2195*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m9, [subpel_h_shufA]
2196*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m10, [subpel_h_shufB]
2197*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m7, q1111
2198*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q0000
2199*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*0]
2200*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+ssq*1], 1     ; 0 1
2201*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [srcq+r6   ]
2202*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, m0, [srcq+ssq*2], 0 ; 2 3
2203*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
2204*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*0], 1     ; 3 4
2205*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ssq*1]
2206*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+ssq*2], 1     ; 5 6
2207*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2208*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m1, m9
2209*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m10
2210*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7
2211*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m8
2212*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m2, m9
2213*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m10
2214*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m7
2215*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m8
2216*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
2217*c0909341SAndroid Build Coastguard Worker    paddd                m1, m4
2218*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m0, m9
2219*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m10
2220*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7
2221*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m8
2222*c0909341SAndroid Build Coastguard Worker    paddd                m5, m6
2223*c0909341SAndroid Build Coastguard Worker    paddd                m2, m5
2224*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m3, m9
2225*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m10
2226*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m7
2227*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m8
2228*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
2229*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
2230*c0909341SAndroid Build Coastguard Worker    paddd                m5, m6
2231*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
2232*c0909341SAndroid Build Coastguard Worker    vperm2i128           m0, m1, m2, 0x21
2233*c0909341SAndroid Build Coastguard Worker    psrld                m1, 10
2234*c0909341SAndroid Build Coastguard Worker    psrld                m2, 10
2235*c0909341SAndroid Build Coastguard Worker    vperm2i128           m3, m4, m5, 0x21
2236*c0909341SAndroid Build Coastguard Worker    pslld                m4, 6
2237*c0909341SAndroid Build Coastguard Worker    pslld                m5, 6
2238*c0909341SAndroid Build Coastguard Worker    pblendw              m2, m4, 0xaa ; 23 34
2239*c0909341SAndroid Build Coastguard Worker    pslld                m0, 6
2240*c0909341SAndroid Build Coastguard Worker    pblendw              m1, m0, 0xaa ; 01 12
2241*c0909341SAndroid Build Coastguard Worker    psrld                m3, 10
2242*c0909341SAndroid Build Coastguard Worker    pblendw              m3, m5, 0xaa ; 45 56
2243*c0909341SAndroid Build Coastguard Worker    psrad                m0, m5, 16
2244*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
2245*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+ssq*0]
2246*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [srcq+ssq*1], 1
2247*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2248*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m11, m1   ; a0 b0
2249*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
2250*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m12       ; a1 b1
2251*c0909341SAndroid Build Coastguard Worker    paddd                m5, m6
2252*c0909341SAndroid Build Coastguard Worker    paddd                m5, m2
2253*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
2254*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m13       ; a2 b2
2255*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
2256*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4, m9
2257*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m10
2258*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m7
2259*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m8
2260*c0909341SAndroid Build Coastguard Worker    paddd                m3, m6
2261*c0909341SAndroid Build Coastguard Worker    paddd                m4, m3
2262*c0909341SAndroid Build Coastguard Worker    psrad                m4, 10
2263*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m4        ; _ 7 6 8
2264*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m0, q1122 ; _ 6 _ 7
2265*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0        ; 67 78
2266*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
2267*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m14, m3   ; a3 b3
2268*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
2269*c0909341SAndroid Build Coastguard Worker    psrad                m4, 10
2270*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
2271*c0909341SAndroid Build Coastguard Worker    packusdw            xm4, xm5
2272*c0909341SAndroid Build Coastguard Worker    pminsw              xm4, xm15
2273*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm4
2274*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm4
2275*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2276*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2277*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
2278*c0909341SAndroid Build Coastguard Worker    RET
2279*c0909341SAndroid Build Coastguard Worker.hv_w8:
2280*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
2281*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [base+subpel_filters+mxq*8]
2282*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2283*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2284*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2285*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
2286*c0909341SAndroid Build Coastguard Worker    pmovsxbw            xm1, [base+subpel_filters+myq*8]
2287*c0909341SAndroid Build Coastguard Worker    shl                  wd, 5
2288*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
2289*c0909341SAndroid Build Coastguard Worker    sub                srcq, 6
2290*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
2291*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
2292*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m2
2293*c0909341SAndroid Build Coastguard Worker    lea                  wd, [hq+wq-256]
2294*c0909341SAndroid Build Coastguard Worker    test          dword r8m, 0x800
2295*c0909341SAndroid Build Coastguard Worker    jz .hv_w8_10bit
2296*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
2297*c0909341SAndroid Build Coastguard Worker    psllw               xm1, 2
2298*c0909341SAndroid Build Coastguard Worker.hv_w8_10bit:
2299*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m0, q0000
2300*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m0, q1111
2301*c0909341SAndroid Build Coastguard Worker    mova            [v_mul], xm1
2302*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m0, q2222
2303*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m0, q3333
2304*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0:
2305*c0909341SAndroid Build Coastguard Worker%macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16
2306*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m%1, m9   ; 2 3 3 4 4 5 5 6
2307*c0909341SAndroid Build Coastguard Worker    pshufb              m%1, m8        ; 0 1 1 2 2 3 3 4
2308*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m12, m2
2309*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m11
2310*c0909341SAndroid Build Coastguard Worker    pshufb              m%2, m9        ; 6 7 7 8 8 9 9 a
2311*c0909341SAndroid Build Coastguard Worker    shufpd               m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8
2312*c0909341SAndroid Build Coastguard Worker    paddd                m3, m10
2313*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m3
2314*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m14, m%2
2315*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m3
2316*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m13, m2
2317*c0909341SAndroid Build Coastguard Worker    pshufb              m%3, m9        ; a b b c c d d e
2318*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m11
2319*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m3
2320*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m12, m%2
2321*c0909341SAndroid Build Coastguard Worker    shufpd              m%2, m%3, 0x05 ; 8 9 9 a a b b c
2322*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%3, m14
2323*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%2, m13
2324*c0909341SAndroid Build Coastguard Worker    paddd                m2, m10
2325*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3
2326*c0909341SAndroid Build Coastguard Worker    paddd               m%3, m2
2327*c0909341SAndroid Build Coastguard Worker    paddd               m%2, m%3
2328*c0909341SAndroid Build Coastguard Worker    psrad               m%1, 10
2329*c0909341SAndroid Build Coastguard Worker    psrad               m%2, 10
2330*c0909341SAndroid Build Coastguard Worker    packssdw            m%1, m%2
2331*c0909341SAndroid Build Coastguard Worker%endmacro
2332*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+r6 *1+ 0]
2333*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [subpel_h_shufA]
2334*c0909341SAndroid Build Coastguard Worker    lea                  r7, [srcq+ssq*4]
2335*c0909341SAndroid Build Coastguard Worker    movu                xm6, [srcq+r6 *1+ 8]
2336*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m9, [subpel_h_shufB]
2337*c0909341SAndroid Build Coastguard Worker    mov                  r8, dstq
2338*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+r6 *1+16]
2339*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [pd_512]
2340*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+ssq*0+ 0]
2341*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [r7  +ssq*0+ 0], 1
2342*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*0+16]
2343*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [r7  +ssq*0+16], 1
2344*c0909341SAndroid Build Coastguard Worker    shufpd               m7, m5, m1, 0x05
2345*c0909341SAndroid Build Coastguard Worker    INIT_XMM avx2
2346*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         4, 6, 0    ; 3
2347*c0909341SAndroid Build Coastguard Worker    INIT_YMM avx2
2348*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         5, 7, 1    ; 0 4
2349*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*2+ 0]
2350*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+r6 *2+ 0], 1
2351*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*2+16]
2352*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+r6 *2+16], 1
2353*c0909341SAndroid Build Coastguard Worker    shufpd               m7, m0, m1, 0x05
2354*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         0, 7, 1    ; 2 6
2355*c0909341SAndroid Build Coastguard Worker    movu                xm6, [srcq+ssq*1+ 0]
2356*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*1+16]
2357*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [r7  +ssq*1+ 0], 1
2358*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [r7  +ssq*1+16], 1
2359*c0909341SAndroid Build Coastguard Worker    add                  r7, r6
2360*c0909341SAndroid Build Coastguard Worker    shufpd               m7, m6, m1, 0x05
2361*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         6, 7, 1    ; 1 5
2362*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m4, q1100
2363*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m5, q3120
2364*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m6, q3120
2365*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m0, q3120
2366*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m7, m4  ; 23
2367*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5      ; 34
2368*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m5, m6  ; 01
2369*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6      ; 45
2370*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m6, m7  ; 12
2371*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m7      ; 56
2372*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
2373*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [v_mul+4*0]
2374*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [v_mul+4*1]
2375*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [v_mul+4*2]
2376*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m9, m1  ; a0
2377*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m2      ; b0
2378*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
2379*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
2380*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m7      ; a1
2381*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7      ; b1
2382*c0909341SAndroid Build Coastguard Worker    paddd                m8, m3
2383*c0909341SAndroid Build Coastguard Worker    paddd                m9, m4
2384*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
2385*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
2386*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m10     ; a2
2387*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m10     ; b2
2388*c0909341SAndroid Build Coastguard Worker    paddd                m8, m5
2389*c0909341SAndroid Build Coastguard Worker    paddd                m9, m6
2390*c0909341SAndroid Build Coastguard Worker    movu                xm5, [r7+ssq*0]
2391*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [r7+ssq*1], 1
2392*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [subpel_h_shufA]
2393*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m10, [subpel_h_shufB]
2394*c0909341SAndroid Build Coastguard Worker    movu                xm6, [r7+ssq*0+16]
2395*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [r7+ssq*1+16], 1
2396*c0909341SAndroid Build Coastguard Worker    vextracti128       [r8], m0, 1
2397*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m5, m7  ; 01
2398*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m10     ; 23
2399*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m11
2400*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m12
2401*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
2402*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m6, m7  ; 89
2403*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m10     ; ab
2404*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m13
2405*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m14
2406*c0909341SAndroid Build Coastguard Worker    paddd                m6, m5
2407*c0909341SAndroid Build Coastguard Worker    movu                xm5, [r7+ssq*0+8]
2408*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [r7+ssq*1+8], 1
2409*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+ssq*2]
2410*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m5, m7
2411*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m10
2412*c0909341SAndroid Build Coastguard Worker    pmaddwd             m10, m13, m7
2413*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m11
2414*c0909341SAndroid Build Coastguard Worker    paddd                m0, m10
2415*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [pd_512]
2416*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
2417*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m14, m5
2418*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m12
2419*c0909341SAndroid Build Coastguard Worker    paddd                m0, m7
2420*c0909341SAndroid Build Coastguard Worker    paddd                m5, m6
2421*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [r8]
2422*c0909341SAndroid Build Coastguard Worker    paddd                m8, m10
2423*c0909341SAndroid Build Coastguard Worker    paddd                m9, m10
2424*c0909341SAndroid Build Coastguard Worker    paddd                m0, m10
2425*c0909341SAndroid Build Coastguard Worker    paddd                m5, m10
2426*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [v_mul+4*3]
2427*c0909341SAndroid Build Coastguard Worker    psrad                m0, 10
2428*c0909341SAndroid Build Coastguard Worker    psrad                m5, 10
2429*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m5
2430*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m0, q3120 ; 7 8
2431*c0909341SAndroid Build Coastguard Worker    shufpd               m6, m7, 0x04  ; 6 7
2432*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6, m7    ; 67
2433*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m7        ; 78
2434*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m10, m5   ; a3
2435*c0909341SAndroid Build Coastguard Worker    pmaddwd             m10, m6        ; b3
2436*c0909341SAndroid Build Coastguard Worker    paddd                m7, m8
2437*c0909341SAndroid Build Coastguard Worker    paddd                m9, m10
2438*c0909341SAndroid Build Coastguard Worker    psrad                m7, 10
2439*c0909341SAndroid Build Coastguard Worker    psrad                m9, 10
2440*c0909341SAndroid Build Coastguard Worker    packusdw             m7, m9
2441*c0909341SAndroid Build Coastguard Worker    pminsw               m7, m15
2442*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m7, q3120
2443*c0909341SAndroid Build Coastguard Worker    mova         [r8+dsq*0], xm7
2444*c0909341SAndroid Build Coastguard Worker    vextracti128 [r8+dsq*1], m7, 1
2445*c0909341SAndroid Build Coastguard Worker    lea                  r8, [r8+dsq*2]
2446*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2447*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
2448*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
2449*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
2450*c0909341SAndroid Build Coastguard Worker    movzx                hd, wb
2451*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<8
2452*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop0
2453*c0909341SAndroid Build Coastguard Worker    RET
2454*c0909341SAndroid Build Coastguard Worker
2455*c0909341SAndroid Build Coastguard Worker%if WIN64
2456*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 4
2457*c0909341SAndroid Build Coastguard Worker%else
2458*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 7
2459*c0909341SAndroid Build Coastguard Worker%endif
2460*c0909341SAndroid Build Coastguard Worker
2461*c0909341SAndroid Build Coastguard Worker%define PREP_8TAP_FN FN prep_8tap,
2462*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_16bpc
2463*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_16bpc
2464*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_16bpc
2465*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular,        REGULAR, REGULAR
2466*c0909341SAndroid Build Coastguard Worker
2467*c0909341SAndroid Build Coastguard Workercglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
2468*c0909341SAndroid Build Coastguard Worker%define base r7-prep_avx2
2469*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
2470*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
2471*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
2472*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 6tap_v, my, 4tap_v
2473*c0909341SAndroid Build Coastguard Worker    lea                  r7, [prep_avx2]
2474*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
2475*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
2476*c0909341SAndroid Build Coastguard Worker    jnz .h
2477*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2478*c0909341SAndroid Build Coastguard Worker    jnz .v
2479*c0909341SAndroid Build Coastguard Worker.prep:
2480*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
2481*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r7m ; bitdepth_max
2482*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(prep,)]
2483*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [r7-prep_avx2+pw_8192]
2484*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
2485*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
2486*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+prep_mul+r6*4]
2487*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
2488*c0909341SAndroid Build Coastguard Worker%if WIN64
2489*c0909341SAndroid Build Coastguard Worker    pop                  r7
2490*c0909341SAndroid Build Coastguard Worker%endif
2491*c0909341SAndroid Build Coastguard Worker    jmp                  wq
2492*c0909341SAndroid Build Coastguard Worker.h_w4:
2493*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
2494*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
2495*c0909341SAndroid Build Coastguard Worker    pmovsxbw            xm0, [base+subpel_filters+mxq*8]
2496*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [subpel_h_shufA]
2497*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
2498*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [subpel_h_shufB]
2499*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       8
2500*c0909341SAndroid Build Coastguard Worker    pshufd              xm0, xm0, q2211
2501*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
2502*c0909341SAndroid Build Coastguard Worker    jnz .h_w4_12bpc
2503*c0909341SAndroid Build Coastguard Worker    psllw               xm0, 2
2504*c0909341SAndroid Build Coastguard Worker.h_w4_12bpc:
2505*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m6, xm0
2506*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m0, q1111
2507*c0909341SAndroid Build Coastguard Worker.h_w4_loop:
2508*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*0]
2509*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+ssq*2], 1
2510*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+ssq*1]
2511*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+r6 *1], 1
2512*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
2513*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1, m3 ; 0 1 1 2 2 3 3 4
2514*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4     ; 2 3 3 4 4 5 5 6
2515*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m6
2516*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m7
2517*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
2518*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
2519*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2, m3
2520*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
2521*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m6
2522*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m7
2523*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5
2524*c0909341SAndroid Build Coastguard Worker    paddd                m1, m2
2525*c0909341SAndroid Build Coastguard Worker    psrad                m0, 4
2526*c0909341SAndroid Build Coastguard Worker    psrad                m1, 4
2527*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
2528*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
2529*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
2530*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2531*c0909341SAndroid Build Coastguard Worker    jg .h_w4_loop
2532*c0909341SAndroid Build Coastguard Worker    RET
2533*c0909341SAndroid Build Coastguard Worker.h:
2534*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2535*c0909341SAndroid Build Coastguard Worker    jnz .hv
2536*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4)
2537*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2538*c0909341SAndroid Build Coastguard Worker    je .h_w4
2539*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
2540*c0909341SAndroid Build Coastguard Worker    sub                srcq, 4
2541*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [base+subpel_filters+1+mxq*8]
2542*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      10
2543*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [subpel_h_shufA]
2544*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
2545*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
2546*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
2547*c0909341SAndroid Build Coastguard Worker    jnz .h_12bpc
2548*c0909341SAndroid Build Coastguard Worker    psllw                m0, 2
2549*c0909341SAndroid Build Coastguard Worker.h_12bpc:
2550*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m0, q0000
2551*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m0, q1111
2552*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m0, q2222
2553*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
2554*c0909341SAndroid Build Coastguard Worker    jg .h_w16
2555*c0909341SAndroid Build Coastguard Worker.h_w8:
2556*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0+ 0]
2557*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*1+ 0], 1
2558*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+ssq*0+16]
2559*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+ssq*1+16], 1
2560*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2561*c0909341SAndroid Build Coastguard Worker    shufpd               m1, m0, m2, 0x05
2562*c0909341SAndroid Build Coastguard Worker%macro PREP_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
2563*c0909341SAndroid Build Coastguard Worker    pshufb              m%1, m6        ; 01 12 23 34
2564*c0909341SAndroid Build Coastguard Worker    pshufb              m%2, m6        ; 45 56 67 78
2565*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%4, m7, m%1   ; a0
2566*c0909341SAndroid Build Coastguard Worker    pshufb              m%3, m6        ; 89 9a ab bc
2567*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m9, m%2   ; a2
2568*c0909341SAndroid Build Coastguard Worker    shufpd              m%1, m%2, 0x05 ; 23 34 45 56
2569*c0909341SAndroid Build Coastguard Worker    paddd               m%4, m%5       ; a0+a2
2570*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m7, m%2   ; b0
2571*c0909341SAndroid Build Coastguard Worker    shufpd              m%2, m%3, 0x05 ; 67 78 89 9a
2572*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%3, m9        ; b2
2573*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m8        ; a1
2574*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%2, m8        ; b1
2575*c0909341SAndroid Build Coastguard Worker    paddd               m%3, m%5       ; b0+b2
2576*c0909341SAndroid Build Coastguard Worker    paddd               m%4, m5
2577*c0909341SAndroid Build Coastguard Worker    paddd               m%3, m5
2578*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%4
2579*c0909341SAndroid Build Coastguard Worker    paddd               m%2, m%3
2580*c0909341SAndroid Build Coastguard Worker    psrad               m%1, 4
2581*c0909341SAndroid Build Coastguard Worker    psrad               m%2, 4
2582*c0909341SAndroid Build Coastguard Worker    packssdw            m%1, m%2
2583*c0909341SAndroid Build Coastguard Worker%endmacro
2584*c0909341SAndroid Build Coastguard Worker    PREP_6TAP_H           0, 1, 2, 3, 4
2585*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
2586*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
2587*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2588*c0909341SAndroid Build Coastguard Worker    jg .h_w8
2589*c0909341SAndroid Build Coastguard Worker    RET
2590*c0909341SAndroid Build Coastguard Worker.h_w16:
2591*c0909341SAndroid Build Coastguard Worker    add                  wd, wd
2592*c0909341SAndroid Build Coastguard Worker.h_w16_loop0:
2593*c0909341SAndroid Build Coastguard Worker    mov                 r6d, wd
2594*c0909341SAndroid Build Coastguard Worker.h_w16_loop:
2595*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6-32]
2596*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6-24]
2597*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6-16]
2598*c0909341SAndroid Build Coastguard Worker    PREP_6TAP_H           0, 1, 2, 3, 4
2599*c0909341SAndroid Build Coastguard Worker    mova       [tmpq+r6-32], m0
2600*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 32
2601*c0909341SAndroid Build Coastguard Worker    jg .h_w16_loop
2602*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
2603*c0909341SAndroid Build Coastguard Worker    add                tmpq, wq
2604*c0909341SAndroid Build Coastguard Worker    dec                  hd
2605*c0909341SAndroid Build Coastguard Worker    jg .h_w16_loop0
2606*c0909341SAndroid Build Coastguard Worker    RET
2607*c0909341SAndroid Build Coastguard Worker.v:
2608*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2609*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2610*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
2611*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
2612*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [base+subpel_filters+1+myq*8]
2613*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       9, 12
2614*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [prep_8tap_1d_rnd]
2615*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
2616*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
2617*c0909341SAndroid Build Coastguard Worker    neg                  r6
2618*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
2619*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
2620*c0909341SAndroid Build Coastguard Worker    jnz .v_12bpc
2621*c0909341SAndroid Build Coastguard Worker    psllw                m0, 2
2622*c0909341SAndroid Build Coastguard Worker.v_12bpc:
2623*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m0, q0000
2624*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m0, q1111
2625*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m0, q2222
2626*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2627*c0909341SAndroid Build Coastguard Worker    jg .v_w8
2628*c0909341SAndroid Build Coastguard Worker.v_w4:
2629*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+r6 *2]
2630*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+r6 *1]
2631*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+ssq*0]
2632*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+ssq*1]
2633*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2634*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+ssq*0]
2635*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m3, 0x30
2636*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m2, 0x30
2637*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3     ; 01 12
2638*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m4, 0x30
2639*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m0, 0x30
2640*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4     ; 23 34
2641*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
2642*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+ssq*1]
2643*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2644*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m6, m1 ; a0 b0
2645*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
2646*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m7     ; a1 b1
2647*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
2648*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m0, m3, 0x30
2649*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+ssq*0]
2650*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m0, 0x30
2651*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3     ; 45 56
2652*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m8, m2 ; a2 b2
2653*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
2654*c0909341SAndroid Build Coastguard Worker    paddd                m4, m3
2655*c0909341SAndroid Build Coastguard Worker    psrad                m4, 4
2656*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m4, 1
2657*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm3
2658*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xm4
2659*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
2660*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2661*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
2662*c0909341SAndroid Build Coastguard Worker    RET
2663*c0909341SAndroid Build Coastguard Worker.v_w8:
2664*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       12
2665*c0909341SAndroid Build Coastguard Worker%if WIN64
2666*c0909341SAndroid Build Coastguard Worker    push                 r8
2667*c0909341SAndroid Build Coastguard Worker%endif
2668*c0909341SAndroid Build Coastguard Worker    mov                 r8d, wd
2669*c0909341SAndroid Build Coastguard Worker    shl                  wd, 5
2670*c0909341SAndroid Build Coastguard Worker    lea                  wd, [hq+wq-256]
2671*c0909341SAndroid Build Coastguard Worker.v_w8_loop0:
2672*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [srcq+r6 *2]
2673*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [srcq+r6 *1]
2674*c0909341SAndroid Build Coastguard Worker    lea                  r5, [srcq+ssq*2]
2675*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [srcq+ssq*0]
2676*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [srcq+ssq*1]
2677*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
2678*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [r5+ssq*0]
2679*c0909341SAndroid Build Coastguard Worker    shufpd               m3, m0, 0x0c
2680*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m1, 0x0c
2681*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m4 ; 01
2682*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4     ; 23
2683*c0909341SAndroid Build Coastguard Worker    shufpd               m0, m2, 0x0c
2684*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m0 ; 12
2685*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0     ; 34
2686*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
2687*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m9, [r5+ssq*1]
2688*c0909341SAndroid Build Coastguard Worker    pmaddwd             m10, m6, m1 ; a0
2689*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+ssq*2]
2690*c0909341SAndroid Build Coastguard Worker    pmaddwd             m11, m6, m2 ; b0
2691*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
2692*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m7     ; a1
2693*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
2694*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7     ; b1
2695*c0909341SAndroid Build Coastguard Worker    paddd               m10, m5
2696*c0909341SAndroid Build Coastguard Worker    paddd               m11, m5
2697*c0909341SAndroid Build Coastguard Worker    paddd               m10, m3
2698*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [r5+ssq*0]
2699*c0909341SAndroid Build Coastguard Worker    paddd               m11, m4
2700*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m0, m9, 0x0d
2701*c0909341SAndroid Build Coastguard Worker    shufpd               m0, m9, m3, 0x0c
2702*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m0 ; 45
2703*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0     ; 56
2704*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m8, m3 ; a2
2705*c0909341SAndroid Build Coastguard Worker    paddd               m10, m9
2706*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m8, m4 ; b2
2707*c0909341SAndroid Build Coastguard Worker    paddd               m11, m9
2708*c0909341SAndroid Build Coastguard Worker    psrad               m10, 4
2709*c0909341SAndroid Build Coastguard Worker    psrad               m11, 4
2710*c0909341SAndroid Build Coastguard Worker    packssdw            m10, m11
2711*c0909341SAndroid Build Coastguard Worker    vpermq              m10, m10, q3120
2712*c0909341SAndroid Build Coastguard Worker    mova          [r7+r8*0], xm10
2713*c0909341SAndroid Build Coastguard Worker    vextracti128  [r7+r8*2], m10, 1
2714*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+r8*4]
2715*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2716*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
2717*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
2718*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
2719*c0909341SAndroid Build Coastguard Worker    movzx                hd, wb
2720*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<8
2721*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop0
2722*c0909341SAndroid Build Coastguard Worker%if WIN64
2723*c0909341SAndroid Build Coastguard Worker    pop                  r8
2724*c0909341SAndroid Build Coastguard Worker%endif
2725*c0909341SAndroid Build Coastguard Worker    RET
2726*c0909341SAndroid Build Coastguard Worker.hv:
2727*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      13, 15
2728*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [prep_8tap_2d_rnd]
2729*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [subpel_h_shufA]
2730*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2731*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
2732*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
2733*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [base+subpel_filters+mxq*8+2]
2734*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2735*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2736*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
2737*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
2738*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [base+subpel_filters+1+myq*8]
2739*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
2740*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
2741*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
2742*c0909341SAndroid Build Coastguard Worker    neg                  r6
2743*c0909341SAndroid Build Coastguard Worker    punpcklbw            m6, m0
2744*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
2745*c0909341SAndroid Build Coastguard Worker    psraw                m6, 4
2746*c0909341SAndroid Build Coastguard Worker    psraw                m1, 8
2747*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
2748*c0909341SAndroid Build Coastguard Worker    jz .hv_w4_10bit
2749*c0909341SAndroid Build Coastguard Worker    psraw                m6, 2
2750*c0909341SAndroid Build Coastguard Worker.hv_w4_10bit:
2751*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m1, q0000
2752*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m1, q1111
2753*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m1, q2222
2754*c0909341SAndroid Build Coastguard Worker.hv_w4:
2755*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+r6 *2]
2756*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+r6 *1], 1 ; 0 1
2757*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m6, q0000
2758*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m9, [base+subpel_h_shufB]
2759*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
2760*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q1111
2761*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*1], 1 ; 2 3
2762*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2763*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ssq*0]    ; 4
2764*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2, m8
2765*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5
2766*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m9
2767*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m6
2768*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m0, m8
2769*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m5
2770*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m9
2771*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m6
2772*c0909341SAndroid Build Coastguard Worker    paddd                m2, m1
2773*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm3, xm8
2774*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm1, xm5
2775*c0909341SAndroid Build Coastguard Worker    pshufb              xm3, xm9
2776*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm3, xm6
2777*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
2778*c0909341SAndroid Build Coastguard Worker    paddd                m2, m7
2779*c0909341SAndroid Build Coastguard Worker    paddd               xm1, xm7
2780*c0909341SAndroid Build Coastguard Worker    paddd                m0, m7
2781*c0909341SAndroid Build Coastguard Worker    paddd               xm3, xm1
2782*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 6}, m2, m0, xm3
2783*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m0      ; 0 2   1 3
2784*c0909341SAndroid Build Coastguard Worker    packssdw            xm0, xm3     ; 2 4
2785*c0909341SAndroid Build Coastguard Worker    vperm2i128           m0, m2, 0x03
2786*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m0  ; 01 12
2787*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0      ; 23 34
2788*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
2789*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ssq*1]
2790*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2791*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+ssq*0], 1
2792*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m10, m1 ; a0 b0
2793*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
2794*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m11     ; a1 b1
2795*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
2796*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3, m8
2797*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5
2798*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m9
2799*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m6
2800*c0909341SAndroid Build Coastguard Worker    paddd                m2, m7
2801*c0909341SAndroid Build Coastguard Worker    paddd                m3, m2
2802*c0909341SAndroid Build Coastguard Worker    psrad                m3, 6
2803*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m3      ; 5 5   6 6
2804*c0909341SAndroid Build Coastguard Worker    vperm2i128           m2, m0, m3, 0x21
2805*c0909341SAndroid Build Coastguard Worker    mova                 m0, m3
2806*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m3      ; 45 56
2807*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m12, m2 ; a2 b2
2808*c0909341SAndroid Build Coastguard Worker    paddd                m4, m7
2809*c0909341SAndroid Build Coastguard Worker    paddd                m4, m3
2810*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
2811*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m4, 1
2812*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm3
2813*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xm4
2814*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
2815*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2816*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
2817*c0909341SAndroid Build Coastguard Worker    RET
2818*c0909341SAndroid Build Coastguard Worker.hv_w8:
2819*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
2820*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [base+subpel_filters+1+mxq*8]
2821*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2822*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2823*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
2824*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
2825*c0909341SAndroid Build Coastguard Worker    pmovsxbw            xm1, [base+subpel_filters+1+myq*8]
2826*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       15
2827*c0909341SAndroid Build Coastguard Worker%if WIN64
2828*c0909341SAndroid Build Coastguard Worker    PUSH                 r8
2829*c0909341SAndroid Build Coastguard Worker%endif
2830*c0909341SAndroid Build Coastguard Worker    mov                 r8d, wd
2831*c0909341SAndroid Build Coastguard Worker    shl                  wd, 5
2832*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
2833*c0909341SAndroid Build Coastguard Worker    sub                srcq, 4
2834*c0909341SAndroid Build Coastguard Worker    neg                  r6
2835*c0909341SAndroid Build Coastguard Worker    lea                  wd, [hq+wq-256]
2836*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
2837*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m2
2838*c0909341SAndroid Build Coastguard Worker    psraw                m0, 4
2839*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
2840*c0909341SAndroid Build Coastguard Worker    jz .hv_w8_10bit
2841*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
2842*c0909341SAndroid Build Coastguard Worker.hv_w8_10bit:
2843*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m0, q0000
2844*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m0, q1111
2845*c0909341SAndroid Build Coastguard Worker    mova            [v_mul], xm1
2846*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m0, q2222
2847*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0:
2848*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [srcq+ssq*0+ 0]
2849*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, m0, [srcq+r6*2+ 0], 0
2850*c0909341SAndroid Build Coastguard Worker    lea                  r5, [srcq+ssq*2]
2851*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [srcq+ssq*0+16]
2852*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, m2, [srcq+r6*2+16], 0
2853*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
2854*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [r5  +ssq*0+ 0], 1
2855*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [r5  +ssq*0+16], 1
2856*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m3, m1, 0x05
2857*c0909341SAndroid Build Coastguard Worker%macro PREP_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
2858*c0909341SAndroid Build Coastguard Worker    pshufb              m%1, m8        ; 01 12 23 34
2859*c0909341SAndroid Build Coastguard Worker    pshufb              m%2, m8        ; 45 56 67 78
2860*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%4, m10, m%1  ; a0
2861*c0909341SAndroid Build Coastguard Worker    pshufb              m%3, m8        ; 89 9a ab bc
2862*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m12, m%2  ; a2
2863*c0909341SAndroid Build Coastguard Worker    shufpd              m%1, m%2, 0x05 ; 23 34 45 56
2864*c0909341SAndroid Build Coastguard Worker    paddd               m%4, m%5       ; a0+a2
2865*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m10, m%2  ; b0
2866*c0909341SAndroid Build Coastguard Worker    shufpd              m%2, m%3, 0x05 ; 67 78 89 9a
2867*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%3, m12       ; b2
2868*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m11       ; a1
2869*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%2, m11       ; b1
2870*c0909341SAndroid Build Coastguard Worker    paddd               m%3, m%5       ; b0+b2
2871*c0909341SAndroid Build Coastguard Worker    paddd               m%4, m7
2872*c0909341SAndroid Build Coastguard Worker    paddd               m%3, m7
2873*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%4
2874*c0909341SAndroid Build Coastguard Worker    paddd               m%2, m%3
2875*c0909341SAndroid Build Coastguard Worker    psrad               m%1, 6
2876*c0909341SAndroid Build Coastguard Worker    psrad               m%2, 6
2877*c0909341SAndroid Build Coastguard Worker    packssdw            m%1, m%2
2878*c0909341SAndroid Build Coastguard Worker%endmacro
2879*c0909341SAndroid Build Coastguard Worker    PREP_6TAP_HV_H        3, 4, 1, 5, 6  ; 0 2
2880*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+r6 *1+ 0]
2881*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [srcq+ssq*1+ 0], 1
2882*c0909341SAndroid Build Coastguard Worker    shufpd               m1, m0, m2, 0x05
2883*c0909341SAndroid Build Coastguard Worker    PREP_6TAP_HV_H        0, 1, 2, 5, 6  ; 2 4
2884*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+r6 *1+16]
2885*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+ssq*1+16], 1
2886*c0909341SAndroid Build Coastguard Worker    shufpd               m1, m4, m2, 0x05
2887*c0909341SAndroid Build Coastguard Worker    PREP_6TAP_HV_H        4, 1, 2, 5, 6  ; 1 3
2888*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m3, q3120
2889*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m4, q3120
2890*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
2891*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m4     ; 01
2892*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4         ; 23
2893*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m0     ; 12
2894*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0         ; 34
2895*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
2896*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [v_mul+4*0]
2897*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [v_mul+4*1]
2898*c0909341SAndroid Build Coastguard Worker    movu                xm5, [r5+ssq*1+ 0]
2899*c0909341SAndroid Build Coastguard Worker    movu                xm6, [r5+ssq*1+16]
2900*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+ssq*2]
2901*c0909341SAndroid Build Coastguard Worker    pmaddwd             m13, m14, m1    ; a0
2902*c0909341SAndroid Build Coastguard Worker    pmaddwd             m14, m2         ; b0
2903*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [r5+ssq*0+ 0], 1
2904*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [r5+ssq*0+16], 1
2905*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
2906*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9         ; a1
2907*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
2908*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m9         ; b1
2909*c0909341SAndroid Build Coastguard Worker    paddd               m13, m3
2910*c0909341SAndroid Build Coastguard Worker    shufpd               m3, m5, m6, 0x05
2911*c0909341SAndroid Build Coastguard Worker    paddd               m14, m4
2912*c0909341SAndroid Build Coastguard Worker    PREP_6TAP_HV_H        5, 3, 6, 4, 9 ; 5 6
2913*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [v_mul+4*2]
2914*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m5, q3120
2915*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m0, m5, 0x05
2916*c0909341SAndroid Build Coastguard Worker    mova                 m0, m5
2917*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m5     ; 45
2918*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5         ; 56
2919*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m6, m3     ; a2
2920*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m4         ; b2
2921*c0909341SAndroid Build Coastguard Worker    paddd               m13, m7
2922*c0909341SAndroid Build Coastguard Worker    paddd               m14, m7
2923*c0909341SAndroid Build Coastguard Worker    paddd                m5, m13
2924*c0909341SAndroid Build Coastguard Worker    paddd                m6, m14
2925*c0909341SAndroid Build Coastguard Worker    psrad                m5, 6
2926*c0909341SAndroid Build Coastguard Worker    psrad                m6, 6
2927*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m6
2928*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m5, q3120
2929*c0909341SAndroid Build Coastguard Worker    mova          [r7+r8*0], xm5
2930*c0909341SAndroid Build Coastguard Worker    vextracti128  [r7+r8*2], m5, 1
2931*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+r8*4]
2932*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2933*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
2934*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
2935*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
2936*c0909341SAndroid Build Coastguard Worker    movzx                hd, wb
2937*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<8
2938*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop0
2939*c0909341SAndroid Build Coastguard Worker%if WIN64
2940*c0909341SAndroid Build Coastguard Worker    POP                  r8
2941*c0909341SAndroid Build Coastguard Worker%endif
2942*c0909341SAndroid Build Coastguard Worker    RET
2943*c0909341SAndroid Build Coastguard Worker
2944*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_16bpc
2945*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_16bpc
2946*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_16bpc
2947*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_16bpc
2948*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp,          SHARP,   SHARP
2949*c0909341SAndroid Build Coastguard Worker
2950*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
2951*c0909341SAndroid Build Coastguard Worker%define base r7-prep_avx2
2952*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
2953*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
2954*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
2955*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 8tap_v, my, 4tap_v
2956*c0909341SAndroid Build Coastguard Worker    lea                  r7, [prep_avx2]
2957*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
2958*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
2959*c0909341SAndroid Build Coastguard Worker    jnz .h
2960*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2961*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _prep_6tap_16bpc_avx2).prep
2962*c0909341SAndroid Build Coastguard Worker.v:
2963*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2964*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2965*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
2966*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
2967*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [base+subpel_filters+myq*8]
2968*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      12, 15
2969*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [prep_8tap_1d_rnd]
2970*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
2971*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
2972*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
2973*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
2974*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
2975*c0909341SAndroid Build Coastguard Worker    jnz .v_12bpc
2976*c0909341SAndroid Build Coastguard Worker    psllw                m0, 2
2977*c0909341SAndroid Build Coastguard Worker.v_12bpc:
2978*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m0, q0000
2979*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m0, q1111
2980*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m0, q2222
2981*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m0, q3333
2982*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2983*c0909341SAndroid Build Coastguard Worker    jg .v_w8
2984*c0909341SAndroid Build Coastguard Worker.v_w4:
2985*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+strideq*0]
2986*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+strideq*1]
2987*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+strideq*2]
2988*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+r6       ]
2989*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
2990*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+strideq*0]
2991*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m5, [srcq+strideq*1]
2992*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m0, 0x30
2993*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m2, 0x30
2994*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0      ; 01 12
2995*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+strideq*2]
2996*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2997*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m4, 0x30
2998*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m3, 0x30
2999*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4      ; 23 34
3000*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m5, 0x30
3001*c0909341SAndroid Build Coastguard Worker    vpblendd             m5, m0, 0x30
3002*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m5      ; 45 56
3003*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
3004*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+strideq*0]
3005*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m8, m1  ; a0 b0
3006*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
3007*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9      ; a1 b1
3008*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
3009*c0909341SAndroid Build Coastguard Worker    paddd                m5, m2
3010*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
3011*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m10     ; a2 b2
3012*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
3013*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m0, m4, 0x30
3014*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+strideq*1]
3015*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
3016*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m0, 0x30
3017*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4      ; 67 78
3018*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m11, m3 ; a3 b3
3019*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4
3020*c0909341SAndroid Build Coastguard Worker    psrad                m5, 4
3021*c0909341SAndroid Build Coastguard Worker    vextracti128        xm4, m5, 1
3022*c0909341SAndroid Build Coastguard Worker    packssdw            xm5, xm4
3023*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xm5
3024*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
3025*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3026*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
3027*c0909341SAndroid Build Coastguard Worker    RET
3028*c0909341SAndroid Build Coastguard Worker.v_w8:
3029*c0909341SAndroid Build Coastguard Worker%if WIN64
3030*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       15
3031*c0909341SAndroid Build Coastguard Worker    push                 r8
3032*c0909341SAndroid Build Coastguard Worker%endif
3033*c0909341SAndroid Build Coastguard Worker    mov                 r8d, wd
3034*c0909341SAndroid Build Coastguard Worker    shl                  wd, 5
3035*c0909341SAndroid Build Coastguard Worker    lea                  wd, [hq+wq-256]
3036*c0909341SAndroid Build Coastguard Worker.v_w8_loop0:
3037*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [srcq+strideq*0]
3038*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [srcq+strideq*1]
3039*c0909341SAndroid Build Coastguard Worker    lea                  r5, [srcq+strideq*4]
3040*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [srcq+r6       ]
3041*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [srcq+strideq*2]
3042*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
3043*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [r5+strideq*0]
3044*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [r5+strideq*1]
3045*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [r5+strideq*2]
3046*c0909341SAndroid Build Coastguard Worker    add                  r5, r6
3047*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m0, 0x0c
3048*c0909341SAndroid Build Coastguard Worker    shufpd               m5, m1, 0x0c
3049*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m4, m5 ; 01
3050*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5     ; 34
3051*c0909341SAndroid Build Coastguard Worker    shufpd               m6, m2, 0x0c
3052*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5, m6 ; 12
3053*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6     ; 45
3054*c0909341SAndroid Build Coastguard Worker    shufpd               m0, m3, 0x0c
3055*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m6, m0 ; 23
3056*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m0     ; 56
3057*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
3058*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m14, [r5+strideq*0]
3059*c0909341SAndroid Build Coastguard Worker    pmaddwd             m12, m8, m1  ; a0
3060*c0909341SAndroid Build Coastguard Worker    pmaddwd             m13, m8, m2  ; b0
3061*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3062*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
3063*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9      ; a1
3064*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m9      ; b1
3065*c0909341SAndroid Build Coastguard Worker    paddd               m12, m7
3066*c0909341SAndroid Build Coastguard Worker    paddd               m13, m7
3067*c0909341SAndroid Build Coastguard Worker    paddd               m12, m3
3068*c0909341SAndroid Build Coastguard Worker    paddd               m13, m4
3069*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
3070*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
3071*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m10     ; a2
3072*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m10     ; b2
3073*c0909341SAndroid Build Coastguard Worker    paddd               m12, m5
3074*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [r5+strideq*1]
3075*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+strideq*2]
3076*c0909341SAndroid Build Coastguard Worker    paddd               m13, m6
3077*c0909341SAndroid Build Coastguard Worker    shufpd               m6, m0, m14, 0x0d
3078*c0909341SAndroid Build Coastguard Worker    shufpd               m0, m14, m5, 0x0c
3079*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6, m0  ; 67
3080*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m0      ; 78
3081*c0909341SAndroid Build Coastguard Worker    pmaddwd             m14, m11, m5 ; a3
3082*c0909341SAndroid Build Coastguard Worker    paddd               m12, m14
3083*c0909341SAndroid Build Coastguard Worker    pmaddwd             m14, m11, m6 ; b3
3084*c0909341SAndroid Build Coastguard Worker    paddd               m13, m14
3085*c0909341SAndroid Build Coastguard Worker    psrad               m12, 4
3086*c0909341SAndroid Build Coastguard Worker    psrad               m13, 4
3087*c0909341SAndroid Build Coastguard Worker    packssdw            m12, m13
3088*c0909341SAndroid Build Coastguard Worker    vpermq              m12, m12, q3120
3089*c0909341SAndroid Build Coastguard Worker    mova          [r7+r8*0], xm12
3090*c0909341SAndroid Build Coastguard Worker    vextracti128  [r7+r8*2], m12, 1
3091*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+r8*4]
3092*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3093*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
3094*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
3095*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
3096*c0909341SAndroid Build Coastguard Worker    movzx                hd, wb
3097*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<8
3098*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop0
3099*c0909341SAndroid Build Coastguard Worker%if WIN64
3100*c0909341SAndroid Build Coastguard Worker    pop                  r8
3101*c0909341SAndroid Build Coastguard Worker%endif
3102*c0909341SAndroid Build Coastguard Worker    RET
3103*c0909341SAndroid Build Coastguard Worker.h:
3104*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
3105*c0909341SAndroid Build Coastguard Worker    jnz .hv
3106*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4)
3107*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
3108*c0909341SAndroid Build Coastguard Worker    je mangle(private_prefix %+ _prep_6tap_16bpc_avx2).h_w4
3109*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
3110*c0909341SAndroid Build Coastguard Worker    sub                srcq, 6
3111*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [base+subpel_filters+mxq*8]
3112*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      12
3113*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [subpel_h_shufA]
3114*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [subpel_h_shufB]
3115*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
3116*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
3117*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
3118*c0909341SAndroid Build Coastguard Worker    jnz .h_12bpc
3119*c0909341SAndroid Build Coastguard Worker    psllw                m0, 2
3120*c0909341SAndroid Build Coastguard Worker.h_12bpc:
3121*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m0, q0000
3122*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m0, q1111
3123*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m0, q2222
3124*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m0, q3333
3125*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
3126*c0909341SAndroid Build Coastguard Worker    jg .h_w16
3127*c0909341SAndroid Build Coastguard Worker.h_w8:
3128*c0909341SAndroid Build Coastguard Worker%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
3129*c0909341SAndroid Build Coastguard Worker    pshufb              m%4, m%1, m7   ; 2 3 3 4 4 5 5 6
3130*c0909341SAndroid Build Coastguard Worker    pshufb              m%1, m6        ; 0 1 1 2 2 3 3 4
3131*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m9, m%4   ; abcd1
3132*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m8        ; abcd0
3133*c0909341SAndroid Build Coastguard Worker    pshufb              m%2, m7        ; 6 7 7 8 8 9 9 a
3134*c0909341SAndroid Build Coastguard Worker    shufpd              m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
3135*c0909341SAndroid Build Coastguard Worker    paddd               m%5, m5
3136*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%5
3137*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m11, m%2  ; abcd3
3138*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%5
3139*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m10, m%4  ; abcd2
3140*c0909341SAndroid Build Coastguard Worker    pshufb              m%3, m7        ; a b b c c d d e
3141*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%4, m8        ; efgh0
3142*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%5
3143*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m9, m%2   ; efgh1
3144*c0909341SAndroid Build Coastguard Worker    shufpd              m%2, m%3, 0x05 ; 8 9 9 a a b b c
3145*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%3, m11       ; efgh3
3146*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%2, m10       ; efgh2
3147*c0909341SAndroid Build Coastguard Worker    paddd               m%4, m5
3148*c0909341SAndroid Build Coastguard Worker    paddd               m%4, m%5
3149*c0909341SAndroid Build Coastguard Worker    paddd               m%3, m%4
3150*c0909341SAndroid Build Coastguard Worker    paddd               m%2, m%3
3151*c0909341SAndroid Build Coastguard Worker    psrad               m%1, 4
3152*c0909341SAndroid Build Coastguard Worker    psrad               m%2, 4
3153*c0909341SAndroid Build Coastguard Worker    packssdw            m%1, m%2
3154*c0909341SAndroid Build Coastguard Worker%endmacro
3155*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*0+ 0]
3156*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+strideq*1+ 0], 1
3157*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+strideq*0+16]
3158*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+strideq*1+16], 1
3159*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
3160*c0909341SAndroid Build Coastguard Worker    shufpd               m1, m0, m2, 0x05
3161*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_H           0, 1, 2, 3, 4
3162*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
3163*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
3164*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3165*c0909341SAndroid Build Coastguard Worker    jg .h_w8
3166*c0909341SAndroid Build Coastguard Worker    RET
3167*c0909341SAndroid Build Coastguard Worker.h_w16:
3168*c0909341SAndroid Build Coastguard Worker    add                  wd, wd
3169*c0909341SAndroid Build Coastguard Worker.h_w16_loop0:
3170*c0909341SAndroid Build Coastguard Worker    mov                 r6d, wd
3171*c0909341SAndroid Build Coastguard Worker.h_w16_loop:
3172*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6-32]
3173*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6-24]
3174*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6-16]
3175*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_H           0, 1, 2, 3, 4
3176*c0909341SAndroid Build Coastguard Worker    mova       [tmpq+r6-32], m0
3177*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 32
3178*c0909341SAndroid Build Coastguard Worker    jg .h_w16_loop
3179*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
3180*c0909341SAndroid Build Coastguard Worker    add                tmpq, wq
3181*c0909341SAndroid Build Coastguard Worker    dec                  hd
3182*c0909341SAndroid Build Coastguard Worker    jg .h_w16_loop0
3183*c0909341SAndroid Build Coastguard Worker    RET
3184*c0909341SAndroid Build Coastguard Worker.hv:
3185*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      16
3186*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [prep_8tap_2d_rnd]
3187*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
3188*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
3189*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
3190*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [base+subpel_filters+mxq*8+2]
3191*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3192*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3193*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
3194*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
3195*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [base+subpel_filters+myq*8]
3196*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
3197*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
3198*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
3199*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
3200*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m0
3201*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
3202*c0909341SAndroid Build Coastguard Worker    psraw                m7, 4
3203*c0909341SAndroid Build Coastguard Worker    psraw                m1, 8
3204*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
3205*c0909341SAndroid Build Coastguard Worker    jz .hv_w4_10bit
3206*c0909341SAndroid Build Coastguard Worker    psraw                m7, 2
3207*c0909341SAndroid Build Coastguard Worker.hv_w4_10bit:
3208*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m1, q0000
3209*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m1, q1111
3210*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m1, q2222
3211*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m1, q3333
3212*c0909341SAndroid Build Coastguard Worker.hv_w4:
3213*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m9, [subpel_h_shufA]
3214*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m10, [subpel_h_shufB]
3215*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m7, q1111
3216*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q0000
3217*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*0]
3218*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+strideq*1], 1     ; 0 1
3219*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [srcq+r6       ]
3220*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, m0, [srcq+strideq*2], 0 ; 2 3
3221*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
3222*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+strideq*0], 1     ; 3 4
3223*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+strideq*1]
3224*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+strideq*2], 1     ; 5 6
3225*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
3226*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m1, m9
3227*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m10
3228*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7
3229*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m8
3230*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m2, m9
3231*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m10
3232*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m7
3233*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m8
3234*c0909341SAndroid Build Coastguard Worker    paddd                m4, m15
3235*c0909341SAndroid Build Coastguard Worker    paddd                m1, m4
3236*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m0, m9
3237*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m10
3238*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7
3239*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m8
3240*c0909341SAndroid Build Coastguard Worker    paddd                m5, m15
3241*c0909341SAndroid Build Coastguard Worker    paddd                m2, m5
3242*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m3, m9
3243*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m10
3244*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m7
3245*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m8
3246*c0909341SAndroid Build Coastguard Worker    paddd                m4, m15
3247*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
3248*c0909341SAndroid Build Coastguard Worker    paddd                m5, m15
3249*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
3250*c0909341SAndroid Build Coastguard Worker    vperm2i128           m0, m1, m2, 0x21
3251*c0909341SAndroid Build Coastguard Worker    psrld                m1, 6
3252*c0909341SAndroid Build Coastguard Worker    psrld                m2, 6
3253*c0909341SAndroid Build Coastguard Worker    vperm2i128           m3, m4, m5, 0x21
3254*c0909341SAndroid Build Coastguard Worker    pslld                m4, 10
3255*c0909341SAndroid Build Coastguard Worker    pslld                m5, 10
3256*c0909341SAndroid Build Coastguard Worker    pblendw              m2, m4, 0xaa ; 23 34
3257*c0909341SAndroid Build Coastguard Worker    pslld                m0, 10
3258*c0909341SAndroid Build Coastguard Worker    pblendw              m1, m0, 0xaa ; 01 12
3259*c0909341SAndroid Build Coastguard Worker    psrld                m3, 6
3260*c0909341SAndroid Build Coastguard Worker    pblendw              m3, m5, 0xaa ; 45 56
3261*c0909341SAndroid Build Coastguard Worker    psrad                m0, m5, 16
3262*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
3263*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+strideq*0]
3264*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [srcq+strideq*1], 1
3265*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
3266*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m11, m1   ; a0 b0
3267*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
3268*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m12       ; a1 b1
3269*c0909341SAndroid Build Coastguard Worker    paddd                m5, m15
3270*c0909341SAndroid Build Coastguard Worker    paddd                m5, m2
3271*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
3272*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m13       ; a2 b2
3273*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
3274*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4, m9
3275*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m10
3276*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m7
3277*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m8
3278*c0909341SAndroid Build Coastguard Worker    paddd                m3, m15
3279*c0909341SAndroid Build Coastguard Worker    paddd                m4, m3
3280*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
3281*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m4        ; _ 7 6 8
3282*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m0, q1122 ; _ 6 _ 7
3283*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0        ; 67 78
3284*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
3285*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m14, m3   ; a3 b3
3286*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
3287*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
3288*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
3289*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm5
3290*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xm4
3291*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
3292*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3293*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
3294*c0909341SAndroid Build Coastguard Worker    RET
3295*c0909341SAndroid Build Coastguard Worker.hv_w8:
3296*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
3297*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [base+subpel_filters+mxq*8]
3298*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3299*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3300*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
3301*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
3302*c0909341SAndroid Build Coastguard Worker    pmovsxbw            xm1, [base+subpel_filters+myq*8]
3303*c0909341SAndroid Build Coastguard Worker%if WIN64
3304*c0909341SAndroid Build Coastguard Worker    PUSH                 r8
3305*c0909341SAndroid Build Coastguard Worker%endif
3306*c0909341SAndroid Build Coastguard Worker    mov                 r8d, wd
3307*c0909341SAndroid Build Coastguard Worker    shl                  wd, 5
3308*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
3309*c0909341SAndroid Build Coastguard Worker    sub                srcq, 6
3310*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
3311*c0909341SAndroid Build Coastguard Worker    lea                  wd, [hq+wq-256]
3312*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
3313*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m2
3314*c0909341SAndroid Build Coastguard Worker    psraw                m0, 4
3315*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
3316*c0909341SAndroid Build Coastguard Worker    jz .hv_w8_10bit
3317*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
3318*c0909341SAndroid Build Coastguard Worker.hv_w8_10bit:
3319*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m0, q0000
3320*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m0, q1111
3321*c0909341SAndroid Build Coastguard Worker    mova            [v_mul], xm1
3322*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m0, q2222
3323*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m0, q3333
3324*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0:
3325*c0909341SAndroid Build Coastguard Worker%macro PREP_8TAP_HV_H 3 ; dst/src+0, src+8, src+16
3326*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m%1, m9   ; 2 3 3 4 4 5 5 6
3327*c0909341SAndroid Build Coastguard Worker    pshufb              m%1, m8        ; 0 1 1 2 2 3 3 4
3328*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m12, m2
3329*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m11
3330*c0909341SAndroid Build Coastguard Worker    pshufb              m%2, m9        ; 6 7 7 8 8 9 9 a
3331*c0909341SAndroid Build Coastguard Worker    shufpd               m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8
3332*c0909341SAndroid Build Coastguard Worker    paddd                m3, m15
3333*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m3
3334*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m14, m%2
3335*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m3
3336*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m13, m2
3337*c0909341SAndroid Build Coastguard Worker    pshufb              m%3, m9        ; a b b c c d d e
3338*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m11
3339*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m3
3340*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m12, m%2
3341*c0909341SAndroid Build Coastguard Worker    shufpd              m%2, m%3, 0x05 ; 8 9 9 a a b b c
3342*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%3, m14
3343*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%2, m13
3344*c0909341SAndroid Build Coastguard Worker    paddd                m2, m15
3345*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3
3346*c0909341SAndroid Build Coastguard Worker    paddd                m2, m%3
3347*c0909341SAndroid Build Coastguard Worker    paddd                m2, m%2
3348*c0909341SAndroid Build Coastguard Worker    psrad               m%1, 6
3349*c0909341SAndroid Build Coastguard Worker    psrad                m2, 6
3350*c0909341SAndroid Build Coastguard Worker    packssdw            m%1, m2
3351*c0909341SAndroid Build Coastguard Worker%endmacro
3352*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+r6       + 0]
3353*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [subpel_h_shufA]
3354*c0909341SAndroid Build Coastguard Worker    lea                  r5, [srcq+strideq*4]
3355*c0909341SAndroid Build Coastguard Worker    movu                xm6, [srcq+r6       + 8]
3356*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m9, [subpel_h_shufB]
3357*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
3358*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+r6       +16]
3359*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+strideq*0+ 0]
3360*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [r5  +strideq*0+ 0], 1
3361*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*0+16]
3362*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [r5  +strideq*0+16], 1
3363*c0909341SAndroid Build Coastguard Worker    shufpd               m7, m5, m1, 0x05
3364*c0909341SAndroid Build Coastguard Worker    INIT_XMM avx2
3365*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_HV_H        4, 6, 0    ; 3
3366*c0909341SAndroid Build Coastguard Worker    INIT_YMM avx2
3367*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_HV_H        5, 7, 1    ; 0 4
3368*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*2+ 0]
3369*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+r6     *2+ 0], 1
3370*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*2+16]
3371*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+r6     *2+16], 1
3372*c0909341SAndroid Build Coastguard Worker    shufpd               m7, m0, m1, 0x05
3373*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_HV_H        0, 7, 1    ; 2 6
3374*c0909341SAndroid Build Coastguard Worker    movu                xm6, [srcq+strideq*1+ 0]
3375*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*1+16]
3376*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [r5  +strideq*1+ 0], 1
3377*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [r5  +strideq*1+16], 1
3378*c0909341SAndroid Build Coastguard Worker    add                  r5, r6
3379*c0909341SAndroid Build Coastguard Worker    shufpd               m7, m6, m1, 0x05
3380*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_HV_H        6, 7, 1    ; 1 5
3381*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m4, q1100
3382*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m5, q3120
3383*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m6, q3120
3384*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m0, q3120
3385*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m7, m4  ; 23
3386*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5      ; 34
3387*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m5, m6  ; 01
3388*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6      ; 45
3389*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m6, m7  ; 12
3390*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m7      ; 56
3391*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
3392*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [v_mul+4*0]
3393*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [v_mul+4*1]
3394*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [v_mul+4*2]
3395*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m9, m1  ; a0
3396*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m2      ; b0
3397*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3398*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
3399*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m7      ; a1
3400*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7      ; b1
3401*c0909341SAndroid Build Coastguard Worker    paddd                m8, m15
3402*c0909341SAndroid Build Coastguard Worker    paddd                m9, m15
3403*c0909341SAndroid Build Coastguard Worker    paddd                m8, m3
3404*c0909341SAndroid Build Coastguard Worker    paddd                m9, m4
3405*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
3406*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
3407*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m10     ; a2
3408*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m10     ; b2
3409*c0909341SAndroid Build Coastguard Worker    paddd                m8, m5
3410*c0909341SAndroid Build Coastguard Worker    paddd                m9, m6
3411*c0909341SAndroid Build Coastguard Worker    movu                xm5, [r5+strideq*0]
3412*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [r5+strideq*1], 1
3413*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [subpel_h_shufA]
3414*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m10, [subpel_h_shufB]
3415*c0909341SAndroid Build Coastguard Worker    movu                xm6, [r5+strideq*0+16]
3416*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [r5+strideq*1+16], 1
3417*c0909341SAndroid Build Coastguard Worker    vextracti128       [r7], m0, 1
3418*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m5, m7  ; 01
3419*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m10     ; 23
3420*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m11
3421*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m12
3422*c0909341SAndroid Build Coastguard Worker    paddd                m0, m15
3423*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
3424*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m6, m7  ; 89
3425*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m10     ; ab
3426*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m13
3427*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m14
3428*c0909341SAndroid Build Coastguard Worker    paddd                m5, m15
3429*c0909341SAndroid Build Coastguard Worker    paddd                m6, m5
3430*c0909341SAndroid Build Coastguard Worker    movu                xm5, [r5+strideq*0+8]
3431*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [r5+strideq*1+8], 1
3432*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+strideq*2]
3433*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m5, m7
3434*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m10
3435*c0909341SAndroid Build Coastguard Worker    pmaddwd             m10, m13, m7
3436*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m11
3437*c0909341SAndroid Build Coastguard Worker    paddd                m0, m10
3438*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
3439*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m14, m5
3440*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m12
3441*c0909341SAndroid Build Coastguard Worker    paddd                m0, m7
3442*c0909341SAndroid Build Coastguard Worker    paddd                m5, m6
3443*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [r7]
3444*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [v_mul+4*3]
3445*c0909341SAndroid Build Coastguard Worker    psrad                m0, 6
3446*c0909341SAndroid Build Coastguard Worker    psrad                m5, 6
3447*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m5
3448*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m0, q3120 ; 7 8
3449*c0909341SAndroid Build Coastguard Worker    shufpd               m6, m7, 0x04  ; 6 7
3450*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6, m7    ; 67
3451*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m7        ; 78
3452*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m10, m5   ; a3
3453*c0909341SAndroid Build Coastguard Worker    pmaddwd             m10, m6        ; b3
3454*c0909341SAndroid Build Coastguard Worker    paddd                m7, m8
3455*c0909341SAndroid Build Coastguard Worker    paddd                m9, m10
3456*c0909341SAndroid Build Coastguard Worker    psrad                m7, 6
3457*c0909341SAndroid Build Coastguard Worker    psrad                m9, 6
3458*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m9
3459*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m7, q3120
3460*c0909341SAndroid Build Coastguard Worker    mova          [r7+r8*0], xm7
3461*c0909341SAndroid Build Coastguard Worker    vextracti128  [r7+r8*2], m7, 1
3462*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+r8*4]
3463*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3464*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
3465*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
3466*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
3467*c0909341SAndroid Build Coastguard Worker    movzx                hd, wb
3468*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<8
3469*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop0
3470*c0909341SAndroid Build Coastguard Worker%if WIN64
3471*c0909341SAndroid Build Coastguard Worker    POP                  r8
3472*c0909341SAndroid Build Coastguard Worker%endif
3473*c0909341SAndroid Build Coastguard Worker    RET
3474*c0909341SAndroid Build Coastguard Worker
3475*c0909341SAndroid Build Coastguard Worker%macro movifprep 2
3476*c0909341SAndroid Build Coastguard Worker %if isprep
3477*c0909341SAndroid Build Coastguard Worker    mov %1, %2
3478*c0909341SAndroid Build Coastguard Worker %endif
3479*c0909341SAndroid Build Coastguard Worker%endmacro
3480*c0909341SAndroid Build Coastguard Worker
3481*c0909341SAndroid Build Coastguard Worker%macro REMAP_REG 2
3482*c0909341SAndroid Build Coastguard Worker %xdefine r%1  r%2
3483*c0909341SAndroid Build Coastguard Worker %xdefine r%1q r%2q
3484*c0909341SAndroid Build Coastguard Worker %xdefine r%1d r%2d
3485*c0909341SAndroid Build Coastguard Worker%endmacro
3486*c0909341SAndroid Build Coastguard Worker
3487*c0909341SAndroid Build Coastguard Worker%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
3488*c0909341SAndroid Build Coastguard Worker %if isprep
3489*c0909341SAndroid Build Coastguard Worker  %xdefine r14_save r14
3490*c0909341SAndroid Build Coastguard Worker  %assign %%i 14
3491*c0909341SAndroid Build Coastguard Worker  %rep 14
3492*c0909341SAndroid Build Coastguard Worker   %assign %%j %%i-1
3493*c0909341SAndroid Build Coastguard Worker   REMAP_REG %%i, %%j
3494*c0909341SAndroid Build Coastguard Worker   %assign %%i %%i-1
3495*c0909341SAndroid Build Coastguard Worker  %endrep
3496*c0909341SAndroid Build Coastguard Worker %endif
3497*c0909341SAndroid Build Coastguard Worker%endmacro
3498*c0909341SAndroid Build Coastguard Worker
3499*c0909341SAndroid Build Coastguard Worker%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
3500*c0909341SAndroid Build Coastguard Worker %if isprep
3501*c0909341SAndroid Build Coastguard Worker  %assign %%i 1
3502*c0909341SAndroid Build Coastguard Worker  %rep 13
3503*c0909341SAndroid Build Coastguard Worker   %assign %%j %%i+1
3504*c0909341SAndroid Build Coastguard Worker   REMAP_REG %%i, %%j
3505*c0909341SAndroid Build Coastguard Worker   %assign %%i %%i+1
3506*c0909341SAndroid Build Coastguard Worker  %endrep
3507*c0909341SAndroid Build Coastguard Worker  %xdefine r14 r14_save
3508*c0909341SAndroid Build Coastguard Worker  %undef r14_save
3509*c0909341SAndroid Build Coastguard Worker %endif
3510*c0909341SAndroid Build Coastguard Worker%endmacro
3511*c0909341SAndroid Build Coastguard Worker
3512*c0909341SAndroid Build Coastguard Worker%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
3513*c0909341SAndroid Build Coastguard Worker    MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
3514*c0909341SAndroid Build Coastguard Worker    RET
3515*c0909341SAndroid Build Coastguard Worker %if %1
3516*c0909341SAndroid Build Coastguard Worker    MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
3517*c0909341SAndroid Build Coastguard Worker %endif
3518*c0909341SAndroid Build Coastguard Worker%endmacro
3519*c0909341SAndroid Build Coastguard Worker
3520*c0909341SAndroid Build Coastguard Worker%macro MC_8TAP_SCALED_H 8-9 0 ; dst, tmp[0-6], load_hrnd
3521*c0909341SAndroid Build Coastguard Worker    movu               xm%1, [srcq+ r4*2]
3522*c0909341SAndroid Build Coastguard Worker    movu               xm%2, [srcq+ r6*2]
3523*c0909341SAndroid Build Coastguard Worker    movu               xm%3, [srcq+ r7*2]
3524*c0909341SAndroid Build Coastguard Worker    movu               xm%4, [srcq+ r9*2]
3525*c0909341SAndroid Build Coastguard Worker    vinserti128         m%1, [srcq+r10*2], 1
3526*c0909341SAndroid Build Coastguard Worker    vinserti128         m%2, [srcq+r11*2], 1
3527*c0909341SAndroid Build Coastguard Worker    vinserti128         m%3, [srcq+r13*2], 1
3528*c0909341SAndroid Build Coastguard Worker    vinserti128         m%4, [srcq+ rX*2], 1
3529*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
3530*c0909341SAndroid Build Coastguard Worker    movu               xm%5, [srcq+ r4*2]
3531*c0909341SAndroid Build Coastguard Worker    movu               xm%6, [srcq+ r6*2]
3532*c0909341SAndroid Build Coastguard Worker    movu               xm%7, [srcq+ r7*2]
3533*c0909341SAndroid Build Coastguard Worker    movu               xm%8, [srcq+ r9*2]
3534*c0909341SAndroid Build Coastguard Worker    vinserti128         m%5, [srcq+r10*2], 1
3535*c0909341SAndroid Build Coastguard Worker    vinserti128         m%6, [srcq+r11*2], 1
3536*c0909341SAndroid Build Coastguard Worker    vinserti128         m%7, [srcq+r13*2], 1
3537*c0909341SAndroid Build Coastguard Worker    vinserti128         m%8, [srcq+ rX*2], 1
3538*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
3539*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m12
3540*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%2, m13
3541*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%3, m14
3542*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%4, m15
3543*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m12
3544*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%6, m13
3545*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%7, m14
3546*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%8, m15
3547*c0909341SAndroid Build Coastguard Worker    phaddd              m%1, m%2
3548*c0909341SAndroid Build Coastguard Worker %if %9
3549*c0909341SAndroid Build Coastguard Worker    mova                m10, [rsp+0x00]
3550*c0909341SAndroid Build Coastguard Worker %endif
3551*c0909341SAndroid Build Coastguard Worker    phaddd              m%3, m%4
3552*c0909341SAndroid Build Coastguard Worker    phaddd              m%5, m%6
3553*c0909341SAndroid Build Coastguard Worker    phaddd              m%7, m%8
3554*c0909341SAndroid Build Coastguard Worker    phaddd              m%1, m%3
3555*c0909341SAndroid Build Coastguard Worker    phaddd              m%5, m%7
3556*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m10
3557*c0909341SAndroid Build Coastguard Worker    paddd               m%5, m10
3558*c0909341SAndroid Build Coastguard Worker    psrad               m%1, xm11
3559*c0909341SAndroid Build Coastguard Worker    psrad               m%5, xm11
3560*c0909341SAndroid Build Coastguard Worker    packssdw            m%1, m%5
3561*c0909341SAndroid Build Coastguard Worker%endmacro
3562*c0909341SAndroid Build Coastguard Worker
3563*c0909341SAndroid Build Coastguard Worker%macro MC_8TAP_SCALED 1
3564*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
3565*c0909341SAndroid Build Coastguard Worker %assign isput  1
3566*c0909341SAndroid Build Coastguard Worker %assign isprep 0
3567*c0909341SAndroid Build Coastguard Workercglobal put_8tap_scaled_16bpc, 4, 14, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
3568*c0909341SAndroid Build Coastguard Worker %xdefine base_reg r12
3569*c0909341SAndroid Build Coastguard Worker    mov                 r7d, pxmaxm
3570*c0909341SAndroid Build Coastguard Worker%else
3571*c0909341SAndroid Build Coastguard Worker %assign isput  0
3572*c0909341SAndroid Build Coastguard Worker %assign isprep 1
3573*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
3574*c0909341SAndroid Build Coastguard Worker  %define tmp_stridem qword [rsp+0xd0]
3575*c0909341SAndroid Build Coastguard Worker %xdefine base_reg r11
3576*c0909341SAndroid Build Coastguard Worker%endif
3577*c0909341SAndroid Build Coastguard Worker    lea            base_reg, [%1_8tap_scaled_16bpc_avx2]
3578*c0909341SAndroid Build Coastguard Worker%define base base_reg-%1_8tap_scaled_16bpc_avx2
3579*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
3580*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, dxm
3581*c0909341SAndroid Build Coastguard Worker%if isprep && UNIX64
3582*c0909341SAndroid Build Coastguard Worker    movd               xm10, mxd
3583*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, xm10
3584*c0909341SAndroid Build Coastguard Worker    mov                 r5d, t0d
3585*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 5, 7
3586*c0909341SAndroid Build Coastguard Worker    mov                 r6d, pxmaxm
3587*c0909341SAndroid Build Coastguard Worker%else
3588*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, mxm
3589*c0909341SAndroid Build Coastguard Worker %if isput
3590*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m11, pxmaxm
3591*c0909341SAndroid Build Coastguard Worker %else
3592*c0909341SAndroid Build Coastguard Worker    mov                 r6d, pxmaxm
3593*c0909341SAndroid Build Coastguard Worker %endif
3594*c0909341SAndroid Build Coastguard Worker%endif
3595*c0909341SAndroid Build Coastguard Worker    mov                 dyd, dym
3596*c0909341SAndroid Build Coastguard Worker%if isput
3597*c0909341SAndroid Build Coastguard Worker %if WIN64
3598*c0909341SAndroid Build Coastguard Worker    mov                 r8d, hm
3599*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
3600*c0909341SAndroid Build Coastguard Worker  %define hm r5m
3601*c0909341SAndroid Build Coastguard Worker  %define dxm r8m
3602*c0909341SAndroid Build Coastguard Worker %else
3603*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
3604*c0909341SAndroid Build Coastguard Worker  %define hm r6m
3605*c0909341SAndroid Build Coastguard Worker %endif
3606*c0909341SAndroid Build Coastguard Worker %define dsm [rsp+0x98]
3607*c0909341SAndroid Build Coastguard Worker %define rX r1
3608*c0909341SAndroid Build Coastguard Worker %define rXd r1d
3609*c0909341SAndroid Build Coastguard Worker%else ; prep
3610*c0909341SAndroid Build Coastguard Worker %if WIN64
3611*c0909341SAndroid Build Coastguard Worker    mov                 r7d, hm
3612*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
3613*c0909341SAndroid Build Coastguard Worker  %define hm r4m
3614*c0909341SAndroid Build Coastguard Worker  %define dxm r7m
3615*c0909341SAndroid Build Coastguard Worker %else
3616*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
3617*c0909341SAndroid Build Coastguard Worker  %define hm [rsp+0x98]
3618*c0909341SAndroid Build Coastguard Worker %endif
3619*c0909341SAndroid Build Coastguard Worker MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
3620*c0909341SAndroid Build Coastguard Worker %define rX r14
3621*c0909341SAndroid Build Coastguard Worker %define rXd r14d
3622*c0909341SAndroid Build Coastguard Worker%endif
3623*c0909341SAndroid Build Coastguard Worker    shr                 r7d, 11
3624*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pd_0x3ff]
3625*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [base+s_8tap_h_rnd+r7*4]
3626*c0909341SAndroid Build Coastguard Worker    movd                xm7, [base+s_8tap_h_sh+r7*4]
3627*c0909341SAndroid Build Coastguard Worker%if isput
3628*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [base+put_s_8tap_v_rnd+r7*4]
3629*c0909341SAndroid Build Coastguard Worker    pinsrd              xm7, [base+put_s_8tap_v_sh+r7*4], 2
3630*c0909341SAndroid Build Coastguard Worker%else
3631*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [base+pd_m524256]
3632*c0909341SAndroid Build Coastguard Worker%endif
3633*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
3634*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
3635*c0909341SAndroid Build Coastguard Worker    movzx               r7d, t1b
3636*c0909341SAndroid Build Coastguard Worker    shr                 t1d, 16
3637*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
3638*c0909341SAndroid Build Coastguard Worker    cmovs               t1d, r7d
3639*c0909341SAndroid Build Coastguard Worker    sub                srcq, ss3q
3640*c0909341SAndroid Build Coastguard Worker    cmp                 dyd, 1024
3641*c0909341SAndroid Build Coastguard Worker    je .dy1
3642*c0909341SAndroid Build Coastguard Worker    cmp                 dyd, 2048
3643*c0909341SAndroid Build Coastguard Worker    je .dy2
3644*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
3645*c0909341SAndroid Build Coastguard Worker    add                  wq, base_reg
3646*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3647*c0909341SAndroid Build Coastguard Worker%if isput
3648*c0909341SAndroid Build Coastguard Worker.w2:
3649*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
3650*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
3651*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
3652*c0909341SAndroid Build Coastguard Worker    movd               xm15, t0d
3653*c0909341SAndroid Build Coastguard Worker    punpckldq            m8, m9, m8
3654*c0909341SAndroid Build Coastguard Worker    paddd               m10, m8 ; mx+dx*[0,1]
3655*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm14, [base+pq_0x40000000+2]
3656*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm15, xm15
3657*c0909341SAndroid Build Coastguard Worker    pand                xm8, xm10, xm6
3658*c0909341SAndroid Build Coastguard Worker    psrld               xm8, 6
3659*c0909341SAndroid Build Coastguard Worker    paddd              xm15, xm8
3660*c0909341SAndroid Build Coastguard Worker    movd                r4d, xm15
3661*c0909341SAndroid Build Coastguard Worker    pextrd              r6d, xm15, 1
3662*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [base+bdct_lb_q]
3663*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [base+subpel_s_shuf2]
3664*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm15, [base+subpel_filters+r4*8+2]
3665*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [base+subpel_filters+r6*8+2]
3666*c0909341SAndroid Build Coastguard Worker    pcmpeqd             xm8, xm9
3667*c0909341SAndroid Build Coastguard Worker    psrld               m10, 10
3668*c0909341SAndroid Build Coastguard Worker    paddd               m10, m10
3669*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
3670*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*1]
3671*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+ssq*2]
3672*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ss3q ]
3673*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
3674*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m5
3675*c0909341SAndroid Build Coastguard Worker    paddb               m10, m6
3676*c0909341SAndroid Build Coastguard Worker    vpblendd           xm15, xm4, 0xa
3677*c0909341SAndroid Build Coastguard Worker    pblendvb           xm15, xm14, xm8
3678*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m15, xm15
3679*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*0], 1 ; 0 4
3680*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+ssq*1], 1 ; 1 5
3681*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+ssq*2], 1 ; 2 6
3682*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+ss3q ], 1 ; 3 7
3683*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
3684*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m10}, m0, m1, m2, m3
3685*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m0, m1, m2, m3
3686*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m1
3687*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m3
3688*c0909341SAndroid Build Coastguard Worker    paddd                m0, m12
3689*c0909341SAndroid Build Coastguard Worker    paddd                m2, m12
3690*c0909341SAndroid Build Coastguard Worker    psrad                m0, xm7
3691*c0909341SAndroid Build Coastguard Worker    psrad                m2, xm7
3692*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2             ; 0 1 2 3  4 5 6 7
3693*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
3694*c0909341SAndroid Build Coastguard Worker    palignr             xm2, xm1, xm0, 4    ; 1 2 3 4
3695*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm0, xm2       ; 01 12
3696*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm0, xm2            ; 23 34
3697*c0909341SAndroid Build Coastguard Worker    pshufd              xm4, xm1, q0321     ; 5 6 7 _
3698*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm1, xm4       ; 45 56
3699*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm4, xm1, xm4       ; 67 __
3700*c0909341SAndroid Build Coastguard Worker.w2_loop:
3701*c0909341SAndroid Build Coastguard Worker    and                 myd, 0x3ff
3702*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 64 << 24
3703*c0909341SAndroid Build Coastguard Worker    mov                 r4d, myd
3704*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6
3705*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [t1+r4]
3706*c0909341SAndroid Build Coastguard Worker    cmovnz              r6q, [base+subpel_filters+r4*8]
3707*c0909341SAndroid Build Coastguard Worker    movq               xm14, r6q
3708*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xm14, xm14
3709*c0909341SAndroid Build Coastguard Worker    pshufd              xm8, xm14, q0000
3710*c0909341SAndroid Build Coastguard Worker    pshufd              xm9, xm14, q1111
3711*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm5, xm3, xm8
3712*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm6, xm0, xm9
3713*c0909341SAndroid Build Coastguard Worker    pshufd              xm8, xm14, q2222
3714*c0909341SAndroid Build Coastguard Worker    pshufd             xm14, xm14, q3333
3715*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm6
3716*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm6, xm2, xm8
3717*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm8, xm4, xm14
3718*c0909341SAndroid Build Coastguard Worker    psrldq              xm9, xm7, 8
3719*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm6
3720*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm13
3721*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm8
3722*c0909341SAndroid Build Coastguard Worker    psrad               xm5, xm9
3723*c0909341SAndroid Build Coastguard Worker    packusdw            xm5, xm5
3724*c0909341SAndroid Build Coastguard Worker    pminsw              xm5, xm11
3725*c0909341SAndroid Build Coastguard Worker    movd             [dstq], xm5
3726*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
3727*c0909341SAndroid Build Coastguard Worker    dec                  hd
3728*c0909341SAndroid Build Coastguard Worker    jz .ret
3729*c0909341SAndroid Build Coastguard Worker    add                 myd, dyd
3730*c0909341SAndroid Build Coastguard Worker    test                myd, ~0x3ff
3731*c0909341SAndroid Build Coastguard Worker    jz .w2_loop
3732*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq]
3733*c0909341SAndroid Build Coastguard Worker    test                myd, 0x400
3734*c0909341SAndroid Build Coastguard Worker    jz .w2_skip_line
3735*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
3736*c0909341SAndroid Build Coastguard Worker    shufps              xm3, xm0, q1032     ; 01 12
3737*c0909341SAndroid Build Coastguard Worker    shufps              xm0, xm2, q1032     ; 23 34
3738*c0909341SAndroid Build Coastguard Worker    shufps              xm2, xm4, q1032     ; 45 56
3739*c0909341SAndroid Build Coastguard Worker    pshufb              xm5, xm10
3740*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm5, xm15
3741*c0909341SAndroid Build Coastguard Worker    phaddd              xm5, xm5
3742*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm12
3743*c0909341SAndroid Build Coastguard Worker    psrad               xm5, xm7
3744*c0909341SAndroid Build Coastguard Worker    packssdw            xm5, xm5
3745*c0909341SAndroid Build Coastguard Worker    palignr             xm1, xm5, xm1, 12
3746*c0909341SAndroid Build Coastguard Worker    punpcklqdq          xm1, xm1            ; 6 7 6 7
3747*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm4, xm1, xm5       ; 67 __
3748*c0909341SAndroid Build Coastguard Worker    jmp .w2_loop
3749*c0909341SAndroid Build Coastguard Worker.w2_skip_line:
3750*c0909341SAndroid Build Coastguard Worker    movu                xm6, [srcq+ssq*1]
3751*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3752*c0909341SAndroid Build Coastguard Worker    mova                xm3, xm0            ; 01 12
3753*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm2            ; 23 34
3754*c0909341SAndroid Build Coastguard Worker    pshufb              xm5, xm10
3755*c0909341SAndroid Build Coastguard Worker    pshufb              xm6, xm10
3756*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm5, xm15
3757*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm6, xm15
3758*c0909341SAndroid Build Coastguard Worker    phaddd              xm5, xm6
3759*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm12
3760*c0909341SAndroid Build Coastguard Worker    psrad               xm5, xm7
3761*c0909341SAndroid Build Coastguard Worker    packssdw            xm5, xm5            ; 6 7 6 7
3762*c0909341SAndroid Build Coastguard Worker    palignr             xm1, xm5, xm1, 8    ; 4 5 6 7
3763*c0909341SAndroid Build Coastguard Worker    pshufd              xm5, xm1, q0321     ; 5 6 7 _
3764*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm1, xm5       ; 45 56
3765*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm4, xm1, xm5       ; 67 __
3766*c0909341SAndroid Build Coastguard Worker    jmp .w2_loop
3767*c0909341SAndroid Build Coastguard Worker%endif
3768*c0909341SAndroid Build Coastguard Worker.w4:
3769*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
3770*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x00], m12
3771*c0909341SAndroid Build Coastguard Worker%if isput
3772*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], xm13
3773*c0909341SAndroid Build Coastguard Worker%else
3774*c0909341SAndroid Build Coastguard Worker    SWAP                m11, m13
3775*c0909341SAndroid Build Coastguard Worker%endif
3776*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x30], xm7
3777*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [base+rescale_mul]
3778*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
3779*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
3780*c0909341SAndroid Build Coastguard Worker    movd               xm15, t0d
3781*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m7
3782*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [base+pq_0x40000000+1]
3783*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm15, xm15
3784*c0909341SAndroid Build Coastguard Worker    SWAP                m13, m10
3785*c0909341SAndroid Build Coastguard Worker    paddd               m13, m8 ; mx+dx*[0-3]
3786*c0909341SAndroid Build Coastguard Worker    pand                 m6, m13
3787*c0909341SAndroid Build Coastguard Worker    psrld                m6, 6
3788*c0909341SAndroid Build Coastguard Worker    paddd              xm15, xm6
3789*c0909341SAndroid Build Coastguard Worker    movd                r4d, xm15
3790*c0909341SAndroid Build Coastguard Worker    pextrd              r6d, xm15, 1
3791*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm15, 2
3792*c0909341SAndroid Build Coastguard Worker    pextrd             r13d, xm15, 3
3793*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [base+bdct_lb_q+ 0]
3794*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [base+bdct_lb_q+16]
3795*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [base+subpel_s_shuf2]
3796*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm14, [base+subpel_filters+r4*8+2]
3797*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm7, [base+subpel_filters+r6*8+2]
3798*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm15, [base+subpel_filters+r11*8+2]
3799*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm8, [base+subpel_filters+r13*8+2]
3800*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m6, m9
3801*c0909341SAndroid Build Coastguard Worker    punpckldq           m10, m6, m6
3802*c0909341SAndroid Build Coastguard Worker    punpckhdq            m6, m6
3803*c0909341SAndroid Build Coastguard Worker    psrld               m13, 10
3804*c0909341SAndroid Build Coastguard Worker    paddd               m13, m13
3805*c0909341SAndroid Build Coastguard Worker    vpblendd           xm14, xm7, 0xa
3806*c0909341SAndroid Build Coastguard Worker    vpblendd           xm15, xm8, 0xa
3807*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m14, xm14
3808*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m15, xm15
3809*c0909341SAndroid Build Coastguard Worker    pblendvb            m14, m2, m10
3810*c0909341SAndroid Build Coastguard Worker    pblendvb            m15, m2, m6
3811*c0909341SAndroid Build Coastguard Worker    pextrd               r4, xm13, 2
3812*c0909341SAndroid Build Coastguard Worker    pshufb              m12, m13, m5
3813*c0909341SAndroid Build Coastguard Worker    pshufb              m13, m1
3814*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r4+ssq*1]
3815*c0909341SAndroid Build Coastguard Worker    lea                 r11, [r4+ssq*2]
3816*c0909341SAndroid Build Coastguard Worker    lea                 r13, [r4+ss3q ]
3817*c0909341SAndroid Build Coastguard Worker    movu                xm7, [srcq+ssq*0]
3818*c0909341SAndroid Build Coastguard Worker    movu                xm9, [srcq+ssq*1]
3819*c0909341SAndroid Build Coastguard Worker    movu                xm8, [srcq+ssq*2]
3820*c0909341SAndroid Build Coastguard Worker    movu               xm10, [srcq+ss3q ]
3821*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+r4   ]
3822*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+r6   ]
3823*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+r11  ]
3824*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+r13  ]
3825*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
3826*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [srcq+ssq*0], 1
3827*c0909341SAndroid Build Coastguard Worker    vinserti128          m9, [srcq+ssq*1], 1
3828*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, [srcq+ssq*2], 1
3829*c0909341SAndroid Build Coastguard Worker    vinserti128         m10, [srcq+ss3q ], 1
3830*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+r4   ], 1
3831*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+r6   ], 1
3832*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+r11  ], 1
3833*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [srcq+r13  ], 1
3834*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
3835*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m5, xm13
3836*c0909341SAndroid Build Coastguard Worker    psubb               m13, m5
3837*c0909341SAndroid Build Coastguard Worker    paddb               m12, m0
3838*c0909341SAndroid Build Coastguard Worker    paddb               m13, m0
3839*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m12}, m7, m9, m8, m10
3840*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m14}, m7, m9, m8, m10
3841*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m13}, m1, m2, m3, m4
3842*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m1, m2, m3, m4
3843*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+0x00]
3844*c0909341SAndroid Build Coastguard Worker    movd                xm6, [rsp+0x30]
3845*c0909341SAndroid Build Coastguard Worker    phaddd               m7, m1
3846*c0909341SAndroid Build Coastguard Worker    phaddd               m9, m3
3847*c0909341SAndroid Build Coastguard Worker    phaddd               m8, m2
3848*c0909341SAndroid Build Coastguard Worker    phaddd              m10, m4
3849*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m7, m9, m8, m10
3850*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, xm6}, m7, m9, m8, m10
3851*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m9                 ; 0 1  4 5
3852*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m10                ; 2 3  6 7
3853*c0909341SAndroid Build Coastguard Worker    vextracti128        xm9, m7, 1              ; 4 5
3854*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m8, 1              ; 6 7
3855*c0909341SAndroid Build Coastguard Worker    shufps              xm4, xm7, xm8, q1032    ; 1 2
3856*c0909341SAndroid Build Coastguard Worker    shufps              xm5, xm8, xm9, q1032    ; 3 4
3857*c0909341SAndroid Build Coastguard Worker    shufps              xm6, xm9, xm3, q1032    ; 5 6
3858*c0909341SAndroid Build Coastguard Worker    psrldq             xm10, xm3, 8             ; 7 _
3859*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm0, xm7, xm4   ; 01
3860*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm7, xm4        ; 12
3861*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm1, xm8, xm5   ; 23
3862*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm8, xm5        ; 34
3863*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm9, xm6   ; 45
3864*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm9, xm6        ; 56
3865*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm10       ; 67
3866*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x40], xm7
3867*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], xm8
3868*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x60], xm9
3869*c0909341SAndroid Build Coastguard Worker.w4_loop:
3870*c0909341SAndroid Build Coastguard Worker    and                 myd, 0x3ff
3871*c0909341SAndroid Build Coastguard Worker    mov                r11d, 64 << 24
3872*c0909341SAndroid Build Coastguard Worker    mov                r13d, myd
3873*c0909341SAndroid Build Coastguard Worker    shr                r13d, 6
3874*c0909341SAndroid Build Coastguard Worker    lea                r13d, [t1+r13]
3875*c0909341SAndroid Build Coastguard Worker    cmovnz             r11q, [base+subpel_filters+r13*8]
3876*c0909341SAndroid Build Coastguard Worker    movq                xm9, r11q
3877*c0909341SAndroid Build Coastguard Worker    pmovsxbw            xm9, xm9
3878*c0909341SAndroid Build Coastguard Worker    pshufd              xm7, xm9, q0000
3879*c0909341SAndroid Build Coastguard Worker    pshufd              xm8, xm9, q1111
3880*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm4, xm0, xm7
3881*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm5, xm1, xm8
3882*c0909341SAndroid Build Coastguard Worker    pshufd              xm7, xm9, q2222
3883*c0909341SAndroid Build Coastguard Worker    pshufd              xm9, xm9, q3333
3884*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm6, xm2, xm7
3885*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm8, xm3, xm9
3886*c0909341SAndroid Build Coastguard Worker%if isput
3887*c0909341SAndroid Build Coastguard Worker    mova                xm7, [rsp+0x20]
3888*c0909341SAndroid Build Coastguard Worker    movd                xm9, [rsp+0x38]
3889*c0909341SAndroid Build Coastguard Worker%else
3890*c0909341SAndroid Build Coastguard Worker    SWAP                 m7, m11
3891*c0909341SAndroid Build Coastguard Worker%endif
3892*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm5
3893*c0909341SAndroid Build Coastguard Worker    paddd               xm6, xm8
3894*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm6
3895*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm7
3896*c0909341SAndroid Build Coastguard Worker%if isput
3897*c0909341SAndroid Build Coastguard Worker    psrad               xm4, xm9
3898*c0909341SAndroid Build Coastguard Worker    packusdw            xm4, xm4
3899*c0909341SAndroid Build Coastguard Worker    pminuw              xm4, xm11
3900*c0909341SAndroid Build Coastguard Worker    movq             [dstq], xm4
3901*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
3902*c0909341SAndroid Build Coastguard Worker%else
3903*c0909341SAndroid Build Coastguard Worker    SWAP                m11, m7
3904*c0909341SAndroid Build Coastguard Worker    psrad               xm4, 6
3905*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm4
3906*c0909341SAndroid Build Coastguard Worker    movq             [tmpq], xm4
3907*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8
3908*c0909341SAndroid Build Coastguard Worker%endif
3909*c0909341SAndroid Build Coastguard Worker    dec                  hd
3910*c0909341SAndroid Build Coastguard Worker    jz .ret
3911*c0909341SAndroid Build Coastguard Worker    add                 myd, dyd
3912*c0909341SAndroid Build Coastguard Worker    test                myd, ~0x3ff
3913*c0909341SAndroid Build Coastguard Worker    jz .w4_loop
3914*c0909341SAndroid Build Coastguard Worker    mova                xm8, [rsp+0x00]
3915*c0909341SAndroid Build Coastguard Worker    movd                xm9, [rsp+0x30]
3916*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq]
3917*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+r4]
3918*c0909341SAndroid Build Coastguard Worker    test                myd, 0x400
3919*c0909341SAndroid Build Coastguard Worker    jz .w4_skip_line
3920*c0909341SAndroid Build Coastguard Worker    mova                xm0, [rsp+0x40]
3921*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x40], xm1
3922*c0909341SAndroid Build Coastguard Worker    mova                xm1, [rsp+0x50]
3923*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], xm2
3924*c0909341SAndroid Build Coastguard Worker    mova                xm2, [rsp+0x60]
3925*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x60], xm3
3926*c0909341SAndroid Build Coastguard Worker    pshufb              xm4, xm12
3927*c0909341SAndroid Build Coastguard Worker    pshufb              xm5, xm13
3928*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm4, xm14
3929*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm5, xm15
3930*c0909341SAndroid Build Coastguard Worker    phaddd              xm4, xm5
3931*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm8
3932*c0909341SAndroid Build Coastguard Worker    psrad               xm4, xm9
3933*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm4
3934*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm10, xm4
3935*c0909341SAndroid Build Coastguard Worker    mova               xm10, xm4
3936*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
3937*c0909341SAndroid Build Coastguard Worker    jmp .w4_loop
3938*c0909341SAndroid Build Coastguard Worker.w4_skip_line:
3939*c0909341SAndroid Build Coastguard Worker    movu                xm6, [srcq+ssq*1]
3940*c0909341SAndroid Build Coastguard Worker    movu                xm7, [srcq+r6]
3941*c0909341SAndroid Build Coastguard Worker    movu                 m0, [rsp+0x50]
3942*c0909341SAndroid Build Coastguard Worker    pshufb              xm4, xm12
3943*c0909341SAndroid Build Coastguard Worker    pshufb              xm6, xm12
3944*c0909341SAndroid Build Coastguard Worker    pshufb              xm5, xm13
3945*c0909341SAndroid Build Coastguard Worker    pshufb              xm7, xm13
3946*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm4, xm14
3947*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm6, xm14
3948*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm5, xm15
3949*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm7, xm15
3950*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x40], m0
3951*c0909341SAndroid Build Coastguard Worker    phaddd              xm4, xm5
3952*c0909341SAndroid Build Coastguard Worker    phaddd              xm6, xm7
3953*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm8
3954*c0909341SAndroid Build Coastguard Worker    paddd               xm6, xm8
3955*c0909341SAndroid Build Coastguard Worker    psrad               xm4, xm9
3956*c0909341SAndroid Build Coastguard Worker    psrad               xm6, xm9
3957*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm6
3958*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm9, xm10, xm4
3959*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x60], xm9
3960*c0909341SAndroid Build Coastguard Worker    psrldq             xm10, xm4, 8
3961*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm1
3962*c0909341SAndroid Build Coastguard Worker    mova                xm1, xm2
3963*c0909341SAndroid Build Coastguard Worker    mova                xm2, xm3
3964*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm4, xm10
3965*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3966*c0909341SAndroid Build Coastguard Worker    jmp .w4_loop
3967*c0909341SAndroid Build Coastguard Worker    SWAP                m10, m13
3968*c0909341SAndroid Build Coastguard Worker%if isprep
3969*c0909341SAndroid Build Coastguard Worker    SWAP                m13, m11
3970*c0909341SAndroid Build Coastguard Worker%endif
3971*c0909341SAndroid Build Coastguard Worker.w8:
3972*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x80], 1
3973*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 16
3974*c0909341SAndroid Build Coastguard Worker    jmp .w_start
3975*c0909341SAndroid Build Coastguard Worker.w16:
3976*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x80], 2
3977*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 32
3978*c0909341SAndroid Build Coastguard Worker    jmp .w_start
3979*c0909341SAndroid Build Coastguard Worker.w32:
3980*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x80], 4
3981*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 64
3982*c0909341SAndroid Build Coastguard Worker    jmp .w_start
3983*c0909341SAndroid Build Coastguard Worker.w64:
3984*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x80], 8
3985*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 128
3986*c0909341SAndroid Build Coastguard Worker    jmp .w_start
3987*c0909341SAndroid Build Coastguard Worker.w128:
3988*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x80], 16
3989*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 256
3990*c0909341SAndroid Build Coastguard Worker.w_start:
3991*c0909341SAndroid Build Coastguard Worker    SWAP                m10, m12, m1
3992*c0909341SAndroid Build Coastguard Worker    SWAP                m11, m7
3993*c0909341SAndroid Build Coastguard Worker    ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
3994*c0909341SAndroid Build Coastguard Worker%if isput
3995*c0909341SAndroid Build Coastguard Worker    movifnidn           dsm, dsq
3996*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0xb0], xm7
3997*c0909341SAndroid Build Coastguard Worker%endif
3998*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x00], m10
3999*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m13
4000*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 16
4001*c0909341SAndroid Build Coastguard Worker    sub                srcq, 6
4002*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [base+rescale_mul2]
4003*c0909341SAndroid Build Coastguard Worker    movd               xm15, t0d
4004*c0909341SAndroid Build Coastguard Worker    mov          [rsp+0x84], t0d
4005*c0909341SAndroid Build Coastguard Worker    mov          [rsp+0x88], srcq
4006*c0909341SAndroid Build Coastguard Worker    mov          [rsp+0x90], r0q ; dstq / tmpq
4007*c0909341SAndroid Build Coastguard Worker%if UNIX64
4008*c0909341SAndroid Build Coastguard Worker    mov                  hm, hd
4009*c0909341SAndroid Build Coastguard Worker%endif
4010*c0909341SAndroid Build Coastguard Worker    shl           dword dxm, 3 ; dx*8
4011*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, xm15
4012*c0909341SAndroid Build Coastguard Worker    paddd                m1, m8 ; mx+dx*[0-7]
4013*c0909341SAndroid Build Coastguard Worker    jmp .hloop
4014*c0909341SAndroid Build Coastguard Worker.hloop_prep:
4015*c0909341SAndroid Build Coastguard Worker    dec    dword [rsp+0x80]
4016*c0909341SAndroid Build Coastguard Worker    jz .ret
4017*c0909341SAndroid Build Coastguard Worker    add    qword [rsp+0x90], 16
4018*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
4019*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, dxm
4020*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pd_0x3ff]
4021*c0909341SAndroid Build Coastguard Worker    paddd                m1, m8, [rsp+0x40]
4022*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [rsp+0x84]
4023*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
4024*c0909341SAndroid Build Coastguard Worker    mov                srcq, [rsp+0x88]
4025*c0909341SAndroid Build Coastguard Worker    mov                 r0q, [rsp+0x90] ; dstq / tmpq
4026*c0909341SAndroid Build Coastguard Worker.hloop:
4027*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        xm2, [base+pq_0x40000000]
4028*c0909341SAndroid Build Coastguard Worker    pand                 m5, m1, m6
4029*c0909341SAndroid Build Coastguard Worker    psrld                m5, 6
4030*c0909341SAndroid Build Coastguard Worker    paddd               m15, m5
4031*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m5, m9
4032*c0909341SAndroid Build Coastguard Worker    vextracti128        xm7, m15, 1
4033*c0909341SAndroid Build Coastguard Worker    movq                 r6, xm15
4034*c0909341SAndroid Build Coastguard Worker    pextrq               r9, xm15, 1
4035*c0909341SAndroid Build Coastguard Worker    movq                r11, xm7
4036*c0909341SAndroid Build Coastguard Worker    pextrq               rX, xm7, 1
4037*c0909341SAndroid Build Coastguard Worker    mov                 r4d, r6d
4038*c0909341SAndroid Build Coastguard Worker    shr                  r6, 32
4039*c0909341SAndroid Build Coastguard Worker    mov                 r7d, r9d
4040*c0909341SAndroid Build Coastguard Worker    shr                  r9, 32
4041*c0909341SAndroid Build Coastguard Worker    mov                r10d, r11d
4042*c0909341SAndroid Build Coastguard Worker    shr                 r11, 32
4043*c0909341SAndroid Build Coastguard Worker    mov                r13d, rXd
4044*c0909341SAndroid Build Coastguard Worker    shr                  rX, 32
4045*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x40], m1
4046*c0909341SAndroid Build Coastguard Worker    movq               xm12, [base+subpel_filters+ r4*8]
4047*c0909341SAndroid Build Coastguard Worker    movq               xm13, [base+subpel_filters+ r6*8]
4048*c0909341SAndroid Build Coastguard Worker    movhps             xm12, [base+subpel_filters+ r7*8]
4049*c0909341SAndroid Build Coastguard Worker    movhps             xm13, [base+subpel_filters+ r9*8]
4050*c0909341SAndroid Build Coastguard Worker    movq               xm14, [base+subpel_filters+r10*8]
4051*c0909341SAndroid Build Coastguard Worker    movq               xm15, [base+subpel_filters+r11*8]
4052*c0909341SAndroid Build Coastguard Worker    movhps             xm14, [base+subpel_filters+r13*8]
4053*c0909341SAndroid Build Coastguard Worker    movhps             xm15, [base+subpel_filters+ rX*8]
4054*c0909341SAndroid Build Coastguard Worker    psrld                m1, 10
4055*c0909341SAndroid Build Coastguard Worker    vextracti128        xm7, m1, 1
4056*c0909341SAndroid Build Coastguard Worker    vextracti128        xm6, m5, 1
4057*c0909341SAndroid Build Coastguard Worker    movq         [rsp+0xa0], xm1
4058*c0909341SAndroid Build Coastguard Worker    movq         [rsp+0xa8], xm7
4059*c0909341SAndroid Build Coastguard Worker    movq                 r6, xm1
4060*c0909341SAndroid Build Coastguard Worker    pextrq              r11, xm1, 1
4061*c0909341SAndroid Build Coastguard Worker    movq                 r9, xm7
4062*c0909341SAndroid Build Coastguard Worker    pextrq               rX, xm7, 1
4063*c0909341SAndroid Build Coastguard Worker    mov                 r4d, r6d
4064*c0909341SAndroid Build Coastguard Worker    shr                  r6, 32
4065*c0909341SAndroid Build Coastguard Worker    mov                r10d, r11d
4066*c0909341SAndroid Build Coastguard Worker    shr                 r11, 32
4067*c0909341SAndroid Build Coastguard Worker    mov                 r7d, r9d
4068*c0909341SAndroid Build Coastguard Worker    shr                  r9, 32
4069*c0909341SAndroid Build Coastguard Worker    mov                r13d, rXd
4070*c0909341SAndroid Build Coastguard Worker    shr                  rX, 32
4071*c0909341SAndroid Build Coastguard Worker    pshufd              xm4, xm5, q2200
4072*c0909341SAndroid Build Coastguard Worker    pshufd              xm5, xm5, q3311
4073*c0909341SAndroid Build Coastguard Worker    pshufd              xm7, xm6, q2200
4074*c0909341SAndroid Build Coastguard Worker    pshufd              xm6, xm6, q3311
4075*c0909341SAndroid Build Coastguard Worker    pblendvb           xm12, xm2, xm4
4076*c0909341SAndroid Build Coastguard Worker    pblendvb           xm13, xm2, xm5
4077*c0909341SAndroid Build Coastguard Worker    pblendvb           xm14, xm2, xm7
4078*c0909341SAndroid Build Coastguard Worker    pblendvb           xm15, xm2, xm6
4079*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m12, xm12
4080*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m13, xm13
4081*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m14, xm14
4082*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m15, xm15
4083*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
4084*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x60], m0
4085*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
4086*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
4087*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
4088*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+0x60]
4089*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m9, [base+subpel_s_shuf8]
4090*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4091*c0909341SAndroid Build Coastguard Worker    mov                 dyd, dym
4092*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m9     ; 01a 01b
4093*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m9     ; 23a 23b
4094*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m9     ; 45a 45b
4095*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m9     ; 67a 67b
4096*c0909341SAndroid Build Coastguard Worker.vloop:
4097*c0909341SAndroid Build Coastguard Worker    and                 myd, 0x3ff
4098*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 64 << 24
4099*c0909341SAndroid Build Coastguard Worker    mov                 r4d, myd
4100*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6
4101*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [t1+r4]
4102*c0909341SAndroid Build Coastguard Worker    cmovnz              r6q, [base+subpel_filters+r4*8]
4103*c0909341SAndroid Build Coastguard Worker    movq                xm9, r6q
4104*c0909341SAndroid Build Coastguard Worker    punpcklqdq          xm9, xm9
4105*c0909341SAndroid Build Coastguard Worker    pmovsxbw             m9, xm9
4106*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m9, q0000
4107*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m9, q1111
4108*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m0, m8
4109*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, m7
4110*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m9, q2222
4111*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m9, q3333
4112*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m8
4113*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m3, m9
4114*c0909341SAndroid Build Coastguard Worker%if isput
4115*c0909341SAndroid Build Coastguard Worker    psrldq              xm8, xm11, 8
4116*c0909341SAndroid Build Coastguard Worker%endif
4117*c0909341SAndroid Build Coastguard Worker    paddd                m4, [rsp+0x20]
4118*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
4119*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
4120*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
4121*c0909341SAndroid Build Coastguard Worker%if isput
4122*c0909341SAndroid Build Coastguard Worker    psrad                m4, xm8
4123*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
4124*c0909341SAndroid Build Coastguard Worker    packusdw            xm4, xm5
4125*c0909341SAndroid Build Coastguard Worker    pminsw              xm4, [rsp+0xb0]
4126*c0909341SAndroid Build Coastguard Worker    mova             [dstq], xm4
4127*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
4128*c0909341SAndroid Build Coastguard Worker%else
4129*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
4130*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
4131*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm5
4132*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xm4
4133*c0909341SAndroid Build Coastguard Worker    add                tmpq, tmp_stridem
4134*c0909341SAndroid Build Coastguard Worker%endif
4135*c0909341SAndroid Build Coastguard Worker    dec                  hd
4136*c0909341SAndroid Build Coastguard Worker    jz .hloop_prep
4137*c0909341SAndroid Build Coastguard Worker    add                 myd, dyd
4138*c0909341SAndroid Build Coastguard Worker    test                myd, ~0x3ff
4139*c0909341SAndroid Build Coastguard Worker    jz .vloop
4140*c0909341SAndroid Build Coastguard Worker    test                myd, 0x400
4141*c0909341SAndroid Build Coastguard Worker    mov          [rsp+0x60], myd
4142*c0909341SAndroid Build Coastguard Worker    mov                 r4d, [rsp+0xa0]
4143*c0909341SAndroid Build Coastguard Worker    mov                 r6d, [rsp+0xa4]
4144*c0909341SAndroid Build Coastguard Worker    mov                 r7d, [rsp+0xa8]
4145*c0909341SAndroid Build Coastguard Worker    mov                 r9d, [rsp+0xac]
4146*c0909341SAndroid Build Coastguard Worker    jz .skip_line
4147*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m9, [base+wswap]
4148*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+ r4*2]
4149*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+ r6*2]
4150*c0909341SAndroid Build Coastguard Worker    movu                xm6, [srcq+ r7*2]
4151*c0909341SAndroid Build Coastguard Worker    movu                xm7, [srcq+ r9*2]
4152*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [srcq+r10*2], 1
4153*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [srcq+r11*2], 1
4154*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [srcq+r13*2], 1
4155*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [srcq+ rX*2], 1
4156*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
4157*c0909341SAndroid Build Coastguard Worker    mov                 myd, [rsp+0x60]
4158*c0909341SAndroid Build Coastguard Worker    mov                 dyd, dym
4159*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m9
4160*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m9
4161*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m9
4162*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m9
4163*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m12
4164*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m13
4165*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m14
4166*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m15
4167*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m5
4168*c0909341SAndroid Build Coastguard Worker    phaddd               m6, m7
4169*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m6
4170*c0909341SAndroid Build Coastguard Worker    paddd                m4, m10
4171*c0909341SAndroid Build Coastguard Worker    psrad                m4, xm11
4172*c0909341SAndroid Build Coastguard Worker    pslld                m4, 16
4173*c0909341SAndroid Build Coastguard Worker    pblendw              m0, m1, 0xaa
4174*c0909341SAndroid Build Coastguard Worker    pblendw              m1, m2, 0xaa
4175*c0909341SAndroid Build Coastguard Worker    pblendw              m2, m3, 0xaa
4176*c0909341SAndroid Build Coastguard Worker    pblendw              m3, m4, 0xaa
4177*c0909341SAndroid Build Coastguard Worker    jmp .vloop
4178*c0909341SAndroid Build Coastguard Worker.skip_line:
4179*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
4180*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
4181*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
4182*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H      3, 10, 4, 5, 6, 7, 8, 9, 1
4183*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m9, [base+subpel_s_shuf8]
4184*c0909341SAndroid Build Coastguard Worker    mov                 myd, [rsp+0x60]
4185*c0909341SAndroid Build Coastguard Worker    mov                 dyd, dym
4186*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m9
4187*c0909341SAndroid Build Coastguard Worker    jmp .vloop
4188*c0909341SAndroid Build Coastguard Worker    SWAP                 m1, m12, m10
4189*c0909341SAndroid Build Coastguard Worker    SWAP                 m7, m11
4190*c0909341SAndroid Build Coastguard Worker.dy1:
4191*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
4192*c0909341SAndroid Build Coastguard Worker    add                  wq, base_reg
4193*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4194*c0909341SAndroid Build Coastguard Worker%if isput
4195*c0909341SAndroid Build Coastguard Worker.dy1_w2:
4196*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4197*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
4198*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
4199*c0909341SAndroid Build Coastguard Worker    movd               xm15, t0d
4200*c0909341SAndroid Build Coastguard Worker    punpckldq            m8, m9, m8
4201*c0909341SAndroid Build Coastguard Worker    paddd               m10, m8 ; mx+dx*[0-1]
4202*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm14, [base+pq_0x40000000+2]
4203*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm15, xm15
4204*c0909341SAndroid Build Coastguard Worker    pand                xm8, xm10, xm6
4205*c0909341SAndroid Build Coastguard Worker    psrld               xm8, 6
4206*c0909341SAndroid Build Coastguard Worker    paddd              xm15, xm8
4207*c0909341SAndroid Build Coastguard Worker    movd                r4d, xm15
4208*c0909341SAndroid Build Coastguard Worker    pextrd              r6d, xm15, 1
4209*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [base+bdct_lb_q]
4210*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [base+subpel_s_shuf2]
4211*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [base+subpel_filters+r4*8+2]
4212*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+subpel_filters+r6*8+2]
4213*c0909341SAndroid Build Coastguard Worker    pcmpeqd             xm8, xm9
4214*c0909341SAndroid Build Coastguard Worker    psrld               m10, 10
4215*c0909341SAndroid Build Coastguard Worker    paddd               m10, m10
4216*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
4217*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*1]
4218*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+ssq*2]
4219*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ss3q ]
4220*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
4221*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
4222*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
4223*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
4224*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
4225*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m5
4226*c0909341SAndroid Build Coastguard Worker    paddb               m10, m6
4227*c0909341SAndroid Build Coastguard Worker    vpblendd           xm15, xm4, 0xa
4228*c0909341SAndroid Build Coastguard Worker    pblendvb           xm15, xm14, xm8
4229*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m15, xm15
4230*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*0], 1
4231*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+ssq*1], 1
4232*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+ssq*2], 1
4233*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
4234*c0909341SAndroid Build Coastguard Worker    movq                xm6, r4q
4235*c0909341SAndroid Build Coastguard Worker    pmovsxbw            xm6, xm6
4236*c0909341SAndroid Build Coastguard Worker    pshufd              xm8, xm6, q0000
4237*c0909341SAndroid Build Coastguard Worker    pshufd              xm9, xm6, q1111
4238*c0909341SAndroid Build Coastguard Worker    pshufd             xm14, xm6, q2222
4239*c0909341SAndroid Build Coastguard Worker    pshufd              xm6, xm6, q3333
4240*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m10}, m0, m1, m2
4241*c0909341SAndroid Build Coastguard Worker    pshufb              xm3, xm10
4242*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m0, m1, m2
4243*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm3, xm15
4244*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m1
4245*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m3
4246*c0909341SAndroid Build Coastguard Worker    paddd                m0, m12
4247*c0909341SAndroid Build Coastguard Worker    paddd                m2, m12
4248*c0909341SAndroid Build Coastguard Worker    psrad                m0, xm7
4249*c0909341SAndroid Build Coastguard Worker    psrad                m2, xm7
4250*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2
4251*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
4252*c0909341SAndroid Build Coastguard Worker    palignr             xm2, xm1, xm0, 4
4253*c0909341SAndroid Build Coastguard Worker    pshufd              xm4, xm1, q2121
4254*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm0, xm2       ; 01 12
4255*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm0, xm2            ; 23 34
4256*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm1, xm4       ; 45 56
4257*c0909341SAndroid Build Coastguard Worker.dy1_w2_loop:
4258*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*0]
4259*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+ssq*1]
4260*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
4261*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm10
4262*c0909341SAndroid Build Coastguard Worker    pshufb              xm5, xm10
4263*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm1, xm15
4264*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm5, xm15
4265*c0909341SAndroid Build Coastguard Worker    phaddd              xm1, xm5
4266*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm5, xm3, xm8
4267*c0909341SAndroid Build Coastguard Worker    mova                xm3, xm0
4268*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm0, xm9
4269*c0909341SAndroid Build Coastguard Worker    paddd               xm1, xm12
4270*c0909341SAndroid Build Coastguard Worker    psrad               xm1, xm7
4271*c0909341SAndroid Build Coastguard Worker    packssdw            xm1, xm1
4272*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm0
4273*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm2
4274*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm2, xm14
4275*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm2
4276*c0909341SAndroid Build Coastguard Worker    palignr             xm2, xm1, xm4, 12
4277*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm1            ; 67 78
4278*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm4, xm2, xm6
4279*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm13
4280*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm4
4281*c0909341SAndroid Build Coastguard Worker    mova                xm4, xm1
4282*c0909341SAndroid Build Coastguard Worker    psrldq              xm1, xm7, 8
4283*c0909341SAndroid Build Coastguard Worker    psrad               xm5, xm1
4284*c0909341SAndroid Build Coastguard Worker    packusdw            xm5, xm5
4285*c0909341SAndroid Build Coastguard Worker    pminsw              xm5, xm11
4286*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm5
4287*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm5, 1
4288*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
4289*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4290*c0909341SAndroid Build Coastguard Worker    jg .dy1_w2_loop
4291*c0909341SAndroid Build Coastguard Worker    RET
4292*c0909341SAndroid Build Coastguard Worker%endif
4293*c0909341SAndroid Build Coastguard Worker.dy1_w4:
4294*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4295*c0909341SAndroid Build Coastguard Worker%if isput
4296*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], xm11
4297*c0909341SAndroid Build Coastguard Worker%endif
4298*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x00], m12
4299*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m13
4300*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x40], xm7
4301*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [base+rescale_mul]
4302*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
4303*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
4304*c0909341SAndroid Build Coastguard Worker    movd               xm15, t0d
4305*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m7
4306*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [base+pq_0x40000000+1]
4307*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm15, xm15
4308*c0909341SAndroid Build Coastguard Worker    SWAP                m13, m10
4309*c0909341SAndroid Build Coastguard Worker    paddd               m13, m8 ; mx+dx*[0-3]
4310*c0909341SAndroid Build Coastguard Worker    pand                 m6, m13
4311*c0909341SAndroid Build Coastguard Worker    psrld                m6, 6
4312*c0909341SAndroid Build Coastguard Worker    paddd              xm15, xm6
4313*c0909341SAndroid Build Coastguard Worker    movd                r4d, xm15
4314*c0909341SAndroid Build Coastguard Worker    pextrd              r6d, xm15, 1
4315*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm15, 2
4316*c0909341SAndroid Build Coastguard Worker    pextrd             r13d, xm15, 3
4317*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [base+bdct_lb_q+ 0]
4318*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [base+bdct_lb_q+16]
4319*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [base+subpel_s_shuf2]
4320*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm14, [base+subpel_filters+r4*8+2]
4321*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm7, [base+subpel_filters+r6*8+2]
4322*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm15, [base+subpel_filters+r11*8+2]
4323*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm8, [base+subpel_filters+r13*8+2]
4324*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m6, m9
4325*c0909341SAndroid Build Coastguard Worker    punpckldq           m10, m6, m6
4326*c0909341SAndroid Build Coastguard Worker    punpckhdq            m6, m6
4327*c0909341SAndroid Build Coastguard Worker    psrld               m13, 10
4328*c0909341SAndroid Build Coastguard Worker    paddd               m13, m13
4329*c0909341SAndroid Build Coastguard Worker    vpblendd           xm14, xm7, 0xa
4330*c0909341SAndroid Build Coastguard Worker    vpblendd           xm15, xm8, 0xa
4331*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m14, xm14
4332*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m15, xm15
4333*c0909341SAndroid Build Coastguard Worker    pblendvb            m14, m2, m10
4334*c0909341SAndroid Build Coastguard Worker    pblendvb            m15, m2, m6
4335*c0909341SAndroid Build Coastguard Worker    pextrd               r4, xm13, 2
4336*c0909341SAndroid Build Coastguard Worker    pshufb              m12, m13, m5
4337*c0909341SAndroid Build Coastguard Worker    pshufb              m13, m1
4338*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r4+ssq*2]
4339*c0909341SAndroid Build Coastguard Worker    lea                 r11, [r4+ssq*1]
4340*c0909341SAndroid Build Coastguard Worker    lea                 r13, [r4+ss3q ]
4341*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
4342*c0909341SAndroid Build Coastguard Worker    movu                xm7, [srcq+r4   ]
4343*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*2]
4344*c0909341SAndroid Build Coastguard Worker    movu                xm8, [srcq+r6   ]
4345*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*1], 1 ; 0 1
4346*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [srcq+r11  ], 1
4347*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+ss3q ], 1 ; 2 3
4348*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, [srcq+r13  ], 1
4349*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
4350*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+ssq*0]
4351*c0909341SAndroid Build Coastguard Worker    movu                xm9, [srcq+r4   ]
4352*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ssq*2]    ; 6 _
4353*c0909341SAndroid Build Coastguard Worker    movu               xm10, [srcq+r6   ]
4354*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+ssq*1], 1 ; 4 5
4355*c0909341SAndroid Build Coastguard Worker    vinserti128          m9, [srcq+r11  ], 1
4356*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ss3q ]
4357*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m5, xm13
4358*c0909341SAndroid Build Coastguard Worker    psubb               m13, m5
4359*c0909341SAndroid Build Coastguard Worker    paddb               m12, m4
4360*c0909341SAndroid Build Coastguard Worker    paddb               m13, m4
4361*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+0x00]
4362*c0909341SAndroid Build Coastguard Worker    movd                xm6, [rsp+0x40]
4363*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m12
4364*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m12
4365*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m14
4366*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m14
4367*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m13
4368*c0909341SAndroid Build Coastguard Worker    pshufb               m8, m13
4369*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m15
4370*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m15
4371*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m12
4372*c0909341SAndroid Build Coastguard Worker    pshufb              xm3, xm12
4373*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m14
4374*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm3, xm14
4375*c0909341SAndroid Build Coastguard Worker    pshufb               m9, m13
4376*c0909341SAndroid Build Coastguard Worker    pshufb             xm10, xm13
4377*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m15
4378*c0909341SAndroid Build Coastguard Worker    pmaddwd            xm10, xm15
4379*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m7
4380*c0909341SAndroid Build Coastguard Worker    phaddd               m1, m8
4381*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m9
4382*c0909341SAndroid Build Coastguard Worker    phaddd              xm3, xm10
4383*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
4384*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5
4385*c0909341SAndroid Build Coastguard Worker    paddd                m2, m5
4386*c0909341SAndroid Build Coastguard Worker    paddd               xm3, xm5
4387*c0909341SAndroid Build Coastguard Worker    psrad                m0, xm6
4388*c0909341SAndroid Build Coastguard Worker    psrad                m1, xm6
4389*c0909341SAndroid Build Coastguard Worker    psrad                m2, xm6
4390*c0909341SAndroid Build Coastguard Worker    psrad               xm3, xm6
4391*c0909341SAndroid Build Coastguard Worker    vperm2i128           m4, m0, m1, 0x21 ; 1 2
4392*c0909341SAndroid Build Coastguard Worker    vperm2i128           m5, m1, m2, 0x21 ; 3 4
4393*c0909341SAndroid Build Coastguard Worker    vperm2i128           m6, m2, m3, 0x21 ; 5 6
4394*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
4395*c0909341SAndroid Build Coastguard Worker    mov                r13d, 64 << 24
4396*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
4397*c0909341SAndroid Build Coastguard Worker    cmovnz             r13q, [base+subpel_filters+myq*8]
4398*c0909341SAndroid Build Coastguard Worker    pslld                m4, 16
4399*c0909341SAndroid Build Coastguard Worker    pslld                m5, 16
4400*c0909341SAndroid Build Coastguard Worker    pslld                m6, 16
4401*c0909341SAndroid Build Coastguard Worker    pblendw              m0, m4, 0xaa ; 01 12
4402*c0909341SAndroid Build Coastguard Worker    pblendw              m1, m5, 0xaa ; 23 34
4403*c0909341SAndroid Build Coastguard Worker    pblendw              m2, m6, 0xaa ; 45 56
4404*c0909341SAndroid Build Coastguard Worker    movq               xm10, r13q
4405*c0909341SAndroid Build Coastguard Worker    punpcklqdq         xm10, xm10
4406*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m10, xm10
4407*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m10, q0000
4408*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m10, q1111
4409*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m10, q2222
4410*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m10, q3333
4411*c0909341SAndroid Build Coastguard Worker.dy1_w4_loop:
4412*c0909341SAndroid Build Coastguard Worker    movu               xm11, [srcq+ssq*0]
4413*c0909341SAndroid Build Coastguard Worker    movu                xm6, [srcq+r4   ]
4414*c0909341SAndroid Build Coastguard Worker    vinserti128         m11, [srcq+ssq*1], 1
4415*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [srcq+r11  ], 1
4416*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
4417*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m0, m7
4418*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, m8
4419*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m12
4420*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m13
4421*c0909341SAndroid Build Coastguard Worker    pmaddwd             m11, m14
4422*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m15
4423*c0909341SAndroid Build Coastguard Worker    paddd                m4, [rsp+0x20]
4424*c0909341SAndroid Build Coastguard Worker    phaddd              m11, m6
4425*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m9
4426*c0909341SAndroid Build Coastguard Worker    paddd               m11, [rsp+0x00]
4427*c0909341SAndroid Build Coastguard Worker    psrad               m11, [rsp+0x40]
4428*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
4429*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
4430*c0909341SAndroid Build Coastguard Worker    paddd                m5, m6
4431*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
4432*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, m3, xm11, 1
4433*c0909341SAndroid Build Coastguard Worker    pslld                m3, m11, 16
4434*c0909341SAndroid Build Coastguard Worker    pblendw              m2, m3, 0xaa   ; 67 78
4435*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m2, m10
4436*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m11, 1
4437*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
4438*c0909341SAndroid Build Coastguard Worker%if isput
4439*c0909341SAndroid Build Coastguard Worker    psrad                m4, [rsp+0x48]
4440*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
4441*c0909341SAndroid Build Coastguard Worker    packusdw            xm4, xm5
4442*c0909341SAndroid Build Coastguard Worker    pminsw              xm4, [rsp+0x50]
4443*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm4
4444*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm4
4445*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
4446*c0909341SAndroid Build Coastguard Worker%else
4447*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
4448*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
4449*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm5
4450*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xm4
4451*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
4452*c0909341SAndroid Build Coastguard Worker%endif
4453*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4454*c0909341SAndroid Build Coastguard Worker    jg .dy1_w4_loop
4455*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_RET
4456*c0909341SAndroid Build Coastguard Worker    SWAP                 m10, m13
4457*c0909341SAndroid Build Coastguard Worker.dy1_w8:
4458*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0xa0], 1
4459*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 16
4460*c0909341SAndroid Build Coastguard Worker    jmp .dy1_w_start
4461*c0909341SAndroid Build Coastguard Worker.dy1_w16:
4462*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0xa0], 2
4463*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 32
4464*c0909341SAndroid Build Coastguard Worker    jmp .dy1_w_start
4465*c0909341SAndroid Build Coastguard Worker.dy1_w32:
4466*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0xa0], 4
4467*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 64
4468*c0909341SAndroid Build Coastguard Worker    jmp .dy1_w_start
4469*c0909341SAndroid Build Coastguard Worker.dy1_w64:
4470*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0xa0], 8
4471*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 128
4472*c0909341SAndroid Build Coastguard Worker    jmp .dy1_w_start
4473*c0909341SAndroid Build Coastguard Worker.dy1_w128:
4474*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0xa0], 16
4475*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 256
4476*c0909341SAndroid Build Coastguard Worker.dy1_w_start:
4477*c0909341SAndroid Build Coastguard Worker    SWAP                m10, m12, m1
4478*c0909341SAndroid Build Coastguard Worker    SWAP                m11, m7
4479*c0909341SAndroid Build Coastguard Worker    ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
4480*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4481*c0909341SAndroid Build Coastguard Worker%if isput
4482*c0909341SAndroid Build Coastguard Worker %define dsm [rsp+0xb8]
4483*c0909341SAndroid Build Coastguard Worker    movifnidn           dsm, dsq
4484*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0xc0], xm7
4485*c0909341SAndroid Build Coastguard Worker%else
4486*c0909341SAndroid Build Coastguard Worker %if UNIX64
4487*c0909341SAndroid Build Coastguard Worker  %define hm [rsp+0xb8]
4488*c0909341SAndroid Build Coastguard Worker %endif
4489*c0909341SAndroid Build Coastguard Worker%endif
4490*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x00], m10
4491*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m13
4492*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x40], xm11
4493*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 16
4494*c0909341SAndroid Build Coastguard Worker    sub                srcq, 6
4495*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
4496*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
4497*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
4498*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
4499*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [base+rescale_mul2]
4500*c0909341SAndroid Build Coastguard Worker    movd               xm15, t0d
4501*c0909341SAndroid Build Coastguard Worker    mov          [rsp+0xa4], t0d
4502*c0909341SAndroid Build Coastguard Worker    mov          [rsp+0xa8], srcq
4503*c0909341SAndroid Build Coastguard Worker    mov          [rsp+0xb0], r0q ; dstq / tmpq
4504*c0909341SAndroid Build Coastguard Worker%if UNIX64
4505*c0909341SAndroid Build Coastguard Worker    mov                  hm, hd
4506*c0909341SAndroid Build Coastguard Worker%endif
4507*c0909341SAndroid Build Coastguard Worker    shl           dword dxm, 3 ; dx*8
4508*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, xm15
4509*c0909341SAndroid Build Coastguard Worker    paddd                m1, m8 ; mx+dx*[0-7]
4510*c0909341SAndroid Build Coastguard Worker    movq                xm0, r4q
4511*c0909341SAndroid Build Coastguard Worker    pmovsxbw            xm0, xm0
4512*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], xm0
4513*c0909341SAndroid Build Coastguard Worker    jmp .dy1_hloop
4514*c0909341SAndroid Build Coastguard Worker.dy1_hloop_prep:
4515*c0909341SAndroid Build Coastguard Worker    dec    dword [rsp+0xa0]
4516*c0909341SAndroid Build Coastguard Worker    jz .ret
4517*c0909341SAndroid Build Coastguard Worker    add    qword [rsp+0xb0], 16
4518*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
4519*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, dxm
4520*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pd_0x3ff]
4521*c0909341SAndroid Build Coastguard Worker    paddd                m1, m8, [rsp+0x60]
4522*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [rsp+0xa4]
4523*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
4524*c0909341SAndroid Build Coastguard Worker    mov                srcq, [rsp+0xa8]
4525*c0909341SAndroid Build Coastguard Worker    mov                 r0q, [rsp+0xb0] ; dstq / tmpq
4526*c0909341SAndroid Build Coastguard Worker    mova                m10, [rsp+0x00]
4527*c0909341SAndroid Build Coastguard Worker    mova               xm11, [rsp+0x40]
4528*c0909341SAndroid Build Coastguard Worker.dy1_hloop:
4529*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        xm2, [base+pq_0x40000000]
4530*c0909341SAndroid Build Coastguard Worker    pand                 m5, m1, m6
4531*c0909341SAndroid Build Coastguard Worker    psrld                m5, 6
4532*c0909341SAndroid Build Coastguard Worker    paddd               m15, m5
4533*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m5, m9
4534*c0909341SAndroid Build Coastguard Worker    vextracti128        xm7, m15, 1
4535*c0909341SAndroid Build Coastguard Worker    movq                 r6, xm15
4536*c0909341SAndroid Build Coastguard Worker    pextrq               r9, xm15, 1
4537*c0909341SAndroid Build Coastguard Worker    movq                r11, xm7
4538*c0909341SAndroid Build Coastguard Worker    pextrq               rX, xm7, 1
4539*c0909341SAndroid Build Coastguard Worker    mov                 r4d, r6d
4540*c0909341SAndroid Build Coastguard Worker    shr                  r6, 32
4541*c0909341SAndroid Build Coastguard Worker    mov                 r7d, r9d
4542*c0909341SAndroid Build Coastguard Worker    shr                  r9, 32
4543*c0909341SAndroid Build Coastguard Worker    mov                r10d, r11d
4544*c0909341SAndroid Build Coastguard Worker    shr                 r11, 32
4545*c0909341SAndroid Build Coastguard Worker    mov                r13d, rXd
4546*c0909341SAndroid Build Coastguard Worker    shr                  rX, 32
4547*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x60], m1
4548*c0909341SAndroid Build Coastguard Worker    movq               xm12, [base+subpel_filters+ r4*8]
4549*c0909341SAndroid Build Coastguard Worker    movq               xm13, [base+subpel_filters+ r6*8]
4550*c0909341SAndroid Build Coastguard Worker    movhps             xm12, [base+subpel_filters+ r7*8]
4551*c0909341SAndroid Build Coastguard Worker    movhps             xm13, [base+subpel_filters+ r9*8]
4552*c0909341SAndroid Build Coastguard Worker    movq               xm14, [base+subpel_filters+r10*8]
4553*c0909341SAndroid Build Coastguard Worker    movq               xm15, [base+subpel_filters+r11*8]
4554*c0909341SAndroid Build Coastguard Worker    movhps             xm14, [base+subpel_filters+r13*8]
4555*c0909341SAndroid Build Coastguard Worker    movhps             xm15, [base+subpel_filters+ rX*8]
4556*c0909341SAndroid Build Coastguard Worker    psrld                m1, 10
4557*c0909341SAndroid Build Coastguard Worker    vextracti128        xm7, m1, 1
4558*c0909341SAndroid Build Coastguard Worker    vextracti128        xm6, m5, 1
4559*c0909341SAndroid Build Coastguard Worker    movq                 r6, xm1
4560*c0909341SAndroid Build Coastguard Worker    pextrq              r11, xm1, 1
4561*c0909341SAndroid Build Coastguard Worker    movq                 r9, xm7
4562*c0909341SAndroid Build Coastguard Worker    pextrq               rX, xm7, 1
4563*c0909341SAndroid Build Coastguard Worker    mov                 r4d, r6d
4564*c0909341SAndroid Build Coastguard Worker    shr                  r6, 32
4565*c0909341SAndroid Build Coastguard Worker    mov                r10d, r11d
4566*c0909341SAndroid Build Coastguard Worker    shr                 r11, 32
4567*c0909341SAndroid Build Coastguard Worker    mov                 r7d, r9d
4568*c0909341SAndroid Build Coastguard Worker    shr                  r9, 32
4569*c0909341SAndroid Build Coastguard Worker    mov                r13d, rXd
4570*c0909341SAndroid Build Coastguard Worker    shr                  rX, 32
4571*c0909341SAndroid Build Coastguard Worker    pshufd              xm4, xm5, q2200
4572*c0909341SAndroid Build Coastguard Worker    pshufd              xm5, xm5, q3311
4573*c0909341SAndroid Build Coastguard Worker    pshufd              xm7, xm6, q2200
4574*c0909341SAndroid Build Coastguard Worker    pshufd              xm6, xm6, q3311
4575*c0909341SAndroid Build Coastguard Worker    pblendvb           xm12, xm2, xm4
4576*c0909341SAndroid Build Coastguard Worker    pblendvb           xm13, xm2, xm5
4577*c0909341SAndroid Build Coastguard Worker    pblendvb           xm14, xm2, xm7
4578*c0909341SAndroid Build Coastguard Worker    pblendvb           xm15, xm2, xm6
4579*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m12, xm12
4580*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m13, xm13
4581*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m14, xm14
4582*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m15, xm15
4583*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
4584*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x80], m0
4585*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
4586*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
4587*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
4588*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+0x80]
4589*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [base+subpel_s_shuf8]
4590*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [rsp+0x50]
4591*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [rsp+0x54]
4592*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [rsp+0x58]
4593*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [rsp+0x5c]
4594*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m7     ; 01a 01b
4595*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m7     ; 23a 23b
4596*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7     ; 45a 45b
4597*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7     ; 67a 67b
4598*c0909341SAndroid Build Coastguard Worker.dy1_vloop:
4599*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m0, m8
4600*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, m9
4601*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m10
4602*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m3, m11
4603*c0909341SAndroid Build Coastguard Worker    paddd                m4, [rsp+0x20]
4604*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
4605*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
4606*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
4607*c0909341SAndroid Build Coastguard Worker%if isput
4608*c0909341SAndroid Build Coastguard Worker    psrad                m4, [rsp+0x48]
4609*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
4610*c0909341SAndroid Build Coastguard Worker    packusdw            xm4, xm5
4611*c0909341SAndroid Build Coastguard Worker    pminsw              xm4, [rsp+0xc0]
4612*c0909341SAndroid Build Coastguard Worker    mova             [dstq], xm4
4613*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
4614*c0909341SAndroid Build Coastguard Worker%else
4615*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
4616*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
4617*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm5
4618*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xm4
4619*c0909341SAndroid Build Coastguard Worker    add                tmpq, tmp_stridem
4620*c0909341SAndroid Build Coastguard Worker%endif
4621*c0909341SAndroid Build Coastguard Worker    dec                  hd
4622*c0909341SAndroid Build Coastguard Worker    jz .dy1_hloop_prep
4623*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [base+wswap]
4624*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m7
4625*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m7
4626*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
4627*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7
4628*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+ r4*2]
4629*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+ r6*2]
4630*c0909341SAndroid Build Coastguard Worker    movu                xm6, [srcq+ r7*2]
4631*c0909341SAndroid Build Coastguard Worker    movu                xm7, [srcq+ r9*2]
4632*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [srcq+r10*2], 1
4633*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [srcq+r11*2], 1
4634*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [srcq+r13*2], 1
4635*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [srcq+ rX*2], 1
4636*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
4637*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m12
4638*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m13
4639*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m14
4640*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m15
4641*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m5
4642*c0909341SAndroid Build Coastguard Worker    phaddd               m6, m7
4643*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m6
4644*c0909341SAndroid Build Coastguard Worker    paddd                m4, [rsp+0x00]
4645*c0909341SAndroid Build Coastguard Worker    psrad                m4, [rsp+0x40]
4646*c0909341SAndroid Build Coastguard Worker    pslld                m4, 16
4647*c0909341SAndroid Build Coastguard Worker    pblendw              m0, m1, 0xaa
4648*c0909341SAndroid Build Coastguard Worker    pblendw              m1, m2, 0xaa
4649*c0909341SAndroid Build Coastguard Worker    pblendw              m2, m3, 0xaa
4650*c0909341SAndroid Build Coastguard Worker    pblendw              m3, m4, 0xaa
4651*c0909341SAndroid Build Coastguard Worker    jmp .dy1_vloop
4652*c0909341SAndroid Build Coastguard Worker    SWAP                 m1, m12, m10
4653*c0909341SAndroid Build Coastguard Worker    SWAP                 m7, m11
4654*c0909341SAndroid Build Coastguard Worker.dy2:
4655*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
4656*c0909341SAndroid Build Coastguard Worker    add                  wq, base_reg
4657*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4658*c0909341SAndroid Build Coastguard Worker%if isput
4659*c0909341SAndroid Build Coastguard Worker.dy2_w2:
4660*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4661*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
4662*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
4663*c0909341SAndroid Build Coastguard Worker    movd               xm15, t0d
4664*c0909341SAndroid Build Coastguard Worker    punpckldq            m8, m9, m8
4665*c0909341SAndroid Build Coastguard Worker    paddd               m10, m8 ; mx+dx*[0-1]
4666*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm14, [base+pq_0x40000000+2]
4667*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm15, xm15
4668*c0909341SAndroid Build Coastguard Worker    pand                xm8, xm10, xm6
4669*c0909341SAndroid Build Coastguard Worker    psrld               xm8, 6
4670*c0909341SAndroid Build Coastguard Worker    paddd              xm15, xm8
4671*c0909341SAndroid Build Coastguard Worker    movd                r4d, xm15
4672*c0909341SAndroid Build Coastguard Worker    pextrd              r6d, xm15, 1
4673*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [base+bdct_lb_q]
4674*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [base+subpel_s_shuf2]
4675*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm15, [base+subpel_filters+r4*8+2]
4676*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [base+subpel_filters+r6*8+2]
4677*c0909341SAndroid Build Coastguard Worker    pcmpeqd             xm8, xm9
4678*c0909341SAndroid Build Coastguard Worker    psrld               m10, 10
4679*c0909341SAndroid Build Coastguard Worker    paddd               m10, m10
4680*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
4681*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*2]
4682*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+ssq*4]
4683*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m5
4684*c0909341SAndroid Build Coastguard Worker    paddb               m10, m6
4685*c0909341SAndroid Build Coastguard Worker    vpblendd           xm15, xm4, 0xa
4686*c0909341SAndroid Build Coastguard Worker    pblendvb           xm15, xm14, xm8
4687*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m15, xm15
4688*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*1], 1 ; 0 1
4689*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+ss3q ], 1 ; 2 3
4690*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
4691*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+ssq*1], 1 ; 4 5
4692*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
4693*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
4694*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
4695*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
4696*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
4697*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m10
4698*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m10
4699*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m10
4700*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m15
4701*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m15
4702*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m15
4703*c0909341SAndroid Build Coastguard Worker    movq                xm6, r4q
4704*c0909341SAndroid Build Coastguard Worker    pmovsxbw            xm6, xm6
4705*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m1
4706*c0909341SAndroid Build Coastguard Worker    phaddd               m1, m2
4707*c0909341SAndroid Build Coastguard Worker    paddd                m0, m12
4708*c0909341SAndroid Build Coastguard Worker    paddd                m1, m12
4709*c0909341SAndroid Build Coastguard Worker    psrad                m0, xm7
4710*c0909341SAndroid Build Coastguard Worker    psrad                m1, xm7
4711*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1             ; 0 2 2 4  1 3 3 5
4712*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
4713*c0909341SAndroid Build Coastguard Worker    pshufd              xm8, xm6, q0000
4714*c0909341SAndroid Build Coastguard Worker    pshufd              xm9, xm6, q1111
4715*c0909341SAndroid Build Coastguard Worker    pshufd             xm14, xm6, q2222
4716*c0909341SAndroid Build Coastguard Worker    pshufd              xm6, xm6, q3333
4717*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm0, xm1       ; 01 23
4718*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm1, xm0, xm1       ; 23 45
4719*c0909341SAndroid Build Coastguard Worker.dy2_w2_loop:
4720*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ssq*0]
4721*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+ssq*2]
4722*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+ssq*1], 1 ; 6 7
4723*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [srcq+ss3q ], 1 ; 8 9
4724*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
4725*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm4, xm2, xm8
4726*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm1, xm9
4727*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m10
4728*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m10
4729*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m15
4730*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m15
4731*c0909341SAndroid Build Coastguard Worker    phaddd               m3, m5
4732*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm1
4733*c0909341SAndroid Build Coastguard Worker    paddd                m3, m12
4734*c0909341SAndroid Build Coastguard Worker    psrad                m3, xm7
4735*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m3
4736*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q2100
4737*c0909341SAndroid Build Coastguard Worker    palignr              m0, m3, m0, 12     ; 4 6 6 8  5 7 7 9
4738*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
4739*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm0, xm1       ; 45 67
4740*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm1, xm0, xm1       ; 67 89
4741*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm3, xm2, xm14
4742*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm5, xm1, xm6
4743*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm13
4744*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm3
4745*c0909341SAndroid Build Coastguard Worker    psrldq              xm3, xm7, 8
4746*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm5
4747*c0909341SAndroid Build Coastguard Worker    psrad               xm4, xm3
4748*c0909341SAndroid Build Coastguard Worker    packusdw            xm4, xm4
4749*c0909341SAndroid Build Coastguard Worker    pminsw              xm4, xm11
4750*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm4
4751*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm4, 1
4752*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
4753*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4754*c0909341SAndroid Build Coastguard Worker    jg .dy2_w2_loop
4755*c0909341SAndroid Build Coastguard Worker    RET
4756*c0909341SAndroid Build Coastguard Worker%endif
4757*c0909341SAndroid Build Coastguard Worker.dy2_w4:
4758*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4759*c0909341SAndroid Build Coastguard Worker%if isput
4760*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], xm11
4761*c0909341SAndroid Build Coastguard Worker%endif
4762*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x00], m12
4763*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m13
4764*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x40], xm7
4765*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [base+rescale_mul]
4766*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
4767*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
4768*c0909341SAndroid Build Coastguard Worker    movd               xm15, t0d
4769*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m7
4770*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [base+pq_0x40000000+1]
4771*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm15, xm15
4772*c0909341SAndroid Build Coastguard Worker    SWAP                m13, m10
4773*c0909341SAndroid Build Coastguard Worker    paddd               m13, m8 ; mx+dx*[0-3]
4774*c0909341SAndroid Build Coastguard Worker    pand                 m6, m13
4775*c0909341SAndroid Build Coastguard Worker    psrld                m6, 6
4776*c0909341SAndroid Build Coastguard Worker    paddd              xm15, xm6
4777*c0909341SAndroid Build Coastguard Worker    movd                r4d, xm15
4778*c0909341SAndroid Build Coastguard Worker    pextrd              r6d, xm15, 1
4779*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm15, 2
4780*c0909341SAndroid Build Coastguard Worker    pextrd             r13d, xm15, 3
4781*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [base+bdct_lb_q+ 0]
4782*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [base+bdct_lb_q+16]
4783*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [base+subpel_s_shuf2]
4784*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm14, [base+subpel_filters+r4*8+2]
4785*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm7, [base+subpel_filters+r6*8+2]
4786*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm15, [base+subpel_filters+r11*8+2]
4787*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm8, [base+subpel_filters+r13*8+2]
4788*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
4789*c0909341SAndroid Build Coastguard Worker    mov                r13d, 64 << 24
4790*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
4791*c0909341SAndroid Build Coastguard Worker    cmovnz             r13q, [base+subpel_filters+myq*8]
4792*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m6, m9
4793*c0909341SAndroid Build Coastguard Worker    punpckldq           m11, m6, m6
4794*c0909341SAndroid Build Coastguard Worker    punpckhdq            m6, m6
4795*c0909341SAndroid Build Coastguard Worker    psrld               m13, 10
4796*c0909341SAndroid Build Coastguard Worker    paddd               m13, m13
4797*c0909341SAndroid Build Coastguard Worker    vpblendd           xm14, xm7, 0xa
4798*c0909341SAndroid Build Coastguard Worker    vpblendd           xm15, xm8, 0xa
4799*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m14, xm14
4800*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m15, xm15
4801*c0909341SAndroid Build Coastguard Worker    movq               xm10, r13q
4802*c0909341SAndroid Build Coastguard Worker    pblendvb            m14, m2, m11
4803*c0909341SAndroid Build Coastguard Worker    pblendvb            m15, m2, m6
4804*c0909341SAndroid Build Coastguard Worker    pextrd               r4, xm13, 2
4805*c0909341SAndroid Build Coastguard Worker    pshufb              m12, m13, m5
4806*c0909341SAndroid Build Coastguard Worker    pshufb              m13, m1
4807*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r4+ssq*1]
4808*c0909341SAndroid Build Coastguard Worker    lea                 r11, [r4+ssq*2]
4809*c0909341SAndroid Build Coastguard Worker    lea                 r13, [r4+ss3q ]
4810*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
4811*c0909341SAndroid Build Coastguard Worker    movu                xm7, [srcq+r4   ]
4812*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*1]
4813*c0909341SAndroid Build Coastguard Worker    movu                xm8, [srcq+r6   ]
4814*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*2], 1 ; 0 2
4815*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [srcq+r11  ], 1
4816*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+ss3q ], 1 ; 1 3
4817*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, [srcq+r13  ], 1
4818*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
4819*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+ssq*0]
4820*c0909341SAndroid Build Coastguard Worker    movu                xm9, [srcq+r4   ]
4821*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+ssq*1], 1 ; 4 5
4822*c0909341SAndroid Build Coastguard Worker    vinserti128          m9, [srcq+r6   ], 1
4823*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
4824*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m5, xm13
4825*c0909341SAndroid Build Coastguard Worker    psubb               m13, m5
4826*c0909341SAndroid Build Coastguard Worker    paddb               m12, m4
4827*c0909341SAndroid Build Coastguard Worker    paddb               m13, m4
4828*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+0x00]
4829*c0909341SAndroid Build Coastguard Worker    movd                xm6, [rsp+0x40]
4830*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m12
4831*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m12
4832*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m12
4833*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m14
4834*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m14
4835*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m14
4836*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m13
4837*c0909341SAndroid Build Coastguard Worker    pshufb               m8, m13
4838*c0909341SAndroid Build Coastguard Worker    pshufb               m9, m13
4839*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m15
4840*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m15
4841*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m15
4842*c0909341SAndroid Build Coastguard Worker    punpcklqdq         xm10, xm10
4843*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m10, xm10
4844*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m7
4845*c0909341SAndroid Build Coastguard Worker    phaddd               m1, m8
4846*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m9
4847*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
4848*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5
4849*c0909341SAndroid Build Coastguard Worker    paddd                m2, m5
4850*c0909341SAndroid Build Coastguard Worker    psrad                m0, xm6
4851*c0909341SAndroid Build Coastguard Worker    psrad                m1, xm6
4852*c0909341SAndroid Build Coastguard Worker    psrad                m2, xm6
4853*c0909341SAndroid Build Coastguard Worker    vperm2i128           m3, m0, m2, 0x21 ; 2 4
4854*c0909341SAndroid Build Coastguard Worker    vperm2i128           m2, m1, 0x13     ; 3 5
4855*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m10, q0000
4856*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m10, q1111
4857*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m10, q2222
4858*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m10, q3333
4859*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m3 ; 0 2  2 4
4860*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m2 ; 1 3  3 5
4861*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0, m1 ; 23 45
4862*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1     ; 01 23
4863*c0909341SAndroid Build Coastguard Worker.dy2_w4_loop:
4864*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*0]
4865*c0909341SAndroid Build Coastguard Worker    movu                xm6, [srcq+r4   ]
4866*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ssq*1]
4867*c0909341SAndroid Build Coastguard Worker    movu               xm11, [srcq+r6   ]
4868*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+ssq*2], 1 ; 6 8
4869*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [srcq+r11  ], 1
4870*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+ss3q ], 1 ; 7 9
4871*c0909341SAndroid Build Coastguard Worker    vinserti128         m11, [srcq+r13  ], 1
4872*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
4873*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m0, m7
4874*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m2, m8
4875*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m12
4876*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m12
4877*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m14
4878*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m14
4879*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+0x00]
4880*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m13
4881*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m13
4882*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m15
4883*c0909341SAndroid Build Coastguard Worker    pmaddwd             m11, m15
4884*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
4885*c0909341SAndroid Build Coastguard Worker    movd                xm5, [rsp+0x40]
4886*c0909341SAndroid Build Coastguard Worker    phaddd               m1, m6
4887*c0909341SAndroid Build Coastguard Worker    phaddd               m3, m11
4888*c0909341SAndroid Build Coastguard Worker    paddd                m1, m0
4889*c0909341SAndroid Build Coastguard Worker    paddd                m3, m0
4890*c0909341SAndroid Build Coastguard Worker    psrad                m1, xm5
4891*c0909341SAndroid Build Coastguard Worker    psrad                m3, xm5
4892*c0909341SAndroid Build Coastguard Worker    pslld                m3, 16
4893*c0909341SAndroid Build Coastguard Worker    pblendw              m1, m3, 0xaa     ; 67 89
4894*c0909341SAndroid Build Coastguard Worker    vperm2i128           m0, m2, m1, 0x21 ; 45 67
4895*c0909341SAndroid Build Coastguard Worker    paddd                m4, [rsp+0x20]
4896*c0909341SAndroid Build Coastguard Worker    mova                 m2, m1
4897*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m0, m9
4898*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m10
4899*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
4900*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
4901*c0909341SAndroid Build Coastguard Worker%if isput
4902*c0909341SAndroid Build Coastguard Worker    psrad                m4, [rsp+0x48]
4903*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
4904*c0909341SAndroid Build Coastguard Worker    packusdw            xm4, xm5
4905*c0909341SAndroid Build Coastguard Worker    pminsw              xm4, [rsp+0x50]
4906*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm4
4907*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm4
4908*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
4909*c0909341SAndroid Build Coastguard Worker%else
4910*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
4911*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
4912*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm5
4913*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xm4
4914*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
4915*c0909341SAndroid Build Coastguard Worker%endif
4916*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4917*c0909341SAndroid Build Coastguard Worker    jg .dy2_w4_loop
4918*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_RET
4919*c0909341SAndroid Build Coastguard Worker    SWAP                m10, m13
4920*c0909341SAndroid Build Coastguard Worker.dy2_w8:
4921*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0xa0], 1
4922*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 16
4923*c0909341SAndroid Build Coastguard Worker    jmp .dy2_w_start
4924*c0909341SAndroid Build Coastguard Worker.dy2_w16:
4925*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0xa0], 2
4926*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 32
4927*c0909341SAndroid Build Coastguard Worker    jmp .dy2_w_start
4928*c0909341SAndroid Build Coastguard Worker.dy2_w32:
4929*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0xa0], 4
4930*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 64
4931*c0909341SAndroid Build Coastguard Worker    jmp .dy2_w_start
4932*c0909341SAndroid Build Coastguard Worker.dy2_w64:
4933*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0xa0], 8
4934*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 128
4935*c0909341SAndroid Build Coastguard Worker    jmp .dy2_w_start
4936*c0909341SAndroid Build Coastguard Worker.dy2_w128:
4937*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0xa0], 16
4938*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 256
4939*c0909341SAndroid Build Coastguard Worker.dy2_w_start:
4940*c0909341SAndroid Build Coastguard Worker    SWAP                m10, m12, m1
4941*c0909341SAndroid Build Coastguard Worker    SWAP                m11, m7
4942*c0909341SAndroid Build Coastguard Worker    ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
4943*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4944*c0909341SAndroid Build Coastguard Worker%if isput
4945*c0909341SAndroid Build Coastguard Worker    movifnidn           dsm, dsq
4946*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0xc0], xm7
4947*c0909341SAndroid Build Coastguard Worker%endif
4948*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x00], m10
4949*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m13
4950*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x40], xm11
4951*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 16
4952*c0909341SAndroid Build Coastguard Worker    sub                srcq, 6
4953*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
4954*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
4955*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
4956*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
4957*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [base+rescale_mul2]
4958*c0909341SAndroid Build Coastguard Worker    movd               xm15, t0d
4959*c0909341SAndroid Build Coastguard Worker    mov          [rsp+0xa4], t0d
4960*c0909341SAndroid Build Coastguard Worker    mov          [rsp+0xa8], srcq
4961*c0909341SAndroid Build Coastguard Worker    mov          [rsp+0xb0], r0q ; dstq / tmpq
4962*c0909341SAndroid Build Coastguard Worker%if UNIX64
4963*c0909341SAndroid Build Coastguard Worker    mov                  hm, hd
4964*c0909341SAndroid Build Coastguard Worker%endif
4965*c0909341SAndroid Build Coastguard Worker    shl           dword dxm, 3 ; dx*8
4966*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, xm15
4967*c0909341SAndroid Build Coastguard Worker    paddd                m1, m8 ; mx+dx*[0-7]
4968*c0909341SAndroid Build Coastguard Worker    movq                xm0, r4q
4969*c0909341SAndroid Build Coastguard Worker    pmovsxbw            xm0, xm0
4970*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], xm0
4971*c0909341SAndroid Build Coastguard Worker    jmp .dy2_hloop
4972*c0909341SAndroid Build Coastguard Worker.dy2_hloop_prep:
4973*c0909341SAndroid Build Coastguard Worker    dec    dword [rsp+0xa0]
4974*c0909341SAndroid Build Coastguard Worker    jz .ret
4975*c0909341SAndroid Build Coastguard Worker    add    qword [rsp+0xb0], 16
4976*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
4977*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, dxm
4978*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pd_0x3ff]
4979*c0909341SAndroid Build Coastguard Worker    paddd                m1, m8, [rsp+0x60]
4980*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [rsp+0xa4]
4981*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
4982*c0909341SAndroid Build Coastguard Worker    mov                srcq, [rsp+0xa8]
4983*c0909341SAndroid Build Coastguard Worker    mov                 r0q, [rsp+0xb0] ; dstq / tmpq
4984*c0909341SAndroid Build Coastguard Worker    mova                m10, [rsp+0x00]
4985*c0909341SAndroid Build Coastguard Worker    mova               xm11, [rsp+0x40]
4986*c0909341SAndroid Build Coastguard Worker.dy2_hloop:
4987*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        xm2, [base+pq_0x40000000]
4988*c0909341SAndroid Build Coastguard Worker    pand                 m5, m1, m6
4989*c0909341SAndroid Build Coastguard Worker    psrld                m5, 6
4990*c0909341SAndroid Build Coastguard Worker    paddd               m15, m5
4991*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m5, m9
4992*c0909341SAndroid Build Coastguard Worker    vextracti128        xm7, m15, 1
4993*c0909341SAndroid Build Coastguard Worker    movq                 r6, xm15
4994*c0909341SAndroid Build Coastguard Worker    pextrq               r9, xm15, 1
4995*c0909341SAndroid Build Coastguard Worker    movq                r11, xm7
4996*c0909341SAndroid Build Coastguard Worker    pextrq               rX, xm7, 1
4997*c0909341SAndroid Build Coastguard Worker    mov                 r4d, r6d
4998*c0909341SAndroid Build Coastguard Worker    shr                  r6, 32
4999*c0909341SAndroid Build Coastguard Worker    mov                 r7d, r9d
5000*c0909341SAndroid Build Coastguard Worker    shr                  r9, 32
5001*c0909341SAndroid Build Coastguard Worker    mov                r10d, r11d
5002*c0909341SAndroid Build Coastguard Worker    shr                 r11, 32
5003*c0909341SAndroid Build Coastguard Worker    mov                r13d, rXd
5004*c0909341SAndroid Build Coastguard Worker    shr                  rX, 32
5005*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x60], m1
5006*c0909341SAndroid Build Coastguard Worker    movq               xm12, [base+subpel_filters+ r4*8]
5007*c0909341SAndroid Build Coastguard Worker    movq               xm13, [base+subpel_filters+ r6*8]
5008*c0909341SAndroid Build Coastguard Worker    movhps             xm12, [base+subpel_filters+ r7*8]
5009*c0909341SAndroid Build Coastguard Worker    movhps             xm13, [base+subpel_filters+ r9*8]
5010*c0909341SAndroid Build Coastguard Worker    movq               xm14, [base+subpel_filters+r10*8]
5011*c0909341SAndroid Build Coastguard Worker    movq               xm15, [base+subpel_filters+r11*8]
5012*c0909341SAndroid Build Coastguard Worker    movhps             xm14, [base+subpel_filters+r13*8]
5013*c0909341SAndroid Build Coastguard Worker    movhps             xm15, [base+subpel_filters+ rX*8]
5014*c0909341SAndroid Build Coastguard Worker    psrld                m1, 10
5015*c0909341SAndroid Build Coastguard Worker    vextracti128        xm7, m1, 1
5016*c0909341SAndroid Build Coastguard Worker    vextracti128        xm6, m5, 1
5017*c0909341SAndroid Build Coastguard Worker    movq                 r6, xm1
5018*c0909341SAndroid Build Coastguard Worker    pextrq              r11, xm1, 1
5019*c0909341SAndroid Build Coastguard Worker    movq                 r9, xm7
5020*c0909341SAndroid Build Coastguard Worker    pextrq               rX, xm7, 1
5021*c0909341SAndroid Build Coastguard Worker    mov                 r4d, r6d
5022*c0909341SAndroid Build Coastguard Worker    shr                  r6, 32
5023*c0909341SAndroid Build Coastguard Worker    mov                r10d, r11d
5024*c0909341SAndroid Build Coastguard Worker    shr                 r11, 32
5025*c0909341SAndroid Build Coastguard Worker    mov                 r7d, r9d
5026*c0909341SAndroid Build Coastguard Worker    shr                  r9, 32
5027*c0909341SAndroid Build Coastguard Worker    mov                r13d, rXd
5028*c0909341SAndroid Build Coastguard Worker    shr                  rX, 32
5029*c0909341SAndroid Build Coastguard Worker    pshufd              xm4, xm5, q2200
5030*c0909341SAndroid Build Coastguard Worker    pshufd              xm5, xm5, q3311
5031*c0909341SAndroid Build Coastguard Worker    pshufd              xm7, xm6, q2200
5032*c0909341SAndroid Build Coastguard Worker    pshufd              xm6, xm6, q3311
5033*c0909341SAndroid Build Coastguard Worker    pblendvb           xm12, xm2, xm4
5034*c0909341SAndroid Build Coastguard Worker    pblendvb           xm13, xm2, xm5
5035*c0909341SAndroid Build Coastguard Worker    pblendvb           xm14, xm2, xm7
5036*c0909341SAndroid Build Coastguard Worker    pblendvb           xm15, xm2, xm6
5037*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m12, xm12
5038*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m13, xm13
5039*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m14, xm14
5040*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m15, xm15
5041*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
5042*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x80], m0
5043*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
5044*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
5045*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
5046*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+0x80]
5047*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [base+subpel_s_shuf8]
5048*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [rsp+0x50]
5049*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [rsp+0x54]
5050*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [rsp+0x58]
5051*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [rsp+0x5c]
5052*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m7     ; 01a 01b
5053*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m7     ; 23a 23b
5054*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7     ; 45a 45b
5055*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7     ; 67a 67b
5056*c0909341SAndroid Build Coastguard Worker.dy2_vloop:
5057*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m0, m8
5058*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, m9
5059*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m10
5060*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m3, m11
5061*c0909341SAndroid Build Coastguard Worker    paddd                m4, [rsp+0x20]
5062*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
5063*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
5064*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
5065*c0909341SAndroid Build Coastguard Worker%if isput
5066*c0909341SAndroid Build Coastguard Worker    psrad                m4, [rsp+0x48]
5067*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
5068*c0909341SAndroid Build Coastguard Worker    packusdw            xm4, xm5
5069*c0909341SAndroid Build Coastguard Worker    pminsw              xm4, [rsp+0xc0]
5070*c0909341SAndroid Build Coastguard Worker    mova             [dstq], xm4
5071*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
5072*c0909341SAndroid Build Coastguard Worker%else
5073*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
5074*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
5075*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm5
5076*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xm4
5077*c0909341SAndroid Build Coastguard Worker    add                tmpq, tmp_stridem
5078*c0909341SAndroid Build Coastguard Worker%endif
5079*c0909341SAndroid Build Coastguard Worker    dec                  hd
5080*c0909341SAndroid Build Coastguard Worker    jz .dy2_hloop_prep
5081*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
5082*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
5083*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
5084*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ r4*2]
5085*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+ r6*2]
5086*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+ r7*2]
5087*c0909341SAndroid Build Coastguard Worker    movu                xm6, [srcq+ r9*2]
5088*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+r10*2], 1
5089*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [srcq+r11*2], 1
5090*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [srcq+r13*2], 1
5091*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [srcq+ rX*2], 1
5092*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
5093*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m12
5094*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m13
5095*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m14
5096*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m15
5097*c0909341SAndroid Build Coastguard Worker    phaddd               m3, m4
5098*c0909341SAndroid Build Coastguard Worker    phaddd               m5, m6
5099*c0909341SAndroid Build Coastguard Worker    phaddd               m3, m5
5100*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+ r4*2]
5101*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+ r6*2]
5102*c0909341SAndroid Build Coastguard Worker    movu                xm6, [srcq+ r7*2]
5103*c0909341SAndroid Build Coastguard Worker    movu                xm7, [srcq+ r9*2]
5104*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [srcq+r10*2], 1
5105*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [srcq+r11*2], 1
5106*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [srcq+r13*2], 1
5107*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [srcq+ rX*2], 1
5108*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
5109*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m12
5110*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m13
5111*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m14
5112*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m15
5113*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m5
5114*c0909341SAndroid Build Coastguard Worker    phaddd               m6, m7
5115*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+0x00]
5116*c0909341SAndroid Build Coastguard Worker    movd                xm7, [rsp+0x40]
5117*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m6
5118*c0909341SAndroid Build Coastguard Worker    paddd                m3, m5
5119*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
5120*c0909341SAndroid Build Coastguard Worker    psrad                m3, xm7
5121*c0909341SAndroid Build Coastguard Worker    psrad                m4, xm7
5122*c0909341SAndroid Build Coastguard Worker    pslld                m4, 16
5123*c0909341SAndroid Build Coastguard Worker    pblendw              m3, m4, 0xaa
5124*c0909341SAndroid Build Coastguard Worker    jmp .dy2_vloop
5125*c0909341SAndroid Build Coastguard Worker.ret:
5126*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_RET 0
5127*c0909341SAndroid Build Coastguard Worker%undef isput
5128*c0909341SAndroid Build Coastguard Worker%undef isprep
5129*c0909341SAndroid Build Coastguard Worker%endmacro
5130*c0909341SAndroid Build Coastguard Worker
5131*c0909341SAndroid Build Coastguard Worker%macro BILIN_SCALED_FN 1
5132*c0909341SAndroid Build Coastguard Workercglobal %1_bilin_scaled_16bpc
5133*c0909341SAndroid Build Coastguard Worker    mov                 t0d, (5*15 << 16) | 5*15
5134*c0909341SAndroid Build Coastguard Worker    mov                 t1d, t0d
5135*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX)
5136*c0909341SAndroid Build Coastguard Worker%endmacro
5137*c0909341SAndroid Build Coastguard Worker
5138*c0909341SAndroid Build Coastguard Worker%if WIN64
5139*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 5
5140*c0909341SAndroid Build Coastguard Worker%else
5141*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 8
5142*c0909341SAndroid Build Coastguard Worker%endif
5143*c0909341SAndroid Build Coastguard Worker
5144*c0909341SAndroid Build Coastguard Worker%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
5145*c0909341SAndroid Build Coastguard WorkerBILIN_SCALED_FN put
5146*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   put_8tap_scaled_16bpc
5147*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_scaled_16bpc
5148*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_scaled_16bpc
5149*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  put_8tap_scaled_16bpc
5150*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, put_8tap_scaled_16bpc
5151*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   put_8tap_scaled_16bpc
5152*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, put_8tap_scaled_16bpc
5153*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  put_8tap_scaled_16bpc
5154*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
5155*c0909341SAndroid Build Coastguard WorkerMC_8TAP_SCALED put
5156*c0909341SAndroid Build Coastguard Worker
5157*c0909341SAndroid Build Coastguard Worker%if WIN64
5158*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5, 4
5159*c0909341SAndroid Build Coastguard Worker%else
5160*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 7
5161*c0909341SAndroid Build Coastguard Worker%endif
5162*c0909341SAndroid Build Coastguard Worker
5163*c0909341SAndroid Build Coastguard Worker%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
5164*c0909341SAndroid Build Coastguard WorkerBILIN_SCALED_FN prep
5165*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   prep_8tap_scaled_16bpc
5166*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_scaled_16bpc
5167*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_scaled_16bpc
5168*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  prep_8tap_scaled_16bpc
5169*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_scaled_16bpc
5170*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_scaled_16bpc
5171*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, prep_8tap_scaled_16bpc
5172*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  prep_8tap_scaled_16bpc
5173*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
5174*c0909341SAndroid Build Coastguard WorkerMC_8TAP_SCALED prep
5175*c0909341SAndroid Build Coastguard Worker
5176*c0909341SAndroid Build Coastguard Worker%macro WARP_V 5 ; dst, 01, 23, 45, 67
5177*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [myq+deltaq*4]
5178*c0909341SAndroid Build Coastguard Worker    lea               tmp2d, [myq+deltaq*1]
5179*c0909341SAndroid Build Coastguard Worker    shr                 myd, 10
5180*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
5181*c0909341SAndroid Build Coastguard Worker    movq                xm8, [filterq+myq  *8]
5182*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, [filterq+tmp1q*8], 1 ; a e
5183*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [tmp2q+deltaq*4]
5184*c0909341SAndroid Build Coastguard Worker    lea                 myd, [tmp2q+deltaq*1]
5185*c0909341SAndroid Build Coastguard Worker    shr               tmp2d, 10
5186*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
5187*c0909341SAndroid Build Coastguard Worker    movq                xm0, [filterq+tmp2q*8]
5188*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [filterq+tmp1q*8], 1 ; b f
5189*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [myq+deltaq*4]
5190*c0909341SAndroid Build Coastguard Worker    lea               tmp2d, [myq+deltaq*1]
5191*c0909341SAndroid Build Coastguard Worker    shr                 myd, 10
5192*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
5193*c0909341SAndroid Build Coastguard Worker    movq                xm9, [filterq+myq  *8]
5194*c0909341SAndroid Build Coastguard Worker    vinserti128          m9, [filterq+tmp1q*8], 1 ; c g
5195*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [tmp2q+deltaq*4]
5196*c0909341SAndroid Build Coastguard Worker    lea                 myd, [tmp2q+gammaq]       ; my += gamma
5197*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m0
5198*c0909341SAndroid Build Coastguard Worker    shr               tmp2d, 10
5199*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
5200*c0909341SAndroid Build Coastguard Worker    movq                xm0, [filterq+tmp2q*8]
5201*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [filterq+tmp1q*8], 1 ; d h
5202*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m9, m0
5203*c0909341SAndroid Build Coastguard Worker    punpckldq            m9, m8, m0
5204*c0909341SAndroid Build Coastguard Worker    punpckhdq            m0, m8, m0
5205*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m11, m9 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8
5206*c0909341SAndroid Build Coastguard Worker    punpckhbw            m9, m11, m9 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8
5207*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%2, m8
5208*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m%3
5209*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m11, m0 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8
5210*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m11, m0 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8
5211*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m%4
5212*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m%5
5213*c0909341SAndroid Build Coastguard Worker    paddd                m9, m%2
5214*c0909341SAndroid Build Coastguard Worker    mova                m%2, m%3
5215*c0909341SAndroid Build Coastguard Worker    paddd                m0, m8
5216*c0909341SAndroid Build Coastguard Worker    mova                m%3, m%4
5217*c0909341SAndroid Build Coastguard Worker    mova                m%4, m%5
5218*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m0, m9
5219*c0909341SAndroid Build Coastguard Worker%endmacro
5220*c0909341SAndroid Build Coastguard Worker
5221*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8t_16bpc, 4, 14, 16, tmp, ts
5222*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r7m
5223*c0909341SAndroid Build Coastguard Worker    lea                  r9, [$$]
5224*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
5225*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [r9-$$+warp8x8_shift+r6*4]
5226*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [warp8x8t_rnd]
5227*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main
5228*c0909341SAndroid Build Coastguard Worker    jmp .start
5229*c0909341SAndroid Build Coastguard Worker.loop:
5230*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main2
5231*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+tsq*4]
5232*c0909341SAndroid Build Coastguard Worker.start:
5233*c0909341SAndroid Build Coastguard Worker    paddd                m7, m14
5234*c0909341SAndroid Build Coastguard Worker    paddd                m0, m14
5235*c0909341SAndroid Build Coastguard Worker    psrad                m7, 15
5236*c0909341SAndroid Build Coastguard Worker    psrad                m0, 15
5237*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m0
5238*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m7, q3120
5239*c0909341SAndroid Build Coastguard Worker    mova         [tmpq+tsq*0], xm7
5240*c0909341SAndroid Build Coastguard Worker    vextracti128 [tmpq+tsq*2], m7, 1
5241*c0909341SAndroid Build Coastguard Worker    dec                 r4d
5242*c0909341SAndroid Build Coastguard Worker    jg .loop
5243*c0909341SAndroid Build Coastguard Worker.end:
5244*c0909341SAndroid Build Coastguard Worker    RET
5245*c0909341SAndroid Build Coastguard Worker
5246*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8_16bpc, 4, 14, 16, dst, ds, src, ss, abcd, mx, tmp2, \
5247*c0909341SAndroid Build Coastguard Worker                                          alpha, beta, filter, tmp1, delta, \
5248*c0909341SAndroid Build Coastguard Worker                                          my, gamma
5249*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r7m
5250*c0909341SAndroid Build Coastguard Worker    lea             filterq, [$$]
5251*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
5252*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [filterq-$$+warp8x8_shift+r6*4]
5253*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [filterq-$$+warp8x8_rnd  +r6*4]
5254*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m15, r7m ; pixel_max
5255*c0909341SAndroid Build Coastguard Worker    call .main
5256*c0909341SAndroid Build Coastguard Worker    jmp .start
5257*c0909341SAndroid Build Coastguard Worker.loop:
5258*c0909341SAndroid Build Coastguard Worker    call .main2
5259*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5260*c0909341SAndroid Build Coastguard Worker.start:
5261*c0909341SAndroid Build Coastguard Worker    psrad                m7, 16
5262*c0909341SAndroid Build Coastguard Worker    psrad                m0, 16
5263*c0909341SAndroid Build Coastguard Worker    packusdw             m7, m0
5264*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m14
5265*c0909341SAndroid Build Coastguard Worker    pminsw               m7, m15
5266*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m7, q3120
5267*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm7
5268*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], m7, 1
5269*c0909341SAndroid Build Coastguard Worker    dec                 r4d
5270*c0909341SAndroid Build Coastguard Worker    jg .loop
5271*c0909341SAndroid Build Coastguard Worker.end:
5272*c0909341SAndroid Build Coastguard Worker    RET
5273*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5274*c0909341SAndroid Build Coastguard Worker.main:
5275*c0909341SAndroid Build Coastguard Worker    ; Stack args offset by one (r4m -> r5m etc.) due to call
5276*c0909341SAndroid Build Coastguard Worker%if WIN64
5277*c0909341SAndroid Build Coastguard Worker    mov               abcdq, r5m
5278*c0909341SAndroid Build Coastguard Worker    mov                 mxd, r6m
5279*c0909341SAndroid Build Coastguard Worker%endif
5280*c0909341SAndroid Build Coastguard Worker    movsx            alphad, word [abcdq+2*0]
5281*c0909341SAndroid Build Coastguard Worker    movsx             betad, word [abcdq+2*1]
5282*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [pd_32768]
5283*c0909341SAndroid Build Coastguard Worker    pxor                m11, m11
5284*c0909341SAndroid Build Coastguard Worker    add             filterq, mc_warp_filter-$$
5285*c0909341SAndroid Build Coastguard Worker    lea               tmp1q, [ssq*3]
5286*c0909341SAndroid Build Coastguard Worker    add                 mxd, 512+(64<<10)
5287*c0909341SAndroid Build Coastguard Worker    lea               tmp2d, [alphaq*3]
5288*c0909341SAndroid Build Coastguard Worker    sub                srcq, tmp1q    ; src -= src_stride*3
5289*c0909341SAndroid Build Coastguard Worker    sub               betad, tmp2d    ; beta -= alpha*3
5290*c0909341SAndroid Build Coastguard Worker    mov                 myd, r7m
5291*c0909341SAndroid Build Coastguard Worker    call .h
5292*c0909341SAndroid Build Coastguard Worker    psrld                m1, m0, 16
5293*c0909341SAndroid Build Coastguard Worker    call .h
5294*c0909341SAndroid Build Coastguard Worker    pblendw              m1, m0, 0xaa ; 01
5295*c0909341SAndroid Build Coastguard Worker    psrld                m2, m0, 16
5296*c0909341SAndroid Build Coastguard Worker    call .h
5297*c0909341SAndroid Build Coastguard Worker    pblendw              m2, m0, 0xaa ; 12
5298*c0909341SAndroid Build Coastguard Worker    psrld                m3, m0, 16
5299*c0909341SAndroid Build Coastguard Worker    call .h
5300*c0909341SAndroid Build Coastguard Worker    pblendw              m3, m0, 0xaa ; 23
5301*c0909341SAndroid Build Coastguard Worker    psrld                m4, m0, 16
5302*c0909341SAndroid Build Coastguard Worker    call .h
5303*c0909341SAndroid Build Coastguard Worker    pblendw              m4, m0, 0xaa ; 34
5304*c0909341SAndroid Build Coastguard Worker    psrld                m5, m0, 16
5305*c0909341SAndroid Build Coastguard Worker    call .h
5306*c0909341SAndroid Build Coastguard Worker    pblendw              m5, m0, 0xaa ; 45
5307*c0909341SAndroid Build Coastguard Worker    psrld                m6, m0, 16
5308*c0909341SAndroid Build Coastguard Worker    call .h
5309*c0909341SAndroid Build Coastguard Worker    pblendw              m6, m0, 0xaa ; 56
5310*c0909341SAndroid Build Coastguard Worker    movsx            deltad, word [abcdq+2*2]
5311*c0909341SAndroid Build Coastguard Worker    movsx            gammad, word [abcdq+2*3]
5312*c0909341SAndroid Build Coastguard Worker    add                 myd, 512+(64<<10)
5313*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 4
5314*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [deltaq*3]
5315*c0909341SAndroid Build Coastguard Worker    sub              gammad, tmp1d    ; gamma -= delta*3
5316*c0909341SAndroid Build Coastguard Worker.main2:
5317*c0909341SAndroid Build Coastguard Worker    call .h
5318*c0909341SAndroid Build Coastguard Worker    psrld                m7, m6, 16
5319*c0909341SAndroid Build Coastguard Worker    pblendw              m7, m0, 0xaa ; 67
5320*c0909341SAndroid Build Coastguard Worker    WARP_V                7, 1, 3, 5, 7
5321*c0909341SAndroid Build Coastguard Worker    call .h
5322*c0909341SAndroid Build Coastguard Worker    psrld               m10, m5, 16
5323*c0909341SAndroid Build Coastguard Worker    pblendw             m10, m0, 0xaa ; 78
5324*c0909341SAndroid Build Coastguard Worker    WARP_V                0, 2, 4, 6, 10
5325*c0909341SAndroid Build Coastguard Worker    ret
5326*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5327*c0909341SAndroid Build Coastguard Worker.h:
5328*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [mxq+alphaq*4]
5329*c0909341SAndroid Build Coastguard Worker    lea               tmp2d, [mxq+alphaq*1]
5330*c0909341SAndroid Build Coastguard Worker    movu               xm10, [srcq-6]
5331*c0909341SAndroid Build Coastguard Worker    vinserti128         m10, [srcq+2], 1
5332*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 10 ; 0
5333*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10 ; 4
5334*c0909341SAndroid Build Coastguard Worker    movq                xm0, [filterq+mxq  *8]
5335*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [filterq+tmp1q*8], 1
5336*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [tmp2q+alphaq*4]
5337*c0909341SAndroid Build Coastguard Worker    lea                 mxd, [tmp2q+alphaq*1]
5338*c0909341SAndroid Build Coastguard Worker    movu                xm8, [srcq-4]
5339*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, [srcq+4], 1
5340*c0909341SAndroid Build Coastguard Worker    shr               tmp2d, 10 ; 1
5341*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10 ; 5
5342*c0909341SAndroid Build Coastguard Worker    movq                xm9, [filterq+tmp2q*8]
5343*c0909341SAndroid Build Coastguard Worker    vinserti128          m9, [filterq+tmp1q*8], 1
5344*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [mxq+alphaq*4]
5345*c0909341SAndroid Build Coastguard Worker    lea               tmp2d, [mxq+alphaq*1]
5346*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 10 ; 2
5347*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10 ; 6
5348*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m11, m0
5349*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m10
5350*c0909341SAndroid Build Coastguard Worker    movu               xm10, [srcq-2]
5351*c0909341SAndroid Build Coastguard Worker    vinserti128         m10, [srcq+6], 1
5352*c0909341SAndroid Build Coastguard Worker    punpcklbw            m9, m11, m9
5353*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m8
5354*c0909341SAndroid Build Coastguard Worker    movq                xm8, [filterq+mxq  *8]
5355*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, [filterq+tmp1q*8], 1
5356*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [tmp2q+alphaq*4]
5357*c0909341SAndroid Build Coastguard Worker    lea                 mxd, [tmp2q+betaq] ; mx += beta
5358*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m9 ; 0 1   4 5
5359*c0909341SAndroid Build Coastguard Worker    movu                xm9, [srcq+0]
5360*c0909341SAndroid Build Coastguard Worker    vinserti128          m9, [srcq+8], 1
5361*c0909341SAndroid Build Coastguard Worker    shr               tmp2d, 10 ; 3
5362*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10 ; 7
5363*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m11, m8
5364*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m10
5365*c0909341SAndroid Build Coastguard Worker    movq               xm10, [filterq+tmp2q*8]
5366*c0909341SAndroid Build Coastguard Worker    vinserti128         m10, [filterq+tmp1q*8], 1
5367*c0909341SAndroid Build Coastguard Worker    punpcklbw           m10, m11, m10
5368*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m10
5369*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
5370*c0909341SAndroid Build Coastguard Worker    phaddd               m8, m9 ; 2 3   6 7
5371*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m8 ; 0 1 2 3   4 5 6 7
5372*c0909341SAndroid Build Coastguard Worker    vpsllvd              m0, m13
5373*c0909341SAndroid Build Coastguard Worker    paddd                m0, m12 ; rounded 14-bit result in upper 16 bits of dword
5374*c0909341SAndroid Build Coastguard Worker    ret
5375*c0909341SAndroid Build Coastguard Worker
5376*c0909341SAndroid Build Coastguard Worker%macro BIDIR_FN 0
5377*c0909341SAndroid Build Coastguard Worker    call .main
5378*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
5379*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5380*c0909341SAndroid Build Coastguard Worker.w4:
5381*c0909341SAndroid Build Coastguard Worker    movq   [dstq          ], xm0
5382*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
5383*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m0, 1
5384*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm0
5385*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm0
5386*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
5387*c0909341SAndroid Build Coastguard Worker    je .ret
5388*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5389*c0909341SAndroid Build Coastguard Worker    movq   [dstq          ], xm1
5390*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
5391*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m1, 1
5392*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
5393*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
5394*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
5395*c0909341SAndroid Build Coastguard Worker    je .ret
5396*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5397*c0909341SAndroid Build Coastguard Worker    movq   [dstq          ], xm2
5398*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm2
5399*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m2, 1
5400*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm2
5401*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm2
5402*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5403*c0909341SAndroid Build Coastguard Worker    movq   [dstq          ], xm3
5404*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm3
5405*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m3, 1
5406*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm3
5407*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm3
5408*c0909341SAndroid Build Coastguard Worker.ret:
5409*c0909341SAndroid Build Coastguard Worker    RET
5410*c0909341SAndroid Build Coastguard Worker.w8:
5411*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
5412*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
5413*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*2], xm1
5414*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+stride3q ], m1, 1
5415*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
5416*c0909341SAndroid Build Coastguard Worker    jne .w8_loop_start
5417*c0909341SAndroid Build Coastguard Worker    RET
5418*c0909341SAndroid Build Coastguard Worker.w8_loop:
5419*c0909341SAndroid Build Coastguard Worker    call .main
5420*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5421*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
5422*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
5423*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*2], xm1
5424*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+stride3q ], m1, 1
5425*c0909341SAndroid Build Coastguard Worker.w8_loop_start:
5426*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5427*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm2
5428*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m2, 1
5429*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*2], xm3
5430*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+stride3q ], m3, 1
5431*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
5432*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
5433*c0909341SAndroid Build Coastguard Worker    RET
5434*c0909341SAndroid Build Coastguard Worker.w16_loop:
5435*c0909341SAndroid Build Coastguard Worker    call .main
5436*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5437*c0909341SAndroid Build Coastguard Worker.w16:
5438*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
5439*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
5440*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m2
5441*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], m3
5442*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5443*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
5444*c0909341SAndroid Build Coastguard Worker    RET
5445*c0909341SAndroid Build Coastguard Worker.w32_loop:
5446*c0909341SAndroid Build Coastguard Worker    call .main
5447*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
5448*c0909341SAndroid Build Coastguard Worker.w32:
5449*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*0], m0
5450*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*1], m1
5451*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*0], m2
5452*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*1], m3
5453*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5454*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
5455*c0909341SAndroid Build Coastguard Worker    RET
5456*c0909341SAndroid Build Coastguard Worker.w64_loop:
5457*c0909341SAndroid Build Coastguard Worker    call .main
5458*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
5459*c0909341SAndroid Build Coastguard Worker.w64:
5460*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
5461*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m1
5462*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*2], m2
5463*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*3], m3
5464*c0909341SAndroid Build Coastguard Worker    dec                  hd
5465*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
5466*c0909341SAndroid Build Coastguard Worker    RET
5467*c0909341SAndroid Build Coastguard Worker.w128_loop:
5468*c0909341SAndroid Build Coastguard Worker    call .main
5469*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
5470*c0909341SAndroid Build Coastguard Worker.w128:
5471*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
5472*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m1
5473*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*2], m2
5474*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*3], m3
5475*c0909341SAndroid Build Coastguard Worker    call .main
5476*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*4], m0
5477*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*5], m1
5478*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*6], m2
5479*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*7], m3
5480*c0909341SAndroid Build Coastguard Worker    dec                  hd
5481*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
5482*c0909341SAndroid Build Coastguard Worker    RET
5483*c0909341SAndroid Build Coastguard Worker%endmacro
5484*c0909341SAndroid Build Coastguard Worker
5485*c0909341SAndroid Build Coastguard Worker%if WIN64
5486*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5
5487*c0909341SAndroid Build Coastguard Worker%else
5488*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7
5489*c0909341SAndroid Build Coastguard Worker%endif
5490*c0909341SAndroid Build Coastguard Worker
5491*c0909341SAndroid Build Coastguard Workercglobal avg_16bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
5492*c0909341SAndroid Build Coastguard Worker%define base r6-avg_avx2_table
5493*c0909341SAndroid Build Coastguard Worker    lea                  r6, [avg_avx2_table]
5494*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5495*c0909341SAndroid Build Coastguard Worker    mov                 t0d, r6m ; pixel_max
5496*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
5497*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 11
5498*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+bidir_rnd+t0*4]
5499*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+bidir_mul+t0*4]
5500*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5501*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
5502*c0909341SAndroid Build Coastguard Worker    BIDIR_FN
5503*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5504*c0909341SAndroid Build Coastguard Worker.main:
5505*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tmp1q+32*0]
5506*c0909341SAndroid Build Coastguard Worker    paddsw               m0, [tmp2q+32*0]
5507*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp1q+32*1]
5508*c0909341SAndroid Build Coastguard Worker    paddsw               m1, [tmp2q+32*1]
5509*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmp1q+32*2]
5510*c0909341SAndroid Build Coastguard Worker    paddsw               m2, [tmp2q+32*2]
5511*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tmp1q+32*3]
5512*c0909341SAndroid Build Coastguard Worker    paddsw               m3, [tmp2q+32*3]
5513*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*4
5514*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*4
5515*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m4
5516*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m4
5517*c0909341SAndroid Build Coastguard Worker    pmaxsw               m2, m4
5518*c0909341SAndroid Build Coastguard Worker    pmaxsw               m3, m4
5519*c0909341SAndroid Build Coastguard Worker    psubsw               m0, m4
5520*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m4
5521*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m4
5522*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m4
5523*c0909341SAndroid Build Coastguard Worker    pmulhw               m0, m5
5524*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m5
5525*c0909341SAndroid Build Coastguard Worker    pmulhw               m2, m5
5526*c0909341SAndroid Build Coastguard Worker    pmulhw               m3, m5
5527*c0909341SAndroid Build Coastguard Worker    ret
5528*c0909341SAndroid Build Coastguard Worker
5529*c0909341SAndroid Build Coastguard Workercglobal w_avg_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, stride3
5530*c0909341SAndroid Build Coastguard Worker    lea                  r6, [w_avg_avx2_table]
5531*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5532*c0909341SAndroid Build Coastguard Worker    mov                 t0d, r6m ; weight
5533*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m8, r7m ; pixel_max
5534*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [r6-w_avg_avx2_table+pd_65538]
5535*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
5536*c0909341SAndroid Build Coastguard Worker    paddw                m7, m8
5537*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
5538*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [t0-16]
5539*c0909341SAndroid Build Coastguard Worker    shl                 t0d, 16
5540*c0909341SAndroid Build Coastguard Worker    sub                 t0d, r6d ; 16-weight, weight
5541*c0909341SAndroid Build Coastguard Worker    pslld                m7, 7
5542*c0909341SAndroid Build Coastguard Worker    rorx                r6d, t0d, 30 ; << 2
5543*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
5544*c0909341SAndroid Build Coastguard Worker    cmovz               r6d, t0d
5545*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5546*c0909341SAndroid Build Coastguard Worker    movd                xm6, r6d
5547*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, xm6
5548*c0909341SAndroid Build Coastguard Worker    BIDIR_FN
5549*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5550*c0909341SAndroid Build Coastguard Worker.main:
5551*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tmp1q+32*0]
5552*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tmp2q+32*0]
5553*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m0, m4
5554*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m4
5555*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tmp1q+32*1]
5556*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp2q+32*1]
5557*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m6
5558*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m6
5559*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
5560*c0909341SAndroid Build Coastguard Worker    paddd                m0, m7
5561*c0909341SAndroid Build Coastguard Worker    psrad                m5, 8
5562*c0909341SAndroid Build Coastguard Worker    psrad                m0, 8
5563*c0909341SAndroid Build Coastguard Worker    packusdw             m0, m5
5564*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m1, m4
5565*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m4
5566*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tmp1q+32*2]
5567*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmp2q+32*2]
5568*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m6
5569*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m6
5570*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
5571*c0909341SAndroid Build Coastguard Worker    paddd                m1, m7
5572*c0909341SAndroid Build Coastguard Worker    psrad                m5, 8
5573*c0909341SAndroid Build Coastguard Worker    psrad                m1, 8
5574*c0909341SAndroid Build Coastguard Worker    packusdw             m1, m5
5575*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m2, m4
5576*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4
5577*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tmp1q+32*3]
5578*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tmp2q+32*3]
5579*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*4
5580*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*4
5581*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m6
5582*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m6
5583*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
5584*c0909341SAndroid Build Coastguard Worker    paddd                m2, m7
5585*c0909341SAndroid Build Coastguard Worker    psrad                m5, 8
5586*c0909341SAndroid Build Coastguard Worker    psrad                m2, 8
5587*c0909341SAndroid Build Coastguard Worker    packusdw             m2, m5
5588*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m3, m4
5589*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4
5590*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m6
5591*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m6
5592*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
5593*c0909341SAndroid Build Coastguard Worker    paddd                m3, m7
5594*c0909341SAndroid Build Coastguard Worker    psrad                m5, 8
5595*c0909341SAndroid Build Coastguard Worker    psrad                m3, 8
5596*c0909341SAndroid Build Coastguard Worker    packusdw             m3, m5
5597*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m8
5598*c0909341SAndroid Build Coastguard Worker    pminsw               m1, m8
5599*c0909341SAndroid Build Coastguard Worker    pminsw               m2, m8
5600*c0909341SAndroid Build Coastguard Worker    pminsw               m3, m8
5601*c0909341SAndroid Build Coastguard Worker    ret
5602*c0909341SAndroid Build Coastguard Worker
5603*c0909341SAndroid Build Coastguard Workercglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
5604*c0909341SAndroid Build Coastguard Worker%define base r7-mask_avx2_table
5605*c0909341SAndroid Build Coastguard Worker    lea                  r7, [mask_avx2_table]
5606*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5607*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r7m ; pixel_max
5608*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5609*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
5610*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r7+wq*4]
5611*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+pw_64]
5612*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+bidir_rnd+r6*4]
5613*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+bidir_mul+r6*4]
5614*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
5615*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
5616*c0909341SAndroid Build Coastguard Worker    BIDIR_FN
5617*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5618*c0909341SAndroid Build Coastguard Worker.main:
5619*c0909341SAndroid Build Coastguard Worker%macro MASK 1
5620*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m5, [maskq+16*%1]
5621*c0909341SAndroid Build Coastguard Worker    mova                m%1, [tmp1q+32*%1]
5622*c0909341SAndroid Build Coastguard Worker    mova                 m6, [tmp2q+32*%1]
5623*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m%1, m6
5624*c0909341SAndroid Build Coastguard Worker    punpcklwd           m%1, m6
5625*c0909341SAndroid Build Coastguard Worker    psubw                m7, m8, m5
5626*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m5, m7 ; m, 64-m
5627*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m7
5628*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m6     ; tmp1 * m + tmp2 * (64-m)
5629*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m5
5630*c0909341SAndroid Build Coastguard Worker    psrad                m4, 5
5631*c0909341SAndroid Build Coastguard Worker    psrad               m%1, 5
5632*c0909341SAndroid Build Coastguard Worker    packssdw            m%1, m4
5633*c0909341SAndroid Build Coastguard Worker    pmaxsw              m%1, m9
5634*c0909341SAndroid Build Coastguard Worker    psubsw              m%1, m9
5635*c0909341SAndroid Build Coastguard Worker    pmulhw              m%1, m10
5636*c0909341SAndroid Build Coastguard Worker%endmacro
5637*c0909341SAndroid Build Coastguard Worker    MASK                  0
5638*c0909341SAndroid Build Coastguard Worker    MASK                  1
5639*c0909341SAndroid Build Coastguard Worker    MASK                  2
5640*c0909341SAndroid Build Coastguard Worker    MASK                  3
5641*c0909341SAndroid Build Coastguard Worker    add               maskq, 16*4
5642*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*4
5643*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*4
5644*c0909341SAndroid Build Coastguard Worker    ret
5645*c0909341SAndroid Build Coastguard Worker
5646*c0909341SAndroid Build Coastguard Workercglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
5647*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_420_avx2_table
5648*c0909341SAndroid Build Coastguard Worker    lea                  r7, [w_mask_420_avx2_table]
5649*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5650*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r8m ; pixel_max
5651*c0909341SAndroid Build Coastguard Worker    movd                xm0, r7m ; sign
5652*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5653*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
5654*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r7+wq*4]
5655*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
5656*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+pw_64]
5657*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [base+bidir_rnd+r6*4]
5658*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [base+bidir_mul+r6*4]
5659*c0909341SAndroid Build Coastguard Worker    movd               xm14, [base+pw_2]
5660*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
5661*c0909341SAndroid Build Coastguard Worker    psubw              xm14, xm0
5662*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m14, xm14
5663*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
5664*c0909341SAndroid Build Coastguard Worker    call .main
5665*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
5666*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5667*c0909341SAndroid Build Coastguard Worker.w4:
5668*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m5
5669*c0909341SAndroid Build Coastguard Worker    paddw                m4, m14
5670*c0909341SAndroid Build Coastguard Worker    psrlw                m4, 2
5671*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m4
5672*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
5673*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm4, xm5
5674*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
5675*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
5676*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m0, 1
5677*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm0
5678*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm0
5679*c0909341SAndroid Build Coastguard Worker    mova            [maskq], xm4
5680*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
5681*c0909341SAndroid Build Coastguard Worker    jl .w4_end
5682*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5683*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm1
5684*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
5685*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m1, 1
5686*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
5687*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
5688*c0909341SAndroid Build Coastguard Worker    je .w4_end
5689*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5690*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm2
5691*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm2
5692*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m2, 1
5693*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm2
5694*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm2
5695*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5696*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm3
5697*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm3
5698*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m3, 1
5699*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm3
5700*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm3
5701*c0909341SAndroid Build Coastguard Worker.w4_end:
5702*c0909341SAndroid Build Coastguard Worker    RET
5703*c0909341SAndroid Build Coastguard Worker.w8_loop:
5704*c0909341SAndroid Build Coastguard Worker    call .main
5705*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5706*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
5707*c0909341SAndroid Build Coastguard Worker.w8:
5708*c0909341SAndroid Build Coastguard Worker    vperm2i128           m6, m4, m5, 0x21
5709*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m5, 0xf0
5710*c0909341SAndroid Build Coastguard Worker    paddw                m4, m14
5711*c0909341SAndroid Build Coastguard Worker    paddw                m4, m6
5712*c0909341SAndroid Build Coastguard Worker    psrlw                m4, 2
5713*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
5714*c0909341SAndroid Build Coastguard Worker    packuswb            xm4, xm5
5715*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
5716*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
5717*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*2], xm1
5718*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+stride3q ], m1, 1
5719*c0909341SAndroid Build Coastguard Worker    mova            [maskq], xm4
5720*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
5721*c0909341SAndroid Build Coastguard Worker    jl .w8_end
5722*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5723*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm2
5724*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m2, 1
5725*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*2], xm3
5726*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+stride3q ], m3, 1
5727*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
5728*c0909341SAndroid Build Coastguard Worker.w8_end:
5729*c0909341SAndroid Build Coastguard Worker    RET
5730*c0909341SAndroid Build Coastguard Worker.w16_loop:
5731*c0909341SAndroid Build Coastguard Worker    call .main
5732*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5733*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
5734*c0909341SAndroid Build Coastguard Worker.w16:
5735*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m4, m5
5736*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m4, m5
5737*c0909341SAndroid Build Coastguard Worker    paddw                m6, m14
5738*c0909341SAndroid Build Coastguard Worker    paddw                m4, m6
5739*c0909341SAndroid Build Coastguard Worker    psrlw                m4, 2
5740*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
5741*c0909341SAndroid Build Coastguard Worker    packuswb            xm4, xm5
5742*c0909341SAndroid Build Coastguard Worker    pshufd              xm4, xm4, q3120
5743*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
5744*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
5745*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m2
5746*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], m3
5747*c0909341SAndroid Build Coastguard Worker    mova            [maskq], xm4
5748*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5749*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
5750*c0909341SAndroid Build Coastguard Worker    RET
5751*c0909341SAndroid Build Coastguard Worker.w32_loop:
5752*c0909341SAndroid Build Coastguard Worker    call .main
5753*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5754*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
5755*c0909341SAndroid Build Coastguard Worker.w32:
5756*c0909341SAndroid Build Coastguard Worker    paddw                m4, m14
5757*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
5758*c0909341SAndroid Build Coastguard Worker    psrlw               m15, m4, 2
5759*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*0], m0
5760*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*1], m1
5761*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*0], m2
5762*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*1], m3
5763*c0909341SAndroid Build Coastguard Worker    call .main
5764*c0909341SAndroid Build Coastguard Worker    mova                 m6, [deint_shuf]
5765*c0909341SAndroid Build Coastguard Worker    paddw                m4, m14
5766*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
5767*c0909341SAndroid Build Coastguard Worker    psrlw                m4, 2
5768*c0909341SAndroid Build Coastguard Worker    packuswb            m15, m4
5769*c0909341SAndroid Build Coastguard Worker    vpermd               m4, m6, m15
5770*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*2+32*0], m0
5771*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*2+32*1], m1
5772*c0909341SAndroid Build Coastguard Worker    mova [dstq+stride3q +32*0], m2
5773*c0909341SAndroid Build Coastguard Worker    mova [dstq+stride3q +32*1], m3
5774*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m4
5775*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5776*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
5777*c0909341SAndroid Build Coastguard Worker    RET
5778*c0909341SAndroid Build Coastguard Worker.w64_loop:
5779*c0909341SAndroid Build Coastguard Worker    call .main
5780*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
5781*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
5782*c0909341SAndroid Build Coastguard Worker.w64:
5783*c0909341SAndroid Build Coastguard Worker    paddw                m4, m14
5784*c0909341SAndroid Build Coastguard Worker    paddw               m15, m14, m5
5785*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*0], m0
5786*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*1], m1
5787*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*2], m2
5788*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*3], m3
5789*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m4 ; no available registers
5790*c0909341SAndroid Build Coastguard Worker    call .main
5791*c0909341SAndroid Build Coastguard Worker    paddw                m4, [maskq]
5792*c0909341SAndroid Build Coastguard Worker    mova                 m6, [deint_shuf]
5793*c0909341SAndroid Build Coastguard Worker    paddw                m5, m15
5794*c0909341SAndroid Build Coastguard Worker    psrlw                m4, 2
5795*c0909341SAndroid Build Coastguard Worker    psrlw                m5, 2
5796*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5 ; 0 2 4 6   1 3 5 7
5797*c0909341SAndroid Build Coastguard Worker    vpermd               m4, m6, m4
5798*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*0], m0
5799*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*1], m1
5800*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*2], m2
5801*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*3], m3
5802*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m4
5803*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5804*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
5805*c0909341SAndroid Build Coastguard Worker    RET
5806*c0909341SAndroid Build Coastguard Worker.w128_loop:
5807*c0909341SAndroid Build Coastguard Worker    call .main
5808*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
5809*c0909341SAndroid Build Coastguard Worker    add               maskq, 64
5810*c0909341SAndroid Build Coastguard Worker.w128:
5811*c0909341SAndroid Build Coastguard Worker    paddw                m4, m14
5812*c0909341SAndroid Build Coastguard Worker    paddw                m5, m14
5813*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*0], m0
5814*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*1], m1
5815*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*2], m2
5816*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*3], m3
5817*c0909341SAndroid Build Coastguard Worker    mova       [maskq+32*0], m4
5818*c0909341SAndroid Build Coastguard Worker    mova     [dstq+strideq], m5
5819*c0909341SAndroid Build Coastguard Worker    call .main
5820*c0909341SAndroid Build Coastguard Worker    paddw                m4, m14
5821*c0909341SAndroid Build Coastguard Worker    paddw               m15, m14, m5
5822*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*4], m0
5823*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*5], m1
5824*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*6], m2
5825*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*7], m3
5826*c0909341SAndroid Build Coastguard Worker    mova       [maskq+32*1], m4
5827*c0909341SAndroid Build Coastguard Worker    call .main
5828*c0909341SAndroid Build Coastguard Worker    paddw                m4, [maskq+32*0]
5829*c0909341SAndroid Build Coastguard Worker    paddw                m5, [dstq+strideq]
5830*c0909341SAndroid Build Coastguard Worker    mova                 m6, [deint_shuf]
5831*c0909341SAndroid Build Coastguard Worker    psrlw                m4, 2
5832*c0909341SAndroid Build Coastguard Worker    psrlw                m5, 2
5833*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
5834*c0909341SAndroid Build Coastguard Worker    vpermd               m4, m6, m4
5835*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*0], m0
5836*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*1], m1
5837*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*2], m2
5838*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*3], m3
5839*c0909341SAndroid Build Coastguard Worker    mova       [maskq+32*0], m4
5840*c0909341SAndroid Build Coastguard Worker    call .main
5841*c0909341SAndroid Build Coastguard Worker    paddw                m4, [maskq+32*1]
5842*c0909341SAndroid Build Coastguard Worker    mova                 m6, [deint_shuf]
5843*c0909341SAndroid Build Coastguard Worker    paddw                m5, m15
5844*c0909341SAndroid Build Coastguard Worker    psrlw                m4, 2
5845*c0909341SAndroid Build Coastguard Worker    psrlw                m5, 2
5846*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
5847*c0909341SAndroid Build Coastguard Worker    vpermd               m4, m6, m4
5848*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*4], m0
5849*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*5], m1
5850*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*6], m2
5851*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*7], m3
5852*c0909341SAndroid Build Coastguard Worker    mova       [maskq+32*1], m4
5853*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5854*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
5855*c0909341SAndroid Build Coastguard Worker    RET
5856*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5857*c0909341SAndroid Build Coastguard Worker.main:
5858*c0909341SAndroid Build Coastguard Worker%macro W_MASK 2-6 11, 12, 13 ; dst/src1, mask/src2, pw_64, rnd, mul
5859*c0909341SAndroid Build Coastguard Worker    mova                m%1, [tmp1q+32*%1]
5860*c0909341SAndroid Build Coastguard Worker    mova                m%2, [tmp2q+32*%1]
5861*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m%2, m%1
5862*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m%2, m%1
5863*c0909341SAndroid Build Coastguard Worker    psubsw              m%1, m%2
5864*c0909341SAndroid Build Coastguard Worker    pabsw               m%1, m%1
5865*c0909341SAndroid Build Coastguard Worker    psubusw              m7, m10, m%1
5866*c0909341SAndroid Build Coastguard Worker    psrlw                m7, 10       ; 64-m
5867*c0909341SAndroid Build Coastguard Worker    psubw               m%2, m%3, m7  ; m
5868*c0909341SAndroid Build Coastguard Worker    punpcklwd           m%1, m7, m%2
5869*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m%2
5870*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m8
5871*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m9
5872*c0909341SAndroid Build Coastguard Worker    psrad               m%1, 5
5873*c0909341SAndroid Build Coastguard Worker    psrad                m7, 5
5874*c0909341SAndroid Build Coastguard Worker    packssdw            m%1, m7
5875*c0909341SAndroid Build Coastguard Worker    pmaxsw              m%1, m%4
5876*c0909341SAndroid Build Coastguard Worker    psubsw              m%1, m%4
5877*c0909341SAndroid Build Coastguard Worker    pmulhw              m%1, m%5
5878*c0909341SAndroid Build Coastguard Worker%endmacro
5879*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4
5880*c0909341SAndroid Build Coastguard Worker    W_MASK                1, 5
5881*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m5
5882*c0909341SAndroid Build Coastguard Worker    W_MASK                2, 5
5883*c0909341SAndroid Build Coastguard Worker    W_MASK                3, 6
5884*c0909341SAndroid Build Coastguard Worker    phaddw               m5, m6
5885*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*4
5886*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*4
5887*c0909341SAndroid Build Coastguard Worker    ret
5888*c0909341SAndroid Build Coastguard Worker
5889*c0909341SAndroid Build Coastguard Workercglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
5890*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_422_avx2_table
5891*c0909341SAndroid Build Coastguard Worker    lea                  r7, [w_mask_422_avx2_table]
5892*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5893*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r8m ; pixel_max
5894*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        m14, r7m ; sign
5895*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5896*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
5897*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r7+wq*4]
5898*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+pw_27615]
5899*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+pw_64]
5900*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [base+bidir_rnd+r6*4]
5901*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [base+bidir_mul+r6*4]
5902*c0909341SAndroid Build Coastguard Worker    mova                m15, [base+deint_shuf]
5903*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
5904*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
5905*c0909341SAndroid Build Coastguard Worker    call .main
5906*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
5907*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5908*c0909341SAndroid Build Coastguard Worker.w4:
5909*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
5910*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
5911*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m0, 1
5912*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm0
5913*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm0
5914*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
5915*c0909341SAndroid Build Coastguard Worker    jl .w4_end
5916*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5917*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm1
5918*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
5919*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m1, 1
5920*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
5921*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
5922*c0909341SAndroid Build Coastguard Worker    je .w4_end
5923*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5924*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm2
5925*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm2
5926*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m2, 1
5927*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm2
5928*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm2
5929*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5930*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm3
5931*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm3
5932*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m3, 1
5933*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm3
5934*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm3
5935*c0909341SAndroid Build Coastguard Worker.w4_end:
5936*c0909341SAndroid Build Coastguard Worker    RET
5937*c0909341SAndroid Build Coastguard Worker.w8_loop:
5938*c0909341SAndroid Build Coastguard Worker    call .main
5939*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5940*c0909341SAndroid Build Coastguard Worker.w8:
5941*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
5942*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
5943*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*2], xm1
5944*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+stride3q ], m1, 1
5945*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
5946*c0909341SAndroid Build Coastguard Worker    jl .w8_end
5947*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5948*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm2
5949*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m2, 1
5950*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*2], xm3
5951*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+stride3q ], m3, 1
5952*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
5953*c0909341SAndroid Build Coastguard Worker.w8_end:
5954*c0909341SAndroid Build Coastguard Worker    RET
5955*c0909341SAndroid Build Coastguard Worker.w16_loop:
5956*c0909341SAndroid Build Coastguard Worker    call .main
5957*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5958*c0909341SAndroid Build Coastguard Worker.w16:
5959*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
5960*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
5961*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m2
5962*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], m3
5963*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5964*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
5965*c0909341SAndroid Build Coastguard Worker    RET
5966*c0909341SAndroid Build Coastguard Worker.w32_loop:
5967*c0909341SAndroid Build Coastguard Worker    call .main
5968*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
5969*c0909341SAndroid Build Coastguard Worker.w32:
5970*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*0], m0
5971*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*1], m1
5972*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*0], m2
5973*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*1], m3
5974*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5975*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
5976*c0909341SAndroid Build Coastguard Worker    RET
5977*c0909341SAndroid Build Coastguard Worker.w64_loop:
5978*c0909341SAndroid Build Coastguard Worker    call .main
5979*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
5980*c0909341SAndroid Build Coastguard Worker.w64:
5981*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
5982*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m1
5983*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*2], m2
5984*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*3], m3
5985*c0909341SAndroid Build Coastguard Worker    dec                  hd
5986*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
5987*c0909341SAndroid Build Coastguard Worker    RET
5988*c0909341SAndroid Build Coastguard Worker.w128_loop:
5989*c0909341SAndroid Build Coastguard Worker    call .main
5990*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
5991*c0909341SAndroid Build Coastguard Worker.w128:
5992*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
5993*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m1
5994*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*2], m2
5995*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*3], m3
5996*c0909341SAndroid Build Coastguard Worker    call .main
5997*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*4], m0
5998*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*5], m1
5999*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*6], m2
6000*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*7], m3
6001*c0909341SAndroid Build Coastguard Worker    dec                  hd
6002*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
6003*c0909341SAndroid Build Coastguard Worker    RET
6004*c0909341SAndroid Build Coastguard WorkerALIGN function_align
6005*c0909341SAndroid Build Coastguard Worker.main:
6006*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4
6007*c0909341SAndroid Build Coastguard Worker    W_MASK                1, 5
6008*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m5
6009*c0909341SAndroid Build Coastguard Worker    W_MASK                2, 5
6010*c0909341SAndroid Build Coastguard Worker    W_MASK                3, 6
6011*c0909341SAndroid Build Coastguard Worker    phaddw               m5, m6
6012*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*4
6013*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*4
6014*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
6015*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
6016*c0909341SAndroid Build Coastguard Worker    psubb                m4, m14
6017*c0909341SAndroid Build Coastguard Worker    pavgb                m4, m5
6018*c0909341SAndroid Build Coastguard Worker    vpermd               m4, m15, m4
6019*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m4
6020*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
6021*c0909341SAndroid Build Coastguard Worker    ret
6022*c0909341SAndroid Build Coastguard Worker
6023*c0909341SAndroid Build Coastguard Workercglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
6024*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_444_avx2_table
6025*c0909341SAndroid Build Coastguard Worker    lea                  r7, [w_mask_444_avx2_table]
6026*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
6027*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r8m ; pixel_max
6028*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
6029*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
6030*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r7+wq*4]
6031*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+pw_27615]
6032*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+pw_64]
6033*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+bidir_rnd+r6*4]
6034*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+bidir_mul+r6*4]
6035*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
6036*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
6037*c0909341SAndroid Build Coastguard Worker    call .main
6038*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
6039*c0909341SAndroid Build Coastguard Worker    jmp                  wq
6040*c0909341SAndroid Build Coastguard Worker.w4:
6041*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
6042*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
6043*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m0, 1
6044*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm0
6045*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm0
6046*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
6047*c0909341SAndroid Build Coastguard Worker    jl .w4_end
6048*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
6049*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm1
6050*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
6051*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m1, 1
6052*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
6053*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
6054*c0909341SAndroid Build Coastguard Worker    je .w4_end
6055*c0909341SAndroid Build Coastguard Worker    call .main
6056*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
6057*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
6058*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
6059*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m0, 1
6060*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm0
6061*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm0
6062*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
6063*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm1
6064*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
6065*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m1, 1
6066*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
6067*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
6068*c0909341SAndroid Build Coastguard Worker.w4_end:
6069*c0909341SAndroid Build Coastguard Worker    RET
6070*c0909341SAndroid Build Coastguard Worker.w8_loop:
6071*c0909341SAndroid Build Coastguard Worker    call .main
6072*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
6073*c0909341SAndroid Build Coastguard Worker.w8:
6074*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
6075*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
6076*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*2], xm1
6077*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+stride3q ], m1, 1
6078*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
6079*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
6080*c0909341SAndroid Build Coastguard Worker.w8_end:
6081*c0909341SAndroid Build Coastguard Worker    RET
6082*c0909341SAndroid Build Coastguard Worker.w16_loop:
6083*c0909341SAndroid Build Coastguard Worker    call .main
6084*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
6085*c0909341SAndroid Build Coastguard Worker.w16:
6086*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
6087*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
6088*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
6089*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
6090*c0909341SAndroid Build Coastguard Worker    RET
6091*c0909341SAndroid Build Coastguard Worker.w32_loop:
6092*c0909341SAndroid Build Coastguard Worker    call .main
6093*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
6094*c0909341SAndroid Build Coastguard Worker.w32:
6095*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
6096*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m1
6097*c0909341SAndroid Build Coastguard Worker    dec                  hd
6098*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
6099*c0909341SAndroid Build Coastguard Worker    RET
6100*c0909341SAndroid Build Coastguard Worker.w64_loop:
6101*c0909341SAndroid Build Coastguard Worker    call .main
6102*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
6103*c0909341SAndroid Build Coastguard Worker.w64:
6104*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
6105*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m1
6106*c0909341SAndroid Build Coastguard Worker    call .main
6107*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*2], m0
6108*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*3], m1
6109*c0909341SAndroid Build Coastguard Worker    dec                  hd
6110*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
6111*c0909341SAndroid Build Coastguard Worker    RET
6112*c0909341SAndroid Build Coastguard Worker.w128_loop:
6113*c0909341SAndroid Build Coastguard Worker    call .main
6114*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
6115*c0909341SAndroid Build Coastguard Worker.w128:
6116*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
6117*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m1
6118*c0909341SAndroid Build Coastguard Worker    call .main
6119*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*2], m0
6120*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*3], m1
6121*c0909341SAndroid Build Coastguard Worker    call .main
6122*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*4], m0
6123*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*5], m1
6124*c0909341SAndroid Build Coastguard Worker    call .main
6125*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*6], m0
6126*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*7], m1
6127*c0909341SAndroid Build Coastguard Worker    dec                  hd
6128*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
6129*c0909341SAndroid Build Coastguard Worker    RET
6130*c0909341SAndroid Build Coastguard WorkerALIGN function_align
6131*c0909341SAndroid Build Coastguard Worker.main:
6132*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 2, 4, 5, 6
6133*c0909341SAndroid Build Coastguard Worker    W_MASK                1, 3, 4, 5, 6
6134*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m3
6135*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m2, q3120
6136*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*2
6137*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*2
6138*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m2
6139*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
6140*c0909341SAndroid Build Coastguard Worker    ret
6141*c0909341SAndroid Build Coastguard Worker
6142*c0909341SAndroid Build Coastguard Worker; (a * (64 - m) + b * m + 32) >> 6
6143*c0909341SAndroid Build Coastguard Worker; = (((b - a) * m + 32) >> 6) + a
6144*c0909341SAndroid Build Coastguard Worker; = (((b - a) * (m << 9) + 16384) >> 15) + a
6145*c0909341SAndroid Build Coastguard Worker;   except m << 9 overflows int16_t when m == 64 (which is possible),
6146*c0909341SAndroid Build Coastguard Worker;   but if we negate m it works out (-64 << 9 == -32768).
6147*c0909341SAndroid Build Coastguard Worker; = (((a - b) * (m * -512) + 16384) >> 15) + a
6148*c0909341SAndroid Build Coastguard Workercglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
6149*c0909341SAndroid Build Coastguard Worker%define base r6-blend_avx2_table
6150*c0909341SAndroid Build Coastguard Worker    lea                  r6, [blend_avx2_table]
6151*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
6152*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
6153*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
6154*c0909341SAndroid Build Coastguard Worker    movifnidn         maskq, maskmp
6155*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pw_m512]
6156*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
6157*c0909341SAndroid Build Coastguard Worker    lea                  r6, [dsq*3]
6158*c0909341SAndroid Build Coastguard Worker    jmp                  wq
6159*c0909341SAndroid Build Coastguard Worker.w4:
6160*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m3, [maskq]
6161*c0909341SAndroid Build Coastguard Worker    movq                xm0, [dstq+dsq*0]
6162*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [dstq+dsq*1]
6163*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [dstq+dsq*2]
6164*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [dstq+r6   ]
6165*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m1, 0x30
6166*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m2, 0xc0
6167*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0, [tmpq]
6168*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
6169*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
6170*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m6
6171*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
6172*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
6173*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
6174*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm0
6175*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm0
6176*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*2], xm1
6177*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+r6   ], xm1
6178*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*4]
6179*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
6180*c0909341SAndroid Build Coastguard Worker    jg .w4
6181*c0909341SAndroid Build Coastguard Worker    RET
6182*c0909341SAndroid Build Coastguard Worker.w8:
6183*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m4, [maskq+16*0]
6184*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m5, [maskq+16*1]
6185*c0909341SAndroid Build Coastguard Worker    mova                xm0, [dstq+dsq*0]
6186*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [dstq+dsq*1], 1
6187*c0909341SAndroid Build Coastguard Worker    mova                xm1, [dstq+dsq*2]
6188*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [dstq+r6   ], 1
6189*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, [tmpq+32*0]
6190*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, [tmpq+32*1]
6191*c0909341SAndroid Build Coastguard Worker    add               maskq, 16*2
6192*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
6193*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m6
6194*c0909341SAndroid Build Coastguard Worker    pmullw               m5, m6
6195*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
6196*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
6197*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
6198*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
6199*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm0
6200*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], m0, 1
6201*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*2], xm1
6202*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+r6   ], m1, 1
6203*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*4]
6204*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
6205*c0909341SAndroid Build Coastguard Worker    jg .w8
6206*c0909341SAndroid Build Coastguard Worker    RET
6207*c0909341SAndroid Build Coastguard Worker.w16:
6208*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m4, [maskq+16*0]
6209*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m5, [maskq+16*1]
6210*c0909341SAndroid Build Coastguard Worker    mova                 m0,     [dstq+dsq*0]
6211*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, [tmpq+ 32*0]
6212*c0909341SAndroid Build Coastguard Worker    mova                 m1,     [dstq+dsq*1]
6213*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, [tmpq+ 32*1]
6214*c0909341SAndroid Build Coastguard Worker    add               maskq, 16*2
6215*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
6216*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m6
6217*c0909341SAndroid Build Coastguard Worker    pmullw               m5, m6
6218*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
6219*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
6220*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
6221*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
6222*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m0
6223*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
6224*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
6225*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
6226*c0909341SAndroid Build Coastguard Worker    jg .w16
6227*c0909341SAndroid Build Coastguard Worker    RET
6228*c0909341SAndroid Build Coastguard Worker.w32:
6229*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m4, [maskq+16*0]
6230*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m5, [maskq+16*1]
6231*c0909341SAndroid Build Coastguard Worker    mova                 m0,     [dstq+32*0]
6232*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, [tmpq+32*0]
6233*c0909341SAndroid Build Coastguard Worker    mova                 m1,     [dstq+32*1]
6234*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, [tmpq+32*1]
6235*c0909341SAndroid Build Coastguard Worker    add               maskq, 16*2
6236*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
6237*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m6
6238*c0909341SAndroid Build Coastguard Worker    pmullw               m5, m6
6239*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
6240*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
6241*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
6242*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
6243*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
6244*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m1
6245*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
6246*c0909341SAndroid Build Coastguard Worker    dec                  hd
6247*c0909341SAndroid Build Coastguard Worker    jg .w32
6248*c0909341SAndroid Build Coastguard Worker    RET
6249*c0909341SAndroid Build Coastguard Worker
6250*c0909341SAndroid Build Coastguard WorkerINIT_XMM avx2
6251*c0909341SAndroid Build Coastguard Workercglobal blend_v_16bpc, 3, 6, 6, dst, ds, tmp, w, h
6252*c0909341SAndroid Build Coastguard Worker%define base r5-blend_v_avx2_table
6253*c0909341SAndroid Build Coastguard Worker    lea                  r5, [blend_v_avx2_table]
6254*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
6255*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
6256*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
6257*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
6258*c0909341SAndroid Build Coastguard Worker    jmp                  wq
6259*c0909341SAndroid Build Coastguard Worker.w2:
6260*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [base+obmc_masks_avx2+2*2]
6261*c0909341SAndroid Build Coastguard Worker.w2_loop:
6262*c0909341SAndroid Build Coastguard Worker    movd                 m0, [dstq+dsq*0]
6263*c0909341SAndroid Build Coastguard Worker    pinsrd               m0, [dstq+dsq*1], 1
6264*c0909341SAndroid Build Coastguard Worker    movq                 m1, [tmpq]
6265*c0909341SAndroid Build Coastguard Worker    add                tmpq, 4*2
6266*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0, m1
6267*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
6268*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
6269*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m0
6270*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], m0, 1
6271*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
6272*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
6273*c0909341SAndroid Build Coastguard Worker    jg .w2_loop
6274*c0909341SAndroid Build Coastguard Worker    RET
6275*c0909341SAndroid Build Coastguard Worker.w4:
6276*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [base+obmc_masks_avx2+4*2]
6277*c0909341SAndroid Build Coastguard Worker.w4_loop:
6278*c0909341SAndroid Build Coastguard Worker    movq                 m0, [dstq+dsq*0]
6279*c0909341SAndroid Build Coastguard Worker    movhps               m0, [dstq+dsq*1]
6280*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0, [tmpq]
6281*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8*2
6282*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
6283*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
6284*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m0
6285*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m0
6286*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
6287*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
6288*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
6289*c0909341SAndroid Build Coastguard Worker    RET
6290*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
6291*c0909341SAndroid Build Coastguard Worker.w8:
6292*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [base+obmc_masks_avx2+8*2]
6293*c0909341SAndroid Build Coastguard Worker.w8_loop:
6294*c0909341SAndroid Build Coastguard Worker    mova                xm0, [dstq+dsq*0]
6295*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [dstq+dsq*1], 1
6296*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0, [tmpq]
6297*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
6298*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
6299*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
6300*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm0
6301*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], m0, 1
6302*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
6303*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
6304*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
6305*c0909341SAndroid Build Coastguard Worker    RET
6306*c0909341SAndroid Build Coastguard Worker.w16:
6307*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+obmc_masks_avx2+16*2]
6308*c0909341SAndroid Build Coastguard Worker.w16_loop:
6309*c0909341SAndroid Build Coastguard Worker    mova                 m0,     [dstq+dsq*0]
6310*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, [tmpq+ 32*0]
6311*c0909341SAndroid Build Coastguard Worker    mova                 m1,     [dstq+dsq*1]
6312*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, [tmpq+ 32*1]
6313*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
6314*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
6315*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4
6316*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
6317*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
6318*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m0
6319*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
6320*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
6321*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
6322*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
6323*c0909341SAndroid Build Coastguard Worker    RET
6324*c0909341SAndroid Build Coastguard Worker.w32:
6325*c0909341SAndroid Build Coastguard Worker%if WIN64
6326*c0909341SAndroid Build Coastguard Worker    movaps         [rsp+ 8], xmm6
6327*c0909341SAndroid Build Coastguard Worker    movaps         [rsp+24], xmm7
6328*c0909341SAndroid Build Coastguard Worker%endif
6329*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+obmc_masks_avx2+32*2]
6330*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [base+obmc_masks_avx2+32*3]
6331*c0909341SAndroid Build Coastguard Worker.w32_loop:
6332*c0909341SAndroid Build Coastguard Worker    mova                 m0,     [dstq+dsq*0+32*0]
6333*c0909341SAndroid Build Coastguard Worker    psubw                m3, m0, [tmpq      +32*0]
6334*c0909341SAndroid Build Coastguard Worker    mova                xm2,     [dstq+dsq*0+32*1]
6335*c0909341SAndroid Build Coastguard Worker    mova                xm5,     [tmpq      +32*1]
6336*c0909341SAndroid Build Coastguard Worker    mova                 m1,     [dstq+dsq*1+32*0]
6337*c0909341SAndroid Build Coastguard Worker    psubw                m4, m1, [tmpq      +32*2]
6338*c0909341SAndroid Build Coastguard Worker    vinserti128          m2,     [dstq+dsq*1+32*1], 1
6339*c0909341SAndroid Build Coastguard Worker    vinserti128          m5,     [tmpq      +32*3], 1
6340*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*4
6341*c0909341SAndroid Build Coastguard Worker    psubw                m5, m2, m5
6342*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6
6343*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m6
6344*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7
6345*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
6346*c0909341SAndroid Build Coastguard Worker    paddw                m1, m4
6347*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
6348*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0+32*0], m0
6349*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*1+32*0], m1
6350*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0+32*1], xm2
6351*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1+32*1], m2, 1
6352*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
6353*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
6354*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
6355*c0909341SAndroid Build Coastguard Worker%if WIN64
6356*c0909341SAndroid Build Coastguard Worker    movaps             xmm6, [rsp+ 8]
6357*c0909341SAndroid Build Coastguard Worker    movaps             xmm7, [rsp+24]
6358*c0909341SAndroid Build Coastguard Worker%endif
6359*c0909341SAndroid Build Coastguard Worker    RET
6360*c0909341SAndroid Build Coastguard Worker
6361*c0909341SAndroid Build Coastguard Worker%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp
6362*c0909341SAndroid Build Coastguard Worker    mova                 m0,     [dstq+32*(%1+0)]
6363*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, [tmpq+32*(%2+0)]
6364*c0909341SAndroid Build Coastguard Worker    mova                 m1,     [dstq+32*(%1+1)]
6365*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, [tmpq+32*(%2+1)]
6366*c0909341SAndroid Build Coastguard Worker%if %3
6367*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*%3
6368*c0909341SAndroid Build Coastguard Worker%endif
6369*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
6370*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4
6371*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
6372*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
6373*c0909341SAndroid Build Coastguard Worker    mova   [dstq+32*(%1+0)], m0
6374*c0909341SAndroid Build Coastguard Worker    mova   [dstq+32*(%1+1)], m1
6375*c0909341SAndroid Build Coastguard Worker%endmacro
6376*c0909341SAndroid Build Coastguard Worker
6377*c0909341SAndroid Build Coastguard WorkerINIT_XMM avx2
6378*c0909341SAndroid Build Coastguard Workercglobal blend_h_16bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
6379*c0909341SAndroid Build Coastguard Worker%define base r5-blend_h_avx2_table
6380*c0909341SAndroid Build Coastguard Worker    lea                  r5, [blend_h_avx2_table]
6381*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
6382*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
6383*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
6384*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
6385*c0909341SAndroid Build Coastguard Worker    lea               maskq, [base+obmc_masks_avx2+hq*2]
6386*c0909341SAndroid Build Coastguard Worker    lea                  hd, [hq*3]
6387*c0909341SAndroid Build Coastguard Worker    shr                  hd, 2 ; h * 3/4
6388*c0909341SAndroid Build Coastguard Worker    lea               maskq, [maskq+hq*2]
6389*c0909341SAndroid Build Coastguard Worker    neg                  hq
6390*c0909341SAndroid Build Coastguard Worker    jmp                  wq
6391*c0909341SAndroid Build Coastguard Worker.w2:
6392*c0909341SAndroid Build Coastguard Worker    movd                 m0, [dstq+dsq*0]
6393*c0909341SAndroid Build Coastguard Worker    pinsrd               m0, [dstq+dsq*1], 1
6394*c0909341SAndroid Build Coastguard Worker    movd                 m2, [maskq+hq*2]
6395*c0909341SAndroid Build Coastguard Worker    movq                 m1, [tmpq]
6396*c0909341SAndroid Build Coastguard Worker    add                tmpq, 4*2
6397*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m2
6398*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0, m1
6399*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
6400*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
6401*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m0
6402*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], m0, 1
6403*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
6404*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
6405*c0909341SAndroid Build Coastguard Worker    jl .w2
6406*c0909341SAndroid Build Coastguard Worker    RET
6407*c0909341SAndroid Build Coastguard Worker.w4:
6408*c0909341SAndroid Build Coastguard Worker    mova                 m3, [blend_shuf]
6409*c0909341SAndroid Build Coastguard Worker.w4_loop:
6410*c0909341SAndroid Build Coastguard Worker    movq                 m0, [dstq+dsq*0]
6411*c0909341SAndroid Build Coastguard Worker    movhps               m0, [dstq+dsq*1]
6412*c0909341SAndroid Build Coastguard Worker    movd                 m2, [maskq+hq*2]
6413*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0, [tmpq]
6414*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8*2
6415*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3
6416*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
6417*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
6418*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m0
6419*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m0
6420*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
6421*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
6422*c0909341SAndroid Build Coastguard Worker    jl .w4_loop
6423*c0909341SAndroid Build Coastguard Worker    RET
6424*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
6425*c0909341SAndroid Build Coastguard Worker.w8:
6426*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [blend_shuf]
6427*c0909341SAndroid Build Coastguard Worker    shufpd               m3, m3, 0x0c
6428*c0909341SAndroid Build Coastguard Worker.w8_loop:
6429*c0909341SAndroid Build Coastguard Worker    mova                xm0, [dstq+dsq*0]
6430*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [dstq+dsq*1], 1
6431*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [maskq+hq*2]
6432*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0, [tmpq]
6433*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
6434*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3
6435*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
6436*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
6437*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm0
6438*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], m0, 1
6439*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
6440*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
6441*c0909341SAndroid Build Coastguard Worker    jl .w8_loop
6442*c0909341SAndroid Build Coastguard Worker    RET
6443*c0909341SAndroid Build Coastguard Worker.w16:
6444*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, [maskq+hq*2]
6445*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, [maskq+hq*2+2]
6446*c0909341SAndroid Build Coastguard Worker    mova                 m0,     [dstq+dsq*0]
6447*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, [tmpq+ 32*0]
6448*c0909341SAndroid Build Coastguard Worker    mova                 m1,     [dstq+dsq*1]
6449*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, [tmpq+ 32*1]
6450*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
6451*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
6452*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
6453*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
6454*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
6455*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m0
6456*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
6457*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
6458*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
6459*c0909341SAndroid Build Coastguard Worker    jl .w16
6460*c0909341SAndroid Build Coastguard Worker    RET
6461*c0909341SAndroid Build Coastguard Worker.w32:
6462*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, [maskq+hq*2]
6463*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           0, 0, 2
6464*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
6465*c0909341SAndroid Build Coastguard Worker    inc                  hq
6466*c0909341SAndroid Build Coastguard Worker    jl .w32
6467*c0909341SAndroid Build Coastguard Worker    RET
6468*c0909341SAndroid Build Coastguard Worker.w64:
6469*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, [maskq+hq*2]
6470*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           0, 0
6471*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           2, 2, 4
6472*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
6473*c0909341SAndroid Build Coastguard Worker    inc                  hq
6474*c0909341SAndroid Build Coastguard Worker    jl .w64
6475*c0909341SAndroid Build Coastguard Worker    RET
6476*c0909341SAndroid Build Coastguard Worker.w128:
6477*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, [maskq+hq*2]
6478*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           0,  0
6479*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           2,  2, 8
6480*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           4, -4
6481*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           6, -2
6482*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
6483*c0909341SAndroid Build Coastguard Worker    inc                  hq
6484*c0909341SAndroid Build Coastguard Worker    jl .w128
6485*c0909341SAndroid Build Coastguard Worker    RET
6486*c0909341SAndroid Build Coastguard Worker
6487*c0909341SAndroid Build Coastguard Workercglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
6488*c0909341SAndroid Build Coastguard Worker                                   bottomext, rightext
6489*c0909341SAndroid Build Coastguard Worker    ; we assume that the buffer (stride) is larger than width, so we can
6490*c0909341SAndroid Build Coastguard Worker    ; safely overwrite by a few bytes
6491*c0909341SAndroid Build Coastguard Worker
6492*c0909341SAndroid Build Coastguard Worker    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
6493*c0909341SAndroid Build Coastguard Worker    xor                r12d, r12d
6494*c0909341SAndroid Build Coastguard Worker    lea                 r10, [ihq-1]
6495*c0909341SAndroid Build Coastguard Worker    cmp                  yq, ihq
6496*c0909341SAndroid Build Coastguard Worker    cmovs               r10, yq
6497*c0909341SAndroid Build Coastguard Worker    test                 yq, yq
6498*c0909341SAndroid Build Coastguard Worker    cmovs               r10, r12
6499*c0909341SAndroid Build Coastguard Worker    imul                r10, sstrideq
6500*c0909341SAndroid Build Coastguard Worker    add                srcq, r10
6501*c0909341SAndroid Build Coastguard Worker
6502*c0909341SAndroid Build Coastguard Worker    ; ref += iclip(x, 0, iw - 1)
6503*c0909341SAndroid Build Coastguard Worker    lea                 r10, [iwq-1]
6504*c0909341SAndroid Build Coastguard Worker    cmp                  xq, iwq
6505*c0909341SAndroid Build Coastguard Worker    cmovs               r10, xq
6506*c0909341SAndroid Build Coastguard Worker    test                 xq, xq
6507*c0909341SAndroid Build Coastguard Worker    cmovs               r10, r12
6508*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+r10*2]
6509*c0909341SAndroid Build Coastguard Worker
6510*c0909341SAndroid Build Coastguard Worker    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
6511*c0909341SAndroid Build Coastguard Worker    lea          bottomextq, [yq+bhq]
6512*c0909341SAndroid Build Coastguard Worker    sub          bottomextq, ihq
6513*c0909341SAndroid Build Coastguard Worker    lea                  r3, [bhq-1]
6514*c0909341SAndroid Build Coastguard Worker    cmovs        bottomextq, r12
6515*c0909341SAndroid Build Coastguard Worker
6516*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
6517*c0909341SAndroid Build Coastguard Worker                bottomext, rightext
6518*c0909341SAndroid Build Coastguard Worker
6519*c0909341SAndroid Build Coastguard Worker    ; top_ext = iclip(-y, 0, bh - 1)
6520*c0909341SAndroid Build Coastguard Worker    neg             topextq
6521*c0909341SAndroid Build Coastguard Worker    cmovs           topextq, r12
6522*c0909341SAndroid Build Coastguard Worker    cmp          bottomextq, bhq
6523*c0909341SAndroid Build Coastguard Worker    cmovns       bottomextq, r3
6524*c0909341SAndroid Build Coastguard Worker    cmp             topextq, bhq
6525*c0909341SAndroid Build Coastguard Worker    cmovg           topextq, r3
6526*c0909341SAndroid Build Coastguard Worker
6527*c0909341SAndroid Build Coastguard Worker    ; right_ext = iclip(x + bw - iw, 0, bw - 1)
6528*c0909341SAndroid Build Coastguard Worker    lea           rightextq, [xq+bwq]
6529*c0909341SAndroid Build Coastguard Worker    sub           rightextq, iwq
6530*c0909341SAndroid Build Coastguard Worker    lea                  r2, [bwq-1]
6531*c0909341SAndroid Build Coastguard Worker    cmovs         rightextq, r12
6532*c0909341SAndroid Build Coastguard Worker
6533*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
6534*c0909341SAndroid Build Coastguard Worker                bottomext, rightext
6535*c0909341SAndroid Build Coastguard Worker
6536*c0909341SAndroid Build Coastguard Worker    ; left_ext = iclip(-x, 0, bw - 1)
6537*c0909341SAndroid Build Coastguard Worker    neg            leftextq
6538*c0909341SAndroid Build Coastguard Worker    cmovs          leftextq, r12
6539*c0909341SAndroid Build Coastguard Worker    cmp           rightextq, bwq
6540*c0909341SAndroid Build Coastguard Worker    cmovns        rightextq, r2
6541*c0909341SAndroid Build Coastguard Worker    cmp            leftextq, bwq
6542*c0909341SAndroid Build Coastguard Worker    cmovns         leftextq, r2
6543*c0909341SAndroid Build Coastguard Worker
6544*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
6545*c0909341SAndroid Build Coastguard Worker                dst, dstride, src, sstride, bottomext, rightext
6546*c0909341SAndroid Build Coastguard Worker
6547*c0909341SAndroid Build Coastguard Worker    ; center_h = bh - top_ext - bottom_ext
6548*c0909341SAndroid Build Coastguard Worker    lea                  r3, [bottomextq+topextq]
6549*c0909341SAndroid Build Coastguard Worker    sub            centerhq, r3
6550*c0909341SAndroid Build Coastguard Worker
6551*c0909341SAndroid Build Coastguard Worker    ; blk += top_ext * PXSTRIDE(dst_stride)
6552*c0909341SAndroid Build Coastguard Worker    mov                  r2, topextq
6553*c0909341SAndroid Build Coastguard Worker    imul                 r2, dstrideq
6554*c0909341SAndroid Build Coastguard Worker    add                dstq, r2
6555*c0909341SAndroid Build Coastguard Worker    mov                 r9m, dstq
6556*c0909341SAndroid Build Coastguard Worker
6557*c0909341SAndroid Build Coastguard Worker    ; center_w = bw - left_ext - right_ext
6558*c0909341SAndroid Build Coastguard Worker    mov            centerwq, bwq
6559*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rightextq+leftextq]
6560*c0909341SAndroid Build Coastguard Worker    sub            centerwq, r3
6561*c0909341SAndroid Build Coastguard Worker
6562*c0909341SAndroid Build Coastguard Worker%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
6563*c0909341SAndroid Build Coastguard Worker.v_loop_%3:
6564*c0909341SAndroid Build Coastguard Worker%if %1
6565*c0909341SAndroid Build Coastguard Worker    ; left extension
6566*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
6567*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, [srcq]
6568*c0909341SAndroid Build Coastguard Worker.left_loop_%3:
6569*c0909341SAndroid Build Coastguard Worker    mova        [dstq+r3*2], m0
6570*c0909341SAndroid Build Coastguard Worker    add                  r3, 16
6571*c0909341SAndroid Build Coastguard Worker    cmp                  r3, leftextq
6572*c0909341SAndroid Build Coastguard Worker    jl .left_loop_%3
6573*c0909341SAndroid Build Coastguard Worker
6574*c0909341SAndroid Build Coastguard Worker    ; body
6575*c0909341SAndroid Build Coastguard Worker    lea                 r12, [dstq+leftextq*2]
6576*c0909341SAndroid Build Coastguard Worker%endif
6577*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
6578*c0909341SAndroid Build Coastguard Worker.body_loop_%3:
6579*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r3*2]
6580*c0909341SAndroid Build Coastguard Worker%if %1
6581*c0909341SAndroid Build Coastguard Worker    movu         [r12+r3*2], m0
6582*c0909341SAndroid Build Coastguard Worker%else
6583*c0909341SAndroid Build Coastguard Worker    movu        [dstq+r3*2], m0
6584*c0909341SAndroid Build Coastguard Worker%endif
6585*c0909341SAndroid Build Coastguard Worker    add                  r3, 16
6586*c0909341SAndroid Build Coastguard Worker    cmp                  r3, centerwq
6587*c0909341SAndroid Build Coastguard Worker    jl .body_loop_%3
6588*c0909341SAndroid Build Coastguard Worker
6589*c0909341SAndroid Build Coastguard Worker%if %2
6590*c0909341SAndroid Build Coastguard Worker    ; right extension
6591*c0909341SAndroid Build Coastguard Worker%if %1
6592*c0909341SAndroid Build Coastguard Worker    lea                 r12, [r12+centerwq*2]
6593*c0909341SAndroid Build Coastguard Worker%else
6594*c0909341SAndroid Build Coastguard Worker    lea                 r12, [dstq+centerwq*2]
6595*c0909341SAndroid Build Coastguard Worker%endif
6596*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
6597*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, [srcq+centerwq*2-2]
6598*c0909341SAndroid Build Coastguard Worker.right_loop_%3:
6599*c0909341SAndroid Build Coastguard Worker    movu         [r12+r3*2], m0
6600*c0909341SAndroid Build Coastguard Worker    add                  r3, 16
6601*c0909341SAndroid Build Coastguard Worker    cmp                  r3, rightextq
6602*c0909341SAndroid Build Coastguard Worker    jl .right_loop_%3
6603*c0909341SAndroid Build Coastguard Worker
6604*c0909341SAndroid Build Coastguard Worker%endif
6605*c0909341SAndroid Build Coastguard Worker    add                dstq, dstrideq
6606*c0909341SAndroid Build Coastguard Worker    add                srcq, sstrideq
6607*c0909341SAndroid Build Coastguard Worker    dec            centerhq
6608*c0909341SAndroid Build Coastguard Worker    jg .v_loop_%3
6609*c0909341SAndroid Build Coastguard Worker%endmacro
6610*c0909341SAndroid Build Coastguard Worker
6611*c0909341SAndroid Build Coastguard Worker    test           leftextq, leftextq
6612*c0909341SAndroid Build Coastguard Worker    jnz .need_left_ext
6613*c0909341SAndroid Build Coastguard Worker    test          rightextq, rightextq
6614*c0909341SAndroid Build Coastguard Worker    jnz .need_right_ext
6615*c0909341SAndroid Build Coastguard Worker    v_loop                0, 0, 0
6616*c0909341SAndroid Build Coastguard Worker    jmp .body_done
6617*c0909341SAndroid Build Coastguard Worker
6618*c0909341SAndroid Build Coastguard Worker.need_left_ext:
6619*c0909341SAndroid Build Coastguard Worker    test          rightextq, rightextq
6620*c0909341SAndroid Build Coastguard Worker    jnz .need_left_right_ext
6621*c0909341SAndroid Build Coastguard Worker    v_loop                1, 0, 1
6622*c0909341SAndroid Build Coastguard Worker    jmp .body_done
6623*c0909341SAndroid Build Coastguard Worker
6624*c0909341SAndroid Build Coastguard Worker.need_left_right_ext:
6625*c0909341SAndroid Build Coastguard Worker    v_loop                1, 1, 2
6626*c0909341SAndroid Build Coastguard Worker    jmp .body_done
6627*c0909341SAndroid Build Coastguard Worker
6628*c0909341SAndroid Build Coastguard Worker.need_right_ext:
6629*c0909341SAndroid Build Coastguard Worker    v_loop                0, 1, 3
6630*c0909341SAndroid Build Coastguard Worker
6631*c0909341SAndroid Build Coastguard Worker.body_done:
6632*c0909341SAndroid Build Coastguard Worker    ; bottom edge extension
6633*c0909341SAndroid Build Coastguard Worker    test         bottomextq, bottomextq
6634*c0909341SAndroid Build Coastguard Worker    jz .top
6635*c0909341SAndroid Build Coastguard Worker    mov                srcq, dstq
6636*c0909341SAndroid Build Coastguard Worker    sub                srcq, dstrideq
6637*c0909341SAndroid Build Coastguard Worker    xor                  r1, r1
6638*c0909341SAndroid Build Coastguard Worker.bottom_x_loop:
6639*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq+r1*2]
6640*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+r1*2]
6641*c0909341SAndroid Build Coastguard Worker    mov                  r4, bottomextq
6642*c0909341SAndroid Build Coastguard Worker.bottom_y_loop:
6643*c0909341SAndroid Build Coastguard Worker    mova               [r3], m0
6644*c0909341SAndroid Build Coastguard Worker    add                  r3, dstrideq
6645*c0909341SAndroid Build Coastguard Worker    dec                  r4
6646*c0909341SAndroid Build Coastguard Worker    jg .bottom_y_loop
6647*c0909341SAndroid Build Coastguard Worker    add                  r1, 16
6648*c0909341SAndroid Build Coastguard Worker    cmp                  r1, bwq
6649*c0909341SAndroid Build Coastguard Worker    jl .bottom_x_loop
6650*c0909341SAndroid Build Coastguard Worker
6651*c0909341SAndroid Build Coastguard Worker.top:
6652*c0909341SAndroid Build Coastguard Worker    ; top edge extension
6653*c0909341SAndroid Build Coastguard Worker    test            topextq, topextq
6654*c0909341SAndroid Build Coastguard Worker    jz .end
6655*c0909341SAndroid Build Coastguard Worker    mov                srcq, r9m
6656*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstm
6657*c0909341SAndroid Build Coastguard Worker    xor                  r1, r1
6658*c0909341SAndroid Build Coastguard Worker.top_x_loop:
6659*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq+r1*2]
6660*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+r1*2]
6661*c0909341SAndroid Build Coastguard Worker    mov                  r4, topextq
6662*c0909341SAndroid Build Coastguard Worker.top_y_loop:
6663*c0909341SAndroid Build Coastguard Worker    mova               [r3], m0
6664*c0909341SAndroid Build Coastguard Worker    add                  r3, dstrideq
6665*c0909341SAndroid Build Coastguard Worker    dec                  r4
6666*c0909341SAndroid Build Coastguard Worker    jg .top_y_loop
6667*c0909341SAndroid Build Coastguard Worker    add                  r1, 16
6668*c0909341SAndroid Build Coastguard Worker    cmp                  r1, bwq
6669*c0909341SAndroid Build Coastguard Worker    jl .top_x_loop
6670*c0909341SAndroid Build Coastguard Worker
6671*c0909341SAndroid Build Coastguard Worker.end:
6672*c0909341SAndroid Build Coastguard Worker    RET
6673*c0909341SAndroid Build Coastguard Worker
6674*c0909341SAndroid Build Coastguard Workercglobal resize_16bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
6675*c0909341SAndroid Build Coastguard Worker                                 dst_w, h, src_w, dx, mx0, pxmax
6676*c0909341SAndroid Build Coastguard Worker    sub          dword mx0m, 4<<14
6677*c0909341SAndroid Build Coastguard Worker    sub        dword src_wm, 8
6678*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, dxm
6679*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, mx0m
6680*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, src_wm
6681*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
6682*c0909341SAndroid Build Coastguard Worker    LEA                  r7, $$
6683*c0909341SAndroid Build Coastguard Worker%define base r7-$$
6684*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [base+pd_64]
6685*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        xm7, pxmaxm
6686*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
6687*c0909341SAndroid Build Coastguard Worker    pslld                m5, 3                      ; dx*8
6688*c0909341SAndroid Build Coastguard Worker    pslld                m6, 14
6689*c0909341SAndroid Build Coastguard Worker    paddd                m8, m2                     ; mx+[0..7]*dx
6690*c0909341SAndroid Build Coastguard Worker.loop_y:
6691*c0909341SAndroid Build Coastguard Worker    xor                  xd, xd
6692*c0909341SAndroid Build Coastguard Worker    mova                 m4, m8             ; per-line working version of mx
6693*c0909341SAndroid Build Coastguard Worker.loop_x:
6694*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+pd_63]
6695*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
6696*c0909341SAndroid Build Coastguard Worker    pmaxsd               m0, m4, m2
6697*c0909341SAndroid Build Coastguard Worker    psrad                m9, m4, 8          ; filter offset (unmasked)
6698*c0909341SAndroid Build Coastguard Worker    pminsd               m0, m6             ; iclip(mx, 0, src_w-8)
6699*c0909341SAndroid Build Coastguard Worker    psubd                m1, m4, m0         ; pshufb offset
6700*c0909341SAndroid Build Coastguard Worker    psrad                m0, 14             ; clipped src_x offset
6701*c0909341SAndroid Build Coastguard Worker    psrad                m1, 14             ; pshufb edge_emu offset
6702*c0909341SAndroid Build Coastguard Worker    pand                 m9, m10            ; filter offset (masked)
6703*c0909341SAndroid Build Coastguard Worker    ; load source pixels
6704*c0909341SAndroid Build Coastguard Worker    movd                r8d, xm0
6705*c0909341SAndroid Build Coastguard Worker    pextrd              r9d, xm0, 1
6706*c0909341SAndroid Build Coastguard Worker    pextrd             r10d, xm0, 2
6707*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm0, 3
6708*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m0, 1
6709*c0909341SAndroid Build Coastguard Worker    movu               xm10, [srcq+r8*2]
6710*c0909341SAndroid Build Coastguard Worker    movu               xm11, [srcq+r9*2]
6711*c0909341SAndroid Build Coastguard Worker    movu               xm12, [srcq+r10*2]
6712*c0909341SAndroid Build Coastguard Worker    movu               xm13, [srcq+r11*2]
6713*c0909341SAndroid Build Coastguard Worker    movd                r8d, xm0
6714*c0909341SAndroid Build Coastguard Worker    pextrd              r9d, xm0, 1
6715*c0909341SAndroid Build Coastguard Worker    pextrd             r10d, xm0, 2
6716*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm0, 3
6717*c0909341SAndroid Build Coastguard Worker    vinserti128         m10, [srcq+r8*2], 1
6718*c0909341SAndroid Build Coastguard Worker    vinserti128         m11, [srcq+r9*2], 1
6719*c0909341SAndroid Build Coastguard Worker    vinserti128         m12, [srcq+r10*2], 1
6720*c0909341SAndroid Build Coastguard Worker    vinserti128         m13, [srcq+r11*2], 1
6721*c0909341SAndroid Build Coastguard Worker    ptest                m1, m1
6722*c0909341SAndroid Build Coastguard Worker    jz .filter
6723*c0909341SAndroid Build Coastguard Worker    movq                 r9, xm1
6724*c0909341SAndroid Build Coastguard Worker    pextrq              r11, xm1, 1
6725*c0909341SAndroid Build Coastguard Worker    movsxd               r8, r9d
6726*c0909341SAndroid Build Coastguard Worker    sar                  r9, 32
6727*c0909341SAndroid Build Coastguard Worker    movsxd              r10, r11d
6728*c0909341SAndroid Build Coastguard Worker    sar                 r11, 32
6729*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m1, 1
6730*c0909341SAndroid Build Coastguard Worker    movu               xm14, [base+resize_shuf+8+r8*2]
6731*c0909341SAndroid Build Coastguard Worker    movu               xm15, [base+resize_shuf+8+r9*2]
6732*c0909341SAndroid Build Coastguard Worker    movu                xm0, [base+resize_shuf+8+r10*2]
6733*c0909341SAndroid Build Coastguard Worker    movu                xm2, [base+resize_shuf+8+r11*2]
6734*c0909341SAndroid Build Coastguard Worker    movq                 r9, xm1
6735*c0909341SAndroid Build Coastguard Worker    pextrq              r11, xm1, 1
6736*c0909341SAndroid Build Coastguard Worker    movsxd               r8, r9d
6737*c0909341SAndroid Build Coastguard Worker    sar                  r9, 32
6738*c0909341SAndroid Build Coastguard Worker    movsxd              r10, r11d
6739*c0909341SAndroid Build Coastguard Worker    sar                 r11, 32
6740*c0909341SAndroid Build Coastguard Worker    vinserti128         m14, [base+resize_shuf+8+r8*2], 1
6741*c0909341SAndroid Build Coastguard Worker    vinserti128         m15, [base+resize_shuf+8+r9*2], 1
6742*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [base+resize_shuf+8+r10*2], 1
6743*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [base+resize_shuf+8+r11*2], 1
6744*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m14
6745*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m15
6746*c0909341SAndroid Build Coastguard Worker    pshufb              m12, m0
6747*c0909341SAndroid Build Coastguard Worker    pshufb              m13, m2
6748*c0909341SAndroid Build Coastguard Worker.filter:
6749*c0909341SAndroid Build Coastguard Worker    movd                r8d, xm9
6750*c0909341SAndroid Build Coastguard Worker    pextrd              r9d, xm9, 1
6751*c0909341SAndroid Build Coastguard Worker    pextrd             r10d, xm9, 2
6752*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm9, 3
6753*c0909341SAndroid Build Coastguard Worker    vextracti128        xm9, m9, 1
6754*c0909341SAndroid Build Coastguard Worker    movq               xm14, [base+resize_filter+r8*8]
6755*c0909341SAndroid Build Coastguard Worker    movq               xm15, [base+resize_filter+r9*8]
6756*c0909341SAndroid Build Coastguard Worker    movq                xm0, [base+resize_filter+r10*8]
6757*c0909341SAndroid Build Coastguard Worker    movq                xm2, [base+resize_filter+r11*8]
6758*c0909341SAndroid Build Coastguard Worker    movd                r8d, xm9
6759*c0909341SAndroid Build Coastguard Worker    pextrd              r9d, xm9, 1
6760*c0909341SAndroid Build Coastguard Worker    pextrd             r10d, xm9, 2
6761*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm9, 3
6762*c0909341SAndroid Build Coastguard Worker    movhps             xm14, [base+resize_filter+r8*8]
6763*c0909341SAndroid Build Coastguard Worker    movhps             xm15, [base+resize_filter+r9*8]
6764*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [base+resize_filter+r10*8]
6765*c0909341SAndroid Build Coastguard Worker    movhps              xm2, [base+resize_filter+r11*8]
6766*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m14, xm14
6767*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m15, xm15
6768*c0909341SAndroid Build Coastguard Worker    pmovsxbw             m0, xm0
6769*c0909341SAndroid Build Coastguard Worker    pmovsxbw             m2, xm2
6770*c0909341SAndroid Build Coastguard Worker    pmaddwd             m10, m14
6771*c0909341SAndroid Build Coastguard Worker    pmaddwd             m11, m15
6772*c0909341SAndroid Build Coastguard Worker    pmaddwd             m12, m0
6773*c0909341SAndroid Build Coastguard Worker    pmaddwd             m13, m2
6774*c0909341SAndroid Build Coastguard Worker    phaddd              m10, m11
6775*c0909341SAndroid Build Coastguard Worker    phaddd              m12, m13
6776*c0909341SAndroid Build Coastguard Worker    phaddd              m10, m12
6777*c0909341SAndroid Build Coastguard Worker    psubd               m10, m3, m10
6778*c0909341SAndroid Build Coastguard Worker    psrad               m10, 7
6779*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m10, 1
6780*c0909341SAndroid Build Coastguard Worker    packusdw           xm10, xm0
6781*c0909341SAndroid Build Coastguard Worker    pminsw             xm10, xm7
6782*c0909341SAndroid Build Coastguard Worker    mova        [dstq+xq*2], xm10
6783*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
6784*c0909341SAndroid Build Coastguard Worker    add                  xd, 8
6785*c0909341SAndroid Build Coastguard Worker    cmp                  xd, dst_wd
6786*c0909341SAndroid Build Coastguard Worker    jl .loop_x
6787*c0909341SAndroid Build Coastguard Worker    add                dstq, dst_strideq
6788*c0909341SAndroid Build Coastguard Worker    add                srcq, src_strideq
6789*c0909341SAndroid Build Coastguard Worker    dec                  hd
6790*c0909341SAndroid Build Coastguard Worker    jg .loop_y
6791*c0909341SAndroid Build Coastguard Worker    RET
6792*c0909341SAndroid Build Coastguard Worker
6793*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
6794