xref: /aosp_15_r20/external/libdav1d/src/x86/mc_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2018-2021, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2018-2021, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 32
32*c0909341SAndroid Build Coastguard Worker
33*c0909341SAndroid Build Coastguard Worker; dav1d_obmc_masks[] with 64-x interleaved
34*c0909341SAndroid Build Coastguard Workerobmc_masks:     db  0,  0,  0,  0
35*c0909341SAndroid Build Coastguard Worker                ; 2
36*c0909341SAndroid Build Coastguard Worker                db 45, 19, 64,  0
37*c0909341SAndroid Build Coastguard Worker                ; 4
38*c0909341SAndroid Build Coastguard Worker                db 39, 25, 50, 14, 59,  5, 64,  0
39*c0909341SAndroid Build Coastguard Worker                ; 8
40*c0909341SAndroid Build Coastguard Worker                db 36, 28, 42, 22, 48, 16, 53, 11, 57,  7, 61,  3, 64,  0, 64,  0
41*c0909341SAndroid Build Coastguard Worker                ; 16
42*c0909341SAndroid Build Coastguard Worker                db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
43*c0909341SAndroid Build Coastguard Worker                db 56,  8, 58,  6, 60,  4, 61,  3, 64,  0, 64,  0, 64,  0, 64,  0
44*c0909341SAndroid Build Coastguard Worker                ; 32
45*c0909341SAndroid Build Coastguard Worker                db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
46*c0909341SAndroid Build Coastguard Worker                db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
47*c0909341SAndroid Build Coastguard Worker                db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
48*c0909341SAndroid Build Coastguard Worker                db 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0
49*c0909341SAndroid Build Coastguard Worker
50*c0909341SAndroid Build Coastguard Workerwarp_8x8_shufA: db  0,  2,  4,  6,  1,  3,  5,  7,  1,  3,  5,  7,  2,  4,  6,  8
51*c0909341SAndroid Build Coastguard Worker                db  4,  6,  8, 10,  5,  7,  9, 11,  5,  7,  9, 11,  6,  8, 10, 12
52*c0909341SAndroid Build Coastguard Workerwarp_8x8_shufB: db  2,  4,  6,  8,  3,  5,  7,  9,  3,  5,  7,  9,  4,  6,  8, 10
53*c0909341SAndroid Build Coastguard Worker                db  6,  8, 10, 12,  7,  9, 11, 13,  7,  9, 11, 13,  8, 10, 12, 14
54*c0909341SAndroid Build Coastguard Workersubpel_h_shuf4: db  0,  1,  2,  3,  1,  2,  3,  4,  8,  9, 10, 11,  9, 10, 11, 12
55*c0909341SAndroid Build Coastguard Worker                db  2,  3,  4,  5,  3,  4,  5,  6, 10, 11, 12, 13, 11, 12, 13, 14
56*c0909341SAndroid Build Coastguard Workersubpel_h_shufA: db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
57*c0909341SAndroid Build Coastguard Workersubpel_h_shufB: db  4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
58*c0909341SAndroid Build Coastguard Workersubpel_h_shufC: db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
59*c0909341SAndroid Build Coastguard Workersubpel_v_shuf4: db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
60*c0909341SAndroid Build Coastguard Workersubpel_s_shuf2: db  0,  1,  2,  3,  0,  1,  2,  3,  8,  9, 10, 11,  8,  9, 10, 11
61*c0909341SAndroid Build Coastguard Workersubpel_s_shuf8: db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
62*c0909341SAndroid Build Coastguard Workerbilin_h_shuf4:  db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
63*c0909341SAndroid Build Coastguard Workerbilin_v_shuf4:  db  4,  0,  5,  1,  6,  2,  7,  3,  8,  4,  9,  5, 10,  6, 11,  7
64*c0909341SAndroid Build Coastguard Workerdeint_shuf4:    db  0,  4,  1,  5,  2,  6,  3,  7,  4,  8,  5,  9,  6, 10,  7, 11
65*c0909341SAndroid Build Coastguard Workerblend_shuf:     db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
66*c0909341SAndroid Build Coastguard Workerpb_8x0_8x8:     db  0,  0,  0,  0,  0,  0,  0,  0,  8,  8,  8,  8,  8,  8,  8,  8
67*c0909341SAndroid Build Coastguard Workerbdct_lb_dw:     db  0,  0,  0,  0,  4,  4,  4,  4,  8,  8,  8,  8, 12, 12, 12, 12
68*c0909341SAndroid Build Coastguard Workerwswap:          db  2,  3,  0,  1,  6,  7,  4,  5, 10, 11,  8,  9, 14, 15, 12, 13
69*c0909341SAndroid Build Coastguard Workerresize_shuf:    db  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  7,  7,  7,  7
70*c0909341SAndroid Build Coastguard Workerrescale_mul:    dd  0,  1,  2,  3,  4,  5,  6,  7
71*c0909341SAndroid Build Coastguard Worker
72*c0909341SAndroid Build Coastguard Workerwm_420_sign:    dd 0x01020102, 0x01010101
73*c0909341SAndroid Build Coastguard Workerwm_422_sign:    dd 0x80808080, 0x7f7f7f7f
74*c0909341SAndroid Build Coastguard Worker
75*c0909341SAndroid Build Coastguard Workerpb_64:   times 4 db 64
76*c0909341SAndroid Build Coastguard Workerpw_m256: times 2 dw -256
77*c0909341SAndroid Build Coastguard Workerpw_15:   times 2 dw 15
78*c0909341SAndroid Build Coastguard Workerpw_32:   times 2 dw 32
79*c0909341SAndroid Build Coastguard Workerpw_34:   times 2 dw 34
80*c0909341SAndroid Build Coastguard Workerpw_258:  times 2 dw 258
81*c0909341SAndroid Build Coastguard Workerpw_512:  times 2 dw 512
82*c0909341SAndroid Build Coastguard Workerpw_1024: times 2 dw 1024
83*c0909341SAndroid Build Coastguard Workerpw_2048: times 2 dw 2048
84*c0909341SAndroid Build Coastguard Workerpw_6903: times 2 dw 6903
85*c0909341SAndroid Build Coastguard Workerpw_8192: times 2 dw 8192
86*c0909341SAndroid Build Coastguard Workerpd_32:           dd 32
87*c0909341SAndroid Build Coastguard Workerpd_63:           dd 63
88*c0909341SAndroid Build Coastguard Workerpd_512:          dd 512
89*c0909341SAndroid Build Coastguard Workerpd_32768:        dd 32768
90*c0909341SAndroid Build Coastguard Workerpd_0x3ff:        dd 0x3ff
91*c0909341SAndroid Build Coastguard Workerpd_0x4000:       dd 0x4000
92*c0909341SAndroid Build Coastguard Workerpq_0x40000000:   dq 0x40000000
93*c0909341SAndroid Build Coastguard Worker
94*c0909341SAndroid Build Coastguard Workercextern mc_subpel_filters
95*c0909341SAndroid Build Coastguard Workercextern mc_warp_filter2
96*c0909341SAndroid Build Coastguard Workercextern resize_filter
97*c0909341SAndroid Build Coastguard Workercextern z_filter_s
98*c0909341SAndroid Build Coastguard Worker
99*c0909341SAndroid Build Coastguard Worker%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
100*c0909341SAndroid Build Coastguard Worker
101*c0909341SAndroid Build Coastguard Worker%macro BASE_JMP_TABLE 3-*
102*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - %3)
103*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_%2
104*c0909341SAndroid Build Coastguard Worker    %%table:
105*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
106*c0909341SAndroid Build Coastguard Worker        dw %%base %+ _w%3 - %%base
107*c0909341SAndroid Build Coastguard Worker        %rotate 1
108*c0909341SAndroid Build Coastguard Worker    %endrep
109*c0909341SAndroid Build Coastguard Worker%endmacro
110*c0909341SAndroid Build Coastguard Worker
111*c0909341SAndroid Build Coastguard Worker%macro HV_JMP_TABLE 5-*
112*c0909341SAndroid Build Coastguard Worker    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
113*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_%3
114*c0909341SAndroid Build Coastguard Worker    %assign %%types %4
115*c0909341SAndroid Build Coastguard Worker    %if %%types & 1
116*c0909341SAndroid Build Coastguard Worker        %xdefine %1_%2_h_%3_table  (%%h  - %5)
117*c0909341SAndroid Build Coastguard Worker        %%h:
118*c0909341SAndroid Build Coastguard Worker        %rep %0 - 4
119*c0909341SAndroid Build Coastguard Worker            dw %%prefix %+ .h_w%5 - %%base
120*c0909341SAndroid Build Coastguard Worker            %rotate 1
121*c0909341SAndroid Build Coastguard Worker        %endrep
122*c0909341SAndroid Build Coastguard Worker        %rotate 4
123*c0909341SAndroid Build Coastguard Worker    %endif
124*c0909341SAndroid Build Coastguard Worker    %if %%types & 2
125*c0909341SAndroid Build Coastguard Worker        %xdefine %1_%2_v_%3_table  (%%v  - %5)
126*c0909341SAndroid Build Coastguard Worker        %%v:
127*c0909341SAndroid Build Coastguard Worker        %rep %0 - 4
128*c0909341SAndroid Build Coastguard Worker            dw %%prefix %+ .v_w%5 - %%base
129*c0909341SAndroid Build Coastguard Worker            %rotate 1
130*c0909341SAndroid Build Coastguard Worker        %endrep
131*c0909341SAndroid Build Coastguard Worker        %rotate 4
132*c0909341SAndroid Build Coastguard Worker    %endif
133*c0909341SAndroid Build Coastguard Worker    %if %%types & 4
134*c0909341SAndroid Build Coastguard Worker        %xdefine %1_%2_hv_%3_table (%%hv - %5)
135*c0909341SAndroid Build Coastguard Worker        %%hv:
136*c0909341SAndroid Build Coastguard Worker        %rep %0 - 4
137*c0909341SAndroid Build Coastguard Worker            dw %%prefix %+ .hv_w%5 - %%base
138*c0909341SAndroid Build Coastguard Worker            %rotate 1
139*c0909341SAndroid Build Coastguard Worker        %endrep
140*c0909341SAndroid Build Coastguard Worker    %endif
141*c0909341SAndroid Build Coastguard Worker%endmacro
142*c0909341SAndroid Build Coastguard Worker
143*c0909341SAndroid Build Coastguard Worker%macro BIDIR_JMP_TABLE 2-*
144*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - 2*%3)
145*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_%2_table
146*c0909341SAndroid Build Coastguard Worker    %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
147*c0909341SAndroid Build Coastguard Worker    %%table:
148*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
149*c0909341SAndroid Build Coastguard Worker        dd %%prefix %+ .w%3 - %%base
150*c0909341SAndroid Build Coastguard Worker        %rotate 1
151*c0909341SAndroid Build Coastguard Worker    %endrep
152*c0909341SAndroid Build Coastguard Worker%endmacro
153*c0909341SAndroid Build Coastguard Worker
154*c0909341SAndroid Build Coastguard Worker%macro SCALED_JMP_TABLE 2-*
155*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - %3)
156*c0909341SAndroid Build Coastguard Worker    %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
157*c0909341SAndroid Build Coastguard Worker%%table:
158*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
159*c0909341SAndroid Build Coastguard Worker        dw %%base %+ .w%3 - %%base
160*c0909341SAndroid Build Coastguard Worker        %rotate 1
161*c0909341SAndroid Build Coastguard Worker    %endrep
162*c0909341SAndroid Build Coastguard Worker    %rotate 2
163*c0909341SAndroid Build Coastguard Worker%%dy_1024:
164*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
165*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
166*c0909341SAndroid Build Coastguard Worker        dw %%base %+ .dy1_w%3 - %%base
167*c0909341SAndroid Build Coastguard Worker        %rotate 1
168*c0909341SAndroid Build Coastguard Worker    %endrep
169*c0909341SAndroid Build Coastguard Worker    %rotate 2
170*c0909341SAndroid Build Coastguard Worker%%dy_2048:
171*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
172*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
173*c0909341SAndroid Build Coastguard Worker        dw %%base %+ .dy2_w%3 - %%base
174*c0909341SAndroid Build Coastguard Worker        %rotate 1
175*c0909341SAndroid Build Coastguard Worker    %endrep
176*c0909341SAndroid Build Coastguard Worker%endmacro
177*c0909341SAndroid Build Coastguard Worker
178*c0909341SAndroid Build Coastguard Worker%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_8bpc_avx2.put)
179*c0909341SAndroid Build Coastguard Worker%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_8bpc_avx2.prep)
180*c0909341SAndroid Build Coastguard Worker
181*c0909341SAndroid Build Coastguard Worker%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
182*c0909341SAndroid Build Coastguard Worker
183*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE   put,  avx2,            2, 4, 8, 16, 32, 64, 128
184*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE   prep, avx2,               4, 8, 16, 32, 64, 128
185*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE     put,  bilin, avx2,  7, 2, 4, 8, 16, 32, 64, 128
186*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE     prep, bilin, avx2,  7,    4, 8, 16, 32, 64, 128
187*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE     put,  6tap,  avx2,  3, 2, 4, 8, 16, 32, 64, 128
188*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE     put,  8tap,  avx2,  3, 2, 4, 8, 16, 32, 64, 128
189*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE     prep, 6tap,  avx2,  1,    4, 8, 16, 32, 64, 128
190*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE     prep, 8tap,  avx2,  1,    4, 8, 16, 32, 64, 128
191*c0909341SAndroid Build Coastguard WorkerSCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128
192*c0909341SAndroid Build Coastguard WorkerSCALED_JMP_TABLE prep_8tap_scaled, avx2,   4, 8, 16, 32, 64, 128
193*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE  avg, avx2,                4, 8, 16, 32, 64, 128
194*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE  w_avg, avx2,              4, 8, 16, 32, 64, 128
195*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE  mask, avx2,               4, 8, 16, 32, 64, 128
196*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE  w_mask_420, avx2,         4, 8, 16, 32, 64, 128
197*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE  w_mask_422, avx2,         4, 8, 16, 32, 64, 128
198*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE  w_mask_444, avx2,         4, 8, 16, 32, 64, 128
199*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE  blend, avx2,              4, 8, 16, 32
200*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE  blend_v, avx2,         2, 4, 8, 16, 32
201*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE  blend_h, avx2,         2, 4, 8, 16, 32, 32, 32
202*c0909341SAndroid Build Coastguard Worker
203*c0909341SAndroid Build Coastguard WorkerSECTION .text
204*c0909341SAndroid Build Coastguard Worker
205*c0909341SAndroid Build Coastguard WorkerINIT_XMM avx2
206*c0909341SAndroid Build Coastguard Workercglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
207*c0909341SAndroid Build Coastguard Worker    movifnidn          mxyd, r6m ; mx
208*c0909341SAndroid Build Coastguard Worker    lea                  r7, [put_avx2]
209*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
210*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
211*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
212*c0909341SAndroid Build Coastguard Worker    jnz .h
213*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r7m ; my
214*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
215*c0909341SAndroid Build Coastguard Worker    jnz .v
216*c0909341SAndroid Build Coastguard Worker.put:
217*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(put,)]
218*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
219*c0909341SAndroid Build Coastguard Worker    jmp                  wq
220*c0909341SAndroid Build Coastguard Worker.put_w2:
221*c0909341SAndroid Build Coastguard Worker    movzx               r6d, word [srcq+ssq*0]
222*c0909341SAndroid Build Coastguard Worker    movzx               r7d, word [srcq+ssq*1]
223*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
224*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r6w
225*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r7w
226*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
227*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
228*c0909341SAndroid Build Coastguard Worker    jg .put_w2
229*c0909341SAndroid Build Coastguard Worker    RET
230*c0909341SAndroid Build Coastguard Worker.put_w4:
231*c0909341SAndroid Build Coastguard Worker    mov                 r6d, [srcq+ssq*0]
232*c0909341SAndroid Build Coastguard Worker    mov                 r7d, [srcq+ssq*1]
233*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
234*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r6d
235*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r7d
236*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
237*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
238*c0909341SAndroid Build Coastguard Worker    jg .put_w4
239*c0909341SAndroid Build Coastguard Worker    RET
240*c0909341SAndroid Build Coastguard Worker.put_w8:
241*c0909341SAndroid Build Coastguard Worker    mov                  r6, [srcq+ssq*0]
242*c0909341SAndroid Build Coastguard Worker    mov                  r7, [srcq+ssq*1]
243*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
244*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r6
245*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r7
246*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
247*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
248*c0909341SAndroid Build Coastguard Worker    jg .put_w8
249*c0909341SAndroid Build Coastguard Worker    RET
250*c0909341SAndroid Build Coastguard Worker.put_w16:
251*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
252*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
253*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
254*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m0
255*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
256*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
257*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
258*c0909341SAndroid Build Coastguard Worker    jg .put_w16
259*c0909341SAndroid Build Coastguard Worker    RET
260*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
261*c0909341SAndroid Build Coastguard Worker.put_w32:
262*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
263*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
264*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
265*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m0
266*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
267*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
268*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
269*c0909341SAndroid Build Coastguard Worker    jg .put_w32
270*c0909341SAndroid Build Coastguard Worker    RET
271*c0909341SAndroid Build Coastguard Worker.put_w64:
272*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+32*0]
273*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+32*1]
274*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1+32*0]
275*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1+32*1]
276*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
277*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+32*0], m0
278*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+32*1], m1
279*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+32*0], m2
280*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+32*1], m3
281*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
282*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
283*c0909341SAndroid Build Coastguard Worker    jg .put_w64
284*c0909341SAndroid Build Coastguard Worker    RET
285*c0909341SAndroid Build Coastguard Worker.put_w128:
286*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+32*0]
287*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+32*1]
288*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+32*2]
289*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+32*3]
290*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
291*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
292*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m1
293*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*2], m2
294*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*3], m3
295*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
296*c0909341SAndroid Build Coastguard Worker    dec                  hd
297*c0909341SAndroid Build Coastguard Worker    jg .put_w128
298*c0909341SAndroid Build Coastguard Worker    RET
299*c0909341SAndroid Build Coastguard Worker.h:
300*c0909341SAndroid Build Coastguard Worker    ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
301*c0909341SAndroid Build Coastguard Worker    ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
302*c0909341SAndroid Build Coastguard Worker    imul               mxyd, 255
303*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [z_filter_s+2]
304*c0909341SAndroid Build Coastguard Worker    add                mxyd, 16
305*c0909341SAndroid Build Coastguard Worker    movd                xm5, mxyd
306*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r7m ; my
307*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, xm5
308*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
309*c0909341SAndroid Build Coastguard Worker    jnz .hv
310*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_h)]
311*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [pw_2048]
312*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
313*c0909341SAndroid Build Coastguard Worker    jmp                  wq
314*c0909341SAndroid Build Coastguard Worker.h_w2:
315*c0909341SAndroid Build Coastguard Worker    movd                xm0, [srcq+ssq*0]
316*c0909341SAndroid Build Coastguard Worker    pinsrd              xm0, [srcq+ssq*1], 1
317*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
318*c0909341SAndroid Build Coastguard Worker    pshufb              xm0, xm4
319*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm5
320*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm3
321*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm0
322*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xm0, 0
323*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xm0, 2
324*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
325*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
326*c0909341SAndroid Build Coastguard Worker    jg .h_w2
327*c0909341SAndroid Build Coastguard Worker    RET
328*c0909341SAndroid Build Coastguard Worker.h_w4:
329*c0909341SAndroid Build Coastguard Worker    mova                xm4, [bilin_h_shuf4]
330*c0909341SAndroid Build Coastguard Worker.h_w4_loop:
331*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+ssq*0]
332*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [srcq+ssq*1]
333*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
334*c0909341SAndroid Build Coastguard Worker    pshufb              xm0, xm4
335*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm5
336*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm3
337*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm0
338*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm0
339*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm0, 1
340*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
341*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
342*c0909341SAndroid Build Coastguard Worker    jg .h_w4_loop
343*c0909341SAndroid Build Coastguard Worker    RET
344*c0909341SAndroid Build Coastguard Worker.h_w8:
345*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
346*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*1]
347*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
348*c0909341SAndroid Build Coastguard Worker    pshufb              xm0, xm4
349*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm4
350*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm5
351*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm5
352*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm3
353*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm1, xm3
354*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm1
355*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm0
356*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm0
357*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
358*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
359*c0909341SAndroid Build Coastguard Worker    jg .h_w8
360*c0909341SAndroid Build Coastguard Worker    RET
361*c0909341SAndroid Build Coastguard Worker.h_w16:
362*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0+8*0]
363*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*1+8*0], 1
364*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*0+8*1]
365*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+ssq*1+8*1], 1
366*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
367*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
368*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
369*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
370*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
371*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
372*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
373*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
374*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm0
375*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], m0, 1
376*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
377*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
378*c0909341SAndroid Build Coastguard Worker    jg .h_w16
379*c0909341SAndroid Build Coastguard Worker    RET
380*c0909341SAndroid Build Coastguard Worker.h_w32:
381*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+8*0]
382*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+8*1]
383*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
384*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
385*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
386*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
387*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
388*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
389*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
390*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
391*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
392*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
393*c0909341SAndroid Build Coastguard Worker    dec                  hd
394*c0909341SAndroid Build Coastguard Worker    jg .h_w32
395*c0909341SAndroid Build Coastguard Worker    RET
396*c0909341SAndroid Build Coastguard Worker.h_w64:
397*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+8*0]
398*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+8*1]
399*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
400*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
401*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
402*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
403*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
404*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
405*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
406*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+8*4]
407*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+8*5]
408*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
409*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
410*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
411*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
412*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
413*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
414*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
415*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m2
416*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
417*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m1
418*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
419*c0909341SAndroid Build Coastguard Worker    dec                  hd
420*c0909341SAndroid Build Coastguard Worker    jg .h_w64
421*c0909341SAndroid Build Coastguard Worker    RET
422*c0909341SAndroid Build Coastguard Worker.h_w128:
423*c0909341SAndroid Build Coastguard Worker    mov                  r6, -32*3
424*c0909341SAndroid Build Coastguard Worker.h_w128_loop:
425*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6+32*3+8*0]
426*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6+32*3+8*1]
427*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
428*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
429*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
430*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
431*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
432*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
433*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
434*c0909341SAndroid Build Coastguard Worker    mova     [dstq+r6+32*3], m0
435*c0909341SAndroid Build Coastguard Worker    add                  r6, 32
436*c0909341SAndroid Build Coastguard Worker    jle .h_w128_loop
437*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
438*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
439*c0909341SAndroid Build Coastguard Worker    dec                  hd
440*c0909341SAndroid Build Coastguard Worker    jg .h_w128
441*c0909341SAndroid Build Coastguard Worker    RET
442*c0909341SAndroid Build Coastguard Worker.v:
443*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_v)]
444*c0909341SAndroid Build Coastguard Worker    imul               mxyd, 255
445*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pw_2048]
446*c0909341SAndroid Build Coastguard Worker    add                mxyd, 16
447*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
448*c0909341SAndroid Build Coastguard Worker    movd                xm4, mxyd
449*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, xm4
450*c0909341SAndroid Build Coastguard Worker    jmp                  wq
451*c0909341SAndroid Build Coastguard Worker.v_w2:
452*c0909341SAndroid Build Coastguard Worker    movd                xm0,      [srcq+ssq*0]
453*c0909341SAndroid Build Coastguard Worker.v_w2_loop:
454*c0909341SAndroid Build Coastguard Worker    pinsrw              xm1, xm0, [srcq+ssq*1], 1 ; 0 1
455*c0909341SAndroid Build Coastguard Worker    lea                srcq,      [srcq+ssq*2]
456*c0909341SAndroid Build Coastguard Worker    pinsrw              xm0, xm1, [srcq+ssq*0], 0 ; 2 1
457*c0909341SAndroid Build Coastguard Worker    pshuflw             xm1, xm1, q2301           ; 1 0
458*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm1, xm0
459*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm4
460*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm1, xm5
461*c0909341SAndroid Build Coastguard Worker    packuswb            xm1, xm1
462*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xm1, 1
463*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xm1, 0
464*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
465*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
466*c0909341SAndroid Build Coastguard Worker    jg .v_w2_loop
467*c0909341SAndroid Build Coastguard Worker    RET
468*c0909341SAndroid Build Coastguard Worker.v_w4:
469*c0909341SAndroid Build Coastguard Worker    movd                xm0, [srcq+ssq*0]
470*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
471*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm2, [srcq+ssq*1]
472*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
473*c0909341SAndroid Build Coastguard Worker    vpblendd            xm1, xm2, xm0, 0x01 ; 0 1
474*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [srcq+ssq*0]
475*c0909341SAndroid Build Coastguard Worker    vpblendd            xm2, xm0, 0x02      ; 1 2
476*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm1, xm2
477*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm4
478*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm1, xm5
479*c0909341SAndroid Build Coastguard Worker    packuswb            xm1, xm1
480*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm1
481*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm1, 1
482*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
483*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
484*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
485*c0909341SAndroid Build Coastguard Worker    RET
486*c0909341SAndroid Build Coastguard Worker.v_w8:
487*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+ssq*0]
488*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
489*c0909341SAndroid Build Coastguard Worker    movq                xm2, [srcq+ssq*1]
490*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
491*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm1, xm0, xm2
492*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+ssq*0]
493*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm2, xm0
494*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm4
495*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm2, xm4
496*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm1, xm5
497*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm2, xm5
498*c0909341SAndroid Build Coastguard Worker    packuswb            xm1, xm2
499*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm1
500*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm1
501*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
502*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
503*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
504*c0909341SAndroid Build Coastguard Worker    RET
505*c0909341SAndroid Build Coastguard Worker.v_w16:
506*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
507*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
508*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [srcq+ssq*1]
509*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
510*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m3, m0, 0x0f ; 0 1
511*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [srcq+ssq*0]
512*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m0, 0xf0     ; 1 2
513*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2, m3
514*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m3
515*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m4
516*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m4
517*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
518*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5
519*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m2
520*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm1
521*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], m1, 1
522*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
523*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
524*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
525*c0909341SAndroid Build Coastguard Worker    RET
526*c0909341SAndroid Build Coastguard Worker.v_w32:
527*c0909341SAndroid Build Coastguard Worker%macro PUT_BILIN_V_W32 0
528*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
529*c0909341SAndroid Build Coastguard Worker%%loop:
530*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1]
531*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
532*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m0, m3
533*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m0, m3
534*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
535*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m4
536*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m4
537*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
538*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5
539*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m2
540*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m0
541*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m0
542*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m4
543*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m4
544*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5
545*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
546*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m3
547*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m1
548*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m2
549*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
550*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
551*c0909341SAndroid Build Coastguard Worker    jg %%loop
552*c0909341SAndroid Build Coastguard Worker%endmacro
553*c0909341SAndroid Build Coastguard Worker    PUT_BILIN_V_W32
554*c0909341SAndroid Build Coastguard Worker    RET
555*c0909341SAndroid Build Coastguard Worker.v_w64:
556*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+32*0]
557*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+32*1]
558*c0909341SAndroid Build Coastguard Worker.v_w64_loop:
559*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
560*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+32*0]
561*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m0, m3
562*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m3
563*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m4
564*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m4
565*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5
566*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
567*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m0
568*c0909341SAndroid Build Coastguard Worker    mova                 m0, m3
569*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+32*1]
570*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m2
571*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m1, m3
572*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m3
573*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m4
574*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m4
575*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5
576*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
577*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m1
578*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
579*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m2
580*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
581*c0909341SAndroid Build Coastguard Worker    dec                  hd
582*c0909341SAndroid Build Coastguard Worker    jg .v_w64_loop
583*c0909341SAndroid Build Coastguard Worker    RET
584*c0909341SAndroid Build Coastguard Worker.v_w128:
585*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+(3<<8)]
586*c0909341SAndroid Build Coastguard Worker    mov                  r4, srcq
587*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
588*c0909341SAndroid Build Coastguard Worker.v_w128_loop:
589*c0909341SAndroid Build Coastguard Worker    PUT_BILIN_V_W32
590*c0909341SAndroid Build Coastguard Worker    add                  r4, 32
591*c0909341SAndroid Build Coastguard Worker    add                  r7, 32
592*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
593*c0909341SAndroid Build Coastguard Worker    mov                srcq, r4
594*c0909341SAndroid Build Coastguard Worker    mov                dstq, r7
595*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
596*c0909341SAndroid Build Coastguard Worker    jg .v_w128_loop
597*c0909341SAndroid Build Coastguard Worker    RET
598*c0909341SAndroid Build Coastguard Worker.hv:
599*c0909341SAndroid Build Coastguard Worker    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
600*c0909341SAndroid Build Coastguard Worker    ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
601*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
602*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       8
603*c0909341SAndroid Build Coastguard Worker    shl                mxyd, 11 ; can't shift by 12 due to signed overflow
604*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pw_15]
605*c0909341SAndroid Build Coastguard Worker    movd                xm6, mxyd
606*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
607*c0909341SAndroid Build Coastguard Worker    paddb                m5, m5
608*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, xm6
609*c0909341SAndroid Build Coastguard Worker    jmp                  wq
610*c0909341SAndroid Build Coastguard Worker.hv_w2:
611*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [srcq+ssq*0]
612*c0909341SAndroid Build Coastguard Worker    pshufb              xm0, xm4
613*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm5
614*c0909341SAndroid Build Coastguard Worker.hv_w2_loop:
615*c0909341SAndroid Build Coastguard Worker    movd                xm1, [srcq+ssq*1]
616*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
617*c0909341SAndroid Build Coastguard Worker    pinsrd              xm1, [srcq+ssq*0], 1
618*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm4
619*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm5             ; 1 _ 2 _
620*c0909341SAndroid Build Coastguard Worker    shufps              xm2, xm0, xm1, q1032 ; 0 _ 1 _
621*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm1
622*c0909341SAndroid Build Coastguard Worker    psubw               xm1, xm2
623*c0909341SAndroid Build Coastguard Worker    pmulhw              xm1, xm6
624*c0909341SAndroid Build Coastguard Worker    pavgw               xm2, xm7
625*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
626*c0909341SAndroid Build Coastguard Worker    psrlw               xm1, 4
627*c0909341SAndroid Build Coastguard Worker    packuswb            xm1, xm1
628*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xm1, 0
629*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xm1, 2
630*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
631*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
632*c0909341SAndroid Build Coastguard Worker    jg .hv_w2_loop
633*c0909341SAndroid Build Coastguard Worker    RET
634*c0909341SAndroid Build Coastguard Worker.hv_w4:
635*c0909341SAndroid Build Coastguard Worker    mova                xm4, [bilin_h_shuf4]
636*c0909341SAndroid Build Coastguard Worker    movddup             xm0, [srcq+ssq*0]
637*c0909341SAndroid Build Coastguard Worker    pshufb              xm0, xm4
638*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm5
639*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
640*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+ssq*1]
641*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
642*c0909341SAndroid Build Coastguard Worker    movhps              xm1, [srcq+ssq*0]
643*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm4
644*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm5             ; 1 2
645*c0909341SAndroid Build Coastguard Worker    shufps              xm2, xm0, xm1, q1032 ; 0 1
646*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm1
647*c0909341SAndroid Build Coastguard Worker    psubw               xm1, xm2
648*c0909341SAndroid Build Coastguard Worker    pmulhw              xm1, xm6
649*c0909341SAndroid Build Coastguard Worker    pavgw               xm2, xm7
650*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
651*c0909341SAndroid Build Coastguard Worker    psrlw               xm1, 4
652*c0909341SAndroid Build Coastguard Worker    packuswb            xm1, xm1
653*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm1
654*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm1, 1
655*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
656*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
657*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
658*c0909341SAndroid Build Coastguard Worker    RET
659*c0909341SAndroid Build Coastguard Worker.hv_w8:
660*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [srcq+ssq*0]
661*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
662*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
663*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
664*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*1]
665*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
666*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+ssq*0], 1
667*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
668*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5           ; 1 2
669*c0909341SAndroid Build Coastguard Worker    vperm2i128           m2, m0, m1, 0x21 ; 0 1
670*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
671*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
672*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m6
673*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m7
674*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
675*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 4
676*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m1, 1
677*c0909341SAndroid Build Coastguard Worker    packuswb            xm1, xm2
678*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm1
679*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm1
680*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
681*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
682*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
683*c0909341SAndroid Build Coastguard Worker    RET
684*c0909341SAndroid Build Coastguard Worker.hv_w16:
685*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+8*0]
686*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*0+8*1], 1
687*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
688*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
689*c0909341SAndroid Build Coastguard Worker.hv_w16_loop:
690*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+ssq*1+8*0]
691*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+ssq*1+8*1], 1
692*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
693*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ssq*0+8*0]
694*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+ssq*0+8*1], 1
695*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
696*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4
697*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
698*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2, m0
699*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m6
700*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m7
701*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
702*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3, m5
703*c0909341SAndroid Build Coastguard Worker    psubw                m3, m0, m2
704*c0909341SAndroid Build Coastguard Worker    pmulhw               m3, m6
705*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m7
706*c0909341SAndroid Build Coastguard Worker    paddw                m3, m2
707*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 4
708*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 4
709*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m3
710*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q3120
711*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm1
712*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], m1, 1
713*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
714*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
715*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop
716*c0909341SAndroid Build Coastguard Worker    RET
717*c0909341SAndroid Build Coastguard Worker.hv_w128:
718*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+(3<<16)]
719*c0909341SAndroid Build Coastguard Worker    jmp .hv_w32_start
720*c0909341SAndroid Build Coastguard Worker.hv_w64:
721*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+(1<<16)]
722*c0909341SAndroid Build Coastguard Worker.hv_w32_start:
723*c0909341SAndroid Build Coastguard Worker    mov                  r4, srcq
724*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
725*c0909341SAndroid Build Coastguard Worker.hv_w32:
726*c0909341SAndroid Build Coastguard Worker%if WIN64
727*c0909341SAndroid Build Coastguard Worker    movaps              r4m, xmm8
728*c0909341SAndroid Build Coastguard Worker%endif
729*c0909341SAndroid Build Coastguard Worker.hv_w32_loop0:
730*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+8*0]
731*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+8*1]
732*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
733*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
734*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
735*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
736*c0909341SAndroid Build Coastguard Worker.hv_w32_loop:
737*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
738*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+8*0]
739*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+8*1]
740*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
741*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4
742*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
743*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5
744*c0909341SAndroid Build Coastguard Worker    psubw                m8, m2, m0
745*c0909341SAndroid Build Coastguard Worker    pmulhw               m8, m6
746*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m7
747*c0909341SAndroid Build Coastguard Worker    paddw                m8, m0
748*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
749*c0909341SAndroid Build Coastguard Worker    psubw                m2, m3, m1
750*c0909341SAndroid Build Coastguard Worker    pmulhw               m2, m6
751*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m7
752*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1
753*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
754*c0909341SAndroid Build Coastguard Worker    psrlw                m8, 4
755*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 4
756*c0909341SAndroid Build Coastguard Worker    packuswb             m8, m2
757*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m8
758*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
759*c0909341SAndroid Build Coastguard Worker    dec                  hd
760*c0909341SAndroid Build Coastguard Worker    jg .hv_w32_loop
761*c0909341SAndroid Build Coastguard Worker    add                  r4, 32
762*c0909341SAndroid Build Coastguard Worker    add                  r7, 32
763*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
764*c0909341SAndroid Build Coastguard Worker    mov                srcq, r4
765*c0909341SAndroid Build Coastguard Worker    mov                dstq, r7
766*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<16
767*c0909341SAndroid Build Coastguard Worker    jg .hv_w32_loop0
768*c0909341SAndroid Build Coastguard Worker%if WIN64
769*c0909341SAndroid Build Coastguard Worker    movaps             xmm8, r4m
770*c0909341SAndroid Build Coastguard Worker%endif
771*c0909341SAndroid Build Coastguard Worker    RET
772*c0909341SAndroid Build Coastguard Worker
773*c0909341SAndroid Build Coastguard Workercglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
774*c0909341SAndroid Build Coastguard Worker    movifnidn          mxyd, r5m ; mx
775*c0909341SAndroid Build Coastguard Worker    lea                  r6, [prep%+SUFFIX]
776*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
777*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
778*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
779*c0909341SAndroid Build Coastguard Worker    jnz .h
780*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r6m ; my
781*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
782*c0909341SAndroid Build Coastguard Worker    jnz .v
783*c0909341SAndroid Build Coastguard Worker.prep:
784*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r6+wq*2+table_offset(prep,)]
785*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
786*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
787*c0909341SAndroid Build Coastguard Worker    jmp                  wq
788*c0909341SAndroid Build Coastguard Worker.prep_w4:
789*c0909341SAndroid Build Coastguard Worker    movd                xm0, [srcq+strideq*0]
790*c0909341SAndroid Build Coastguard Worker    pinsrd              xm0, [srcq+strideq*1], 1
791*c0909341SAndroid Build Coastguard Worker    pinsrd              xm0, [srcq+strideq*2], 2
792*c0909341SAndroid Build Coastguard Worker    pinsrd              xm0, [srcq+stride3q ], 3
793*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
794*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m0, xm0
795*c0909341SAndroid Build Coastguard Worker    psllw                m0, 4
796*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
797*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
798*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
799*c0909341SAndroid Build Coastguard Worker    jg .prep_w4
800*c0909341SAndroid Build Coastguard Worker    RET
801*c0909341SAndroid Build Coastguard Worker.prep_w8:
802*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+strideq*0]
803*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [srcq+strideq*1]
804*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+strideq*2]
805*c0909341SAndroid Build Coastguard Worker    movhps              xm1, [srcq+stride3q ]
806*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
807*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m0, xm0
808*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m1, xm1
809*c0909341SAndroid Build Coastguard Worker    psllw                m0, 4
810*c0909341SAndroid Build Coastguard Worker    psllw                m1, 4
811*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
812*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m1
813*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
814*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
815*c0909341SAndroid Build Coastguard Worker    jg .prep_w8
816*c0909341SAndroid Build Coastguard Worker    RET
817*c0909341SAndroid Build Coastguard Worker.prep_w16:
818*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m0, [srcq+strideq*0]
819*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m1, [srcq+strideq*1]
820*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m2, [srcq+strideq*2]
821*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m3, [srcq+stride3q ]
822*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
823*c0909341SAndroid Build Coastguard Worker    psllw                m0, 4
824*c0909341SAndroid Build Coastguard Worker    psllw                m1, 4
825*c0909341SAndroid Build Coastguard Worker    psllw                m2, 4
826*c0909341SAndroid Build Coastguard Worker    psllw                m3, 4
827*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
828*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m1
829*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*2], m2
830*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*3], m3
831*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*4
832*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
833*c0909341SAndroid Build Coastguard Worker    jg .prep_w16
834*c0909341SAndroid Build Coastguard Worker    RET
835*c0909341SAndroid Build Coastguard Worker.prep_w32:
836*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m0, [srcq+strideq*0+16*0]
837*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m1, [srcq+strideq*0+16*1]
838*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m2, [srcq+strideq*1+16*0]
839*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m3, [srcq+strideq*1+16*1]
840*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
841*c0909341SAndroid Build Coastguard Worker    psllw                m0, 4
842*c0909341SAndroid Build Coastguard Worker    psllw                m1, 4
843*c0909341SAndroid Build Coastguard Worker    psllw                m2, 4
844*c0909341SAndroid Build Coastguard Worker    psllw                m3, 4
845*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
846*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m1
847*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*2], m2
848*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*3], m3
849*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*4
850*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
851*c0909341SAndroid Build Coastguard Worker    jg .prep_w32
852*c0909341SAndroid Build Coastguard Worker    RET
853*c0909341SAndroid Build Coastguard Worker.prep_w64:
854*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m0, [srcq+16*0]
855*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m1, [srcq+16*1]
856*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m2, [srcq+16*2]
857*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m3, [srcq+16*3]
858*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
859*c0909341SAndroid Build Coastguard Worker    psllw                m0, 4
860*c0909341SAndroid Build Coastguard Worker    psllw                m1, 4
861*c0909341SAndroid Build Coastguard Worker    psllw                m2, 4
862*c0909341SAndroid Build Coastguard Worker    psllw                m3, 4
863*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
864*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m1
865*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*2], m2
866*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*3], m3
867*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*4
868*c0909341SAndroid Build Coastguard Worker    dec                  hd
869*c0909341SAndroid Build Coastguard Worker    jg .prep_w64
870*c0909341SAndroid Build Coastguard Worker    RET
871*c0909341SAndroid Build Coastguard Worker.prep_w128:
872*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m0, [srcq+16*0]
873*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m1, [srcq+16*1]
874*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m2, [srcq+16*2]
875*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m3, [srcq+16*3]
876*c0909341SAndroid Build Coastguard Worker    psllw                m0, 4
877*c0909341SAndroid Build Coastguard Worker    psllw                m1, 4
878*c0909341SAndroid Build Coastguard Worker    psllw                m2, 4
879*c0909341SAndroid Build Coastguard Worker    psllw                m3, 4
880*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
881*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m1
882*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*2], m2
883*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*3], m3
884*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m0, [srcq+16*4]
885*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m1, [srcq+16*5]
886*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m2, [srcq+16*6]
887*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m3, [srcq+16*7]
888*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*8
889*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
890*c0909341SAndroid Build Coastguard Worker    psllw                m0, 4
891*c0909341SAndroid Build Coastguard Worker    psllw                m1, 4
892*c0909341SAndroid Build Coastguard Worker    psllw                m2, 4
893*c0909341SAndroid Build Coastguard Worker    psllw                m3, 4
894*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*4], m0
895*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*3], m1
896*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*2], m2
897*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*1], m3
898*c0909341SAndroid Build Coastguard Worker    dec                  hd
899*c0909341SAndroid Build Coastguard Worker    jg .prep_w128
900*c0909341SAndroid Build Coastguard Worker    RET
901*c0909341SAndroid Build Coastguard Worker.h:
902*c0909341SAndroid Build Coastguard Worker    ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
903*c0909341SAndroid Build Coastguard Worker    ; = (16 - mx) * src[x] + mx * src[x + 1]
904*c0909341SAndroid Build Coastguard Worker    imul               mxyd, 255
905*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [z_filter_s+2]
906*c0909341SAndroid Build Coastguard Worker    add                mxyd, 16
907*c0909341SAndroid Build Coastguard Worker    movd                xm5, mxyd
908*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r6m ; my
909*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, xm5
910*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
911*c0909341SAndroid Build Coastguard Worker    jnz .hv
912*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
913*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
914*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
915*c0909341SAndroid Build Coastguard Worker    jmp                  wq
916*c0909341SAndroid Build Coastguard Worker.h_w4:
917*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [bilin_h_shuf4]
918*c0909341SAndroid Build Coastguard Worker.h_w4_loop:
919*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+strideq*0]
920*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [srcq+strideq*1]
921*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+strideq*2]
922*c0909341SAndroid Build Coastguard Worker    movhps              xm1, [srcq+stride3q ]
923*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
924*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, xm1, 1
925*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
926*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
927*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
928*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
929*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
930*c0909341SAndroid Build Coastguard Worker    jg .h_w4_loop
931*c0909341SAndroid Build Coastguard Worker    RET
932*c0909341SAndroid Build Coastguard Worker.h_w8:
933*c0909341SAndroid Build Coastguard Worker.h_w8_loop:
934*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*0]
935*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+strideq*1], 1
936*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*2]
937*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+stride3q ], 1
938*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
939*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
940*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
941*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
942*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
943*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
944*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m1
945*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
946*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
947*c0909341SAndroid Build Coastguard Worker    jg .h_w8_loop
948*c0909341SAndroid Build Coastguard Worker    RET
949*c0909341SAndroid Build Coastguard Worker.h_w16:
950*c0909341SAndroid Build Coastguard Worker.h_w16_loop:
951*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*0+8*0]
952*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+strideq*0+8*1], 1
953*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*1+8*0]
954*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+strideq*1+8*1], 1
955*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+strideq*2+8*0]
956*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+strideq*2+8*1], 1
957*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+stride3q +8*0]
958*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+stride3q +8*1], 1
959*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
960*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
961*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
962*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
963*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4
964*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
965*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
966*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
967*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5
968*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
969*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m1
970*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*2], m2
971*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*3], m3
972*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*4
973*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
974*c0909341SAndroid Build Coastguard Worker    jg .h_w16_loop
975*c0909341SAndroid Build Coastguard Worker    RET
976*c0909341SAndroid Build Coastguard Worker.h_w32:
977*c0909341SAndroid Build Coastguard Worker.h_w32_loop:
978*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*0+8*0]
979*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+strideq*0+8*1], 1
980*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*0+8*2]
981*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+strideq*0+8*3], 1
982*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+strideq*1+8*0]
983*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+strideq*1+8*1], 1
984*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+strideq*1+8*2]
985*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+strideq*1+8*3], 1
986*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
987*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
988*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
989*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
990*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4
991*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
992*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
993*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
994*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5
995*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
996*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m1
997*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*2], m2
998*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*3], m3
999*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*4
1000*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1001*c0909341SAndroid Build Coastguard Worker    jg .h_w32_loop
1002*c0909341SAndroid Build Coastguard Worker    RET
1003*c0909341SAndroid Build Coastguard Worker.h_w64:
1004*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+8*0]
1005*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+8*1], 1
1006*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+8*2]
1007*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+8*3], 1
1008*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+8*4]
1009*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+8*5], 1
1010*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+8*6]
1011*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+8*7], 1
1012*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
1013*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
1014*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1015*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
1016*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4
1017*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1018*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
1019*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
1020*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5
1021*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
1022*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m1
1023*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*2], m2
1024*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*3], m3
1025*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*4
1026*c0909341SAndroid Build Coastguard Worker    dec                  hd
1027*c0909341SAndroid Build Coastguard Worker    jg .h_w64
1028*c0909341SAndroid Build Coastguard Worker    RET
1029*c0909341SAndroid Build Coastguard Worker.h_w128:
1030*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+8*0]
1031*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+8*1], 1
1032*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+8*2]
1033*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+8*3], 1
1034*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+8*4]
1035*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+8*5], 1
1036*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+8*6]
1037*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+8*7], 1
1038*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
1039*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1040*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
1041*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4
1042*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1043*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
1044*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
1045*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5
1046*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
1047*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m1
1048*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*2], m2
1049*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*3], m3
1050*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+8* 8]
1051*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+8* 9], 1
1052*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+8*10]
1053*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+8*11], 1
1054*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+8*12]
1055*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+8*13], 1
1056*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+8*14]
1057*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+8*15], 1
1058*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*8
1059*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
1060*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
1061*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1062*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
1063*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4
1064*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1065*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
1066*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
1067*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5
1068*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*4], m0
1069*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*3], m1
1070*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*2], m2
1071*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*1], m3
1072*c0909341SAndroid Build Coastguard Worker    dec                  hd
1073*c0909341SAndroid Build Coastguard Worker    jg .h_w128
1074*c0909341SAndroid Build Coastguard Worker    RET
1075*c0909341SAndroid Build Coastguard Worker.v:
1076*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       7
1077*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
1078*c0909341SAndroid Build Coastguard Worker    imul               mxyd, 255
1079*c0909341SAndroid Build Coastguard Worker    add                mxyd, 16
1080*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
1081*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
1082*c0909341SAndroid Build Coastguard Worker    movd                xm6, mxyd
1083*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, xm6
1084*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1085*c0909341SAndroid Build Coastguard Worker.v_w4:
1086*c0909341SAndroid Build Coastguard Worker    movd                xm0, [srcq+strideq*0]
1087*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
1088*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [srcq+strideq*2]
1089*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm2, [srcq+strideq*1]
1090*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [srcq+stride3q ]
1091*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1092*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m0, 0x05     ; 0 2 2 2
1093*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [srcq+strideq*0]
1094*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m2, 0x0f     ; 1 1 3 3
1095*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m1, m0, 0xa0 ; 0 2 2 4
1096*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m3, 0xaa     ; 0 1 2 3
1097*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m3, 0x55     ; 1 2 3 4
1098*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2
1099*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6
1100*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m1
1101*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
1102*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1103*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
1104*c0909341SAndroid Build Coastguard Worker    RET
1105*c0909341SAndroid Build Coastguard Worker.v_w8:
1106*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+strideq*0]
1107*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
1108*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [srcq+strideq*2]
1109*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+strideq*1]
1110*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+stride3q ]
1111*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1112*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m0, 0x03     ; 0 2 2 2
1113*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+strideq*0]
1114*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m3, 0xcc     ; 1 3 1 3
1115*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m2, m1, 0xf0 ; 1 3 2 2
1116*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m1, 0x0f     ; 0 2 1 3
1117*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m0, 0xc0     ; 1 3 2 4
1118*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2, m3
1119*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m3
1120*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6
1121*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6
1122*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m1
1123*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m2
1124*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
1125*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1126*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
1127*c0909341SAndroid Build Coastguard Worker    RET
1128*c0909341SAndroid Build Coastguard Worker.v_w16:
1129*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [srcq+strideq*0]
1130*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
1131*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [srcq+strideq*1]
1132*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [srcq+strideq*2]
1133*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [srcq+stride3q ]
1134*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1135*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m0, m2, 0x0c ; 0 2
1136*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [srcq+strideq*0]
1137*c0909341SAndroid Build Coastguard Worker    shufpd               m1, m3, 0x0c     ; 1 3
1138*c0909341SAndroid Build Coastguard Worker    shufpd               m2, m0, 0x0c     ; 2 4
1139*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4, m1
1140*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m1, m2
1141*c0909341SAndroid Build Coastguard Worker    punpckhbw            m4, m1
1142*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
1143*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m6
1144*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m6
1145*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m6
1146*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6
1147*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m3
1148*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m5
1149*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*2], m4
1150*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*3], m1
1151*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*4
1152*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1153*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
1154*c0909341SAndroid Build Coastguard Worker    RET
1155*c0909341SAndroid Build Coastguard Worker.v_w32:
1156*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [srcq+strideq*0], q3120
1157*c0909341SAndroid Build Coastguard Worker.v_w32_loop:
1158*c0909341SAndroid Build Coastguard Worker    vpermq               m1, [srcq+strideq*1], q3120
1159*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [srcq+strideq*2], q3120
1160*c0909341SAndroid Build Coastguard Worker    vpermq               m3, [srcq+stride3q ], q3120
1161*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1162*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m0, m1
1163*c0909341SAndroid Build Coastguard Worker    punpckhbw            m5, m0, m1
1164*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [srcq+strideq*0], q3120
1165*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m6
1166*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m6
1167*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m4
1168*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m5
1169*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m1, m2
1170*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
1171*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m6
1172*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6
1173*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m2, m3
1174*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m3
1175*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m6
1176*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6
1177*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*2], m4
1178*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*3], m1
1179*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*8
1180*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m3, m0
1181*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m0
1182*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6
1183*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m6
1184*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*4], m5
1185*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*3], m2
1186*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*2], m1
1187*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*1], m3
1188*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1189*c0909341SAndroid Build Coastguard Worker    jg .v_w32_loop
1190*c0909341SAndroid Build Coastguard Worker    RET
1191*c0909341SAndroid Build Coastguard Worker.v_w64:
1192*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [srcq+strideq*0+32*0], q3120
1193*c0909341SAndroid Build Coastguard Worker    vpermq               m1, [srcq+strideq*0+32*1], q3120
1194*c0909341SAndroid Build Coastguard Worker.v_w64_loop:
1195*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [srcq+strideq*1+32*0], q3120
1196*c0909341SAndroid Build Coastguard Worker    vpermq               m3, [srcq+strideq*1+32*1], q3120
1197*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1198*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m0, m2
1199*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m2
1200*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m6
1201*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m6
1202*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m4
1203*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m0
1204*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m1, m3
1205*c0909341SAndroid Build Coastguard Worker    punpckhbw            m5, m1, m3
1206*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [srcq+strideq*0+32*0], q3120
1207*c0909341SAndroid Build Coastguard Worker    vpermq               m1, [srcq+strideq*0+32*1], q3120
1208*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m6
1209*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m6
1210*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*2], m4
1211*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*3], m5
1212*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*8
1213*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m2, m0
1214*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m0
1215*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m3, m1
1216*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m1
1217*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m6
1218*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6
1219*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m6
1220*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m6
1221*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*4], m4
1222*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*3], m2
1223*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*2], m5
1224*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-32*1], m3
1225*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1226*c0909341SAndroid Build Coastguard Worker    jg .v_w64_loop
1227*c0909341SAndroid Build Coastguard Worker    RET
1228*c0909341SAndroid Build Coastguard Worker.v_w128:
1229*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+(3<<8)]
1230*c0909341SAndroid Build Coastguard Worker    mov                  r3, srcq
1231*c0909341SAndroid Build Coastguard Worker    mov                  r5, tmpq
1232*c0909341SAndroid Build Coastguard Worker.v_w128_loop0:
1233*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [srcq+strideq*0], q3120
1234*c0909341SAndroid Build Coastguard Worker.v_w128_loop:
1235*c0909341SAndroid Build Coastguard Worker    vpermq               m1, [srcq+strideq*1], q3120
1236*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1237*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m0, m1
1238*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m0, m1
1239*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [srcq+strideq*0], q3120
1240*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6
1241*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m6
1242*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m1, m0
1243*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m0
1244*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m6
1245*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6
1246*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m2
1247*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m3
1248*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*8], m4
1249*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*9], m1
1250*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*16
1251*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1252*c0909341SAndroid Build Coastguard Worker    jg .v_w128_loop
1253*c0909341SAndroid Build Coastguard Worker    add                  r3, 32
1254*c0909341SAndroid Build Coastguard Worker    add                  r5, 64
1255*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
1256*c0909341SAndroid Build Coastguard Worker    mov                srcq, r3
1257*c0909341SAndroid Build Coastguard Worker    mov                tmpq, r5
1258*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
1259*c0909341SAndroid Build Coastguard Worker    jg .v_w128_loop0
1260*c0909341SAndroid Build Coastguard Worker    RET
1261*c0909341SAndroid Build Coastguard Worker.hv:
1262*c0909341SAndroid Build Coastguard Worker    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
1263*c0909341SAndroid Build Coastguard Worker    ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
1264*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       7
1265*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
1266*c0909341SAndroid Build Coastguard Worker    shl                mxyd, 11
1267*c0909341SAndroid Build Coastguard Worker    movd                xm6, mxyd
1268*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, xm6
1269*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
1270*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
1271*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1272*c0909341SAndroid Build Coastguard Worker.hv_w4:
1273*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [bilin_h_shuf4]
1274*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+strideq*0]
1275*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
1276*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1277*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
1278*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+strideq*1]
1279*c0909341SAndroid Build Coastguard Worker    movhps              xm1, [srcq+strideq*2]
1280*c0909341SAndroid Build Coastguard Worker    movq                xm2, [srcq+stride3q ]
1281*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1282*c0909341SAndroid Build Coastguard Worker    movhps              xm2, [srcq+strideq*0]
1283*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, xm2, 1
1284*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1285*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5        ; 1 2 3 4
1286*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m1, m0, 0xc0
1287*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m2, q2103 ; 0 1 2 3
1288*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
1289*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
1290*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6
1291*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1292*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m1
1293*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
1294*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1295*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
1296*c0909341SAndroid Build Coastguard Worker    RET
1297*c0909341SAndroid Build Coastguard Worker.hv_w8:
1298*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [srcq+strideq*0]
1299*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
1300*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1301*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
1302*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*1]
1303*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+strideq*2], 1
1304*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+stride3q ]
1305*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1306*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+strideq*0], 1
1307*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1308*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
1309*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5           ; 1 2
1310*c0909341SAndroid Build Coastguard Worker    vperm2i128           m3, m0, m1, 0x21 ; 0 1
1311*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2, m5       ; 3 4
1312*c0909341SAndroid Build Coastguard Worker    vperm2i128           m2, m1, m0, 0x21 ; 2 3
1313*c0909341SAndroid Build Coastguard Worker    psubw                m1, m3
1314*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6
1315*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
1316*c0909341SAndroid Build Coastguard Worker    psubw                m3, m0, m2
1317*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6
1318*c0909341SAndroid Build Coastguard Worker    paddw                m3, m2
1319*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m1
1320*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m3
1321*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
1322*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1323*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
1324*c0909341SAndroid Build Coastguard Worker    RET
1325*c0909341SAndroid Build Coastguard Worker.hv_w16:
1326*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*0+8*0]
1327*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+strideq*0+8*1], 1
1328*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
1329*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1330*c0909341SAndroid Build Coastguard Worker.hv_w16_loop:
1331*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*1+8*0]
1332*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+strideq*1+8*1], 1
1333*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1334*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+strideq*0+8*0]
1335*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+strideq*0+8*1], 1
1336*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1337*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
1338*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
1339*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, m0
1340*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6
1341*c0909341SAndroid Build Coastguard Worker    paddw                m3, m0
1342*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2, m5
1343*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, m1
1344*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
1345*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1
1346*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m3
1347*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m2
1348*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
1349*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1350*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop
1351*c0909341SAndroid Build Coastguard Worker    RET
1352*c0909341SAndroid Build Coastguard Worker.hv_w32:
1353*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+8*0]
1354*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+8*1], 1
1355*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+8*2]
1356*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+8*3], 1
1357*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
1358*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1359*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1360*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
1361*c0909341SAndroid Build Coastguard Worker.hv_w32_loop:
1362*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
1363*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+8*0]
1364*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+8*1], 1
1365*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
1366*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
1367*c0909341SAndroid Build Coastguard Worker    psubw                m3, m2, m0
1368*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6
1369*c0909341SAndroid Build Coastguard Worker    paddw                m3, m0
1370*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
1371*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+8*2]
1372*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+8*3], 1
1373*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
1374*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
1375*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m3
1376*c0909341SAndroid Build Coastguard Worker    psubw                m3, m2, m1
1377*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6
1378*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
1379*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
1380*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m3
1381*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
1382*c0909341SAndroid Build Coastguard Worker    dec                  hd
1383*c0909341SAndroid Build Coastguard Worker    jg .hv_w32_loop
1384*c0909341SAndroid Build Coastguard Worker    RET
1385*c0909341SAndroid Build Coastguard Worker.hv_w128:
1386*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+(7<<8)]
1387*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 256
1388*c0909341SAndroid Build Coastguard Worker    jmp .hv_w64_start
1389*c0909341SAndroid Build Coastguard Worker.hv_w64:
1390*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+(3<<8)]
1391*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 128
1392*c0909341SAndroid Build Coastguard Worker.hv_w64_start:
1393*c0909341SAndroid Build Coastguard Worker%if WIN64
1394*c0909341SAndroid Build Coastguard Worker    PUSH                 r7
1395*c0909341SAndroid Build Coastguard Worker%endif
1396*c0909341SAndroid Build Coastguard Worker    mov                  r5, srcq
1397*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
1398*c0909341SAndroid Build Coastguard Worker.hv_w64_loop0:
1399*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*0+8*0]
1400*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+strideq*0+8*1], 1
1401*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
1402*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1403*c0909341SAndroid Build Coastguard Worker.hv_w64_loop:
1404*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+strideq*1+8*0]
1405*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+strideq*1+8*1], 1
1406*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1407*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+strideq*0+8*0]
1408*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+strideq*0+8*1], 1
1409*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1410*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
1411*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
1412*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, m0
1413*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6
1414*c0909341SAndroid Build Coastguard Worker    paddw                m3, m0
1415*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2, m5
1416*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, m1
1417*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
1418*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1
1419*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+r6*0], m3
1420*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+r6*1], m2
1421*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+r6*2]
1422*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1423*c0909341SAndroid Build Coastguard Worker    jg .hv_w64_loop
1424*c0909341SAndroid Build Coastguard Worker    add                  r5, 16
1425*c0909341SAndroid Build Coastguard Worker    add                  r7, 32
1426*c0909341SAndroid Build Coastguard Worker    movzx                hd, r3b
1427*c0909341SAndroid Build Coastguard Worker    mov                srcq, r5
1428*c0909341SAndroid Build Coastguard Worker    mov                tmpq, r7
1429*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 1<<8
1430*c0909341SAndroid Build Coastguard Worker    jg .hv_w64_loop0
1431*c0909341SAndroid Build Coastguard Worker%if WIN64
1432*c0909341SAndroid Build Coastguard Worker    POP                  r7
1433*c0909341SAndroid Build Coastguard Worker%endif
1434*c0909341SAndroid Build Coastguard Worker    RET
1435*c0909341SAndroid Build Coastguard Worker
1436*c0909341SAndroid Build Coastguard Worker; int8_t subpel_filters[5][15][8]
1437*c0909341SAndroid Build Coastguard Worker%assign FILTER_REGULAR (0*15 << 16) | 3*15
1438*c0909341SAndroid Build Coastguard Worker%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
1439*c0909341SAndroid Build Coastguard Worker%assign FILTER_SHARP   (2*15 << 16) | 3*15
1440*c0909341SAndroid Build Coastguard Worker
1441*c0909341SAndroid Build Coastguard Worker%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to
1442*c0909341SAndroid Build Coastguard Workercglobal %1_%2_8bpc
1443*c0909341SAndroid Build Coastguard Worker    mov                 t0d, FILTER_%3
1444*c0909341SAndroid Build Coastguard Worker%ifidn %3, %4
1445*c0909341SAndroid Build Coastguard Worker    mov                 t1d, t0d
1446*c0909341SAndroid Build Coastguard Worker%else
1447*c0909341SAndroid Build Coastguard Worker    mov                 t1d, FILTER_%4
1448*c0909341SAndroid Build Coastguard Worker%endif
1449*c0909341SAndroid Build Coastguard Worker%if %0 == 5 ; skip the jump in the last filter
1450*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
1451*c0909341SAndroid Build Coastguard Worker%endif
1452*c0909341SAndroid Build Coastguard Worker%endmacro
1453*c0909341SAndroid Build Coastguard Worker
1454*c0909341SAndroid Build Coastguard Worker%if WIN64
1455*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4, 5
1456*c0909341SAndroid Build Coastguard Worker%else
1457*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7, 8
1458*c0909341SAndroid Build Coastguard Worker%endif
1459*c0909341SAndroid Build Coastguard Worker
1460*c0909341SAndroid Build Coastguard Worker%define PUT_8TAP_FN FN put_8tap,
1461*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_8bpc
1462*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_8bpc
1463*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_8bpc
1464*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular,        REGULAR, REGULAR
1465*c0909341SAndroid Build Coastguard Worker
1466*c0909341SAndroid Build Coastguard Workercglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns
1467*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
1468*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
1469*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
1470*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 6tap_v, my, 4tap_v
1471*c0909341SAndroid Build Coastguard Worker    lea                  r8, [put_avx2]
1472*c0909341SAndroid Build Coastguard Worker    mov                  wd, wm
1473*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
1474*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
1475*c0909341SAndroid Build Coastguard Worker    jnz .h
1476*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
1477*c0909341SAndroid Build Coastguard Worker    jnz .v
1478*c0909341SAndroid Build Coastguard Worker.put:
1479*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
1480*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r8+wq*2+table_offset(put,)]
1481*c0909341SAndroid Build Coastguard Worker    add                  wq, r8
1482*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
1483*c0909341SAndroid Build Coastguard Worker    lea                  r7, [dsq*3]
1484*c0909341SAndroid Build Coastguard Worker%if WIN64
1485*c0909341SAndroid Build Coastguard Worker    pop                  r8
1486*c0909341SAndroid Build Coastguard Worker%endif
1487*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1488*c0909341SAndroid Build Coastguard Worker.h_w2:
1489*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
1490*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq-1]
1491*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [r8+mxq*8+subpel_filters-put_avx2+2]
1492*c0909341SAndroid Build Coastguard Worker    je .h_w4
1493*c0909341SAndroid Build Coastguard Worker    mova                xm3, [subpel_h_shuf4]
1494*c0909341SAndroid Build Coastguard Worker.h_w2_loop:
1495*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+ssq*0]
1496*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [srcq+ssq*1]
1497*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1498*c0909341SAndroid Build Coastguard Worker    pshufb              xm0, xm3
1499*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm4
1500*c0909341SAndroid Build Coastguard Worker    phaddw              xm0, xm0
1501*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm5
1502*c0909341SAndroid Build Coastguard Worker    psraw               xm0, 6
1503*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm0
1504*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xm0, 0
1505*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xm0, 1
1506*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1507*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1508*c0909341SAndroid Build Coastguard Worker    jg .h_w2_loop
1509*c0909341SAndroid Build Coastguard Worker    RET
1510*c0909341SAndroid Build Coastguard Worker.h_w4:
1511*c0909341SAndroid Build Coastguard Worker    mova                xm3, [subpel_h_shufA]
1512*c0909341SAndroid Build Coastguard Worker.h_w4_loop:
1513*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+ssq*0]
1514*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+ssq*1]
1515*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1516*c0909341SAndroid Build Coastguard Worker    pshufb              xm0, xm3
1517*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm3
1518*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm4
1519*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm4
1520*c0909341SAndroid Build Coastguard Worker    phaddw              xm0, xm1
1521*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm5
1522*c0909341SAndroid Build Coastguard Worker    psraw               xm0, 6
1523*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm0
1524*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm0
1525*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm0, 1
1526*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1527*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1528*c0909341SAndroid Build Coastguard Worker    jg .h_w4_loop
1529*c0909341SAndroid Build Coastguard Worker    RET
1530*c0909341SAndroid Build Coastguard Worker.h:
1531*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
1532*c0909341SAndroid Build Coastguard Worker    jnz .hv
1533*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pw_34] ; 2 + (8 << 2)
1534*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1535*c0909341SAndroid Build Coastguard Worker    jle .h_w2
1536*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      11
1537*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
1538*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [z_filter_s+ 2] ; 01
1539*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
1540*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [z_filter_s+ 6] ; 23
1541*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
1542*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [z_filter_s+10] ; 45
1543*c0909341SAndroid Build Coastguard Worker    lea                 mxq, [r8+mxq*8+subpel_filters+1-put_avx2]
1544*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r8+wq*2+table_offset(put, _6tap_h)]
1545*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m8, [mxq+0]
1546*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, [mxq+2]
1547*c0909341SAndroid Build Coastguard Worker    add                  wq, r8
1548*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m10, [mxq+4]
1549*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1550*c0909341SAndroid Build Coastguard Worker.h_w8:
1551*c0909341SAndroid Build Coastguard Worker%macro PUT_6TAP_H 3 ; dst/src, tmp[1-2]
1552*c0909341SAndroid Build Coastguard Worker    pshufb              m%2, m%1, m4
1553*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%2, m8
1554*c0909341SAndroid Build Coastguard Worker    pshufb              m%3, m%1, m6
1555*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%3, m9
1556*c0909341SAndroid Build Coastguard Worker    pshufb              m%1, m7
1557*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%1, m10
1558*c0909341SAndroid Build Coastguard Worker    paddw               m%2, m5
1559*c0909341SAndroid Build Coastguard Worker    paddw               m%1, m%3
1560*c0909341SAndroid Build Coastguard Worker    paddw               m%1, m%2
1561*c0909341SAndroid Build Coastguard Worker    psraw               m%1, 6
1562*c0909341SAndroid Build Coastguard Worker%endmacro
1563*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
1564*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*1], 1
1565*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1566*c0909341SAndroid Build Coastguard Worker    PUT_6TAP_H            0, 1, 2
1567*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
1568*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm1
1569*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm0
1570*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm0
1571*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1572*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1573*c0909341SAndroid Build Coastguard Worker    jg .h_w8
1574*c0909341SAndroid Build Coastguard Worker    RET
1575*c0909341SAndroid Build Coastguard Worker.h_w16:
1576*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0+8*0]
1577*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*1+8*0], 1
1578*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*0+8*1]
1579*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+ssq*1+8*1], 1
1580*c0909341SAndroid Build Coastguard Worker    PUT_6TAP_H            0, 2, 3
1581*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1582*c0909341SAndroid Build Coastguard Worker    PUT_6TAP_H            1, 2, 3
1583*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1584*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm0
1585*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], m0, 1
1586*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1587*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1588*c0909341SAndroid Build Coastguard Worker    jg .h_w16
1589*c0909341SAndroid Build Coastguard Worker    RET
1590*c0909341SAndroid Build Coastguard Worker.h_w32:
1591*c0909341SAndroid Build Coastguard Worker    xor                 r6d, r6d
1592*c0909341SAndroid Build Coastguard Worker    jmp .h_start
1593*c0909341SAndroid Build Coastguard Worker.h_w64:
1594*c0909341SAndroid Build Coastguard Worker    mov                  r6, -32*1
1595*c0909341SAndroid Build Coastguard Worker    jmp .h_start
1596*c0909341SAndroid Build Coastguard Worker.h_w128:
1597*c0909341SAndroid Build Coastguard Worker    mov                  r6, -32*3
1598*c0909341SAndroid Build Coastguard Worker.h_start:
1599*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
1600*c0909341SAndroid Build Coastguard Worker    sub                dstq, r6
1601*c0909341SAndroid Build Coastguard Worker    mov                  r4, r6
1602*c0909341SAndroid Build Coastguard Worker.h_loop:
1603*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6+8*0]
1604*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6+8*1]
1605*c0909341SAndroid Build Coastguard Worker    PUT_6TAP_H            0, 2, 3
1606*c0909341SAndroid Build Coastguard Worker    PUT_6TAP_H            1, 2, 3
1607*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1608*c0909341SAndroid Build Coastguard Worker    mova          [dstq+r6], m0
1609*c0909341SAndroid Build Coastguard Worker    add                  r6, 32
1610*c0909341SAndroid Build Coastguard Worker    jle .h_loop
1611*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
1612*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
1613*c0909341SAndroid Build Coastguard Worker    mov                  r6, r4
1614*c0909341SAndroid Build Coastguard Worker    dec                  hd
1615*c0909341SAndroid Build Coastguard Worker    jg .h_loop
1616*c0909341SAndroid Build Coastguard Worker    RET
1617*c0909341SAndroid Build Coastguard Worker.v:
1618*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       9, 12
1619*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1620*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1621*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1622*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
1623*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, wd
1624*c0909341SAndroid Build Coastguard Worker    movzx               r6d, word [r8+r6*2+table_offset(put, _6tap_v)]
1625*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [pw_512]
1626*c0909341SAndroid Build Coastguard Worker    lea                 myq, [r8+myq*8+subpel_filters+1-put_avx2]
1627*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, [myq+0]
1628*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, [myq+2]
1629*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, [myq+4]
1630*c0909341SAndroid Build Coastguard Worker    add                  r6, r8
1631*c0909341SAndroid Build Coastguard Worker    mov                 nsq, ssq
1632*c0909341SAndroid Build Coastguard Worker    neg                 nsq
1633*c0909341SAndroid Build Coastguard Worker    jmp                  r6
1634*c0909341SAndroid Build Coastguard Worker.v_w2:
1635*c0909341SAndroid Build Coastguard Worker    movd                xm2, [srcq+nsq*2]
1636*c0909341SAndroid Build Coastguard Worker    pinsrw              xm2, [srcq+nsq*1], 2
1637*c0909341SAndroid Build Coastguard Worker    pinsrw              xm2, [srcq+ssq*0], 4
1638*c0909341SAndroid Build Coastguard Worker    pinsrw              xm2, [srcq+ssq*1], 6 ; 0 1 2 3
1639*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1640*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [srcq+ssq*0]
1641*c0909341SAndroid Build Coastguard Worker    palignr             xm3, xm0, xm2, 4     ; 1 2 3 4
1642*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm1, xm2, xm3        ; 01 12
1643*c0909341SAndroid Build Coastguard Worker    punpckhbw           xm2, xm3             ; 23 34
1644*c0909341SAndroid Build Coastguard Worker.v_w2_loop:
1645*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [srcq+ssq*1]
1646*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1647*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm3, xm1, xm5        ; a0 b0
1648*c0909341SAndroid Build Coastguard Worker    mova                xm1, xm2
1649*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm2, xm6             ; a1 b1
1650*c0909341SAndroid Build Coastguard Worker    paddw               xm3, xm2
1651*c0909341SAndroid Build Coastguard Worker    vpblendd            xm2, xm0, xm4, 0x02  ; 4 5
1652*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [srcq+ssq*0]
1653*c0909341SAndroid Build Coastguard Worker    vpblendd            xm4, xm0, 0x02       ; 5 6
1654*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm2, xm4             ; 67 78
1655*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm4, xm2, xm7        ; a3 b3
1656*c0909341SAndroid Build Coastguard Worker    paddw               xm3, xm4
1657*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm3, xm8
1658*c0909341SAndroid Build Coastguard Worker    packuswb            xm3, xm3
1659*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xm3, 0
1660*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xm3, 2
1661*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1662*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1663*c0909341SAndroid Build Coastguard Worker    jg .v_w2_loop
1664*c0909341SAndroid Build Coastguard Worker    RET
1665*c0909341SAndroid Build Coastguard Worker.v_w4:
1666*c0909341SAndroid Build Coastguard Worker    movd                xm2, [srcq+nsq*2]
1667*c0909341SAndroid Build Coastguard Worker    pinsrd              xm2, [srcq+nsq*1], 1
1668*c0909341SAndroid Build Coastguard Worker    pinsrd              xm2, [srcq+ssq*0], 2
1669*c0909341SAndroid Build Coastguard Worker    pinsrd              xm2, [srcq+ssq*1], 3 ; 0 1 2 3
1670*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1671*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [srcq+ssq*0]
1672*c0909341SAndroid Build Coastguard Worker    palignr             xm3, xm0, xm2, 4     ; 1 2 3 4
1673*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm1, xm2, xm3        ; 01 12
1674*c0909341SAndroid Build Coastguard Worker    punpckhbw           xm2, xm3             ; 23 34
1675*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
1676*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [srcq+ssq*1]
1677*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1678*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm3, xm1, xm5        ; a0 b0
1679*c0909341SAndroid Build Coastguard Worker    mova                xm1, xm2
1680*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm2, xm6             ; a1 b1
1681*c0909341SAndroid Build Coastguard Worker    paddw               xm3, xm2
1682*c0909341SAndroid Build Coastguard Worker    vpblendd            xm2, xm0, xm4, 0x02  ; 4 5
1683*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [srcq+ssq*0]
1684*c0909341SAndroid Build Coastguard Worker    vpblendd            xm4, xm0, 0x02       ; 5 6
1685*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm2, xm4             ; 45 56
1686*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm4, xm2, xm7        ; a2 b2
1687*c0909341SAndroid Build Coastguard Worker    paddw               xm3, xm4
1688*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm3, xm8
1689*c0909341SAndroid Build Coastguard Worker    packuswb            xm3, xm3
1690*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm3
1691*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm3, 1
1692*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1693*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1694*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
1695*c0909341SAndroid Build Coastguard Worker    RET
1696*c0909341SAndroid Build Coastguard Worker.v_w8:
1697*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+nsq*2]
1698*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+nsq*1]
1699*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+ssq*0]
1700*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+ssq*1]
1701*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1702*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+ssq*0]
1703*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m3, 0x30
1704*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m2, 0x30
1705*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m3      ; 01 12
1706*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m4, 0x30
1707*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m0, 0x30
1708*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4      ; 23 34
1709*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
1710*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+ssq*1]
1711*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1712*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m1, m5  ; a0 b0
1713*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
1714*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6      ; a1 b1
1715*c0909341SAndroid Build Coastguard Worker    paddw                m3, m2
1716*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m0, m4, 0x30
1717*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+ssq*0]
1718*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m0, 0x30
1719*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4      ; 45 56
1720*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m2, m7  ; a2 b2
1721*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
1722*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m8
1723*c0909341SAndroid Build Coastguard Worker    vextracti128        xm4, m3, 1
1724*c0909341SAndroid Build Coastguard Worker    packuswb            xm3, xm4
1725*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm3
1726*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm3
1727*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1728*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1729*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
1730*c0909341SAndroid Build Coastguard Worker    RET
1731*c0909341SAndroid Build Coastguard Worker.v_w16:
1732*c0909341SAndroid Build Coastguard Worker.v_w32:
1733*c0909341SAndroid Build Coastguard Worker.v_w64:
1734*c0909341SAndroid Build Coastguard Worker.v_w128:
1735*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [wq*8-128]
1736*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       12
1737*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+r6*2]
1738*c0909341SAndroid Build Coastguard Worker.v_w16_loop0:
1739*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [srcq+nsq*2]
1740*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [srcq+nsq*1]
1741*c0909341SAndroid Build Coastguard Worker    lea                  r4, [srcq+ssq*2]
1742*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [srcq+ssq*0]
1743*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [srcq+ssq*1]
1744*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
1745*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [r4+ssq*0]
1746*c0909341SAndroid Build Coastguard Worker    shufpd               m3, m0, 0x0c
1747*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m1, 0x0c
1748*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m3, m4 ; 01
1749*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m4     ; 23
1750*c0909341SAndroid Build Coastguard Worker    shufpd               m0, m2, 0x0c
1751*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4, m0 ; 12
1752*c0909341SAndroid Build Coastguard Worker    punpckhbw            m4, m0     ; 34
1753*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
1754*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m9, [r4+ssq*1]
1755*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m1, m5  ; a0
1756*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r4+ssq*2]
1757*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m11, m2, m5  ; b0
1758*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
1759*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m6      ; a1
1760*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
1761*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m6      ; b1
1762*c0909341SAndroid Build Coastguard Worker    paddw               m10, m3
1763*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [r4+ssq*0]
1764*c0909341SAndroid Build Coastguard Worker    paddw               m11, m4
1765*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m0, m9, 0x0d
1766*c0909341SAndroid Build Coastguard Worker    shufpd               m0, m9, m3, 0x0c
1767*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4, m0  ; 45
1768*c0909341SAndroid Build Coastguard Worker    punpckhbw            m4, m0      ; 56
1769*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m9, m3, m7  ; a2
1770*c0909341SAndroid Build Coastguard Worker    paddw               m10, m9
1771*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m9, m4, m7  ; b2
1772*c0909341SAndroid Build Coastguard Worker    paddw               m11, m9
1773*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m10, m8
1774*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m8
1775*c0909341SAndroid Build Coastguard Worker    packuswb            m10, m11
1776*c0909341SAndroid Build Coastguard Worker    vpermq              m10, m10, q3120
1777*c0909341SAndroid Build Coastguard Worker    mova         [r7+dsq*0], xm10
1778*c0909341SAndroid Build Coastguard Worker    vextracti128 [r7+dsq*1], m10, 1
1779*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+dsq*2]
1780*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1781*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
1782*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
1783*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
1784*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
1785*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
1786*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop0
1787*c0909341SAndroid Build Coastguard Worker    RET
1788*c0909341SAndroid Build Coastguard Worker.hv:
1789*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      12, 16
1790*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1791*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
1792*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
1793*c0909341SAndroid Build Coastguard Worker    dec                srcq
1794*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [r8+mxq*8+subpel_filters-put_avx2+2]
1795*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1796*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1797*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1798*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
1799*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [r8+myq*8+subpel_filters+1-put_avx2]
1800*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pw_8192]
1801*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
1802*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [pd_512]
1803*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
1804*c0909341SAndroid Build Coastguard Worker    mov                 nsq, ssq
1805*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m0, q0000
1806*c0909341SAndroid Build Coastguard Worker    neg                 nsq
1807*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m0, q1111
1808*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m0, q2222
1809*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1810*c0909341SAndroid Build Coastguard Worker    je .hv_w4
1811*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [subpel_h_shuf4]
1812*c0909341SAndroid Build Coastguard Worker    movq                xm2, [srcq+nsq*2]
1813*c0909341SAndroid Build Coastguard Worker    movhps              xm2, [srcq+nsq*1]
1814*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+ssq*0]
1815*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [srcq+ssq*1]
1816*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1817*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [srcq+ssq*0]
1818*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m1, 0x30
1819*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5
1820*c0909341SAndroid Build Coastguard Worker    pshufb              xm0, xm5
1821*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6
1822*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm6
1823*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m0
1824*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m7
1825*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m2, 1
1826*c0909341SAndroid Build Coastguard Worker    palignr             xm0, xm2, 4
1827*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm1, xm2, xm0  ; 01 12
1828*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm2, xm0       ; 23 34
1829*c0909341SAndroid Build Coastguard Worker.hv_w2_loop:
1830*c0909341SAndroid Build Coastguard Worker    movq                xm4, [srcq+ssq*1]
1831*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1832*c0909341SAndroid Build Coastguard Worker    movhps              xm4, [srcq+ssq*0]
1833*c0909341SAndroid Build Coastguard Worker    pshufb              xm4, xm5
1834*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm4, xm6
1835*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm3, xm9, xm1  ; a0 b0
1836*c0909341SAndroid Build Coastguard Worker    mova                xm1, xm2
1837*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm2, xm10      ; a1 b1
1838*c0909341SAndroid Build Coastguard Worker    phaddw              xm4, xm4
1839*c0909341SAndroid Build Coastguard Worker    paddd               xm3, xm2
1840*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm4, xm7
1841*c0909341SAndroid Build Coastguard Worker    palignr             xm2, xm4, xm0, 12
1842*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm4
1843*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm4       ; 45 56
1844*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm4, xm11, xm2 ; a2 b2
1845*c0909341SAndroid Build Coastguard Worker    paddd               xm3, xm8
1846*c0909341SAndroid Build Coastguard Worker    paddd               xm3, xm4
1847*c0909341SAndroid Build Coastguard Worker    psrad               xm3, 10
1848*c0909341SAndroid Build Coastguard Worker    packssdw            xm3, xm3
1849*c0909341SAndroid Build Coastguard Worker    packuswb            xm3, xm3
1850*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xm3, 0
1851*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xm3, 1
1852*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1853*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1854*c0909341SAndroid Build Coastguard Worker    jg .hv_w2_loop
1855*c0909341SAndroid Build Coastguard Worker    RET
1856*c0909341SAndroid Build Coastguard Worker.hv_w4:
1857*c0909341SAndroid Build Coastguard Worker    mova                 m5, [subpel_h_shuf4]
1858*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+nsq*2]
1859*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+nsq*1]
1860*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [srcq+ssq*0]
1861*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+ssq*1]
1862*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1863*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+ssq*0]
1864*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m4, 0xcc ; 0 1
1865*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m3, 0xcc ; 2 3
1866*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5
1867*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m5
1868*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m5
1869*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6
1870*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6
1871*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m6
1872*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m1
1873*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m0
1874*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m7
1875*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m7
1876*c0909341SAndroid Build Coastguard Worker    palignr              m3, m0, m2, 4
1877*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m3   ; 01 12
1878*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m3       ; 23 34
1879*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
1880*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+ssq*1]
1881*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1882*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9, m1   ; a0 b0
1883*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
1884*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m10      ; a1 b1
1885*c0909341SAndroid Build Coastguard Worker    paddd                m3, m2
1886*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+ssq*0]
1887*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m2, 0xcc ; 5 6
1888*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m5
1889*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m6
1890*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m4
1891*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m7
1892*c0909341SAndroid Build Coastguard Worker    palignr              m2, m4, m0, 12
1893*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
1894*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4       ; 45 56
1895*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m11, m2  ; a2 b2
1896*c0909341SAndroid Build Coastguard Worker    paddd                m3, m8
1897*c0909341SAndroid Build Coastguard Worker    paddd                m3, m4
1898*c0909341SAndroid Build Coastguard Worker    psrad                m3, 10
1899*c0909341SAndroid Build Coastguard Worker    vextracti128        xm4, m3, 1
1900*c0909341SAndroid Build Coastguard Worker    packssdw            xm3, xm4
1901*c0909341SAndroid Build Coastguard Worker    packuswb            xm3, xm3
1902*c0909341SAndroid Build Coastguard Worker    pshuflw             xm3, xm3, q3120
1903*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm3
1904*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm3, 1
1905*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1906*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1907*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
1908*c0909341SAndroid Build Coastguard Worker    RET
1909*c0909341SAndroid Build Coastguard Worker.hv_w8:
1910*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
1911*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
1912*c0909341SAndroid Build Coastguard Worker    lea                 mxq, [r8+mxq*8+subpel_filters+1-put_avx2]
1913*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       16
1914*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m10, [mxq+0]
1915*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m11, [mxq+2]
1916*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m12, [mxq+4]
1917*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1918*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1919*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1920*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
1921*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [r8+myq*8+subpel_filters+1-put_avx2]
1922*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [wq*8-64]
1923*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [z_filter_s+ 6]
1924*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
1925*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m9, [z_filter_s+10]
1926*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
1927*c0909341SAndroid Build Coastguard Worker    mov                 nsq, ssq
1928*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m0, q0000
1929*c0909341SAndroid Build Coastguard Worker    neg                 nsq
1930*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m0, q1111
1931*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+r6*4]
1932*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m0, q2222
1933*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0:
1934*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [z_filter_s+2]
1935*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+nsq*2]
1936*c0909341SAndroid Build Coastguard Worker    lea                  r4, [srcq+ssq*2]
1937*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+nsq*1]
1938*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [srcq+ssq*0]
1939*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
1940*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [srcq+ssq*1], 1 ; 1 3
1941*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m0, 0xf0        ; 0 2
1942*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [r4+ssq*0], 1   ; 2 4
1943*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pw_8192]
1944*c0909341SAndroid Build Coastguard Worker%macro HV_H_6TAP_W8 6 ; src/dst, tmp[1-2], shuf[1-3]
1945*c0909341SAndroid Build Coastguard Worker    pshufb               %2, %1, %4
1946*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %2, m10
1947*c0909341SAndroid Build Coastguard Worker    pshufb               %3, %1, %5
1948*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %3, m11
1949*c0909341SAndroid Build Coastguard Worker    pshufb               %1, %6
1950*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %1, m12
1951*c0909341SAndroid Build Coastguard Worker    paddw                %2, %3
1952*c0909341SAndroid Build Coastguard Worker    paddw                %1, %2
1953*c0909341SAndroid Build Coastguard Worker%endmacro
1954*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP_W8         m3, m1, m2, m7, m8, m9
1955*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP_W8         m4, m1, m2, m7, m8, m9
1956*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP_W8         m0, m1, m2, m7, m8, m9
1957*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m3, q3120
1958*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m4, q3120
1959*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
1960*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
1961*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m5
1962*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
1963*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m4  ; 01
1964*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4      ; 23
1965*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m0  ; 12
1966*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0      ; 34
1967*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
1968*c0909341SAndroid Build Coastguard Worker    movu                xm7, [r4+ssq*1]
1969*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r4+ssq*2]
1970*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [r4+ssq*0], 1 ; 5 6
1971*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m13, m1 ; a0
1972*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
1973*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m13, m2 ; b0
1974*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
1975*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m14     ; a1
1976*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m14     ; b1
1977*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
1978*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [z_filter_s+2]
1979*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4
1980*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP_W8         m7, m3, m4, m3, m8, m9
1981*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [pw_8192]
1982*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [pd_512]
1983*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m3
1984*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4
1985*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4
1986*c0909341SAndroid Build Coastguard Worker    mova                 m4, m0
1987*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m7, q3120
1988*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m0, 0x05
1989*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m0  ; 45
1990*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m15, m3 ; a2
1991*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0      ; 67
1992*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
1993*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m15, m4 ; b2
1994*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
1995*c0909341SAndroid Build Coastguard Worker    psrad                m5, 10
1996*c0909341SAndroid Build Coastguard Worker    psrad                m6, 10
1997*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m6
1998*c0909341SAndroid Build Coastguard Worker    vextracti128        xm6, m5, 1
1999*c0909341SAndroid Build Coastguard Worker    packuswb            xm5, xm6
2000*c0909341SAndroid Build Coastguard Worker    pshufd              xm5, xm5, q3120
2001*c0909341SAndroid Build Coastguard Worker    movq         [r7+dsq*0], xm5
2002*c0909341SAndroid Build Coastguard Worker    movhps       [r7+dsq*1], xm5
2003*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+dsq*2]
2004*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2005*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
2006*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
2007*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
2008*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
2009*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
2010*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop0
2011*c0909341SAndroid Build Coastguard Worker    RET
2012*c0909341SAndroid Build Coastguard Worker
2013*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,  put_8tap_8bpc
2014*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH, put_8tap_8bpc
2015*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_sharp,  REGULAR, SHARP,  put_8tap_8bpc
2016*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_8tap_8bpc
2017*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp,          SHARP,   SHARP
2018*c0909341SAndroid Build Coastguard Worker
2019*c0909341SAndroid Build Coastguard Workercglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
2020*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
2021*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
2022*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
2023*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 8tap_v, my, 4tap_v
2024*c0909341SAndroid Build Coastguard Worker    lea                  r8, [put_avx2]
2025*c0909341SAndroid Build Coastguard Worker    movsxd               wq, wm
2026*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
2027*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
2028*c0909341SAndroid Build Coastguard Worker    jnz .h
2029*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2030*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _put_6tap_8bpc_avx2).put
2031*c0909341SAndroid Build Coastguard Worker.v:
2032*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      12, 15
2033*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2034*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2035*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2036*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
2037*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, wd
2038*c0909341SAndroid Build Coastguard Worker    movzx               r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
2039*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pw_512]
2040*c0909341SAndroid Build Coastguard Worker    lea                 myq, [r8+myq*8+subpel_filters-put_avx2]
2041*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m8, [myq+0]
2042*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, [myq+2]
2043*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m10, [myq+4]
2044*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m11, [myq+6]
2045*c0909341SAndroid Build Coastguard Worker    add                  r6, r8
2046*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
2047*c0909341SAndroid Build Coastguard Worker    sub                srcq, ss3q
2048*c0909341SAndroid Build Coastguard Worker    jmp                  r6
2049*c0909341SAndroid Build Coastguard Worker.v_w2:
2050*c0909341SAndroid Build Coastguard Worker    movd                xm2, [srcq+ssq*0]
2051*c0909341SAndroid Build Coastguard Worker    pinsrw              xm2, [srcq+ssq*1], 2
2052*c0909341SAndroid Build Coastguard Worker    pinsrw              xm2, [srcq+ssq*2], 4
2053*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2054*c0909341SAndroid Build Coastguard Worker    pinsrw              xm2, [srcq+ssq*0], 6 ; 0 1 2 3
2055*c0909341SAndroid Build Coastguard Worker    movd                xm3, [srcq+ssq*1]
2056*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm1, [srcq+ssq*2]
2057*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2058*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [srcq+ssq*0]
2059*c0909341SAndroid Build Coastguard Worker    vpblendd            xm3, xm1, 0x02       ; 4 5
2060*c0909341SAndroid Build Coastguard Worker    vpblendd            xm1, xm0, 0x02       ; 5 6
2061*c0909341SAndroid Build Coastguard Worker    palignr             xm4, xm3, xm2, 4     ; 1 2 3 4
2062*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm3, xm1             ; 45 56
2063*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm1, xm2, xm4        ; 01 12
2064*c0909341SAndroid Build Coastguard Worker    punpckhbw           xm2, xm4             ; 23 34
2065*c0909341SAndroid Build Coastguard Worker.v_w2_loop:
2066*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm5, xm1, xm8        ; a0 b0
2067*c0909341SAndroid Build Coastguard Worker    mova                xm1, xm2
2068*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm2, xm9             ; a1 b1
2069*c0909341SAndroid Build Coastguard Worker    paddw               xm5, xm2
2070*c0909341SAndroid Build Coastguard Worker    mova                xm2, xm3
2071*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm3, xm10            ; a2 b2
2072*c0909341SAndroid Build Coastguard Worker    paddw               xm5, xm3
2073*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [srcq+ssq*1]
2074*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2075*c0909341SAndroid Build Coastguard Worker    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
2076*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [srcq+ssq*0]
2077*c0909341SAndroid Build Coastguard Worker    vpblendd            xm4, xm0, 0x02       ; 7 8
2078*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm3, xm4             ; 67 78
2079*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm4, xm3, xm11       ; a3 b3
2080*c0909341SAndroid Build Coastguard Worker    paddw               xm5, xm4
2081*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm5, xm7
2082*c0909341SAndroid Build Coastguard Worker    packuswb            xm5, xm5
2083*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xm5, 0
2084*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xm5, 2
2085*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2086*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2087*c0909341SAndroid Build Coastguard Worker    jg .v_w2_loop
2088*c0909341SAndroid Build Coastguard Worker    RET
2089*c0909341SAndroid Build Coastguard Worker.v_w4:
2090*c0909341SAndroid Build Coastguard Worker    movd                xm2, [srcq+ssq*0]
2091*c0909341SAndroid Build Coastguard Worker    pinsrd              xm2, [srcq+ssq*1], 1
2092*c0909341SAndroid Build Coastguard Worker    pinsrd              xm2, [srcq+ssq*2], 2
2093*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2094*c0909341SAndroid Build Coastguard Worker    pinsrd              xm2, [srcq+ssq*0], 3 ; 0 1 2 3
2095*c0909341SAndroid Build Coastguard Worker    movd                xm3, [srcq+ssq*1]
2096*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm1, [srcq+ssq*2]
2097*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2098*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [srcq+ssq*0]
2099*c0909341SAndroid Build Coastguard Worker    vpblendd            xm3, xm1, 0x02       ; 4 5
2100*c0909341SAndroid Build Coastguard Worker    vpblendd            xm1, xm0, 0x02       ; 5 6
2101*c0909341SAndroid Build Coastguard Worker    palignr             xm4, xm3, xm2, 4     ; 1 2 3 4
2102*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm3, xm1             ; 45 56
2103*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm1, xm2, xm4        ; 01 12
2104*c0909341SAndroid Build Coastguard Worker    punpckhbw           xm2, xm4             ; 23 34
2105*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
2106*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm5, xm1, xm8        ; a0 b0
2107*c0909341SAndroid Build Coastguard Worker    mova                xm1, xm2
2108*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm2, xm9             ; a1 b1
2109*c0909341SAndroid Build Coastguard Worker    paddw               xm5, xm2
2110*c0909341SAndroid Build Coastguard Worker    mova                xm2, xm3
2111*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm3, xm10            ; a2 b2
2112*c0909341SAndroid Build Coastguard Worker    paddw               xm5, xm3
2113*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [srcq+ssq*1]
2114*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2115*c0909341SAndroid Build Coastguard Worker    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
2116*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [srcq+ssq*0]
2117*c0909341SAndroid Build Coastguard Worker    vpblendd            xm4, xm0, 0x02       ; 7 8
2118*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm3, xm4             ; 67 78
2119*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm4, xm3, xm11       ; a3 b3
2120*c0909341SAndroid Build Coastguard Worker    paddw               xm5, xm4
2121*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm5, xm7
2122*c0909341SAndroid Build Coastguard Worker    packuswb            xm5, xm5
2123*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm5
2124*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm5, 1
2125*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2126*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2127*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
2128*c0909341SAndroid Build Coastguard Worker    RET
2129*c0909341SAndroid Build Coastguard Worker.v_w8:
2130*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+ssq*0]
2131*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+ssq*1]
2132*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+ssq*2]
2133*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2134*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m5, [srcq+ssq*0]
2135*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+ssq*1]
2136*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m6, [srcq+ssq*2]
2137*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2138*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+ssq*0]
2139*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m4, 0x30
2140*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m2, 0x30
2141*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m4      ; 01 12
2142*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m5, 0x30
2143*c0909341SAndroid Build Coastguard Worker    vpblendd             m5, m3, 0x30
2144*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m5      ; 23 34
2145*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m6, 0x30
2146*c0909341SAndroid Build Coastguard Worker    vpblendd             m6, m0, 0x30
2147*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m6      ; 45 56
2148*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
2149*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+ssq*1]
2150*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2151*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m1, m8  ; a0 b0
2152*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
2153*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9      ; a1 b1
2154*c0909341SAndroid Build Coastguard Worker    paddw                m5, m2
2155*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
2156*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m10     ; a2 b2
2157*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3
2158*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m0, m4, 0x30
2159*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+ssq*0]
2160*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m0, 0x30
2161*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4      ; 67 78
2162*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m3, m11 ; a3 b3
2163*c0909341SAndroid Build Coastguard Worker    paddw                m5, m4
2164*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7
2165*c0909341SAndroid Build Coastguard Worker    vextracti128        xm4, m5, 1
2166*c0909341SAndroid Build Coastguard Worker    packuswb            xm5, xm4
2167*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm5
2168*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm5
2169*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2170*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2171*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
2172*c0909341SAndroid Build Coastguard Worker    RET
2173*c0909341SAndroid Build Coastguard Worker.v_w16:
2174*c0909341SAndroid Build Coastguard Worker.v_w32:
2175*c0909341SAndroid Build Coastguard Worker.v_w64:
2176*c0909341SAndroid Build Coastguard Worker.v_w128:
2177*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [wq*8-128]
2178*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       15
2179*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+r6*2]
2180*c0909341SAndroid Build Coastguard Worker.v_w16_loop0:
2181*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [srcq+ssq*0]
2182*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [srcq+ssq*1]
2183*c0909341SAndroid Build Coastguard Worker    lea                  r4, [srcq+ss3q]
2184*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [srcq+ssq*2]
2185*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [r4+ssq*0]
2186*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
2187*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [r4+ssq*1]
2188*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [r4+ssq*2]
2189*c0909341SAndroid Build Coastguard Worker    add                  r4, ss3q
2190*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [r4+ssq*0]
2191*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m0, 0x0c
2192*c0909341SAndroid Build Coastguard Worker    shufpd               m5, m1, 0x0c
2193*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m4, m5 ; 01
2194*c0909341SAndroid Build Coastguard Worker    punpckhbw            m4, m5     ; 34
2195*c0909341SAndroid Build Coastguard Worker    shufpd               m6, m2, 0x0c
2196*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m5, m6 ; 12
2197*c0909341SAndroid Build Coastguard Worker    punpckhbw            m5, m6     ; 45
2198*c0909341SAndroid Build Coastguard Worker    shufpd               m0, m3, 0x0c
2199*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m6, m0 ; 23
2200*c0909341SAndroid Build Coastguard Worker    punpckhbw            m6, m0     ; 56
2201*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
2202*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m12, [r4+ssq*1]
2203*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r4+ssq*2]
2204*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m1, m8  ; a0
2205*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m2, m8  ; b0
2206*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
2207*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
2208*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m9      ; a1
2209*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m9      ; b1
2210*c0909341SAndroid Build Coastguard Worker    paddw               m13, m3
2211*c0909341SAndroid Build Coastguard Worker    paddw               m14, m4
2212*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
2213*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
2214*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m10     ; a2
2215*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m10     ; b2
2216*c0909341SAndroid Build Coastguard Worker    paddw               m13, m5
2217*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [r4+ssq*0]
2218*c0909341SAndroid Build Coastguard Worker    paddw               m14, m6
2219*c0909341SAndroid Build Coastguard Worker    shufpd               m6, m0, m12, 0x0d
2220*c0909341SAndroid Build Coastguard Worker    shufpd               m0, m12, m5, 0x0c
2221*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m6, m0  ; 67
2222*c0909341SAndroid Build Coastguard Worker    punpckhbw            m6, m0      ; 78
2223*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, m5, m11 ; a3
2224*c0909341SAndroid Build Coastguard Worker    paddw               m13, m12
2225*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, m6, m11 ; b3
2226*c0909341SAndroid Build Coastguard Worker    paddw               m14, m12
2227*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m13, m7
2228*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m7
2229*c0909341SAndroid Build Coastguard Worker    packuswb            m13, m14
2230*c0909341SAndroid Build Coastguard Worker    vpermq              m13, m13, q3120
2231*c0909341SAndroid Build Coastguard Worker    mova         [r7+dsq*0], xm13
2232*c0909341SAndroid Build Coastguard Worker    vextracti128 [r7+dsq*1], m13, 1
2233*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+dsq*2]
2234*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2235*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
2236*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
2237*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
2238*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
2239*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
2240*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop0
2241*c0909341SAndroid Build Coastguard Worker    RET
2242*c0909341SAndroid Build Coastguard Worker.h:
2243*c0909341SAndroid Build Coastguard Worker.h_w2:
2244*c0909341SAndroid Build Coastguard Worker.h_w4:
2245*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2246*c0909341SAndroid Build Coastguard Worker    jnz .hv
2247*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pw_34] ; 2 + (8 << 2)
2248*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2249*c0909341SAndroid Build Coastguard Worker    jle mangle(private_prefix %+ _put_6tap_8bpc_avx2).h_w2
2250*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      11
2251*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
2252*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [subpel_h_shufA]
2253*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
2254*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [subpel_h_shufB]
2255*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
2256*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [subpel_h_shufC]
2257*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r8+wq*2+table_offset(put, _8tap_h)]
2258*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [r8+mxq*8+subpel_filters-put_avx2+0]
2259*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [r8+mxq*8+subpel_filters-put_avx2+4]
2260*c0909341SAndroid Build Coastguard Worker    add                  wq, r8
2261*c0909341SAndroid Build Coastguard Worker    jmp                  wq
2262*c0909341SAndroid Build Coastguard Worker.h_w8:
2263*c0909341SAndroid Build Coastguard Worker%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
2264*c0909341SAndroid Build Coastguard Worker    pshufb              m%2, m%1, m7
2265*c0909341SAndroid Build Coastguard Worker    pshufb              m%3, m%1, m8
2266*c0909341SAndroid Build Coastguard Worker    pshufb              m%1, m6
2267*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%4, m%2, m9
2268*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%2, m10
2269*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%3, m10
2270*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%1, m9
2271*c0909341SAndroid Build Coastguard Worker    paddw               m%3, m%4
2272*c0909341SAndroid Build Coastguard Worker    paddw               m%1, m%2
2273*c0909341SAndroid Build Coastguard Worker    phaddw              m%1, m%3
2274*c0909341SAndroid Build Coastguard Worker    paddw               m%1, m5
2275*c0909341SAndroid Build Coastguard Worker    psraw               m%1, 6
2276*c0909341SAndroid Build Coastguard Worker%endmacro
2277*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
2278*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*1], 1
2279*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2280*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H            0, 1, 2, 3
2281*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
2282*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm1
2283*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm0
2284*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm0
2285*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2286*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2287*c0909341SAndroid Build Coastguard Worker    jg .h_w8
2288*c0909341SAndroid Build Coastguard Worker    RET
2289*c0909341SAndroid Build Coastguard Worker.h_w16:
2290*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0+8*0]
2291*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*1+8*0], 1
2292*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*0+8*1]
2293*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+ssq*1+8*1], 1
2294*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H            0, 2, 3, 4
2295*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2296*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H            1, 2, 3, 4
2297*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2298*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm0
2299*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], m0, 1
2300*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2301*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2302*c0909341SAndroid Build Coastguard Worker    jg .h_w16
2303*c0909341SAndroid Build Coastguard Worker    RET
2304*c0909341SAndroid Build Coastguard Worker.h_w32:
2305*c0909341SAndroid Build Coastguard Worker    xor                 r6d, r6d
2306*c0909341SAndroid Build Coastguard Worker    jmp .h_start
2307*c0909341SAndroid Build Coastguard Worker.h_w64:
2308*c0909341SAndroid Build Coastguard Worker    mov                  r6, -32*1
2309*c0909341SAndroid Build Coastguard Worker    jmp .h_start
2310*c0909341SAndroid Build Coastguard Worker.h_w128:
2311*c0909341SAndroid Build Coastguard Worker    mov                  r6, -32*3
2312*c0909341SAndroid Build Coastguard Worker.h_start:
2313*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
2314*c0909341SAndroid Build Coastguard Worker    sub                dstq, r6
2315*c0909341SAndroid Build Coastguard Worker    mov                  r4, r6
2316*c0909341SAndroid Build Coastguard Worker.h_loop:
2317*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6+8*0]
2318*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6+8*1]
2319*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H            0, 2, 3, 4
2320*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H            1, 2, 3, 4
2321*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2322*c0909341SAndroid Build Coastguard Worker    mova          [dstq+r6], m0
2323*c0909341SAndroid Build Coastguard Worker    add                  r6, 32
2324*c0909341SAndroid Build Coastguard Worker    jle .h_loop
2325*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
2326*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
2327*c0909341SAndroid Build Coastguard Worker    mov                  r6, r4
2328*c0909341SAndroid Build Coastguard Worker    dec                  hd
2329*c0909341SAndroid Build Coastguard Worker    jg .h_loop
2330*c0909341SAndroid Build Coastguard Worker    RET
2331*c0909341SAndroid Build Coastguard Worker.hv:
2332*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      14, 16
2333*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2334*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
2335*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
2336*c0909341SAndroid Build Coastguard Worker    dec                srcq
2337*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [r8+mxq*8+subpel_filters-put_avx2+2]
2338*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2339*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2340*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2341*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
2342*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [r8+myq*8+subpel_filters-put_avx2]
2343*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
2344*c0909341SAndroid Build Coastguard Worker    sub                srcq, ss3q
2345*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
2346*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
2347*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [pw_8192]
2348*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [pd_512]
2349*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m0, q0000
2350*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m0, q1111
2351*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m0, q2222
2352*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m0, q3333
2353*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2354*c0909341SAndroid Build Coastguard Worker    je .hv_w4
2355*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [subpel_h_shuf4]
2356*c0909341SAndroid Build Coastguard Worker    movq                xm2, [srcq+ssq*0]
2357*c0909341SAndroid Build Coastguard Worker    movhps              xm2, [srcq+ssq*1]
2358*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+ssq*2]
2359*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2360*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [srcq+ssq*0]
2361*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+ssq*1]
2362*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+ssq*2]
2363*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2364*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [srcq+ssq*0]
2365*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m3, 0x30
2366*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m1, 0x30
2367*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m4, 0xc0
2368*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m6
2369*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6
2370*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
2371*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7
2372*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m0
2373*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m8
2374*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m2, 1
2375*c0909341SAndroid Build Coastguard Worker    palignr             xm4, xm3, xm2, 4
2376*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm1, xm2, xm4  ; 01 12
2377*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm2, xm4       ; 23 34
2378*c0909341SAndroid Build Coastguard Worker    pshufd              xm0, xm3, q2121
2379*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm0       ; 45 56
2380*c0909341SAndroid Build Coastguard Worker.hv_w2_loop:
2381*c0909341SAndroid Build Coastguard Worker    movq                xm4, [srcq+ssq*1]
2382*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2383*c0909341SAndroid Build Coastguard Worker    movhps              xm4, [srcq+ssq*0]
2384*c0909341SAndroid Build Coastguard Worker    pshufb              xm4, xm6
2385*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm4, xm7
2386*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm5, xm1, xm10 ; a0 b0
2387*c0909341SAndroid Build Coastguard Worker    mova                xm1, xm2
2388*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm2, xm11      ; a1 b1
2389*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm2
2390*c0909341SAndroid Build Coastguard Worker    mova                xm2, xm3
2391*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm3, xm12      ; a2 b2
2392*c0909341SAndroid Build Coastguard Worker    phaddw              xm4, xm4
2393*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm4, xm8
2394*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm3
2395*c0909341SAndroid Build Coastguard Worker    palignr             xm3, xm4, xm0, 12
2396*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm4
2397*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm0       ; 67 78
2398*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm4, xm3, xm13 ; a3 b3
2399*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm9
2400*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm4
2401*c0909341SAndroid Build Coastguard Worker    psrad               xm5, 10
2402*c0909341SAndroid Build Coastguard Worker    packssdw            xm5, xm5
2403*c0909341SAndroid Build Coastguard Worker    packuswb            xm5, xm5
2404*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xm5, 0
2405*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xm5, 1
2406*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2407*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2408*c0909341SAndroid Build Coastguard Worker    jg .hv_w2_loop
2409*c0909341SAndroid Build Coastguard Worker    RET
2410*c0909341SAndroid Build Coastguard Worker.hv_w4:
2411*c0909341SAndroid Build Coastguard Worker    mova                 m6, [subpel_h_shuf4]
2412*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+ssq*0]
2413*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+ssq*1]
2414*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+ssq*2]
2415*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2416*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m5, [srcq+ssq*0]
2417*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+ssq*1]
2418*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m4, 0xcc ; 0 1
2419*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+ssq*2]
2420*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2421*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [srcq+ssq*0]
2422*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m5, 0xcc ; 2 3
2423*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m4, 0xcc ; 4 5
2424*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m6
2425*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6
2426*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m6
2427*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m6
2428*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
2429*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7
2430*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m7
2431*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m7
2432*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m0
2433*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m1
2434*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m8
2435*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m8
2436*c0909341SAndroid Build Coastguard Worker    palignr              m4, m3, m2, 4
2437*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m4   ; 01 12
2438*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m4       ; 23 34
2439*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q2121
2440*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m0       ; 45 56
2441*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
2442*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+ssq*1]
2443*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2444*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, m10  ; a0 b0
2445*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
2446*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m11      ; a1 b1
2447*c0909341SAndroid Build Coastguard Worker    paddd                m5, m2
2448*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
2449*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m12      ; a2 b2
2450*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
2451*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+ssq*0]
2452*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m3, 0xcc ; 7 8
2453*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m6
2454*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m7
2455*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m4
2456*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m8
2457*c0909341SAndroid Build Coastguard Worker    palignr              m3, m4, m0, 12
2458*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
2459*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m0       ; 67 78
2460*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m3, m13  ; a3 b3
2461*c0909341SAndroid Build Coastguard Worker    paddd                m5, m9
2462*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4
2463*c0909341SAndroid Build Coastguard Worker    psrad                m5, 10
2464*c0909341SAndroid Build Coastguard Worker    vextracti128        xm4, m5, 1
2465*c0909341SAndroid Build Coastguard Worker    packssdw            xm5, xm4
2466*c0909341SAndroid Build Coastguard Worker    packuswb            xm5, xm5
2467*c0909341SAndroid Build Coastguard Worker    pshuflw             xm5, xm5, q3120
2468*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm5
2469*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm5, 1
2470*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2471*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2472*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
2473*c0909341SAndroid Build Coastguard Worker    RET
2474*c0909341SAndroid Build Coastguard Worker.hv_w8:
2475*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       16
2476*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
2477*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
2478*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [r8+mxq*8+subpel_filters-put_avx2+0]
2479*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [r8+mxq*8+subpel_filters-put_avx2+4]
2480*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2481*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2482*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2483*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
2484*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [r8+myq*8+subpel_filters-put_avx2]
2485*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
2486*c0909341SAndroid Build Coastguard Worker    sub                srcq, ss3q
2487*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
2488*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
2489*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m0, q0000
2490*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m0, q1111
2491*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m0, q2222
2492*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m0, q3333
2493*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [wq*8-64]
2494*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+r6*4]
2495*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0:
2496*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [subpel_h_shufA]
2497*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+ssq*0]
2498*c0909341SAndroid Build Coastguard Worker    lea                  r4, [srcq+ss3q]
2499*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [subpel_h_shufB]
2500*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+ssq*1]
2501*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
2502*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m9, [subpel_h_shufC]
2503*c0909341SAndroid Build Coastguard Worker    movu                xm6, [srcq+ssq*2]
2504*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [r4+ssq*0]
2505*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m0, 0xf0      ; 0 3
2506*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [r4+ssq*1], 1 ; 1 4
2507*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [r4+ssq*2], 1 ; 2 5
2508*c0909341SAndroid Build Coastguard Worker    add                  r4, ss3q
2509*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [r4+ssq*0], 1 ; 3 6
2510*c0909341SAndroid Build Coastguard Worker%macro HV_H_8TAP_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
2511*c0909341SAndroid Build Coastguard Worker    pshufb               %3, %1, %6
2512*c0909341SAndroid Build Coastguard Worker    pshufb               %4, %1, %7
2513*c0909341SAndroid Build Coastguard Worker    pshufb               %1, %5
2514*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %2, %3, m10
2515*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %4, m11
2516*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %3, m11
2517*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %1, m10
2518*c0909341SAndroid Build Coastguard Worker    paddw                %2, %4
2519*c0909341SAndroid Build Coastguard Worker    paddw                %1, %3
2520*c0909341SAndroid Build Coastguard Worker    phaddw               %1, %2
2521*c0909341SAndroid Build Coastguard Worker%endmacro
2522*c0909341SAndroid Build Coastguard Worker    HV_H_8TAP_W8         m4, m1, m2, m3, m7, m8, m9
2523*c0909341SAndroid Build Coastguard Worker    HV_H_8TAP_W8         m5, m1, m2, m3, m7, m8, m9
2524*c0909341SAndroid Build Coastguard Worker    HV_H_8TAP_W8         m6, m1, m2, m3, m7, m8, m9
2525*c0909341SAndroid Build Coastguard Worker    HV_H_8TAP_W8         m0, m1, m2, m3, m7, m8, m9
2526*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pw_8192]
2527*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m4, q3120
2528*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m5, q3120
2529*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m6, q3120
2530*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m7
2531*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m7
2532*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7
2533*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m7
2534*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m0, q3120
2535*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m4, m5  ; 01
2536*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5      ; 34
2537*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5, m6  ; 12
2538*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6      ; 45
2539*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m6, m7  ; 23
2540*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m7      ; 56
2541*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
2542*c0909341SAndroid Build Coastguard Worker    vextracti128        r6m, m0, 1 ; not enough registers
2543*c0909341SAndroid Build Coastguard Worker    movu                xm0, [r4+ssq*1]
2544*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r4+ssq*2]
2545*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [r4+ssq*0], 1 ; 7 8
2546*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m1, m12 ; a0
2547*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m2, m12 ; b0
2548*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
2549*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
2550*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m13     ; a1
2551*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m13     ; b1
2552*c0909341SAndroid Build Coastguard Worker    paddd                m8, m3
2553*c0909341SAndroid Build Coastguard Worker    paddd                m9, m4
2554*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
2555*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
2556*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m14     ; a2
2557*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m14     ; b2
2558*c0909341SAndroid Build Coastguard Worker    paddd                m8, m5
2559*c0909341SAndroid Build Coastguard Worker    paddd                m9, m6
2560*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [subpel_h_shufB]
2561*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [subpel_h_shufC]
2562*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [subpel_h_shufA]
2563*c0909341SAndroid Build Coastguard Worker    HV_H_8TAP_W8         m0, m5, m6, m7, m5, m6, m7
2564*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pw_8192]
2565*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pd_512]
2566*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, r6m
2567*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
2568*c0909341SAndroid Build Coastguard Worker    paddd                m8, m7
2569*c0909341SAndroid Build Coastguard Worker    paddd                m9, m7
2570*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m0, q3120 ; 7 8
2571*c0909341SAndroid Build Coastguard Worker    shufpd               m6, m7, 0x04  ; 6 7
2572*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6, m7  ; 67
2573*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m7      ; 78
2574*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m5, m15 ; a3
2575*c0909341SAndroid Build Coastguard Worker    paddd                m8, m7
2576*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m6, m15 ; b3
2577*c0909341SAndroid Build Coastguard Worker    paddd                m7, m9
2578*c0909341SAndroid Build Coastguard Worker    psrad                m8, 10
2579*c0909341SAndroid Build Coastguard Worker    psrad                m7, 10
2580*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m7
2581*c0909341SAndroid Build Coastguard Worker    vextracti128        xm7, m8, 1
2582*c0909341SAndroid Build Coastguard Worker    packuswb            xm8, xm7
2583*c0909341SAndroid Build Coastguard Worker    pshufd              xm7, xm8, q3120
2584*c0909341SAndroid Build Coastguard Worker    movq         [r7+dsq*0], xm7
2585*c0909341SAndroid Build Coastguard Worker    movhps       [r7+dsq*1], xm7
2586*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+dsq*2]
2587*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2588*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
2589*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
2590*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
2591*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
2592*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
2593*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop0
2594*c0909341SAndroid Build Coastguard Worker    RET
2595*c0909341SAndroid Build Coastguard Worker
2596*c0909341SAndroid Build Coastguard Worker%if WIN64
2597*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 4
2598*c0909341SAndroid Build Coastguard Worker%else
2599*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 7
2600*c0909341SAndroid Build Coastguard Worker%endif
2601*c0909341SAndroid Build Coastguard Worker
2602*c0909341SAndroid Build Coastguard Worker%define PREP_8TAP_FN FN prep_8tap,
2603*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_8bpc
2604*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_8bpc
2605*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_8bpc
2606*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular,        REGULAR, REGULAR
2607*c0909341SAndroid Build Coastguard Worker
2608*c0909341SAndroid Build Coastguard Workercglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my, ns
2609*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
2610*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
2611*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
2612*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 6tap_v, my, 4tap_v
2613*c0909341SAndroid Build Coastguard Worker    lea                  r7, [prep%+SUFFIX]
2614*c0909341SAndroid Build Coastguard Worker    mov                  wd, wm
2615*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
2616*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
2617*c0909341SAndroid Build Coastguard Worker    jnz .h
2618*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2619*c0909341SAndroid Build Coastguard Worker    jnz .v
2620*c0909341SAndroid Build Coastguard Worker.prep:
2621*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
2622*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(prep,)]
2623*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
2624*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
2625*c0909341SAndroid Build Coastguard Worker%if WIN64
2626*c0909341SAndroid Build Coastguard Worker    pop                  r7
2627*c0909341SAndroid Build Coastguard Worker%endif
2628*c0909341SAndroid Build Coastguard Worker    jmp                  wq
2629*c0909341SAndroid Build Coastguard Worker.v:
2630*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      10, 12
2631*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2632*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2633*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
2634*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
2635*c0909341SAndroid Build Coastguard Worker    lea                 myq, [r7+myq*8+subpel_filters+1-prep%+SUFFIX]
2636*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [pw_8192]
2637*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, [myq+0]
2638*c0909341SAndroid Build Coastguard Worker    mov                 nsq, ssq
2639*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, [myq+2]
2640*c0909341SAndroid Build Coastguard Worker    neg                 nsq
2641*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m8, [myq+4]
2642*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
2643*c0909341SAndroid Build Coastguard Worker    jg .v_w16
2644*c0909341SAndroid Build Coastguard Worker    je .v_w8
2645*c0909341SAndroid Build Coastguard Worker.v_w4:
2646*c0909341SAndroid Build Coastguard Worker    movd                xm2, [srcq+nsq*2]
2647*c0909341SAndroid Build Coastguard Worker    pinsrd              xm2, [srcq+nsq*1], 1
2648*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [srcq+ssq*0]
2649*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [srcq+ssq*1]
2650*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [srcq+ssq*2]
2651*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [deint_shuf4]
2652*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m2, 0xeb
2653*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m3, m0
2654*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m3, 0x60 ; 0 1 2 _   2 3 4 _
2655*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m5       ; 01  12    23  34
2656*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
2657*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
2658*c0909341SAndroid Build Coastguard Worker    pinsrd              xm0, [srcq+nsq*1], 1
2659*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [srcq+ssq*0]
2660*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [srcq+ssq*1]
2661*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m0, 0xeb
2662*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [srcq+ssq*2]
2663*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m3, m0
2664*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m3, 0x60 ; 4 5 6 _   6 7 8 _
2665*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5       ; 45  56    67  78
2666*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m1, m6   ; a0  b0    c0  d0
2667*c0909341SAndroid Build Coastguard Worker    vperm2i128           m1, m2, 0x21 ; 23  34    45  56
2668*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m2, m8   ; a2  b2    c2  d2
2669*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m7       ; a1  b1    c1  d1
2670*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
2671*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
2672*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m9
2673*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
2674*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m3
2675*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
2676*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2677*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
2678*c0909341SAndroid Build Coastguard Worker    RET
2679*c0909341SAndroid Build Coastguard Worker.v_w8:
2680*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+nsq*2]
2681*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+nsq*1]
2682*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+ssq*0]
2683*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+ssq*1]
2684*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+ssq*2]
2685*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m3, 0x30
2686*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m2, 0x30
2687*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m3     ; 01 12
2688*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m4, 0x30
2689*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m0, 0x30
2690*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4     ; 23 34
2691*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
2692*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
2693*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6     ; a0
2694*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+nsq*1]
2695*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m2, m7 ; a1
2696*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m2, m6 ; b0
2697*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+ssq*0]
2698*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m3, 0x30
2699*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m2, 0x30
2700*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
2701*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m0, m3 ; 45 56
2702*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+ssq*1]
2703*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+ssq*2]
2704*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m3, 0x30
2705*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m0, 0x30
2706*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3     ; 67 78
2707*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m1, m7 ; b1
2708*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3
2709*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m1, m8 ; a2
2710*c0909341SAndroid Build Coastguard Worker    paddw                m4, m3
2711*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m2, m8 ; b2
2712*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3
2713*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m9
2714*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m9
2715*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m4
2716*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m5
2717*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
2718*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2719*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
2720*c0909341SAndroid Build Coastguard Worker    RET
2721*c0909341SAndroid Build Coastguard Worker.v_w16:
2722*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [wq*2-32]
2723*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+nsq*2]
2724*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       12
2725*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+r6*8]
2726*c0909341SAndroid Build Coastguard Worker.v_w16_loop0:
2727*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [srcq+ssq*0]
2728*c0909341SAndroid Build Coastguard Worker    lea                  r5, [srcq+ssq*2]
2729*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [srcq+ssq*1]
2730*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
2731*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [r5+ssq*0]
2732*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [r5+ssq*1]
2733*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+ssq*2]
2734*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [r5+ssq*0]
2735*c0909341SAndroid Build Coastguard Worker    shufpd               m3, m0, 0x0c
2736*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m1, 0x0c
2737*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m3, m4 ; 01
2738*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m4     ; 23
2739*c0909341SAndroid Build Coastguard Worker    shufpd               m0, m2, 0x0c
2740*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4, m0 ; 12
2741*c0909341SAndroid Build Coastguard Worker    punpckhbw            m4, m0     ; 34
2742*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
2743*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [r5+ssq*1]
2744*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m1, m6 ; a0
2745*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+ssq*2]
2746*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m11, m2, m6 ; b0
2747*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
2748*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m7     ; a1
2749*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
2750*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m7     ; b1
2751*c0909341SAndroid Build Coastguard Worker    paddw               m10, m3
2752*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [r5+ssq*0]
2753*c0909341SAndroid Build Coastguard Worker    paddw               m11, m4
2754*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m0, m5, 0x0d
2755*c0909341SAndroid Build Coastguard Worker    shufpd               m0, m5, m3, 0x0c
2756*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4, m0 ; 45
2757*c0909341SAndroid Build Coastguard Worker    punpckhbw            m4, m0     ; 56
2758*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m3, m8 ; a2
2759*c0909341SAndroid Build Coastguard Worker    paddw               m10, m5
2760*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m4, m8 ; b2
2761*c0909341SAndroid Build Coastguard Worker    paddw               m11, m5
2762*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m10, m9
2763*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m9
2764*c0909341SAndroid Build Coastguard Worker    mova          [r7+wq*0], m10
2765*c0909341SAndroid Build Coastguard Worker    mova          [r7+wq*2], m11
2766*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+wq*4]
2767*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2768*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
2769*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
2770*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
2771*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
2772*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
2773*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop0
2774*c0909341SAndroid Build Coastguard Worker    RET
2775*c0909341SAndroid Build Coastguard Worker.h_w4:
2776*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
2777*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
2778*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [subpel_h_shufA]
2779*c0909341SAndroid Build Coastguard Worker    dec                srcq
2780*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
2781*c0909341SAndroid Build Coastguard Worker    lea                  r3, [ssq*3]
2782*c0909341SAndroid Build Coastguard Worker.h_w4_loop:
2783*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+ssq*0]
2784*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+ssq*2]
2785*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+ssq*1]
2786*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m2, 0x30
2787*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+r3   ]
2788*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
2789*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m2, 0x30
2790*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
2791*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
2792*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
2793*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
2794*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m1
2795*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
2796*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
2797*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
2798*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2799*c0909341SAndroid Build Coastguard Worker    jg .h_w4_loop
2800*c0909341SAndroid Build Coastguard Worker    RET
2801*c0909341SAndroid Build Coastguard Worker.h:
2802*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2803*c0909341SAndroid Build Coastguard Worker    jnz .hv
2804*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [pw_8192]
2805*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2806*c0909341SAndroid Build Coastguard Worker    je .h_w4
2807*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      10
2808*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
2809*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [z_filter_s+ 2]
2810*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
2811*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [z_filter_s+ 6]
2812*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
2813*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [z_filter_s+10]
2814*c0909341SAndroid Build Coastguard Worker    lea                 mxq, [r7+mxq*8+subpel_filters+1-prep%+SUFFIX]
2815*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(prep, _6tap_h)]
2816*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, [mxq+0]
2817*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m8, [mxq+2]
2818*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
2819*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, [mxq+4]
2820*c0909341SAndroid Build Coastguard Worker    jmp                  wq
2821*c0909341SAndroid Build Coastguard Worker.h_w8:
2822*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
2823*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*1], 1
2824*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2825*c0909341SAndroid Build Coastguard Worker%macro PREP_6TAP_H 0
2826*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m0, m3
2827*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m7
2828*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m0, m5
2829*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m8
2830*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6
2831*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m9
2832*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2833*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2834*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
2835*c0909341SAndroid Build Coastguard Worker%endmacro
2836*c0909341SAndroid Build Coastguard Worker    PREP_6TAP_H
2837*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
2838*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
2839*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2840*c0909341SAndroid Build Coastguard Worker    jg .h_w8
2841*c0909341SAndroid Build Coastguard Worker    RET
2842*c0909341SAndroid Build Coastguard Worker.h_w16:
2843*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0+8*0]
2844*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*0+8*1], 1
2845*c0909341SAndroid Build Coastguard Worker    PREP_6TAP_H
2846*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
2847*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*1+8*0]
2848*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*1+8*1], 1
2849*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2850*c0909341SAndroid Build Coastguard Worker    PREP_6TAP_H
2851*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m0
2852*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
2853*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2854*c0909341SAndroid Build Coastguard Worker    jg .h_w16
2855*c0909341SAndroid Build Coastguard Worker    RET
2856*c0909341SAndroid Build Coastguard Worker.h_w32:
2857*c0909341SAndroid Build Coastguard Worker    xor                 r6d, r6d
2858*c0909341SAndroid Build Coastguard Worker    jmp .h_start
2859*c0909341SAndroid Build Coastguard Worker.h_w64:
2860*c0909341SAndroid Build Coastguard Worker    mov                  r6, -32*1
2861*c0909341SAndroid Build Coastguard Worker    jmp .h_start
2862*c0909341SAndroid Build Coastguard Worker.h_w128:
2863*c0909341SAndroid Build Coastguard Worker    mov                  r6, -32*3
2864*c0909341SAndroid Build Coastguard Worker.h_start:
2865*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
2866*c0909341SAndroid Build Coastguard Worker    mov                  r5, r6
2867*c0909341SAndroid Build Coastguard Worker.h_loop:
2868*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+r6+8*0]
2869*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+r6+8*1], 1
2870*c0909341SAndroid Build Coastguard Worker    PREP_6TAP_H
2871*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
2872*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+r6+8*2]
2873*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+r6+8*3], 1
2874*c0909341SAndroid Build Coastguard Worker    PREP_6TAP_H
2875*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m0
2876*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
2877*c0909341SAndroid Build Coastguard Worker    add                  r6, 32
2878*c0909341SAndroid Build Coastguard Worker    jle .h_loop
2879*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
2880*c0909341SAndroid Build Coastguard Worker    mov                  r6, r5
2881*c0909341SAndroid Build Coastguard Worker    dec                  hd
2882*c0909341SAndroid Build Coastguard Worker    jg .h_loop
2883*c0909341SAndroid Build Coastguard Worker    RET
2884*c0909341SAndroid Build Coastguard Worker.hv:
2885*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      14, 16
2886*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2887*c0909341SAndroid Build Coastguard Worker    jne .hv_w8
2888*c0909341SAndroid Build Coastguard Worker.hv_w4:
2889*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
2890*c0909341SAndroid Build Coastguard Worker    dec                srcq
2891*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
2892*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2893*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2894*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
2895*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
2896*c0909341SAndroid Build Coastguard Worker    mova                 m6, [subpel_h_shuf4]
2897*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [r7+myq*8+subpel_filters+1-prep%+SUFFIX]
2898*c0909341SAndroid Build Coastguard Worker    mov                 nsq, ssq
2899*c0909341SAndroid Build Coastguard Worker    pmovzxbd            m13, [deint_shuf4]
2900*c0909341SAndroid Build Coastguard Worker    neg                 nsq
2901*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [pw_8192]
2902*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [pd_32]
2903*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
2904*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+nsq*2]
2905*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
2906*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+nsq*1]
2907*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m0, q0000
2908*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [srcq+ssq*0]
2909*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m0, q1111
2910*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+ssq*1]
2911*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m0, q2222
2912*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+ssq*2]
2913*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m4, 0xcc ; 0 1
2914*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m3, 0xcc ; 2 3
2915*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m6
2916*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m6
2917*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6
2918*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
2919*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m7
2920*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7
2921*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m1       ; 0 1 2 3
2922*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m0       ; 4
2923*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m8
2924*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m8
2925*c0909341SAndroid Build Coastguard Worker    palignr              m0, m2, 4
2926*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m0   ; 01 12
2927*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0       ; 23 34
2928*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
2929*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m10, m1  ; a0 b0
2930*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
2931*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m2, m10  ; c0 d0
2932*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [srcq+nsq*1]
2933*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m11      ; a1 b1
2934*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+ssq*0]
2935*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
2936*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+ssq*1]
2937*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m3, 0xcc ; 5 6
2938*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+ssq*2]
2939*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m3, 0xcc ; 7 8
2940*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m6
2941*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m6
2942*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m7
2943*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
2944*c0909341SAndroid Build Coastguard Worker    phaddw               m1, m2       ; 5 6 7 8
2945*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m8
2946*c0909341SAndroid Build Coastguard Worker    paddd                m5, m9
2947*c0909341SAndroid Build Coastguard Worker    paddd                m4, m9
2948*c0909341SAndroid Build Coastguard Worker    palignr              m2, m1, m0, 12
2949*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
2950*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m0   ; 45 56
2951*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0       ; 67 78
2952*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m11, m1  ; c1 d1
2953*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
2954*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m12, m1  ; a2 b2
2955*c0909341SAndroid Build Coastguard Worker    paddd                m4, m3
2956*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m12, m2  ; c2 d2
2957*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
2958*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
2959*c0909341SAndroid Build Coastguard Worker    psrad                m5, 6
2960*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
2961*c0909341SAndroid Build Coastguard Worker    vpermd               m4, m13, m4
2962*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m4
2963*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
2964*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2965*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
2966*c0909341SAndroid Build Coastguard Worker    RET
2967*c0909341SAndroid Build Coastguard Worker.hv_w8:
2968*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
2969*c0909341SAndroid Build Coastguard Worker    lea                 mxq, [r7+mxq*8+subpel_filters+1-prep_avx2]
2970*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       16
2971*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m10, [mxq+0]
2972*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m11, [mxq+2]
2973*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m12, [mxq+4]
2974*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2975*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2976*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2977*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
2978*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [r7+myq*8+subpel_filters+1-prep_avx2]
2979*c0909341SAndroid Build Coastguard Worker    lea                  r7, [ssq*2+2]
2980*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [z_filter_s+ 6]
2981*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
2982*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m9, [z_filter_s+10]
2983*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
2984*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [wq*8-64]
2985*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m0, q0000
2986*c0909341SAndroid Build Coastguard Worker    sub                srcq, r7
2987*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m0, q1111
2988*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+r6*4]
2989*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m0, q2222
2990*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0:
2991*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [z_filter_s+2]
2992*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ssq*0]
2993*c0909341SAndroid Build Coastguard Worker    lea                  r5, [srcq+ssq*2]
2994*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+ssq*1]
2995*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [r5+ssq*0]
2996*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
2997*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [r5+ssq*1], 1 ; 1 3
2998*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+ssq*2]
2999*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m0, 0xf0      ; 0 2
3000*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [r5+ssq*0], 1 ; 2 4
3001*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pw_8192]
3002*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP_W8         m3, m1, m2, m7, m8, m9
3003*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP_W8         m4, m1, m2, m7, m8, m9
3004*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP_W8         m0, m1, m2, m7, m8, m9
3005*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m3, q3120
3006*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m4, q3120
3007*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
3008*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
3009*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m5
3010*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
3011*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m4  ; 01
3012*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4      ; 23
3013*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m0  ; 12
3014*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0      ; 34
3015*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
3016*c0909341SAndroid Build Coastguard Worker    movu                xm7, [r5+ssq*1]
3017*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+ssq*2]
3018*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [r5+ssq*0], 1 ; 5 6
3019*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m13, m1 ; a0
3020*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3021*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m13, m2 ; b0
3022*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
3023*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m14     ; a1
3024*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m14     ; b1
3025*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
3026*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [z_filter_s+2]
3027*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4
3028*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP_W8         m7, m3, m4, m3, m8, m9
3029*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [pw_8192]
3030*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [pd_32]
3031*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m3
3032*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4
3033*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4
3034*c0909341SAndroid Build Coastguard Worker    mova                 m4, m0
3035*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m7, q3120
3036*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m0, 0x05
3037*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m0  ; 45
3038*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m15, m3 ; a2
3039*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0      ; 67
3040*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
3041*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m15, m4 ; b2
3042*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
3043*c0909341SAndroid Build Coastguard Worker    psrad                m5, 6
3044*c0909341SAndroid Build Coastguard Worker    psrad                m6, 6
3045*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m6
3046*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m5, q3120
3047*c0909341SAndroid Build Coastguard Worker    mova          [r7+wq*0], xm5
3048*c0909341SAndroid Build Coastguard Worker    vextracti128  [r7+wq*2], m5, 1
3049*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+wq*4]
3050*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3051*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
3052*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
3053*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
3054*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
3055*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
3056*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop0
3057*c0909341SAndroid Build Coastguard Worker    RET
3058*c0909341SAndroid Build Coastguard Worker
3059*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_8bpc
3060*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_8bpc
3061*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_8bpc
3062*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_8bpc
3063*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp,          SHARP,   SHARP
3064*c0909341SAndroid Build Coastguard Worker
3065*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
3066*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
3067*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
3068*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
3069*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 8tap_v, my, 4tap_v
3070*c0909341SAndroid Build Coastguard Worker    lea                  r7, [prep%+SUFFIX]
3071*c0909341SAndroid Build Coastguard Worker    mov                  wd, wm
3072*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
3073*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
3074*c0909341SAndroid Build Coastguard Worker    jnz .h
3075*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
3076*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _prep_6tap_8bpc_avx2).prep
3077*c0909341SAndroid Build Coastguard Worker.v:
3078*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      12, 15
3079*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb ; Select 4-tap/8-tap filter multipliers.
3080*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16  ; Note that the code is 8-tap only, having
3081*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4   ; a separate 4-tap code path for (4|8|16)x4
3082*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd ; had a negligible effect on performance.
3083*c0909341SAndroid Build Coastguard Worker    lea                 myq, [r7+myq*8+subpel_filters-prep%+SUFFIX]
3084*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
3085*c0909341SAndroid Build Coastguard Worker    sub                srcq, stride3q
3086*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pw_8192]
3087*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m8, [myq+0]
3088*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, [myq+2]
3089*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m10, [myq+4]
3090*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m11, [myq+6]
3091*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
3092*c0909341SAndroid Build Coastguard Worker    jg .v_w16
3093*c0909341SAndroid Build Coastguard Worker    je .v_w8
3094*c0909341SAndroid Build Coastguard Worker.v_w4:
3095*c0909341SAndroid Build Coastguard Worker    movd                xm0, [srcq+strideq*0]
3096*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [srcq+strideq*2]
3097*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm2, [srcq+strideq*1]
3098*c0909341SAndroid Build Coastguard Worker    add                srcq, stride3q
3099*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [srcq+strideq*0]
3100*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m0, 0x01     ; 0 2 2 _   2 _ _ _
3101*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m2, 0x03     ; 1 1 3 3   3 3 _ _
3102*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [srcq+strideq*1]
3103*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [srcq+strideq*2]
3104*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m0, 0x68     ; 0 2 2 4   2 4 4 _
3105*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [srcq+stride3q ]
3106*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [deint_shuf4]
3107*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m2, 0xc0     ; 1 1 3 3   3 3 5 5
3108*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m3, m1, 0x55 ; 0 1 2 3   2 3 4 5
3109*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m1, 0xaa     ; 1 2 3 4   3 4 5 _
3110*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2, m3       ; 01  12    23  34
3111*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m0, 0x80     ; 1 2 3 4   3 4 5 6
3112*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m3           ; 23  34    45  56
3113*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
3114*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
3115*c0909341SAndroid Build Coastguard Worker    pinsrd              xm0, [srcq+strideq*0], 1
3116*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [srcq+strideq*1]
3117*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [srcq+strideq*2]
3118*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m0, 0x03     ; 6 7 8 _   8 _ _ _
3119*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [srcq+stride3q ]
3120*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m4, 0x20     ; 6 7 8 _   8 9 _ _
3121*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m0, 0x40     ; 6 7 8 _   8 9 a _
3122*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m5           ; 67  78    89  9a
3123*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m1, m8
3124*c0909341SAndroid Build Coastguard Worker    vperm2i128           m1, m2, m3, 0x21 ; 45  56    67  78
3125*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9
3126*c0909341SAndroid Build Coastguard Worker    paddw                m4, m2
3127*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
3128*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m11
3129*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
3130*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m1, m10
3131*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
3132*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m7
3133*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m3
3134*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
3135*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3136*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
3137*c0909341SAndroid Build Coastguard Worker    RET
3138*c0909341SAndroid Build Coastguard Worker.v_w8:
3139*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+strideq*0]
3140*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+strideq*1]
3141*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+strideq*2]
3142*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m5, [srcq+stride3q ]
3143*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
3144*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+strideq*0]
3145*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m6, [srcq+strideq*1]
3146*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+strideq*2]
3147*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m4, 0x30
3148*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m2, 0x30
3149*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m4 ; 01 12
3150*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m5, 0x30
3151*c0909341SAndroid Build Coastguard Worker    vpblendd             m5, m3, 0x30
3152*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m5 ; 23 34
3153*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m6, 0x30
3154*c0909341SAndroid Build Coastguard Worker    vpblendd             m6, m0, 0x30
3155*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m6 ; 45 56
3156*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
3157*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+stride3q ]
3158*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
3159*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m2, m9  ; a1
3160*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m2, m8  ; b0
3161*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m0, m4, 0x30
3162*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+strideq*0]
3163*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m0, 0x30
3164*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4      ; 67 78
3165*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m8      ; a0
3166*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m3, m9  ; b1
3167*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
3168*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3169*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m10     ; a2
3170*c0909341SAndroid Build Coastguard Worker    paddw                m6, m4
3171*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3
3172*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+strideq*1]
3173*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m0, m4, 0x30
3174*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+strideq*2]
3175*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m0, 0x30
3176*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4      ; 89 9a
3177*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m2, m11 ; a3
3178*c0909341SAndroid Build Coastguard Worker    paddw                m5, m4
3179*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m2, m10 ; b2
3180*c0909341SAndroid Build Coastguard Worker    paddw                m6, m4
3181*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m3, m11 ; b3
3182*c0909341SAndroid Build Coastguard Worker    paddw                m6, m4
3183*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7
3184*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m7
3185*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m5
3186*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m6
3187*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
3188*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3189*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
3190*c0909341SAndroid Build Coastguard Worker    RET
3191*c0909341SAndroid Build Coastguard Worker.v_w16:
3192*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [wq*2-32]
3193*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       15
3194*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+r6*8]
3195*c0909341SAndroid Build Coastguard Worker.v_w16_loop0:
3196*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [srcq+strideq*0]
3197*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [srcq+strideq*1]
3198*c0909341SAndroid Build Coastguard Worker    lea                  r5, [srcq+strideq*2]
3199*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [r5+strideq*1]
3200*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [r5+strideq*0]
3201*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+strideq*2]
3202*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [r5+strideq*0]
3203*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [r5+strideq*1]
3204*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+strideq*2]
3205*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [r5+strideq*0]
3206*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
3207*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m0, 0x0c
3208*c0909341SAndroid Build Coastguard Worker    shufpd               m5, m1, 0x0c
3209*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m4, m5 ; 01
3210*c0909341SAndroid Build Coastguard Worker    punpckhbw            m4, m5     ; 34
3211*c0909341SAndroid Build Coastguard Worker    shufpd               m6, m2, 0x0c
3212*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m5, m6 ; 12
3213*c0909341SAndroid Build Coastguard Worker    punpckhbw            m5, m6     ; 45
3214*c0909341SAndroid Build Coastguard Worker    shufpd               m0, m3, 0x0c
3215*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m6, m0 ; 23
3216*c0909341SAndroid Build Coastguard Worker    punpckhbw            m6, m0     ; 56
3217*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
3218*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m12, [r5+strideq*1]
3219*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+strideq*2]
3220*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m1, m8  ; a0
3221*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m2, m8  ; b0
3222*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3223*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
3224*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m9      ; a1
3225*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m9      ; b1
3226*c0909341SAndroid Build Coastguard Worker    paddw               m13, m3
3227*c0909341SAndroid Build Coastguard Worker    paddw               m14, m4
3228*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
3229*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
3230*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m10     ; a2
3231*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m10     ; b2
3232*c0909341SAndroid Build Coastguard Worker    paddw               m13, m5
3233*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [r5+strideq*0]
3234*c0909341SAndroid Build Coastguard Worker    paddw               m14, m6
3235*c0909341SAndroid Build Coastguard Worker    shufpd               m6, m0, m12, 0x0d
3236*c0909341SAndroid Build Coastguard Worker    shufpd               m0, m12, m5, 0x0c
3237*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m6, m0  ; 67
3238*c0909341SAndroid Build Coastguard Worker    punpckhbw            m6, m0      ; 78
3239*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, m5, m11 ; a3
3240*c0909341SAndroid Build Coastguard Worker    paddw               m13, m12
3241*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, m6, m11 ; b3
3242*c0909341SAndroid Build Coastguard Worker    paddw               m14, m12
3243*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m13, m7
3244*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m7
3245*c0909341SAndroid Build Coastguard Worker    mova          [r7+wq*0], m13
3246*c0909341SAndroid Build Coastguard Worker    mova          [r7+wq*2], m14
3247*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+wq*4]
3248*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3249*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
3250*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
3251*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
3252*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
3253*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
3254*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop0
3255*c0909341SAndroid Build Coastguard Worker    RET
3256*c0909341SAndroid Build Coastguard Worker.h:
3257*c0909341SAndroid Build Coastguard Worker.h_w4:
3258*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
3259*c0909341SAndroid Build Coastguard Worker    jnz .hv
3260*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [pw_8192]
3261*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
3262*c0909341SAndroid Build Coastguard Worker    je mangle(private_prefix %+ _prep_6tap_8bpc_avx2).h_w4
3263*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      10
3264*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [subpel_h_shufA]
3265*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
3266*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [subpel_h_shufB]
3267*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [subpel_h_shufC]
3268*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
3269*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
3270*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
3271*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
3272*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
3273*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
3274*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3275*c0909341SAndroid Build Coastguard Worker.h_w8:
3276*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*0]
3277*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+strideq*1], 1
3278*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
3279*c0909341SAndroid Build Coastguard Worker%macro PREP_8TAP_H 0
3280*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m0, m5
3281*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m0, m6
3282*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m0, m7
3283*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m8
3284*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2, m8
3285*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9
3286*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m9
3287*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
3288*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
3289*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m1, m0
3290*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
3291*c0909341SAndroid Build Coastguard Worker%endmacro
3292*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_H
3293*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
3294*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
3295*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3296*c0909341SAndroid Build Coastguard Worker    jg .h_w8
3297*c0909341SAndroid Build Coastguard Worker    RET
3298*c0909341SAndroid Build Coastguard Worker.h_w16:
3299*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*0+8*0]
3300*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+strideq*0+8*1], 1
3301*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_H
3302*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
3303*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+strideq*1+8*0]
3304*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+strideq*1+8*1], 1
3305*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
3306*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_H
3307*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m0
3308*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
3309*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3310*c0909341SAndroid Build Coastguard Worker    jg .h_w16
3311*c0909341SAndroid Build Coastguard Worker    RET
3312*c0909341SAndroid Build Coastguard Worker.h_w32:
3313*c0909341SAndroid Build Coastguard Worker    xor                 r6d, r6d
3314*c0909341SAndroid Build Coastguard Worker    jmp .h_start
3315*c0909341SAndroid Build Coastguard Worker.h_w64:
3316*c0909341SAndroid Build Coastguard Worker    mov                  r6, -32*1
3317*c0909341SAndroid Build Coastguard Worker    jmp .h_start
3318*c0909341SAndroid Build Coastguard Worker.h_w128:
3319*c0909341SAndroid Build Coastguard Worker    mov                  r6, -32*3
3320*c0909341SAndroid Build Coastguard Worker.h_start:
3321*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
3322*c0909341SAndroid Build Coastguard Worker    mov                  r5, r6
3323*c0909341SAndroid Build Coastguard Worker.h_loop:
3324*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+r6+8*0]
3325*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+r6+8*1], 1
3326*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_H
3327*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*0], m0
3328*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+r6+8*2]
3329*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+r6+8*3], 1
3330*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_H
3331*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+32*1], m0
3332*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32*2
3333*c0909341SAndroid Build Coastguard Worker    add                  r6, 32
3334*c0909341SAndroid Build Coastguard Worker    jle .h_loop
3335*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
3336*c0909341SAndroid Build Coastguard Worker    mov                  r6, r5
3337*c0909341SAndroid Build Coastguard Worker    dec                  hd
3338*c0909341SAndroid Build Coastguard Worker    jg .h_loop
3339*c0909341SAndroid Build Coastguard Worker    RET
3340*c0909341SAndroid Build Coastguard Worker.hv:
3341*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      16
3342*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
3343*c0909341SAndroid Build Coastguard Worker    je .hv_w4
3344*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
3345*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
3346*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
3347*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
3348*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3349*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3350*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
3351*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
3352*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
3353*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
3354*c0909341SAndroid Build Coastguard Worker    sub                srcq, stride3q
3355*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
3356*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
3357*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m0, q0000
3358*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m0, q1111
3359*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m0, q2222
3360*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m0, q3333
3361*c0909341SAndroid Build Coastguard Worker    jmp .hv_w8
3362*c0909341SAndroid Build Coastguard Worker.hv_w4:
3363*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
3364*c0909341SAndroid Build Coastguard Worker    dec                srcq
3365*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
3366*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3367*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3368*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
3369*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
3370*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
3371*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
3372*c0909341SAndroid Build Coastguard Worker    sub                srcq, stride3q
3373*c0909341SAndroid Build Coastguard Worker    mova                 m7, [subpel_h_shuf4]
3374*c0909341SAndroid Build Coastguard Worker    pmovzxbd             m9, [deint_shuf4]
3375*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [pw_8192]
3376*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
3377*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
3378*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pd_32]
3379*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m0, q0000
3380*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m0, q1111
3381*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m0, q2222
3382*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m0, q3333
3383*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+strideq*0]
3384*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+strideq*1]
3385*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [srcq+strideq*2]
3386*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m5, [srcq+stride3q ]
3387*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
3388*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+strideq*0]
3389*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m6, [srcq+strideq*1]
3390*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [srcq+strideq*2]
3391*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m4, 0xcc ; 0 1
3392*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m5, 0xcc ; 2 3
3393*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m6, 0xcc ; 4 5
3394*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7 ; 00 01 10 11  02 03 12 13
3395*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m7 ; 20 21 30 31  22 23 32 33
3396*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7 ; 40 41 50 51  42 43 52 53
3397*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m7 ; 60 61 60 61  62 63 62 63
3398*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m8
3399*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m8
3400*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m8
3401*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m8
3402*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m0 ; 0a 1a 2a 3a  0b 1b 2b 3b
3403*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m1 ; 4a 5a 6a __  4b 5b 6b __
3404*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m10
3405*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m10
3406*c0909341SAndroid Build Coastguard Worker    palignr              m4, m3, m2, 4 ; 1a 2a 3a 4a  1b 2b 3b 4b
3407*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m4  ; 01 12
3408*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m4      ; 23 34
3409*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q2121
3410*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m0      ; 45 56
3411*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
3412*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, m12 ; a0 b0
3413*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m12 ; c0 d0
3414*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m13     ; a1 b1
3415*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m3, m13 ; c1 d1
3416*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3417*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m14     ; a2 b2
3418*c0909341SAndroid Build Coastguard Worker    paddd                m5, m2
3419*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+stride3q ]
3420*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
3421*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4
3422*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+strideq*0]
3423*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
3424*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+strideq*1]
3425*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m4, 0xcc
3426*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+strideq*2]
3427*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m4, 0xcc
3428*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
3429*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7
3430*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m8
3431*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m8
3432*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
3433*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m10
3434*c0909341SAndroid Build Coastguard Worker    palignr              m3, m2, m0, 12
3435*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
3436*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m0  ; 67 78
3437*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0      ; 89 9a
3438*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m2, m14 ; c2 d2
3439*c0909341SAndroid Build Coastguard Worker    paddd                m6, m11
3440*c0909341SAndroid Build Coastguard Worker    paddd                m5, m11
3441*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4
3442*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m2, m15 ; a3 b3
3443*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4
3444*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m3, m15 ; c3 d3
3445*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4
3446*c0909341SAndroid Build Coastguard Worker    psrad                m5, 6
3447*c0909341SAndroid Build Coastguard Worker    psrad                m6, 6
3448*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m6
3449*c0909341SAndroid Build Coastguard Worker    vpermd               m5, m9, m5
3450*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m5
3451*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
3452*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3453*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
3454*c0909341SAndroid Build Coastguard Worker    RET
3455*c0909341SAndroid Build Coastguard Worker.hv_w8:
3456*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [wq*8-64]
3457*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+r6*4]
3458*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0:
3459*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [subpel_h_shufA]
3460*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+strideq*0]
3461*c0909341SAndroid Build Coastguard Worker    lea                  r5, [srcq+strideq*2]
3462*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [subpel_h_shufB]
3463*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+strideq*1]
3464*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
3465*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m9, [subpel_h_shufC]
3466*c0909341SAndroid Build Coastguard Worker    movu                xm6, [r5+strideq*0]
3467*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [r5+strideq*1]
3468*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+strideq*2]
3469*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m0, 0xf0          ; 0 3
3470*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [r5+strideq*0], 1 ; 1 4
3471*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [r5+strideq*1], 1 ; 2 5
3472*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+strideq*2]
3473*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [r5+strideq*0], 1 ; 3 6
3474*c0909341SAndroid Build Coastguard Worker    HV_H_8TAP_W8         m4, m1, m2, m3, m7, m8, m9
3475*c0909341SAndroid Build Coastguard Worker    HV_H_8TAP_W8         m5, m1, m2, m3, m7, m8, m9
3476*c0909341SAndroid Build Coastguard Worker    HV_H_8TAP_W8         m6, m1, m2, m3, m7, m8, m9
3477*c0909341SAndroid Build Coastguard Worker    HV_H_8TAP_W8         m0, m1, m2, m3, m7, m8, m9
3478*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pw_8192]
3479*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m4, q3120
3480*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m5, q3120
3481*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m6, q3120
3482*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m7
3483*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m7
3484*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7
3485*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m7
3486*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m0, q3120
3487*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m4, m5  ; 01
3488*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5      ; 34
3489*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5, m6  ; 12
3490*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6      ; 45
3491*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m6, m7  ; 23
3492*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m7      ; 56
3493*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
3494*c0909341SAndroid Build Coastguard Worker    vextracti128       [r7], m0, 1 ; not enough registers
3495*c0909341SAndroid Build Coastguard Worker    movu                xm0, [r5+strideq*1]
3496*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+strideq*2]
3497*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [r5+strideq*0], 1 ; 7 8
3498*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m1, m12 ; a0
3499*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m2, m12 ; b0
3500*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3501*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
3502*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m13     ; a1
3503*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m13     ; b1
3504*c0909341SAndroid Build Coastguard Worker    paddd                m8, m3
3505*c0909341SAndroid Build Coastguard Worker    paddd                m9, m4
3506*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
3507*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
3508*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m14     ; a2
3509*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m14     ; b2
3510*c0909341SAndroid Build Coastguard Worker    paddd                m8, m5
3511*c0909341SAndroid Build Coastguard Worker    paddd                m9, m6
3512*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [subpel_h_shufB]
3513*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [subpel_h_shufC]
3514*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [subpel_h_shufA]
3515*c0909341SAndroid Build Coastguard Worker    HV_H_8TAP_W8         m0, m5, m6, m7, m5, m6, m7
3516*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pw_8192]
3517*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pd_32]
3518*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [r7]
3519*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
3520*c0909341SAndroid Build Coastguard Worker    paddd                m8, m7
3521*c0909341SAndroid Build Coastguard Worker    paddd                m9, m7
3522*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m0, q3120 ; 7 8
3523*c0909341SAndroid Build Coastguard Worker    shufpd               m6, m7, 0x04  ; 6 7
3524*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6, m7  ; 67
3525*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m7      ; 78
3526*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m5, m15 ; a3
3527*c0909341SAndroid Build Coastguard Worker    paddd                m8, m7
3528*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m6, m15 ; b3
3529*c0909341SAndroid Build Coastguard Worker    paddd                m7, m9
3530*c0909341SAndroid Build Coastguard Worker    psrad                m8, 6
3531*c0909341SAndroid Build Coastguard Worker    psrad                m7, 6
3532*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m7
3533*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m8, q3120
3534*c0909341SAndroid Build Coastguard Worker    mova          [r7+wq*0], xm7
3535*c0909341SAndroid Build Coastguard Worker    vextracti128  [r7+wq*2], m7, 1
3536*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+wq*4]
3537*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3538*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
3539*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
3540*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
3541*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
3542*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
3543*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop0
3544*c0909341SAndroid Build Coastguard Worker    RET
3545*c0909341SAndroid Build Coastguard Worker
3546*c0909341SAndroid Build Coastguard Worker%macro movifprep 2
3547*c0909341SAndroid Build Coastguard Worker %if isprep
3548*c0909341SAndroid Build Coastguard Worker    mov %1, %2
3549*c0909341SAndroid Build Coastguard Worker %endif
3550*c0909341SAndroid Build Coastguard Worker%endmacro
3551*c0909341SAndroid Build Coastguard Worker
3552*c0909341SAndroid Build Coastguard Worker%macro REMAP_REG 2
3553*c0909341SAndroid Build Coastguard Worker %xdefine r%1  r%2
3554*c0909341SAndroid Build Coastguard Worker %xdefine r%1q r%2q
3555*c0909341SAndroid Build Coastguard Worker %xdefine r%1d r%2d
3556*c0909341SAndroid Build Coastguard Worker%endmacro
3557*c0909341SAndroid Build Coastguard Worker
3558*c0909341SAndroid Build Coastguard Worker%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
3559*c0909341SAndroid Build Coastguard Worker %if isprep
3560*c0909341SAndroid Build Coastguard Worker  %xdefine r14_save r14
3561*c0909341SAndroid Build Coastguard Worker  %assign %%i 14
3562*c0909341SAndroid Build Coastguard Worker  %rep 14
3563*c0909341SAndroid Build Coastguard Worker   %assign %%j %%i-1
3564*c0909341SAndroid Build Coastguard Worker   REMAP_REG %%i, %%j
3565*c0909341SAndroid Build Coastguard Worker   %assign %%i %%i-1
3566*c0909341SAndroid Build Coastguard Worker  %endrep
3567*c0909341SAndroid Build Coastguard Worker %endif
3568*c0909341SAndroid Build Coastguard Worker%endmacro
3569*c0909341SAndroid Build Coastguard Worker
3570*c0909341SAndroid Build Coastguard Worker%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
3571*c0909341SAndroid Build Coastguard Worker %if isprep
3572*c0909341SAndroid Build Coastguard Worker  %assign %%i 1
3573*c0909341SAndroid Build Coastguard Worker  %rep 13
3574*c0909341SAndroid Build Coastguard Worker   %assign %%j %%i+1
3575*c0909341SAndroid Build Coastguard Worker   REMAP_REG %%i, %%j
3576*c0909341SAndroid Build Coastguard Worker   %assign %%i %%i+1
3577*c0909341SAndroid Build Coastguard Worker  %endrep
3578*c0909341SAndroid Build Coastguard Worker  %xdefine r14 r14_save
3579*c0909341SAndroid Build Coastguard Worker  %undef r14_save
3580*c0909341SAndroid Build Coastguard Worker %endif
3581*c0909341SAndroid Build Coastguard Worker%endmacro
3582*c0909341SAndroid Build Coastguard Worker
3583*c0909341SAndroid Build Coastguard Worker%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
3584*c0909341SAndroid Build Coastguard Worker    MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
3585*c0909341SAndroid Build Coastguard Worker    RET
3586*c0909341SAndroid Build Coastguard Worker %if %1
3587*c0909341SAndroid Build Coastguard Worker    MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
3588*c0909341SAndroid Build Coastguard Worker %endif
3589*c0909341SAndroid Build Coastguard Worker%endmacro
3590*c0909341SAndroid Build Coastguard Worker
3591*c0909341SAndroid Build Coastguard Worker%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
3592*c0909341SAndroid Build Coastguard Worker    movq               xm%1, [srcq+ r4]
3593*c0909341SAndroid Build Coastguard Worker    movq               xm%2, [srcq+ r6]
3594*c0909341SAndroid Build Coastguard Worker    movhps             xm%1, [srcq+ r7]
3595*c0909341SAndroid Build Coastguard Worker    movhps             xm%2, [srcq+ r9]
3596*c0909341SAndroid Build Coastguard Worker    vinserti128         m%1, [srcq+r10], 1
3597*c0909341SAndroid Build Coastguard Worker    vinserti128         m%2, [srcq+r11], 1
3598*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m%5, [srcq+r13]
3599*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m%6, [srcq+ rX]
3600*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
3601*c0909341SAndroid Build Coastguard Worker    movq               xm%3, [srcq+ r4]
3602*c0909341SAndroid Build Coastguard Worker    movq               xm%4, [srcq+ r6]
3603*c0909341SAndroid Build Coastguard Worker    movhps             xm%3, [srcq+ r7]
3604*c0909341SAndroid Build Coastguard Worker    movhps             xm%4, [srcq+ r9]
3605*c0909341SAndroid Build Coastguard Worker    vinserti128         m%3, [srcq+r10], 1
3606*c0909341SAndroid Build Coastguard Worker    vinserti128         m%4, [srcq+r11], 1
3607*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m%7, [srcq+r13]
3608*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m%8, [srcq+ rX]
3609*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
3610*c0909341SAndroid Build Coastguard Worker    vpblendd            m%1, m%5, 0xc0
3611*c0909341SAndroid Build Coastguard Worker    vpblendd            m%2, m%6, 0xc0
3612*c0909341SAndroid Build Coastguard Worker    vpblendd            m%3, m%7, 0xc0
3613*c0909341SAndroid Build Coastguard Worker    vpblendd            m%4, m%8, 0xc0
3614*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%1, m15
3615*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%2, m10
3616*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%3, m15
3617*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%4, m10
3618*c0909341SAndroid Build Coastguard Worker    phaddw              m%1, m%2
3619*c0909341SAndroid Build Coastguard Worker    phaddw              m%3, m%4
3620*c0909341SAndroid Build Coastguard Worker    phaddw              m%1, m%3
3621*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%1, m12
3622*c0909341SAndroid Build Coastguard Worker%endmacro
3623*c0909341SAndroid Build Coastguard Worker
3624*c0909341SAndroid Build Coastguard Worker%macro MC_8TAP_SCALED 1
3625*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
3626*c0909341SAndroid Build Coastguard Worker %assign isprep 0
3627*c0909341SAndroid Build Coastguard Workercglobal put_8tap_scaled_8bpc, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy
3628*c0909341SAndroid Build Coastguard Worker %xdefine base_reg r12
3629*c0909341SAndroid Build Coastguard Worker %define rndshift 10
3630*c0909341SAndroid Build Coastguard Worker%else
3631*c0909341SAndroid Build Coastguard Worker %assign isprep 1
3632*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
3633*c0909341SAndroid Build Coastguard Worker %define tmp_stridem qword [rsp+120]
3634*c0909341SAndroid Build Coastguard Worker %xdefine base_reg r11
3635*c0909341SAndroid Build Coastguard Worker %define rndshift 6
3636*c0909341SAndroid Build Coastguard Worker%endif
3637*c0909341SAndroid Build Coastguard Worker    lea            base_reg, [%1_8tap_scaled_8bpc_avx2]
3638*c0909341SAndroid Build Coastguard Worker%define base base_reg-%1_8tap_scaled_8bpc_avx2
3639*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
3640*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, dxm
3641*c0909341SAndroid Build Coastguard Worker%if isprep && UNIX64
3642*c0909341SAndroid Build Coastguard Worker    movd               xm14, mxd
3643*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, xm14
3644*c0909341SAndroid Build Coastguard Worker    mov                 r5d, t0d
3645*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 5, 7
3646*c0909341SAndroid Build Coastguard Worker%else
3647*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, mxm
3648*c0909341SAndroid Build Coastguard Worker%endif
3649*c0909341SAndroid Build Coastguard Worker    mov                 dyd, dym
3650*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
3651*c0909341SAndroid Build Coastguard Worker %if WIN64
3652*c0909341SAndroid Build Coastguard Worker    mov                 r8d, hm
3653*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
3654*c0909341SAndroid Build Coastguard Worker  %define hm r5m
3655*c0909341SAndroid Build Coastguard Worker  %define dxm r8m
3656*c0909341SAndroid Build Coastguard Worker %else
3657*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
3658*c0909341SAndroid Build Coastguard Worker  %define hm r6m
3659*c0909341SAndroid Build Coastguard Worker %endif
3660*c0909341SAndroid Build Coastguard Worker %define dsm [rsp+112]
3661*c0909341SAndroid Build Coastguard Worker %define rX r1
3662*c0909341SAndroid Build Coastguard Worker %define rXd r1d
3663*c0909341SAndroid Build Coastguard Worker%else ; prep
3664*c0909341SAndroid Build Coastguard Worker %if WIN64
3665*c0909341SAndroid Build Coastguard Worker    mov                 r7d, hm
3666*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
3667*c0909341SAndroid Build Coastguard Worker  %define hm r4m
3668*c0909341SAndroid Build Coastguard Worker  %define dxm r7m
3669*c0909341SAndroid Build Coastguard Worker %else
3670*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
3671*c0909341SAndroid Build Coastguard Worker  %define hm [rsp+112]
3672*c0909341SAndroid Build Coastguard Worker %endif
3673*c0909341SAndroid Build Coastguard Worker MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
3674*c0909341SAndroid Build Coastguard Worker %define rX r14
3675*c0909341SAndroid Build Coastguard Worker %define rXd r14d
3676*c0909341SAndroid Build Coastguard Worker%endif
3677*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+pd_0x3ff]
3678*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [base+pw_8192]
3679*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
3680*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [base+pd_512]
3681*c0909341SAndroid Build Coastguard Worker%else
3682*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [base+pd_32]
3683*c0909341SAndroid Build Coastguard Worker%endif
3684*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
3685*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
3686*c0909341SAndroid Build Coastguard Worker    movzx               r7d, t1b
3687*c0909341SAndroid Build Coastguard Worker    shr                 t1d, 16
3688*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
3689*c0909341SAndroid Build Coastguard Worker    cmovs               t1d, r7d
3690*c0909341SAndroid Build Coastguard Worker    sub                srcq, ss3q
3691*c0909341SAndroid Build Coastguard Worker    cmp                 dyd, 1024
3692*c0909341SAndroid Build Coastguard Worker    je .dy1
3693*c0909341SAndroid Build Coastguard Worker    cmp                 dyd, 2048
3694*c0909341SAndroid Build Coastguard Worker    je .dy2
3695*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
3696*c0909341SAndroid Build Coastguard Worker    add                  wq, base_reg
3697*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3698*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
3699*c0909341SAndroid Build Coastguard Worker.w2:
3700*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
3701*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
3702*c0909341SAndroid Build Coastguard Worker    dec                srcq
3703*c0909341SAndroid Build Coastguard Worker    movd               xm15, t0d
3704*c0909341SAndroid Build Coastguard Worker    punpckldq            m8, m9, m8
3705*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0,1]
3706*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+pd_0x4000]
3707*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm15, xm15
3708*c0909341SAndroid Build Coastguard Worker    pand                 m8, m14, m10
3709*c0909341SAndroid Build Coastguard Worker    psrld                m8, 6
3710*c0909341SAndroid Build Coastguard Worker    paddd              xm15, xm8
3711*c0909341SAndroid Build Coastguard Worker    movd                r4d, xm15
3712*c0909341SAndroid Build Coastguard Worker    pextrd              r6d, xm15, 1
3713*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [base+bdct_lb_dw]
3714*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [base+subpel_s_shuf2]
3715*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [base+subpel_filters+r4*8+2]
3716*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
3717*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m9
3718*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
3719*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+ssq*0]
3720*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+ssq*2]
3721*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [srcq+ssq*1]
3722*c0909341SAndroid Build Coastguard Worker    movhps              xm1, [srcq+ss3q ]
3723*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
3724*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
3725*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
3726*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*0], 1
3727*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+ssq*2], 1
3728*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+ssq*1]
3729*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+ss3q ]
3730*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
3731*c0909341SAndroid Build Coastguard Worker    vpblendd            m15, m7, 0xaa
3732*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m2, 0xc0       ; 0 1  4 5
3733*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m3, 0xc0       ; 2 3  6 7
3734*c0909341SAndroid Build Coastguard Worker    pblendvb            m15, m11, m8
3735*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14
3736*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14
3737*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m15
3738*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m15
3739*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m1
3740*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m12            ; 0 1 2 3  4 5 6 7
3741*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1          ; 4 5 6 7
3742*c0909341SAndroid Build Coastguard Worker    palignr             xm2, xm1, xm0, 4    ; 1 2 3 4
3743*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm0, xm2       ; 01 12
3744*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm0, xm2            ; 23 34
3745*c0909341SAndroid Build Coastguard Worker    pshufd              xm4, xm1, q0321     ; 5 6 7 _
3746*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm1, xm4       ; 45 56
3747*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm4, xm1, xm4       ; 67 __
3748*c0909341SAndroid Build Coastguard Worker.w2_loop:
3749*c0909341SAndroid Build Coastguard Worker    and                 myd, 0x3ff
3750*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 64 << 24
3751*c0909341SAndroid Build Coastguard Worker    mov                 r4d, myd
3752*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6
3753*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [t1+r4]
3754*c0909341SAndroid Build Coastguard Worker    cmovnz              r6q, [base+subpel_filters+r4*8]
3755*c0909341SAndroid Build Coastguard Worker    movq               xm11, r6q
3756*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xm11, xm11
3757*c0909341SAndroid Build Coastguard Worker    pshufd              xm8, xm11, q0000
3758*c0909341SAndroid Build Coastguard Worker    pshufd              xm9, xm11, q1111
3759*c0909341SAndroid Build Coastguard Worker    pshufd             xm10, xm11, q2222
3760*c0909341SAndroid Build Coastguard Worker    pshufd             xm11, xm11, q3333
3761*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm5, xm3, xm8
3762*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm6, xm0, xm9
3763*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm7, xm2, xm10
3764*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm8, xm4, xm11
3765*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm6
3766*c0909341SAndroid Build Coastguard Worker    paddd               xm7, xm8
3767*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm13
3768*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm7
3769*c0909341SAndroid Build Coastguard Worker    psrad               xm5, 10
3770*c0909341SAndroid Build Coastguard Worker    packssdw            xm5, xm5
3771*c0909341SAndroid Build Coastguard Worker    packuswb            xm5, xm5
3772*c0909341SAndroid Build Coastguard Worker    pextrw           [dstq], xm5, 0
3773*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
3774*c0909341SAndroid Build Coastguard Worker    dec                  hd
3775*c0909341SAndroid Build Coastguard Worker    jz .ret
3776*c0909341SAndroid Build Coastguard Worker    add                 myd, dyd
3777*c0909341SAndroid Build Coastguard Worker    test                myd, ~0x3ff
3778*c0909341SAndroid Build Coastguard Worker    jz .w2_loop
3779*c0909341SAndroid Build Coastguard Worker    movq                xm5, [srcq]
3780*c0909341SAndroid Build Coastguard Worker    test                myd, 0x400
3781*c0909341SAndroid Build Coastguard Worker    jz .w2_skip_line
3782*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
3783*c0909341SAndroid Build Coastguard Worker    shufps              xm3, xm0, q1032     ; 01 12
3784*c0909341SAndroid Build Coastguard Worker    shufps              xm0, xm2, q1032     ; 23 34
3785*c0909341SAndroid Build Coastguard Worker    shufps              xm2, xm4, q1032     ; 45 56
3786*c0909341SAndroid Build Coastguard Worker    pshufb              xm5, xm14
3787*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm5, xm15
3788*c0909341SAndroid Build Coastguard Worker    phaddw              xm5, xm5
3789*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm5, xm12
3790*c0909341SAndroid Build Coastguard Worker    palignr             xm1, xm5, xm1, 12
3791*c0909341SAndroid Build Coastguard Worker    punpcklqdq          xm1, xm1            ; 6 7 6 7
3792*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm4, xm1, xm5       ; 67 __
3793*c0909341SAndroid Build Coastguard Worker    jmp .w2_loop
3794*c0909341SAndroid Build Coastguard Worker.w2_skip_line:
3795*c0909341SAndroid Build Coastguard Worker    movhps              xm5, [srcq+ssq*1]
3796*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3797*c0909341SAndroid Build Coastguard Worker    mova                xm3, xm0            ; 01 12
3798*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm2            ; 23 34
3799*c0909341SAndroid Build Coastguard Worker    pshufb              xm5, xm14
3800*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm5, xm15
3801*c0909341SAndroid Build Coastguard Worker    phaddw              xm5, xm5
3802*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm5, xm12           ; 6 7 6 7
3803*c0909341SAndroid Build Coastguard Worker    palignr             xm1, xm5, xm1, 8    ; 4 5 6 7
3804*c0909341SAndroid Build Coastguard Worker    pshufd              xm5, xm1, q0321     ; 5 6 7 _
3805*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm1, xm5       ; 45 56
3806*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm4, xm1, xm5       ; 67 __
3807*c0909341SAndroid Build Coastguard Worker    jmp .w2_loop
3808*c0909341SAndroid Build Coastguard Worker%endif
3809*c0909341SAndroid Build Coastguard Worker.w4:
3810*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
3811*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [base+rescale_mul]
3812*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
3813*c0909341SAndroid Build Coastguard Worker    dec                srcq
3814*c0909341SAndroid Build Coastguard Worker    movd               xm15, t0d
3815*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m7
3816*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+pd_0x4000]
3817*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm15, xm15
3818*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-3]
3819*c0909341SAndroid Build Coastguard Worker    pand                 m0, m14, m10
3820*c0909341SAndroid Build Coastguard Worker    psrld                m0, 6
3821*c0909341SAndroid Build Coastguard Worker    paddd              xm15, xm0
3822*c0909341SAndroid Build Coastguard Worker    movd                r4d, xm15
3823*c0909341SAndroid Build Coastguard Worker    pextrd              r6d, xm15, 1
3824*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm15, 2
3825*c0909341SAndroid Build Coastguard Worker    pextrd             r13d, xm15, 3
3826*c0909341SAndroid Build Coastguard Worker    movd               xm15, [base+subpel_filters+r4*8+2]
3827*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [base+bdct_lb_dw]
3828*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m6, [base+subpel_s_shuf2]
3829*c0909341SAndroid Build Coastguard Worker    pinsrd             xm15, [base+subpel_filters+r6*8+2], 1
3830*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m0, m9
3831*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
3832*c0909341SAndroid Build Coastguard Worker    movu                xm7, [srcq+ssq*0]
3833*c0909341SAndroid Build Coastguard Worker    movu                xm9, [srcq+ssq*1]
3834*c0909341SAndroid Build Coastguard Worker    pinsrd             xm15, [base+subpel_filters+r11*8+2], 2
3835*c0909341SAndroid Build Coastguard Worker    movu                xm8, [srcq+ssq*2]
3836*c0909341SAndroid Build Coastguard Worker    movu               xm10, [srcq+ss3q ]
3837*c0909341SAndroid Build Coastguard Worker    pinsrd             xm15, [base+subpel_filters+r13*8+2], 3
3838*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
3839*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
3840*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
3841*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [srcq+ssq*0], 1
3842*c0909341SAndroid Build Coastguard Worker    vinserti128          m9, [srcq+ssq*1], 1
3843*c0909341SAndroid Build Coastguard Worker    vinserti128         m15, xm15, 1
3844*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, [srcq+ssq*2], 1
3845*c0909341SAndroid Build Coastguard Worker    vinserti128         m10, [srcq+ss3q ], 1
3846*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
3847*c0909341SAndroid Build Coastguard Worker    pblendvb            m15, m11, m0
3848*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m14
3849*c0909341SAndroid Build Coastguard Worker    pshufb               m9, m14
3850*c0909341SAndroid Build Coastguard Worker    pshufb               m8, m14
3851*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m14
3852*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m15
3853*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m9, m15
3854*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m8, m15
3855*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m15
3856*c0909341SAndroid Build Coastguard Worker    phaddw               m7, m9
3857*c0909341SAndroid Build Coastguard Worker    phaddw               m8, m10
3858*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m12                ; 0 1  4 5
3859*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m12                ; 2 3  6 7
3860*c0909341SAndroid Build Coastguard Worker    vextracti128        xm9, m7, 1              ; 4 5
3861*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m8, 1              ; 6 7
3862*c0909341SAndroid Build Coastguard Worker    shufps              xm4, xm7, xm8, q1032    ; 1 2
3863*c0909341SAndroid Build Coastguard Worker    shufps              xm5, xm8, xm9, q1032    ; 3 4
3864*c0909341SAndroid Build Coastguard Worker    shufps              xm6, xm9, xm3, q1032    ; 5 6
3865*c0909341SAndroid Build Coastguard Worker    psrldq             xm11, xm3, 8             ; 7 _
3866*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm0, xm7, xm4   ; 01
3867*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm7, xm4        ; 12
3868*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm1, xm8, xm5   ; 23
3869*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm8, xm5        ; 34
3870*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm9, xm6   ; 45
3871*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm9, xm6        ; 56
3872*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm11       ; 67
3873*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x00], xm7
3874*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], xm8
3875*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], xm9
3876*c0909341SAndroid Build Coastguard Worker.w4_loop:
3877*c0909341SAndroid Build Coastguard Worker    and                 myd, 0x3ff
3878*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 64 << 24
3879*c0909341SAndroid Build Coastguard Worker    mov                 r4d, myd
3880*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6
3881*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [t1+r4]
3882*c0909341SAndroid Build Coastguard Worker    cmovnz              r6q, [base+subpel_filters+r4*8]
3883*c0909341SAndroid Build Coastguard Worker    movq               xm10, r6q
3884*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xm10, xm10
3885*c0909341SAndroid Build Coastguard Worker    pshufd              xm7, xm10, q0000
3886*c0909341SAndroid Build Coastguard Worker    pshufd              xm8, xm10, q1111
3887*c0909341SAndroid Build Coastguard Worker    pshufd              xm9, xm10, q2222
3888*c0909341SAndroid Build Coastguard Worker    pshufd             xm10, xm10, q3333
3889*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm4, xm0, xm7
3890*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm5, xm1, xm8
3891*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm6, xm2, xm9
3892*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm7, xm3, xm10
3893*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm5
3894*c0909341SAndroid Build Coastguard Worker    paddd               xm6, xm7
3895*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm13
3896*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm6
3897*c0909341SAndroid Build Coastguard Worker    psrad               xm4, rndshift
3898*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm4
3899*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
3900*c0909341SAndroid Build Coastguard Worker    packuswb            xm4, xm4
3901*c0909341SAndroid Build Coastguard Worker    movd             [dstq], xm4
3902*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
3903*c0909341SAndroid Build Coastguard Worker%else
3904*c0909341SAndroid Build Coastguard Worker    movq             [tmpq], xm4
3905*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8
3906*c0909341SAndroid Build Coastguard Worker%endif
3907*c0909341SAndroid Build Coastguard Worker    dec                  hd
3908*c0909341SAndroid Build Coastguard Worker    jz .ret
3909*c0909341SAndroid Build Coastguard Worker    add                 myd, dyd
3910*c0909341SAndroid Build Coastguard Worker    test                myd, ~0x3ff
3911*c0909341SAndroid Build Coastguard Worker    jz .w4_loop
3912*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq]
3913*c0909341SAndroid Build Coastguard Worker    test                myd, 0x400
3914*c0909341SAndroid Build Coastguard Worker    jz .w4_skip_line
3915*c0909341SAndroid Build Coastguard Worker    mova                xm0, [rsp+0x00]
3916*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x00], xm1
3917*c0909341SAndroid Build Coastguard Worker    mova                xm1, [rsp+0x10]
3918*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], xm2
3919*c0909341SAndroid Build Coastguard Worker    mova                xm2, [rsp+0x20]
3920*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], xm3
3921*c0909341SAndroid Build Coastguard Worker    pshufb              xm4, xm14
3922*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm4, xm15
3923*c0909341SAndroid Build Coastguard Worker    phaddw              xm4, xm4
3924*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm4, xm12
3925*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm11, xm4
3926*c0909341SAndroid Build Coastguard Worker    mova               xm11, xm4
3927*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
3928*c0909341SAndroid Build Coastguard Worker    jmp .w4_loop
3929*c0909341SAndroid Build Coastguard Worker.w4_skip_line:
3930*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+ssq*1]
3931*c0909341SAndroid Build Coastguard Worker    movu                 m6, [rsp+0x10]
3932*c0909341SAndroid Build Coastguard Worker    pshufb              xm4, xm14
3933*c0909341SAndroid Build Coastguard Worker    pshufb              xm5, xm14
3934*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm4, xm15
3935*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm5, xm15
3936*c0909341SAndroid Build Coastguard Worker    movu         [rsp+0x00], m6
3937*c0909341SAndroid Build Coastguard Worker    phaddw              xm4, xm5
3938*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm4, xm12
3939*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm9, xm11, xm4
3940*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], xm9
3941*c0909341SAndroid Build Coastguard Worker    psrldq             xm11, xm4, 8
3942*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm1
3943*c0909341SAndroid Build Coastguard Worker    mova                xm1, xm2
3944*c0909341SAndroid Build Coastguard Worker    mova                xm2, xm3
3945*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm4, xm11
3946*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3947*c0909341SAndroid Build Coastguard Worker    jmp .w4_loop
3948*c0909341SAndroid Build Coastguard Worker.w8:
3949*c0909341SAndroid Build Coastguard Worker    mov      dword [rsp+48], 1
3950*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 16
3951*c0909341SAndroid Build Coastguard Worker    jmp .w_start
3952*c0909341SAndroid Build Coastguard Worker.w16:
3953*c0909341SAndroid Build Coastguard Worker    mov      dword [rsp+48], 2
3954*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 32
3955*c0909341SAndroid Build Coastguard Worker    jmp .w_start
3956*c0909341SAndroid Build Coastguard Worker.w32:
3957*c0909341SAndroid Build Coastguard Worker    mov      dword [rsp+48], 4
3958*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 64
3959*c0909341SAndroid Build Coastguard Worker    jmp .w_start
3960*c0909341SAndroid Build Coastguard Worker.w64:
3961*c0909341SAndroid Build Coastguard Worker    mov      dword [rsp+48], 8
3962*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 128
3963*c0909341SAndroid Build Coastguard Worker    jmp .w_start
3964*c0909341SAndroid Build Coastguard Worker.w128:
3965*c0909341SAndroid Build Coastguard Worker    mov      dword [rsp+48], 16
3966*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 256
3967*c0909341SAndroid Build Coastguard Worker.w_start:
3968*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
3969*c0909341SAndroid Build Coastguard Worker    movifnidn           dsm, dsq
3970*c0909341SAndroid Build Coastguard Worker%endif
3971*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 16
3972*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
3973*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [base+rescale_mul]
3974*c0909341SAndroid Build Coastguard Worker    movd               xm15, t0d
3975*c0909341SAndroid Build Coastguard Worker    mov            [rsp+72], t0d
3976*c0909341SAndroid Build Coastguard Worker    mov            [rsp+56], srcq
3977*c0909341SAndroid Build Coastguard Worker    mov            [rsp+64], r0q ; dstq / tmpq
3978*c0909341SAndroid Build Coastguard Worker%if UNIX64
3979*c0909341SAndroid Build Coastguard Worker    mov                  hm, hd
3980*c0909341SAndroid Build Coastguard Worker%endif
3981*c0909341SAndroid Build Coastguard Worker    shl           dword dxm, 3 ; dx*8
3982*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, xm15
3983*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-7]
3984*c0909341SAndroid Build Coastguard Worker    jmp .hloop
3985*c0909341SAndroid Build Coastguard Worker.hloop_prep:
3986*c0909341SAndroid Build Coastguard Worker    dec      dword [rsp+48]
3987*c0909341SAndroid Build Coastguard Worker    jz .ret
3988*c0909341SAndroid Build Coastguard Worker    add      qword [rsp+64], 8*(isprep+1)
3989*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
3990*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, dxm
3991*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+pd_0x3ff]
3992*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8, [rsp+16]
3993*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [rsp+72]
3994*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
3995*c0909341SAndroid Build Coastguard Worker    mov                srcq, [rsp+56]
3996*c0909341SAndroid Build Coastguard Worker    mov                 r0q, [rsp+64] ; dstq / tmpq
3997*c0909341SAndroid Build Coastguard Worker.hloop:
3998*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m11, [base+pq_0x40000000]
3999*c0909341SAndroid Build Coastguard Worker    pand                 m6, m14, m10
4000*c0909341SAndroid Build Coastguard Worker    psrld                m6, 6
4001*c0909341SAndroid Build Coastguard Worker    paddd               m15, m6
4002*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m6, m9
4003*c0909341SAndroid Build Coastguard Worker    vextracti128        xm7, m15, 1
4004*c0909341SAndroid Build Coastguard Worker    movd                r4d, xm15
4005*c0909341SAndroid Build Coastguard Worker    pextrd              r6d, xm15, 2
4006*c0909341SAndroid Build Coastguard Worker    pextrd              r7d, xm15, 1
4007*c0909341SAndroid Build Coastguard Worker    pextrd              r9d, xm15, 3
4008*c0909341SAndroid Build Coastguard Worker    movd               r10d, xm7
4009*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm7, 2
4010*c0909341SAndroid Build Coastguard Worker    pextrd             r13d, xm7, 1
4011*c0909341SAndroid Build Coastguard Worker    pextrd              rXd, xm7, 3
4012*c0909341SAndroid Build Coastguard Worker    movu           [rsp+16], m14
4013*c0909341SAndroid Build Coastguard Worker    movq               xm15, [base+subpel_filters+ r4*8]
4014*c0909341SAndroid Build Coastguard Worker    movq               xm10, [base+subpel_filters+ r6*8]
4015*c0909341SAndroid Build Coastguard Worker    movhps             xm15, [base+subpel_filters+ r7*8]
4016*c0909341SAndroid Build Coastguard Worker    movhps             xm10, [base+subpel_filters+ r9*8]
4017*c0909341SAndroid Build Coastguard Worker    vinserti128         m15, [base+subpel_filters+r10*8], 1
4018*c0909341SAndroid Build Coastguard Worker    vinserti128         m10, [base+subpel_filters+r11*8], 1
4019*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m9, [base+subpel_filters+r13*8]
4020*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
4021*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
4022*c0909341SAndroid Build Coastguard Worker    vextracti128        xm7, m14, 1
4023*c0909341SAndroid Build Coastguard Worker    mova              [rsp], xm14
4024*c0909341SAndroid Build Coastguard Worker    movd                r4d, xm14
4025*c0909341SAndroid Build Coastguard Worker    pextrd              r6d, xm14, 2
4026*c0909341SAndroid Build Coastguard Worker    pextrd              r7d, xm14, 1
4027*c0909341SAndroid Build Coastguard Worker    pextrd              r9d, xm14, 3
4028*c0909341SAndroid Build Coastguard Worker    movd               r10d, xm7
4029*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm7, 2
4030*c0909341SAndroid Build Coastguard Worker    pextrd             r13d, xm7, 1
4031*c0909341SAndroid Build Coastguard Worker    pextrd              rXd, xm7, 3
4032*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m6, q1100
4033*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q3322
4034*c0909341SAndroid Build Coastguard Worker    vpblendd            m15, m9, 0xc0
4035*c0909341SAndroid Build Coastguard Worker    vpblendd            m10, m8, 0xc0
4036*c0909341SAndroid Build Coastguard Worker    pblendvb            m15, m11, m5
4037*c0909341SAndroid Build Coastguard Worker    pblendvb            m10, m11, m6
4038*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m14, [base+subpel_s_shuf8]
4039*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
4040*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
4041*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
4042*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
4043*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4044*c0909341SAndroid Build Coastguard Worker    mov                 dyd, dym
4045*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14    ; 01a 01b
4046*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14    ; 23a 23b
4047*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14    ; 45a 45b
4048*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m14    ; 67a 67b
4049*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m14, [base+wswap]
4050*c0909341SAndroid Build Coastguard Worker.vloop:
4051*c0909341SAndroid Build Coastguard Worker    and                 myd, 0x3ff
4052*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 64 << 24
4053*c0909341SAndroid Build Coastguard Worker    mov                 r4d, myd
4054*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6
4055*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [t1+r4]
4056*c0909341SAndroid Build Coastguard Worker    cmovnz              r6q, [base+subpel_filters+r4*8]
4057*c0909341SAndroid Build Coastguard Worker    movq               xm11, r6q
4058*c0909341SAndroid Build Coastguard Worker    punpcklqdq         xm11, xm11
4059*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m11, xm11
4060*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m11, q0000
4061*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m11, q1111
4062*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m0, m8
4063*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, m9
4064*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m11, q2222
4065*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m11, q3333
4066*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m8
4067*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m3, m11
4068*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
4069*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
4070*c0909341SAndroid Build Coastguard Worker    paddd                m4, m13
4071*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
4072*c0909341SAndroid Build Coastguard Worker    psrad                m4, rndshift
4073*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
4074*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm5
4075*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
4076*c0909341SAndroid Build Coastguard Worker    packuswb            xm4, xm4
4077*c0909341SAndroid Build Coastguard Worker    movq             [dstq], xm4
4078*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
4079*c0909341SAndroid Build Coastguard Worker%else
4080*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xm4
4081*c0909341SAndroid Build Coastguard Worker    add                tmpq, tmp_stridem
4082*c0909341SAndroid Build Coastguard Worker%endif
4083*c0909341SAndroid Build Coastguard Worker    dec                  hd
4084*c0909341SAndroid Build Coastguard Worker    jz .hloop_prep
4085*c0909341SAndroid Build Coastguard Worker    add                 myd, dyd
4086*c0909341SAndroid Build Coastguard Worker    test                myd, ~0x3ff
4087*c0909341SAndroid Build Coastguard Worker    jz .vloop
4088*c0909341SAndroid Build Coastguard Worker    test                myd, 0x400
4089*c0909341SAndroid Build Coastguard Worker    mov            [rsp+52], myd
4090*c0909341SAndroid Build Coastguard Worker    mov                 r4d, [rsp+ 0]
4091*c0909341SAndroid Build Coastguard Worker    mov                 r6d, [rsp+ 8]
4092*c0909341SAndroid Build Coastguard Worker    mov                 r7d, [rsp+ 4]
4093*c0909341SAndroid Build Coastguard Worker    mov                 r9d, [rsp+12]
4094*c0909341SAndroid Build Coastguard Worker    jz .skip_line
4095*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m6, [srcq+r13]
4096*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m7, [srcq+ rX]
4097*c0909341SAndroid Build Coastguard Worker    movq                xm4, [srcq+ r4]
4098*c0909341SAndroid Build Coastguard Worker    movq                xm5, [srcq+ r6]
4099*c0909341SAndroid Build Coastguard Worker    movhps              xm4, [srcq+ r7]
4100*c0909341SAndroid Build Coastguard Worker    movhps              xm5, [srcq+ r9]
4101*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [srcq+r10], 1
4102*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [srcq+r11], 1
4103*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
4104*c0909341SAndroid Build Coastguard Worker    mov                 myd, [rsp+52]
4105*c0909341SAndroid Build Coastguard Worker    mov                 dyd, dym
4106*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14
4107*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14
4108*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14
4109*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m14
4110*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m6, 0xc0
4111*c0909341SAndroid Build Coastguard Worker    vpblendd             m5, m7, 0xc0
4112*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m15
4113*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m10
4114*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m5
4115*c0909341SAndroid Build Coastguard Worker    pslld                m5, m4, 16
4116*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
4117*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m12
4118*c0909341SAndroid Build Coastguard Worker    pblendw              m0, m1, 0xaa
4119*c0909341SAndroid Build Coastguard Worker    pblendw              m1, m2, 0xaa
4120*c0909341SAndroid Build Coastguard Worker    pblendw              m2, m3, 0xaa
4121*c0909341SAndroid Build Coastguard Worker    pblendw              m3, m4, 0xaa
4122*c0909341SAndroid Build Coastguard Worker    jmp .vloop
4123*c0909341SAndroid Build Coastguard Worker.skip_line:
4124*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
4125*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
4126*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
4127*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m7, [srcq+r13]
4128*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m8, [srcq+ rX]
4129*c0909341SAndroid Build Coastguard Worker    movq                xm3, [srcq+ r4]
4130*c0909341SAndroid Build Coastguard Worker    movq                xm4, [srcq+ r6]
4131*c0909341SAndroid Build Coastguard Worker    movhps              xm3, [srcq+ r7]
4132*c0909341SAndroid Build Coastguard Worker    movhps              xm4, [srcq+ r9]
4133*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+r10], 1
4134*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [srcq+r11], 1
4135*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
4136*c0909341SAndroid Build Coastguard Worker    movq                xm5, [srcq+ r4]
4137*c0909341SAndroid Build Coastguard Worker    movq                xm6, [srcq+ r6]
4138*c0909341SAndroid Build Coastguard Worker    movhps              xm5, [srcq+ r7]
4139*c0909341SAndroid Build Coastguard Worker    movhps              xm6, [srcq+ r9]
4140*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [srcq+r10], 1
4141*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [srcq+r11], 1
4142*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m9, [srcq+r13]
4143*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m11, [srcq+ rX]
4144*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
4145*c0909341SAndroid Build Coastguard Worker    mov                 myd, [rsp+52]
4146*c0909341SAndroid Build Coastguard Worker    mov                 dyd, dym
4147*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m7, 0xc0
4148*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m8, 0xc0
4149*c0909341SAndroid Build Coastguard Worker    vpblendd             m5, m9, 0xc0
4150*c0909341SAndroid Build Coastguard Worker    vpblendd             m6, m11, 0xc0
4151*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m15
4152*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m10
4153*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m15
4154*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m10
4155*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m4
4156*c0909341SAndroid Build Coastguard Worker    phaddw               m5, m6
4157*c0909341SAndroid Build Coastguard Worker    psrld                m4, m3, 16
4158*c0909341SAndroid Build Coastguard Worker    pslld                m6, m5, 16
4159*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
4160*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6
4161*c0909341SAndroid Build Coastguard Worker    pblendw              m3, m5, 0xaa
4162*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m12
4163*c0909341SAndroid Build Coastguard Worker    jmp .vloop
4164*c0909341SAndroid Build Coastguard Worker.dy1:
4165*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
4166*c0909341SAndroid Build Coastguard Worker    add                  wq, base_reg
4167*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4168*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
4169*c0909341SAndroid Build Coastguard Worker.dy1_w2:
4170*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4171*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
4172*c0909341SAndroid Build Coastguard Worker    dec                srcq
4173*c0909341SAndroid Build Coastguard Worker    movd               xm15, t0d
4174*c0909341SAndroid Build Coastguard Worker    punpckldq            m8, m9, m8
4175*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-1]
4176*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+pd_0x4000]
4177*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm15, xm15
4178*c0909341SAndroid Build Coastguard Worker    pand                 m8, m14, m10
4179*c0909341SAndroid Build Coastguard Worker    psrld                m8, 6
4180*c0909341SAndroid Build Coastguard Worker    paddd              xm15, xm8
4181*c0909341SAndroid Build Coastguard Worker    movd                r4d, xm15
4182*c0909341SAndroid Build Coastguard Worker    pextrd              r6d, xm15, 1
4183*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [base+bdct_lb_dw]
4184*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [base+subpel_s_shuf2]
4185*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [base+subpel_filters+r4*8+2]
4186*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
4187*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m9
4188*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
4189*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+ssq*0]
4190*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+ssq*2]
4191*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [srcq+ssq*1]
4192*c0909341SAndroid Build Coastguard Worker    movhps              xm1, [srcq+ss3q ]
4193*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
4194*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
4195*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
4196*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
4197*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
4198*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
4199*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
4200*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [srcq+ssq*0], 1
4201*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [srcq+ssq*2], 1
4202*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+ssq*1]
4203*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
4204*c0909341SAndroid Build Coastguard Worker    movq               xm10, r4q
4205*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xm10, xm10
4206*c0909341SAndroid Build Coastguard Worker    vpblendd            m15, m7, 0xaa
4207*c0909341SAndroid Build Coastguard Worker    pblendvb            m15, m11, m8
4208*c0909341SAndroid Build Coastguard Worker    pshufd              xm8, xm10, q0000
4209*c0909341SAndroid Build Coastguard Worker    pshufd              xm9, xm10, q1111
4210*c0909341SAndroid Build Coastguard Worker    pshufd             xm11, xm10, q3333
4211*c0909341SAndroid Build Coastguard Worker    pshufd             xm10, xm10, q2222
4212*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m2, 0xc0
4213*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14
4214*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14
4215*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m15
4216*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m15
4217*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m1
4218*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m12
4219*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
4220*c0909341SAndroid Build Coastguard Worker    palignr             xm2, xm1, xm0, 4
4221*c0909341SAndroid Build Coastguard Worker    pshufd              xm4, xm1, q2121
4222*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm0, xm2       ; 01 12
4223*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm0, xm2            ; 23 34
4224*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm1, xm4       ; 45 56
4225*c0909341SAndroid Build Coastguard Worker.dy1_w2_loop:
4226*c0909341SAndroid Build Coastguard Worker    movq                xm1, [srcq+ssq*0]
4227*c0909341SAndroid Build Coastguard Worker    movhps              xm1, [srcq+ssq*1]
4228*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
4229*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm5, xm3, xm8
4230*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm6, xm0, xm9
4231*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm7, xm2, xm10
4232*c0909341SAndroid Build Coastguard Worker    mova                xm3, xm0
4233*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm2
4234*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm13
4235*c0909341SAndroid Build Coastguard Worker    paddd               xm6, xm7
4236*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm14
4237*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm15
4238*c0909341SAndroid Build Coastguard Worker    phaddw              xm1, xm1
4239*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm1, xm12
4240*c0909341SAndroid Build Coastguard Worker    palignr             xm7, xm1, xm4, 12
4241*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm7, xm1     ; 67 78
4242*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm7, xm2, xm11
4243*c0909341SAndroid Build Coastguard Worker    mova                xm4, xm1
4244*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm6
4245*c0909341SAndroid Build Coastguard Worker    paddd               xm5, xm7
4246*c0909341SAndroid Build Coastguard Worker    psrad               xm5, rndshift
4247*c0909341SAndroid Build Coastguard Worker    packssdw            xm5, xm5
4248*c0909341SAndroid Build Coastguard Worker    packuswb            xm5, xm5
4249*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xm5, 0
4250*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xm5, 1
4251*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
4252*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4253*c0909341SAndroid Build Coastguard Worker    jg .dy1_w2_loop
4254*c0909341SAndroid Build Coastguard Worker    RET
4255*c0909341SAndroid Build Coastguard Worker%endif
4256*c0909341SAndroid Build Coastguard Worker.dy1_w4:
4257*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4258*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [base+rescale_mul]
4259*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
4260*c0909341SAndroid Build Coastguard Worker    dec                srcq
4261*c0909341SAndroid Build Coastguard Worker    movd               xm15, t0d
4262*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m7
4263*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+pd_0x4000]
4264*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm15, xm15
4265*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-3]
4266*c0909341SAndroid Build Coastguard Worker    pand                 m8, m14, m10
4267*c0909341SAndroid Build Coastguard Worker    psrld                m8, 6
4268*c0909341SAndroid Build Coastguard Worker    paddd              xm15, xm8
4269*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m8, q3120
4270*c0909341SAndroid Build Coastguard Worker    movd                r4d, xm15
4271*c0909341SAndroid Build Coastguard Worker    pextrd              r6d, xm15, 2
4272*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm15, 1
4273*c0909341SAndroid Build Coastguard Worker    pextrd             r13d, xm15, 3
4274*c0909341SAndroid Build Coastguard Worker    movd               xm15, [base+subpel_filters+r4*8+2]
4275*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
4276*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+ssq*0]
4277*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ssq*2]
4278*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [base+bdct_lb_dw]
4279*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m6, [base+subpel_s_shuf2]
4280*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m9
4281*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
4282*c0909341SAndroid Build Coastguard Worker    pinsrd             xm15, [base+subpel_filters+r11*8+2], 1
4283*c0909341SAndroid Build Coastguard Worker    vpblendd             m7, [base+subpel_filters+r13*8+2-20], 0x20
4284*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+ssq*1], 1
4285*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+ss3q ], 1
4286*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
4287*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
4288*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
4289*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
4290*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
4291*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
4292*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
4293*c0909341SAndroid Build Coastguard Worker    movu                xm4, [srcq+ssq*0]
4294*c0909341SAndroid Build Coastguard Worker    movu                xm5, [srcq+ssq*2]
4295*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [srcq+ssq*1], 1
4296*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
4297*c0909341SAndroid Build Coastguard Worker    vpblendd            m15, m7, 0x30
4298*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m15, m15
4299*c0909341SAndroid Build Coastguard Worker    pblendvb            m15, m11, m8
4300*c0909341SAndroid Build Coastguard Worker    movq               xm10, r4q
4301*c0909341SAndroid Build Coastguard Worker    punpcklqdq         xm10, xm10
4302*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m10, xm10
4303*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14
4304*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m14
4305*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m14
4306*c0909341SAndroid Build Coastguard Worker    pshufb              xm5, xm14
4307*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m2, q3120
4308*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m3, q3120
4309*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m4, q3120
4310*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m5, q3120
4311*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m10, q0000
4312*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m10, q1111
4313*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m10, q2222
4314*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m10, q3333
4315*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m15
4316*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m15
4317*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m15
4318*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m15
4319*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
4320*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m5
4321*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m12
4322*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m12
4323*c0909341SAndroid Build Coastguard Worker    palignr              m5, m4, m2, 4
4324*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m4, q2121
4325*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2, m5     ; 01 12
4326*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2, m5     ; 23 34
4327*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m3     ; 45 56
4328*c0909341SAndroid Build Coastguard Worker.dy1_w4_loop:
4329*c0909341SAndroid Build Coastguard Worker    movu               xm11, [srcq+ssq*0]
4330*c0909341SAndroid Build Coastguard Worker    vinserti128         m11, [srcq+ssq*1], 1
4331*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
4332*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m0, m7
4333*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, m8
4334*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m9
4335*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
4336*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
4337*c0909341SAndroid Build Coastguard Worker    paddd                m4, m13
4338*c0909341SAndroid Build Coastguard Worker    paddd                m5, m6
4339*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m14
4340*c0909341SAndroid Build Coastguard Worker    vpermq              m11, m11, q3120
4341*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m11, m15
4342*c0909341SAndroid Build Coastguard Worker    phaddw              m11, m11
4343*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m12
4344*c0909341SAndroid Build Coastguard Worker    palignr              m6, m11, m3, 12
4345*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m6, m11    ; 67 78
4346*c0909341SAndroid Build Coastguard Worker    mova                 m3, m11
4347*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m10
4348*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
4349*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
4350*c0909341SAndroid Build Coastguard Worker    psrad                m4, rndshift
4351*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
4352*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm5
4353*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
4354*c0909341SAndroid Build Coastguard Worker    packuswb            xm4, xm4
4355*c0909341SAndroid Build Coastguard Worker    pshuflw             xm4, xm4, q3120
4356*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm4
4357*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm4, 1
4358*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
4359*c0909341SAndroid Build Coastguard Worker%else
4360*c0909341SAndroid Build Coastguard Worker    pshufd              xm4, xm4, q3120
4361*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xm4
4362*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
4363*c0909341SAndroid Build Coastguard Worker%endif
4364*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4365*c0909341SAndroid Build Coastguard Worker    jg .dy1_w4_loop
4366*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_RET
4367*c0909341SAndroid Build Coastguard Worker.dy1_w8:
4368*c0909341SAndroid Build Coastguard Worker    mov      dword [rsp+72], 1
4369*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 16
4370*c0909341SAndroid Build Coastguard Worker    jmp .dy1_w_start
4371*c0909341SAndroid Build Coastguard Worker.dy1_w16:
4372*c0909341SAndroid Build Coastguard Worker    mov      dword [rsp+72], 2
4373*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 32
4374*c0909341SAndroid Build Coastguard Worker    jmp .dy1_w_start
4375*c0909341SAndroid Build Coastguard Worker.dy1_w32:
4376*c0909341SAndroid Build Coastguard Worker    mov      dword [rsp+72], 4
4377*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 64
4378*c0909341SAndroid Build Coastguard Worker    jmp .dy1_w_start
4379*c0909341SAndroid Build Coastguard Worker.dy1_w64:
4380*c0909341SAndroid Build Coastguard Worker    mov      dword [rsp+72], 8
4381*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 128
4382*c0909341SAndroid Build Coastguard Worker    jmp .dy1_w_start
4383*c0909341SAndroid Build Coastguard Worker.dy1_w128:
4384*c0909341SAndroid Build Coastguard Worker    mov      dword [rsp+72], 16
4385*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 256
4386*c0909341SAndroid Build Coastguard Worker.dy1_w_start:
4387*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4388*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
4389*c0909341SAndroid Build Coastguard Worker    movifnidn           dsm, dsq
4390*c0909341SAndroid Build Coastguard Worker%endif
4391*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 16
4392*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
4393*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
4394*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
4395*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
4396*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
4397*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [base+rescale_mul]
4398*c0909341SAndroid Build Coastguard Worker    movd               xm15, t0d
4399*c0909341SAndroid Build Coastguard Worker    mov            [rsp+76], t0d
4400*c0909341SAndroid Build Coastguard Worker    mov            [rsp+80], srcq
4401*c0909341SAndroid Build Coastguard Worker    mov            [rsp+88], r0q ; dstq / tmpq
4402*c0909341SAndroid Build Coastguard Worker%if UNIX64
4403*c0909341SAndroid Build Coastguard Worker    mov                  hm, hd
4404*c0909341SAndroid Build Coastguard Worker%endif
4405*c0909341SAndroid Build Coastguard Worker    shl           dword dxm, 3 ; dx*8
4406*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, xm15
4407*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-7]
4408*c0909341SAndroid Build Coastguard Worker    movq                xm0, r4q
4409*c0909341SAndroid Build Coastguard Worker    pmovsxbw            xm0, xm0
4410*c0909341SAndroid Build Coastguard Worker    mova           [rsp+96], xm0
4411*c0909341SAndroid Build Coastguard Worker    jmp .dy1_hloop
4412*c0909341SAndroid Build Coastguard Worker.dy1_hloop_prep:
4413*c0909341SAndroid Build Coastguard Worker    dec      dword [rsp+72]
4414*c0909341SAndroid Build Coastguard Worker    jz .ret
4415*c0909341SAndroid Build Coastguard Worker    add      qword [rsp+88], 8*(isprep+1)
4416*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
4417*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, dxm
4418*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+pd_0x3ff]
4419*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8, [rsp+32]
4420*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [rsp+76]
4421*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
4422*c0909341SAndroid Build Coastguard Worker    mov                srcq, [rsp+80]
4423*c0909341SAndroid Build Coastguard Worker    mov                 r0q, [rsp+88] ; dstq / tmpq
4424*c0909341SAndroid Build Coastguard Worker.dy1_hloop:
4425*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m11, [base+pq_0x40000000]
4426*c0909341SAndroid Build Coastguard Worker    pand                 m6, m14, m10
4427*c0909341SAndroid Build Coastguard Worker    psrld                m6, 6
4428*c0909341SAndroid Build Coastguard Worker    paddd               m15, m6
4429*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m6, m9
4430*c0909341SAndroid Build Coastguard Worker    vextracti128        xm7, m15, 1
4431*c0909341SAndroid Build Coastguard Worker    movd                r4d, xm15
4432*c0909341SAndroid Build Coastguard Worker    pextrd              r6d, xm15, 2
4433*c0909341SAndroid Build Coastguard Worker    pextrd              r7d, xm15, 1
4434*c0909341SAndroid Build Coastguard Worker    pextrd              r9d, xm15, 3
4435*c0909341SAndroid Build Coastguard Worker    movd               r10d, xm7
4436*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm7, 2
4437*c0909341SAndroid Build Coastguard Worker    pextrd             r13d, xm7, 1
4438*c0909341SAndroid Build Coastguard Worker    pextrd              rXd, xm7, 3
4439*c0909341SAndroid Build Coastguard Worker    movu           [rsp+32], m14
4440*c0909341SAndroid Build Coastguard Worker    movq               xm15, [base+subpel_filters+ r4*8]
4441*c0909341SAndroid Build Coastguard Worker    movq               xm10, [base+subpel_filters+ r6*8]
4442*c0909341SAndroid Build Coastguard Worker    movhps             xm15, [base+subpel_filters+ r7*8]
4443*c0909341SAndroid Build Coastguard Worker    movhps             xm10, [base+subpel_filters+ r9*8]
4444*c0909341SAndroid Build Coastguard Worker    vinserti128         m15, [base+subpel_filters+r10*8], 1
4445*c0909341SAndroid Build Coastguard Worker    vinserti128         m10, [base+subpel_filters+r11*8], 1
4446*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m9, [base+subpel_filters+r13*8]
4447*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
4448*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
4449*c0909341SAndroid Build Coastguard Worker    vextracti128        xm7, m14, 1
4450*c0909341SAndroid Build Coastguard Worker    movq           [rsp+64], xm14
4451*c0909341SAndroid Build Coastguard Worker    movd                r4d, xm14
4452*c0909341SAndroid Build Coastguard Worker    pextrd              r6d, xm14, 2
4453*c0909341SAndroid Build Coastguard Worker    pextrd              r7d, xm14, 1
4454*c0909341SAndroid Build Coastguard Worker    pextrd              r9d, xm14, 3
4455*c0909341SAndroid Build Coastguard Worker    movd               r10d, xm7
4456*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm7, 2
4457*c0909341SAndroid Build Coastguard Worker    pextrd             r13d, xm7, 1
4458*c0909341SAndroid Build Coastguard Worker    pextrd              rXd, xm7, 3
4459*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m6, q1100
4460*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q3322
4461*c0909341SAndroid Build Coastguard Worker    vpblendd            m15, m9, 0xc0
4462*c0909341SAndroid Build Coastguard Worker    vpblendd            m10, m8, 0xc0
4463*c0909341SAndroid Build Coastguard Worker    pblendvb            m15, m11, m5
4464*c0909341SAndroid Build Coastguard Worker    pblendvb            m10, m11, m6
4465*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m14, [base+subpel_s_shuf8]
4466*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
4467*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
4468*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
4469*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
4470*c0909341SAndroid Build Coastguard Worker    movu              [rsp], m10
4471*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [rsp+0x60]
4472*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [rsp+0x64]
4473*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [rsp+0x68]
4474*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [rsp+0x6c]
4475*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14    ; 01a 01b
4476*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14    ; 23a 23b
4477*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14    ; 45a 45b
4478*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m14    ; 67a 67b
4479*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m14, [base+wswap]
4480*c0909341SAndroid Build Coastguard Worker.dy1_vloop:
4481*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m0, m8
4482*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, m9
4483*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m10
4484*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m3, m11
4485*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
4486*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
4487*c0909341SAndroid Build Coastguard Worker    paddd                m4, m13
4488*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
4489*c0909341SAndroid Build Coastguard Worker    psrad                m4, rndshift
4490*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
4491*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm5
4492*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
4493*c0909341SAndroid Build Coastguard Worker    packuswb            xm4, xm4
4494*c0909341SAndroid Build Coastguard Worker    movq             [dstq], xm4
4495*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
4496*c0909341SAndroid Build Coastguard Worker%else
4497*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xm4
4498*c0909341SAndroid Build Coastguard Worker    add                tmpq, tmp_stridem
4499*c0909341SAndroid Build Coastguard Worker%endif
4500*c0909341SAndroid Build Coastguard Worker    dec                  hd
4501*c0909341SAndroid Build Coastguard Worker    jz .dy1_hloop_prep
4502*c0909341SAndroid Build Coastguard Worker    movq                xm4, [srcq+ r4]
4503*c0909341SAndroid Build Coastguard Worker    movq                xm5, [srcq+ r6]
4504*c0909341SAndroid Build Coastguard Worker    movhps              xm4, [srcq+ r7]
4505*c0909341SAndroid Build Coastguard Worker    movhps              xm5, [srcq+ r9]
4506*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [srcq+r10], 1
4507*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [srcq+r11], 1
4508*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m6, [srcq+r13]
4509*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m7, [srcq+ rX]
4510*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
4511*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14
4512*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14
4513*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14
4514*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m14
4515*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m6, 0xc0
4516*c0909341SAndroid Build Coastguard Worker    vpblendd             m5, m7, 0xc0
4517*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m15
4518*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, [rsp]
4519*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m5
4520*c0909341SAndroid Build Coastguard Worker    pslld                m5, m4, 16
4521*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
4522*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m12
4523*c0909341SAndroid Build Coastguard Worker    pblendw              m0, m1, 0xaa
4524*c0909341SAndroid Build Coastguard Worker    pblendw              m1, m2, 0xaa
4525*c0909341SAndroid Build Coastguard Worker    pblendw              m2, m3, 0xaa
4526*c0909341SAndroid Build Coastguard Worker    pblendw              m3, m4, 0xaa
4527*c0909341SAndroid Build Coastguard Worker    jmp .dy1_vloop
4528*c0909341SAndroid Build Coastguard Worker.dy2:
4529*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
4530*c0909341SAndroid Build Coastguard Worker    add                  wq, base_reg
4531*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4532*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
4533*c0909341SAndroid Build Coastguard Worker.dy2_w2:
4534*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4535*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
4536*c0909341SAndroid Build Coastguard Worker    dec                srcq
4537*c0909341SAndroid Build Coastguard Worker    movd               xm15, t0d
4538*c0909341SAndroid Build Coastguard Worker    punpckldq            m8, m9, m8
4539*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-1]
4540*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+pd_0x4000]
4541*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm15, xm15
4542*c0909341SAndroid Build Coastguard Worker    pand                 m8, m14, m10
4543*c0909341SAndroid Build Coastguard Worker    psrld                m8, 6
4544*c0909341SAndroid Build Coastguard Worker    paddd              xm15, xm8
4545*c0909341SAndroid Build Coastguard Worker    movd                r4d, xm15
4546*c0909341SAndroid Build Coastguard Worker    pextrd              r6d, xm15, 1
4547*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [base+bdct_lb_dw]
4548*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [base+subpel_s_shuf2]
4549*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [base+subpel_filters+r4*8+2]
4550*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
4551*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m9
4552*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
4553*c0909341SAndroid Build Coastguard Worker    movq                xm0, [srcq+ssq*0]
4554*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [srcq+ssq*1]
4555*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [srcq+ssq*2]
4556*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [srcq+ss3q ]
4557*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
4558*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
4559*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
4560*c0909341SAndroid Build Coastguard Worker    vpblendd            m15, m7, 0xaa
4561*c0909341SAndroid Build Coastguard Worker    pblendvb            m15, m11, m8
4562*c0909341SAndroid Build Coastguard Worker    movhps              xm1, [srcq+ssq*0]
4563*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [srcq+ssq*1]
4564*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
4565*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
4566*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
4567*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
4568*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
4569*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m2, 0x30
4570*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m4, 0xc0
4571*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m3, 0xc0
4572*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14
4573*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14
4574*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m15
4575*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m15
4576*c0909341SAndroid Build Coastguard Worker    movq               xm11, r4q
4577*c0909341SAndroid Build Coastguard Worker    pmovsxbw           xm11, xm11
4578*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m1
4579*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m12            ; 0 2 _ 4  1 3 _ 5
4580*c0909341SAndroid Build Coastguard Worker    pshufd              xm8, xm11, q0000
4581*c0909341SAndroid Build Coastguard Worker    pshufd              xm9, xm11, q1111
4582*c0909341SAndroid Build Coastguard Worker    pshufd             xm10, xm11, q2222
4583*c0909341SAndroid Build Coastguard Worker    pshufd             xm11, xm11, q3333
4584*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q3110      ; 0 2 2 4  1 3 3 5
4585*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m2, 1
4586*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm2, xm1       ; 01 23
4587*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm2, xm1            ; 23 45
4588*c0909341SAndroid Build Coastguard Worker.dy2_w2_loop:
4589*c0909341SAndroid Build Coastguard Worker    movq                xm6, [srcq+ssq*0]
4590*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m7, [srcq+ssq*1]
4591*c0909341SAndroid Build Coastguard Worker    movhps              xm6, [srcq+ssq*2]
4592*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [srcq+ss3q ]
4593*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
4594*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm4, xm3, xm8
4595*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm5, xm2, xm9
4596*c0909341SAndroid Build Coastguard Worker    vpblendd             m6, m7, 0x30
4597*c0909341SAndroid Build Coastguard Worker    vpblendd             m6, m1, 0xc0
4598*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m14
4599*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m15
4600*c0909341SAndroid Build Coastguard Worker    phaddw               m6, m6
4601*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m12
4602*c0909341SAndroid Build Coastguard Worker    palignr              m0, m6, m0, 8
4603*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q3221
4604*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m2, 1
4605*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm2, xm1       ; 45 67
4606*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm2, xm1            ; 67 89
4607*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm6, xm3, xm10
4608*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm7, xm2, xm11
4609*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm5
4610*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm13
4611*c0909341SAndroid Build Coastguard Worker    paddd               xm6, xm7
4612*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm6
4613*c0909341SAndroid Build Coastguard Worker    psrad               xm4, rndshift
4614*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm4
4615*c0909341SAndroid Build Coastguard Worker    packuswb            xm4, xm4
4616*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xm4, 0
4617*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xm4, 1
4618*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
4619*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4620*c0909341SAndroid Build Coastguard Worker    jg .dy2_w2_loop
4621*c0909341SAndroid Build Coastguard Worker    RET
4622*c0909341SAndroid Build Coastguard Worker%endif
4623*c0909341SAndroid Build Coastguard Worker.dy2_w4:
4624*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4625*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [base+rescale_mul]
4626*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
4627*c0909341SAndroid Build Coastguard Worker    dec                srcq
4628*c0909341SAndroid Build Coastguard Worker    movd               xm15, t0d
4629*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m7
4630*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+pd_0x4000]
4631*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm15, xm15
4632*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-3]
4633*c0909341SAndroid Build Coastguard Worker    pand                 m8, m14, m10
4634*c0909341SAndroid Build Coastguard Worker    psrld                m8, 6
4635*c0909341SAndroid Build Coastguard Worker    paddd              xm15, xm8
4636*c0909341SAndroid Build Coastguard Worker    movd                r4d, xm15
4637*c0909341SAndroid Build Coastguard Worker    pextrd              r6d, xm15, 1
4638*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm15, 2
4639*c0909341SAndroid Build Coastguard Worker    pextrd             r13d, xm15, 3
4640*c0909341SAndroid Build Coastguard Worker    movd               xm15, [base+subpel_filters+r4*8+2]
4641*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [base+bdct_lb_dw]
4642*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m6, [base+subpel_s_shuf2]
4643*c0909341SAndroid Build Coastguard Worker    pinsrd             xm15, [base+subpel_filters+r6*8+2], 1
4644*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m9
4645*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
4646*c0909341SAndroid Build Coastguard Worker    movu                xm0, [srcq+ssq*0]
4647*c0909341SAndroid Build Coastguard Worker    movu                xm2, [srcq+ssq*2]
4648*c0909341SAndroid Build Coastguard Worker    pinsrd             xm15, [base+subpel_filters+r11*8+2], 2
4649*c0909341SAndroid Build Coastguard Worker    movu                xm1, [srcq+ssq*1]
4650*c0909341SAndroid Build Coastguard Worker    movu                xm3, [srcq+ss3q ]
4651*c0909341SAndroid Build Coastguard Worker    pinsrd             xm15, [base+subpel_filters+r13*8+2], 3
4652*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
4653*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
4654*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
4655*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
4656*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
4657*c0909341SAndroid Build Coastguard Worker    vinserti128         m15, xm15, 1
4658*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
4659*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
4660*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [srcq+ssq*0], 1
4661*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+ssq*1], 1
4662*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
4663*c0909341SAndroid Build Coastguard Worker    pblendvb            m15, m11, m8
4664*c0909341SAndroid Build Coastguard Worker    pshufb              xm0, xm14
4665*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14
4666*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm14
4667*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m14
4668*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm15
4669*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m15
4670*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm15
4671*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m15
4672*c0909341SAndroid Build Coastguard Worker    movq               xm11, r4q
4673*c0909341SAndroid Build Coastguard Worker    punpcklqdq         xm11, xm11
4674*c0909341SAndroid Build Coastguard Worker    pmovsxbw            m11, xm11
4675*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m2
4676*c0909341SAndroid Build Coastguard Worker    phaddw               m1, m3
4677*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m12    ; 0 2  _ 4
4678*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m12    ; 1 3  _ 5
4679*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m11, q0000
4680*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m11, q1111
4681*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m11, q2222
4682*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m11, q3333
4683*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm0, xm1
4684*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m1     ; 23 45
4685*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, m2, xm1, 1 ; 01 23
4686*c0909341SAndroid Build Coastguard Worker.dy2_w4_loop:
4687*c0909341SAndroid Build Coastguard Worker    movu                xm6, [srcq+ssq*0]
4688*c0909341SAndroid Build Coastguard Worker    movu                xm7, [srcq+ssq*1]
4689*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [srcq+ssq*2], 1
4690*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [srcq+ss3q ], 1
4691*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
4692*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m0, m8
4693*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, m9
4694*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m14
4695*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m14
4696*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m15
4697*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m15
4698*c0909341SAndroid Build Coastguard Worker    psrld                m2, m6, 16
4699*c0909341SAndroid Build Coastguard Worker    pslld                m3, m7, 16
4700*c0909341SAndroid Build Coastguard Worker    paddw                m6, m2
4701*c0909341SAndroid Build Coastguard Worker    paddw                m7, m3
4702*c0909341SAndroid Build Coastguard Worker    pblendw              m6, m7, 0xaa   ; 67 89
4703*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m12
4704*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
4705*c0909341SAndroid Build Coastguard Worker    vperm2i128           m0, m1, m6, 0x21 ; 45 67
4706*c0909341SAndroid Build Coastguard Worker    mova                 m1, m6
4707*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m0, m10
4708*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m1, m11
4709*c0909341SAndroid Build Coastguard Worker    paddd                m4, m13
4710*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
4711*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
4712*c0909341SAndroid Build Coastguard Worker    psrad                m4, rndshift
4713*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
4714*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm5
4715*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
4716*c0909341SAndroid Build Coastguard Worker    packuswb            xm4, xm4
4717*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm4
4718*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm4, 1
4719*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
4720*c0909341SAndroid Build Coastguard Worker%else
4721*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xm4
4722*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
4723*c0909341SAndroid Build Coastguard Worker%endif
4724*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4725*c0909341SAndroid Build Coastguard Worker    jg .dy2_w4_loop
4726*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_RET
4727*c0909341SAndroid Build Coastguard Worker.dy2_w8:
4728*c0909341SAndroid Build Coastguard Worker    mov      dword [rsp+40], 1
4729*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 16
4730*c0909341SAndroid Build Coastguard Worker    jmp .dy2_w_start
4731*c0909341SAndroid Build Coastguard Worker.dy2_w16:
4732*c0909341SAndroid Build Coastguard Worker    mov      dword [rsp+40], 2
4733*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 32
4734*c0909341SAndroid Build Coastguard Worker    jmp .dy2_w_start
4735*c0909341SAndroid Build Coastguard Worker.dy2_w32:
4736*c0909341SAndroid Build Coastguard Worker    mov      dword [rsp+40], 4
4737*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 64
4738*c0909341SAndroid Build Coastguard Worker    jmp .dy2_w_start
4739*c0909341SAndroid Build Coastguard Worker.dy2_w64:
4740*c0909341SAndroid Build Coastguard Worker    mov      dword [rsp+40], 8
4741*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 128
4742*c0909341SAndroid Build Coastguard Worker    jmp .dy2_w_start
4743*c0909341SAndroid Build Coastguard Worker.dy2_w128:
4744*c0909341SAndroid Build Coastguard Worker    mov      dword [rsp+40], 16
4745*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 256
4746*c0909341SAndroid Build Coastguard Worker.dy2_w_start:
4747*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4748*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
4749*c0909341SAndroid Build Coastguard Worker    movifnidn           dsm, dsq
4750*c0909341SAndroid Build Coastguard Worker%endif
4751*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 16
4752*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
4753*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
4754*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
4755*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
4756*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
4757*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [base+rescale_mul]
4758*c0909341SAndroid Build Coastguard Worker    movd               xm15, t0d
4759*c0909341SAndroid Build Coastguard Worker    mov            [rsp+64], t0d
4760*c0909341SAndroid Build Coastguard Worker    mov            [rsp+48], srcq
4761*c0909341SAndroid Build Coastguard Worker    mov            [rsp+56], r0q ; dstq / tmpq
4762*c0909341SAndroid Build Coastguard Worker%if UNIX64
4763*c0909341SAndroid Build Coastguard Worker    mov                  hm, hd
4764*c0909341SAndroid Build Coastguard Worker%endif
4765*c0909341SAndroid Build Coastguard Worker    shl           dword dxm, 3 ; dx*8
4766*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, xm15
4767*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-7]
4768*c0909341SAndroid Build Coastguard Worker    movq                xm0, r4q
4769*c0909341SAndroid Build Coastguard Worker    pmovsxbw            xm0, xm0
4770*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], xm0
4771*c0909341SAndroid Build Coastguard Worker    jmp .dy2_hloop
4772*c0909341SAndroid Build Coastguard Worker.dy2_hloop_prep:
4773*c0909341SAndroid Build Coastguard Worker    dec      dword [rsp+40]
4774*c0909341SAndroid Build Coastguard Worker    jz .ret
4775*c0909341SAndroid Build Coastguard Worker    add      qword [rsp+56], 8*(isprep+1)
4776*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
4777*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, dxm
4778*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+pd_0x3ff]
4779*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8, [rsp]
4780*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [rsp+64]
4781*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
4782*c0909341SAndroid Build Coastguard Worker    mov                srcq, [rsp+48]
4783*c0909341SAndroid Build Coastguard Worker    mov                 r0q, [rsp+56] ; dstq / tmpq
4784*c0909341SAndroid Build Coastguard Worker.dy2_hloop:
4785*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m11, [base+pq_0x40000000]
4786*c0909341SAndroid Build Coastguard Worker    pand                 m6, m14, m10
4787*c0909341SAndroid Build Coastguard Worker    psrld                m6, 6
4788*c0909341SAndroid Build Coastguard Worker    paddd               m15, m6
4789*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m6, m9
4790*c0909341SAndroid Build Coastguard Worker    vextracti128        xm7, m15, 1
4791*c0909341SAndroid Build Coastguard Worker    movd                r4d, xm15
4792*c0909341SAndroid Build Coastguard Worker    pextrd              r6d, xm15, 2
4793*c0909341SAndroid Build Coastguard Worker    pextrd              r7d, xm15, 1
4794*c0909341SAndroid Build Coastguard Worker    pextrd              r9d, xm15, 3
4795*c0909341SAndroid Build Coastguard Worker    movd               r10d, xm7
4796*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm7, 2
4797*c0909341SAndroid Build Coastguard Worker    pextrd             r13d, xm7, 1
4798*c0909341SAndroid Build Coastguard Worker    pextrd              rXd, xm7, 3
4799*c0909341SAndroid Build Coastguard Worker    movu              [rsp], m14
4800*c0909341SAndroid Build Coastguard Worker    movq               xm15, [base+subpel_filters+ r4*8]
4801*c0909341SAndroid Build Coastguard Worker    movq               xm10, [base+subpel_filters+ r6*8]
4802*c0909341SAndroid Build Coastguard Worker    movhps             xm15, [base+subpel_filters+ r7*8]
4803*c0909341SAndroid Build Coastguard Worker    movhps             xm10, [base+subpel_filters+ r9*8]
4804*c0909341SAndroid Build Coastguard Worker    vinserti128         m15, [base+subpel_filters+r10*8], 1
4805*c0909341SAndroid Build Coastguard Worker    vinserti128         m10, [base+subpel_filters+r11*8], 1
4806*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m9, [base+subpel_filters+r13*8]
4807*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
4808*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
4809*c0909341SAndroid Build Coastguard Worker    vextracti128        xm7, m14, 1
4810*c0909341SAndroid Build Coastguard Worker    movd                r4d, xm14
4811*c0909341SAndroid Build Coastguard Worker    pextrd              r6d, xm14, 2
4812*c0909341SAndroid Build Coastguard Worker    pextrd              r7d, xm14, 1
4813*c0909341SAndroid Build Coastguard Worker    pextrd              r9d, xm14, 3
4814*c0909341SAndroid Build Coastguard Worker    movd               r10d, xm7
4815*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm7, 2
4816*c0909341SAndroid Build Coastguard Worker    pextrd             r13d, xm7, 1
4817*c0909341SAndroid Build Coastguard Worker    pextrd              rXd, xm7, 3
4818*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m6, q1100
4819*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q3322
4820*c0909341SAndroid Build Coastguard Worker    vpblendd            m15, m9, 0xc0
4821*c0909341SAndroid Build Coastguard Worker    vpblendd            m10, m8, 0xc0
4822*c0909341SAndroid Build Coastguard Worker    pblendvb            m15, m11, m5
4823*c0909341SAndroid Build Coastguard Worker    pblendvb            m10, m11, m6
4824*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m14, [base+subpel_s_shuf8]
4825*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
4826*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
4827*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
4828*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
4829*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [rsp+0x50]
4830*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [rsp+0x54]
4831*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [rsp+0x58]
4832*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [rsp+0x5c]
4833*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14    ; 01a 01b
4834*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14    ; 23a 23b
4835*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14    ; 45a 45b
4836*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m14    ; 67a 67b
4837*c0909341SAndroid Build Coastguard Worker    SWAP                m14, m4
4838*c0909341SAndroid Build Coastguard Worker.dy2_vloop:
4839*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m0, m8
4840*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, m9
4841*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m11
4842*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m3, m14
4843*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
4844*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
4845*c0909341SAndroid Build Coastguard Worker    paddd                m4, m13
4846*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
4847*c0909341SAndroid Build Coastguard Worker    psrad                m4, rndshift
4848*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
4849*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm5
4850*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
4851*c0909341SAndroid Build Coastguard Worker    packuswb            xm4, xm4
4852*c0909341SAndroid Build Coastguard Worker    movq             [dstq], xm4
4853*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
4854*c0909341SAndroid Build Coastguard Worker%else
4855*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], xm4
4856*c0909341SAndroid Build Coastguard Worker    add                tmpq, tmp_stridem
4857*c0909341SAndroid Build Coastguard Worker%endif
4858*c0909341SAndroid Build Coastguard Worker    dec                  hd
4859*c0909341SAndroid Build Coastguard Worker    jz .dy2_hloop_prep
4860*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
4861*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
4862*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
4863*c0909341SAndroid Build Coastguard Worker    movq                xm3, [srcq+ r4]
4864*c0909341SAndroid Build Coastguard Worker    movq                xm4, [srcq+ r6]
4865*c0909341SAndroid Build Coastguard Worker    movhps              xm3, [srcq+ r7]
4866*c0909341SAndroid Build Coastguard Worker    movhps              xm4, [srcq+ r9]
4867*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [srcq+r10], 1
4868*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [srcq+r11], 1
4869*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m5, [srcq+r13]
4870*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m6, [srcq+ rX]
4871*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
4872*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m5, 0xc0
4873*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m6, 0xc0
4874*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m15
4875*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m10
4876*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m4
4877*c0909341SAndroid Build Coastguard Worker    movq                xm4, [srcq+ r4]
4878*c0909341SAndroid Build Coastguard Worker    movq                xm5, [srcq+ r6]
4879*c0909341SAndroid Build Coastguard Worker    movhps              xm4, [srcq+ r7]
4880*c0909341SAndroid Build Coastguard Worker    movhps              xm5, [srcq+ r9]
4881*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [srcq+r10], 1
4882*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [srcq+r11], 1
4883*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m6, [srcq+r13]
4884*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m7, [srcq+ rX]
4885*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
4886*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m6, 0xc0
4887*c0909341SAndroid Build Coastguard Worker    vpblendd             m5, m7, 0xc0
4888*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m15
4889*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m10
4890*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m5
4891*c0909341SAndroid Build Coastguard Worker    psrld                m5, m3, 16
4892*c0909341SAndroid Build Coastguard Worker    pslld                m6, m4, 16
4893*c0909341SAndroid Build Coastguard Worker    paddw                m3, m5
4894*c0909341SAndroid Build Coastguard Worker    paddw                m4, m6
4895*c0909341SAndroid Build Coastguard Worker    pblendw              m3, m4, 0xaa
4896*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m12
4897*c0909341SAndroid Build Coastguard Worker    jmp .dy2_vloop
4898*c0909341SAndroid Build Coastguard Worker.ret:
4899*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_RET 0
4900*c0909341SAndroid Build Coastguard Worker%undef isprep
4901*c0909341SAndroid Build Coastguard Worker%endmacro
4902*c0909341SAndroid Build Coastguard Worker
4903*c0909341SAndroid Build Coastguard Worker%macro BILIN_SCALED_FN 1
4904*c0909341SAndroid Build Coastguard Workercglobal %1_bilin_scaled_8bpc
4905*c0909341SAndroid Build Coastguard Worker    mov                 t0d, (5*15 << 16) | 5*15
4906*c0909341SAndroid Build Coastguard Worker    mov                 t1d, t0d
4907*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX)
4908*c0909341SAndroid Build Coastguard Worker%endmacro
4909*c0909341SAndroid Build Coastguard Worker
4910*c0909341SAndroid Build Coastguard Worker%if WIN64
4911*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 5
4912*c0909341SAndroid Build Coastguard Worker%else
4913*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 8
4914*c0909341SAndroid Build Coastguard Worker%endif
4915*c0909341SAndroid Build Coastguard Worker
4916*c0909341SAndroid Build Coastguard Worker%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
4917*c0909341SAndroid Build Coastguard Worker%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
4918*c0909341SAndroid Build Coastguard Worker
4919*c0909341SAndroid Build Coastguard WorkerBILIN_SCALED_FN put
4920*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   put_8tap_scaled_8bpc
4921*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_scaled_8bpc
4922*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_scaled_8bpc
4923*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  put_8tap_scaled_8bpc
4924*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, put_8tap_scaled_8bpc
4925*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   put_8tap_scaled_8bpc
4926*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, put_8tap_scaled_8bpc
4927*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  put_8tap_scaled_8bpc
4928*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
4929*c0909341SAndroid Build Coastguard WorkerMC_8TAP_SCALED put
4930*c0909341SAndroid Build Coastguard Worker
4931*c0909341SAndroid Build Coastguard Worker%if WIN64
4932*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5, 4
4933*c0909341SAndroid Build Coastguard Worker%else
4934*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 7
4935*c0909341SAndroid Build Coastguard Worker%endif
4936*c0909341SAndroid Build Coastguard Worker
4937*c0909341SAndroid Build Coastguard WorkerBILIN_SCALED_FN prep
4938*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   prep_8tap_scaled_8bpc
4939*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_scaled_8bpc
4940*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_scaled_8bpc
4941*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  prep_8tap_scaled_8bpc
4942*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_scaled_8bpc
4943*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_scaled_8bpc
4944*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, prep_8tap_scaled_8bpc
4945*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  prep_8tap_scaled_8bpc
4946*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
4947*c0909341SAndroid Build Coastguard WorkerMC_8TAP_SCALED prep
4948*c0909341SAndroid Build Coastguard Worker
4949*c0909341SAndroid Build Coastguard Worker%macro WARP_V 5 ; dst, 02, 46, 13, 57
4950*c0909341SAndroid Build Coastguard Worker    ; Can be done using gathers, but that's terribly slow on many CPU:s
4951*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [myq+deltaq*4]
4952*c0909341SAndroid Build Coastguard Worker    lea               tmp2d, [myq+deltaq*1]
4953*c0909341SAndroid Build Coastguard Worker    shr                 myd, 10
4954*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
4955*c0909341SAndroid Build Coastguard Worker    movq                xm8, [filterq+myq  *8]
4956*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, [filterq+tmp1q*8], 1 ; a e
4957*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [tmp2q+deltaq*4]
4958*c0909341SAndroid Build Coastguard Worker    lea                 myd, [tmp2q+deltaq*1]
4959*c0909341SAndroid Build Coastguard Worker    shr               tmp2d, 10
4960*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
4961*c0909341SAndroid Build Coastguard Worker    movq                xm0, [filterq+tmp2q*8]
4962*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [filterq+tmp1q*8], 1 ; b f
4963*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [myq+deltaq*4]
4964*c0909341SAndroid Build Coastguard Worker    lea               tmp2d, [myq+deltaq*1]
4965*c0909341SAndroid Build Coastguard Worker    shr                 myd, 10
4966*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
4967*c0909341SAndroid Build Coastguard Worker    movq                xm9, [filterq+myq  *8]
4968*c0909341SAndroid Build Coastguard Worker    vinserti128          m9, [filterq+tmp1q*8], 1 ; c g
4969*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [tmp2q+deltaq*4]
4970*c0909341SAndroid Build Coastguard Worker    lea                 myd, [tmp2q+gammaq]       ; my += gamma
4971*c0909341SAndroid Build Coastguard Worker    shr               tmp2d, 10
4972*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
4973*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m0
4974*c0909341SAndroid Build Coastguard Worker    movq                xm0, [filterq+tmp2q*8]
4975*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [filterq+tmp1q*8], 1 ; d h
4976*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m9, m0
4977*c0909341SAndroid Build Coastguard Worker    punpckldq            m9, m8, m0
4978*c0909341SAndroid Build Coastguard Worker    punpckhdq            m0, m8, m0
4979*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
4980*c0909341SAndroid Build Coastguard Worker    punpckhbw            m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
4981*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%2, m8
4982*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m%3
4983*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
4984*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
4985*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m%4
4986*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m%5
4987*c0909341SAndroid Build Coastguard Worker    paddd               m%2, m9
4988*c0909341SAndroid Build Coastguard Worker    paddd                m0, m8
4989*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m0, m%2
4990*c0909341SAndroid Build Coastguard Worker%endmacro
4991*c0909341SAndroid Build Coastguard Worker
4992*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts
4993*c0909341SAndroid Build Coastguard Worker%if WIN64
4994*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 0xa0
4995*c0909341SAndroid Build Coastguard Worker%endif
4996*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main
4997*c0909341SAndroid Build Coastguard Worker.loop:
4998*c0909341SAndroid Build Coastguard Worker    psrad                m7, 13
4999*c0909341SAndroid Build Coastguard Worker    psrad                m0, 13
5000*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m0
5001*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m14 ; (x + (1 << 6)) >> 7
5002*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m7, q3120
5003*c0909341SAndroid Build Coastguard Worker    mova         [tmpq+tsq*0], xm7
5004*c0909341SAndroid Build Coastguard Worker    vextracti128 [tmpq+tsq*2], m7, 1
5005*c0909341SAndroid Build Coastguard Worker    dec                 r4d
5006*c0909341SAndroid Build Coastguard Worker    jz   mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).end
5007*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main2
5008*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+tsq*4]
5009*c0909341SAndroid Build Coastguard Worker    jmp .loop
5010*c0909341SAndroid Build Coastguard Worker
5011*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
5012*c0909341SAndroid Build Coastguard Worker                                        beta, filter, tmp1, delta, my, gamma
5013*c0909341SAndroid Build Coastguard Worker%if WIN64
5014*c0909341SAndroid Build Coastguard Worker    %assign xmm_regs_used 16
5015*c0909341SAndroid Build Coastguard Worker    %assign stack_size_padded 0xa0
5016*c0909341SAndroid Build Coastguard Worker    SUB                 rsp, stack_size_padded
5017*c0909341SAndroid Build Coastguard Worker%endif
5018*c0909341SAndroid Build Coastguard Worker    call .main
5019*c0909341SAndroid Build Coastguard Worker    jmp .start
5020*c0909341SAndroid Build Coastguard Worker.loop:
5021*c0909341SAndroid Build Coastguard Worker    call .main2
5022*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5023*c0909341SAndroid Build Coastguard Worker.start:
5024*c0909341SAndroid Build Coastguard Worker    psrad                m7, 18
5025*c0909341SAndroid Build Coastguard Worker    psrad                m0, 18
5026*c0909341SAndroid Build Coastguard Worker    packusdw             m7, m0
5027*c0909341SAndroid Build Coastguard Worker    pavgw                m7, m11 ; (x + (1 << 10)) >> 11
5028*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m7, 1
5029*c0909341SAndroid Build Coastguard Worker    packuswb            xm7, xm0
5030*c0909341SAndroid Build Coastguard Worker    pshufd              xm7, xm7, q3120
5031*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm7
5032*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm7
5033*c0909341SAndroid Build Coastguard Worker    dec                 r4d
5034*c0909341SAndroid Build Coastguard Worker    jg .loop
5035*c0909341SAndroid Build Coastguard Worker.end:
5036*c0909341SAndroid Build Coastguard Worker    RET
5037*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5038*c0909341SAndroid Build Coastguard Worker.main:
5039*c0909341SAndroid Build Coastguard Worker    ; Stack is offset due to call
5040*c0909341SAndroid Build Coastguard Worker    %assign stack_offset stack_offset + gprsize
5041*c0909341SAndroid Build Coastguard Worker    %assign stack_size stack_size + gprsize
5042*c0909341SAndroid Build Coastguard Worker    %assign stack_size_padded stack_size_padded + gprsize
5043*c0909341SAndroid Build Coastguard Worker    movifnidn         abcdq, abcdmp
5044*c0909341SAndroid Build Coastguard Worker    movifnidn           mxd, mxm
5045*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM
5046*c0909341SAndroid Build Coastguard Worker    movsx            alphad, word [abcdq+2*0]
5047*c0909341SAndroid Build Coastguard Worker    movsx             betad, word [abcdq+2*1]
5048*c0909341SAndroid Build Coastguard Worker    mova                m12, [warp_8x8_shufA]
5049*c0909341SAndroid Build Coastguard Worker    mova                m13, [warp_8x8_shufB]
5050*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [pw_8192]
5051*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [pd_32768]
5052*c0909341SAndroid Build Coastguard Worker    pxor                m11, m11
5053*c0909341SAndroid Build Coastguard Worker    lea             filterq, [mc_warp_filter2]
5054*c0909341SAndroid Build Coastguard Worker    lea               tmp1q, [ssq*3+3]
5055*c0909341SAndroid Build Coastguard Worker    add                 mxd, 512+(64<<10)
5056*c0909341SAndroid Build Coastguard Worker    lea               tmp2d, [alphaq*3]
5057*c0909341SAndroid Build Coastguard Worker    sub                srcq, tmp1q    ; src -= src_stride*3 + 3
5058*c0909341SAndroid Build Coastguard Worker    sub               betad, tmp2d    ; beta -= alpha*3
5059*c0909341SAndroid Build Coastguard Worker    mov                 myd, r6m
5060*c0909341SAndroid Build Coastguard Worker    call .h
5061*c0909341SAndroid Build Coastguard Worker    psrld                m1, m0, 16
5062*c0909341SAndroid Build Coastguard Worker    call .h
5063*c0909341SAndroid Build Coastguard Worker    psrld                m4, m0, 16
5064*c0909341SAndroid Build Coastguard Worker    call .h
5065*c0909341SAndroid Build Coastguard Worker    pblendw              m1, m0, 0xaa ; 02
5066*c0909341SAndroid Build Coastguard Worker    call .h
5067*c0909341SAndroid Build Coastguard Worker    pblendw              m4, m0, 0xaa ; 13
5068*c0909341SAndroid Build Coastguard Worker    call .h
5069*c0909341SAndroid Build Coastguard Worker    psrld                m2, m1, 16
5070*c0909341SAndroid Build Coastguard Worker    pblendw              m2, m0, 0xaa ; 24
5071*c0909341SAndroid Build Coastguard Worker    call .h
5072*c0909341SAndroid Build Coastguard Worker    psrld                m5, m4, 16
5073*c0909341SAndroid Build Coastguard Worker    pblendw              m5, m0, 0xaa ; 35
5074*c0909341SAndroid Build Coastguard Worker    call .h
5075*c0909341SAndroid Build Coastguard Worker    psrld                m3, m2, 16
5076*c0909341SAndroid Build Coastguard Worker    pblendw              m3, m0, 0xaa ; 46
5077*c0909341SAndroid Build Coastguard Worker    movsx            deltad, word [abcdq+2*2]
5078*c0909341SAndroid Build Coastguard Worker    movsx            gammad, word [abcdq+2*3]
5079*c0909341SAndroid Build Coastguard Worker    add                 myd, 512+(64<<10)
5080*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 4
5081*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [deltaq*3]
5082*c0909341SAndroid Build Coastguard Worker    sub              gammad, tmp1d    ; gamma -= delta*3
5083*c0909341SAndroid Build Coastguard Worker.main2:
5084*c0909341SAndroid Build Coastguard Worker    call .h
5085*c0909341SAndroid Build Coastguard Worker    psrld                m6, m5, 16
5086*c0909341SAndroid Build Coastguard Worker    pblendw              m6, m0, 0xaa ; 57
5087*c0909341SAndroid Build Coastguard Worker    WARP_V                7, 1, 3, 4, 6
5088*c0909341SAndroid Build Coastguard Worker    call .h
5089*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
5090*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
5091*c0909341SAndroid Build Coastguard Worker    psrld                m3, 16
5092*c0909341SAndroid Build Coastguard Worker    pblendw              m3, m0, 0xaa ; 68
5093*c0909341SAndroid Build Coastguard Worker    WARP_V                0, 4, 6, 1, 3
5094*c0909341SAndroid Build Coastguard Worker    mova                 m4, m5
5095*c0909341SAndroid Build Coastguard Worker    mova                 m5, m6
5096*c0909341SAndroid Build Coastguard Worker    ret
5097*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5098*c0909341SAndroid Build Coastguard Worker.h:
5099*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [mxq+alphaq*4]
5100*c0909341SAndroid Build Coastguard Worker    lea               tmp2d, [mxq+alphaq*1]
5101*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m10, [srcq]
5102*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 10
5103*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
5104*c0909341SAndroid Build Coastguard Worker    movq                xm8, [filterq+mxq  *8]
5105*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, [filterq+tmp1q*8], 1
5106*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [tmp2q+alphaq*4]
5107*c0909341SAndroid Build Coastguard Worker    lea                 mxd, [tmp2q+alphaq*1]
5108*c0909341SAndroid Build Coastguard Worker    shr               tmp2d, 10
5109*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
5110*c0909341SAndroid Build Coastguard Worker    movq                xm0, [filterq+tmp2q*8]
5111*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [filterq+tmp1q*8], 1
5112*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [mxq+alphaq*4]
5113*c0909341SAndroid Build Coastguard Worker    lea               tmp2d, [mxq+alphaq*1]
5114*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 10
5115*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
5116*c0909341SAndroid Build Coastguard Worker    movq                xm9, [filterq+mxq  *8]
5117*c0909341SAndroid Build Coastguard Worker    vinserti128          m9, [filterq+tmp1q*8], 1
5118*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [tmp2q+alphaq*4]
5119*c0909341SAndroid Build Coastguard Worker    lea                 mxd, [tmp2q+betaq] ; mx += beta
5120*c0909341SAndroid Build Coastguard Worker    shr               tmp2d, 10
5121*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
5122*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m8, m0  ; 0 1   4 5
5123*c0909341SAndroid Build Coastguard Worker    movq                xm0, [filterq+tmp2q*8]
5124*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [filterq+tmp1q*8], 1
5125*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m9, m0  ; 2 3   6 7
5126*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m10, m12
5127*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m8
5128*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m13
5129*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
5130*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
5131*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m10
5132*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m14 ; 17-bit intermediate, upshifted by 13
5133*c0909341SAndroid Build Coastguard Worker    paddd                m0, m15 ; rounded 14-bit result in upper 16 bits of dword
5134*c0909341SAndroid Build Coastguard Worker    ret
5135*c0909341SAndroid Build Coastguard Worker
5136*c0909341SAndroid Build Coastguard Worker%macro BIDIR_FN 1 ; op
5137*c0909341SAndroid Build Coastguard Worker    %1                    0
5138*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
5139*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5140*c0909341SAndroid Build Coastguard Worker.w4:
5141*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
5142*c0909341SAndroid Build Coastguard Worker    movd   [dstq          ], xm0
5143*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
5144*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm1
5145*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 1
5146*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
5147*c0909341SAndroid Build Coastguard Worker    je .ret
5148*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5149*c0909341SAndroid Build Coastguard Worker    pextrd [dstq          ], xm0, 2
5150*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 3
5151*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm1, 2
5152*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 3
5153*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
5154*c0909341SAndroid Build Coastguard Worker    je .ret
5155*c0909341SAndroid Build Coastguard Worker    %1                    2
5156*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5157*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
5158*c0909341SAndroid Build Coastguard Worker    movd   [dstq          ], xm0
5159*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
5160*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm1
5161*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 1
5162*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5163*c0909341SAndroid Build Coastguard Worker    pextrd [dstq          ], xm0, 2
5164*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 3
5165*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm1, 2
5166*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 3
5167*c0909341SAndroid Build Coastguard Worker.ret:
5168*c0909341SAndroid Build Coastguard Worker    RET
5169*c0909341SAndroid Build Coastguard Worker.w8_loop:
5170*c0909341SAndroid Build Coastguard Worker    %1_INC_PTR            2
5171*c0909341SAndroid Build Coastguard Worker    %1                    0
5172*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5173*c0909341SAndroid Build Coastguard Worker.w8:
5174*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
5175*c0909341SAndroid Build Coastguard Worker    movq   [dstq          ], xm0
5176*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
5177*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
5178*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
5179*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5180*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
5181*c0909341SAndroid Build Coastguard Worker    RET
5182*c0909341SAndroid Build Coastguard Worker.w16_loop:
5183*c0909341SAndroid Build Coastguard Worker    %1_INC_PTR            4
5184*c0909341SAndroid Build Coastguard Worker    %1                    0
5185*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5186*c0909341SAndroid Build Coastguard Worker.w16:
5187*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
5188*c0909341SAndroid Build Coastguard Worker    mova         [dstq          ], xm0
5189*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
5190*c0909341SAndroid Build Coastguard Worker    %1                    2
5191*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
5192*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*2], xm0
5193*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+stride3q ], m0, 1
5194*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5195*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
5196*c0909341SAndroid Build Coastguard Worker    RET
5197*c0909341SAndroid Build Coastguard Worker.w32_loop:
5198*c0909341SAndroid Build Coastguard Worker    %1_INC_PTR            4
5199*c0909341SAndroid Build Coastguard Worker    %1                    0
5200*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
5201*c0909341SAndroid Build Coastguard Worker.w32:
5202*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
5203*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
5204*c0909341SAndroid Build Coastguard Worker    %1                    2
5205*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
5206*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m0
5207*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5208*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
5209*c0909341SAndroid Build Coastguard Worker    RET
5210*c0909341SAndroid Build Coastguard Worker.w64_loop:
5211*c0909341SAndroid Build Coastguard Worker    %1_INC_PTR            4
5212*c0909341SAndroid Build Coastguard Worker    %1                    0
5213*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
5214*c0909341SAndroid Build Coastguard Worker.w64:
5215*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
5216*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
5217*c0909341SAndroid Build Coastguard Worker    %1                    2
5218*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
5219*c0909341SAndroid Build Coastguard Worker    mova          [dstq+32], m0
5220*c0909341SAndroid Build Coastguard Worker    dec                  hd
5221*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
5222*c0909341SAndroid Build Coastguard Worker    RET
5223*c0909341SAndroid Build Coastguard Worker.w128_loop:
5224*c0909341SAndroid Build Coastguard Worker    %1                    0
5225*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
5226*c0909341SAndroid Build Coastguard Worker.w128:
5227*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
5228*c0909341SAndroid Build Coastguard Worker    mova        [dstq+0*32], m0
5229*c0909341SAndroid Build Coastguard Worker    %1                    2
5230*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
5231*c0909341SAndroid Build Coastguard Worker    mova        [dstq+1*32], m0
5232*c0909341SAndroid Build Coastguard Worker    %1_INC_PTR            8
5233*c0909341SAndroid Build Coastguard Worker    %1                   -4
5234*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
5235*c0909341SAndroid Build Coastguard Worker    mova        [dstq+2*32], m0
5236*c0909341SAndroid Build Coastguard Worker    %1                   -2
5237*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
5238*c0909341SAndroid Build Coastguard Worker    mova        [dstq+3*32], m0
5239*c0909341SAndroid Build Coastguard Worker    dec                  hd
5240*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
5241*c0909341SAndroid Build Coastguard Worker    RET
5242*c0909341SAndroid Build Coastguard Worker%endmacro
5243*c0909341SAndroid Build Coastguard Worker
5244*c0909341SAndroid Build Coastguard Worker%macro AVG 1 ; src_offset
5245*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tmp1q+(%1+0)*32]
5246*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tmp2q+(%1+0)*32]
5247*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp1q+(%1+1)*32]
5248*c0909341SAndroid Build Coastguard Worker    paddw                m1, [tmp2q+(%1+1)*32]
5249*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m2
5250*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
5251*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
5252*c0909341SAndroid Build Coastguard Worker%endmacro
5253*c0909341SAndroid Build Coastguard Worker
5254*c0909341SAndroid Build Coastguard Worker%macro AVG_INC_PTR 1
5255*c0909341SAndroid Build Coastguard Worker    add               tmp1q, %1*32
5256*c0909341SAndroid Build Coastguard Worker    add               tmp2q, %1*32
5257*c0909341SAndroid Build Coastguard Worker%endmacro
5258*c0909341SAndroid Build Coastguard Worker
5259*c0909341SAndroid Build Coastguard Workercglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
5260*c0909341SAndroid Build Coastguard Worker%define base r6-avg %+ SUFFIX %+ _table
5261*c0909341SAndroid Build Coastguard Worker    lea                  r6, [avg %+ SUFFIX %+ _table]
5262*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5263*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5264*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [r6+wq*4]
5265*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [base+pw_1024]
5266*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
5267*c0909341SAndroid Build Coastguard Worker    BIDIR_FN            AVG
5268*c0909341SAndroid Build Coastguard Worker
5269*c0909341SAndroid Build Coastguard Worker%macro W_AVG 1 ; src_offset
5270*c0909341SAndroid Build Coastguard Worker    ; (a * weight + b * (16 - weight) + 128) >> 8
5271*c0909341SAndroid Build Coastguard Worker    ; = ((a - b) * weight + (b << 4) + 128) >> 8
5272*c0909341SAndroid Build Coastguard Worker    ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
5273*c0909341SAndroid Build Coastguard Worker    ; = ((((b - a) * (-weight     << 12)) >> 16) + b + 8) >> 4
5274*c0909341SAndroid Build Coastguard Worker    mova                 m0,     [tmp1q+(%1+0)*32]
5275*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, [tmp2q+(%1+0)*32]
5276*c0909341SAndroid Build Coastguard Worker    mova                 m1,     [tmp1q+(%1+1)*32]
5277*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, [tmp2q+(%1+1)*32]
5278*c0909341SAndroid Build Coastguard Worker    pmulhw               m2, m4
5279*c0909341SAndroid Build Coastguard Worker    pmulhw               m3, m4
5280*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
5281*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
5282*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
5283*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
5284*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
5285*c0909341SAndroid Build Coastguard Worker%endmacro
5286*c0909341SAndroid Build Coastguard Worker
5287*c0909341SAndroid Build Coastguard Worker%define W_AVG_INC_PTR AVG_INC_PTR
5288*c0909341SAndroid Build Coastguard Worker
5289*c0909341SAndroid Build Coastguard Workercglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
5290*c0909341SAndroid Build Coastguard Worker%define base r6-w_avg %+ SUFFIX %+ _table
5291*c0909341SAndroid Build Coastguard Worker    lea                  r6, [w_avg %+ SUFFIX %+ _table]
5292*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5293*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5294*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, r6m ; weight
5295*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [r6+wq*4]
5296*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pw_2048]
5297*c0909341SAndroid Build Coastguard Worker    psllw                m4, 12 ; (weight-16) << 12 when interpreted as signed
5298*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
5299*c0909341SAndroid Build Coastguard Worker    cmp           dword r6m, 7
5300*c0909341SAndroid Build Coastguard Worker    jg .weight_gt7
5301*c0909341SAndroid Build Coastguard Worker    mov                  r6, tmp1q
5302*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
5303*c0909341SAndroid Build Coastguard Worker    mov               tmp1q, tmp2q
5304*c0909341SAndroid Build Coastguard Worker    psubw                m4, m0, m4 ; -weight
5305*c0909341SAndroid Build Coastguard Worker    mov               tmp2q, r6
5306*c0909341SAndroid Build Coastguard Worker.weight_gt7:
5307*c0909341SAndroid Build Coastguard Worker    BIDIR_FN          W_AVG
5308*c0909341SAndroid Build Coastguard Worker
5309*c0909341SAndroid Build Coastguard Worker%macro MASK 1 ; src_offset
5310*c0909341SAndroid Build Coastguard Worker    ; (a * m + b * (64 - m) + 512) >> 10
5311*c0909341SAndroid Build Coastguard Worker    ; = ((a - b) * m + (b << 6) + 512) >> 10
5312*c0909341SAndroid Build Coastguard Worker    ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
5313*c0909341SAndroid Build Coastguard Worker    vpermq               m3,     [maskq+%1*16], q3120
5314*c0909341SAndroid Build Coastguard Worker    mova                 m0,     [tmp2q+(%1+0)*32]
5315*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0, [tmp1q+(%1+0)*32]
5316*c0909341SAndroid Build Coastguard Worker    psubb                m3, m4, m3
5317*c0909341SAndroid Build Coastguard Worker    paddw                m1, m1     ; (b - a) << 1
5318*c0909341SAndroid Build Coastguard Worker    paddb                m3, m3
5319*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4, m3 ; -m << 9
5320*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m2
5321*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
5322*c0909341SAndroid Build Coastguard Worker    mova                 m1,     [tmp2q+(%1+1)*32]
5323*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1, [tmp1q+(%1+1)*32]
5324*c0909341SAndroid Build Coastguard Worker    paddw                m2, m2
5325*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m4, m3
5326*c0909341SAndroid Build Coastguard Worker    pmulhw               m2, m3
5327*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
5328*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
5329*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
5330*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
5331*c0909341SAndroid Build Coastguard Worker%endmacro
5332*c0909341SAndroid Build Coastguard Worker
5333*c0909341SAndroid Build Coastguard Worker%macro MASK_INC_PTR 1
5334*c0909341SAndroid Build Coastguard Worker    add               maskq, %1*16
5335*c0909341SAndroid Build Coastguard Worker    add               tmp2q, %1*32
5336*c0909341SAndroid Build Coastguard Worker    add               tmp1q, %1*32
5337*c0909341SAndroid Build Coastguard Worker%endmacro
5338*c0909341SAndroid Build Coastguard Worker
5339*c0909341SAndroid Build Coastguard Workercglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
5340*c0909341SAndroid Build Coastguard Worker%define base r7-mask %+ SUFFIX %+ _table
5341*c0909341SAndroid Build Coastguard Worker    lea                  r7, [mask %+ SUFFIX %+ _table]
5342*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5343*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5344*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
5345*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [r7+wq*4]
5346*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pw_2048]
5347*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
5348*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
5349*c0909341SAndroid Build Coastguard Worker    BIDIR_FN           MASK
5350*c0909341SAndroid Build Coastguard Worker
5351*c0909341SAndroid Build Coastguard Worker%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4
5352*c0909341SAndroid Build Coastguard Worker    mova                m%1, [tmp1q+32*%3]
5353*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp2q+32*%3]
5354*c0909341SAndroid Build Coastguard Worker    psubw                m1, m%1
5355*c0909341SAndroid Build Coastguard Worker    pabsw               m%2, m1
5356*c0909341SAndroid Build Coastguard Worker    psubusw             m%2, m6, m%2
5357*c0909341SAndroid Build Coastguard Worker    psrlw               m%2, 8 ; 64 - m
5358*c0909341SAndroid Build Coastguard Worker    psllw                m2, m%2, 10
5359*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m2
5360*c0909341SAndroid Build Coastguard Worker    paddw               m%1, m1
5361*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp1q+32*%4]
5362*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmp2q+32*%4]
5363*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
5364*c0909341SAndroid Build Coastguard Worker    pabsw                m3, m2
5365*c0909341SAndroid Build Coastguard Worker    psubusw              m3, m6, m3
5366*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 8
5367*c0909341SAndroid Build Coastguard Worker%if %5
5368*c0909341SAndroid Build Coastguard Worker    packuswb            m%2, m3
5369*c0909341SAndroid Build Coastguard Worker    psubb               m%2, m5, m%2
5370*c0909341SAndroid Build Coastguard Worker    vpermq              m%2, m%2, q3120
5371*c0909341SAndroid Build Coastguard Worker%else
5372*c0909341SAndroid Build Coastguard Worker    phaddw              m%2, m3
5373*c0909341SAndroid Build Coastguard Worker%endif
5374*c0909341SAndroid Build Coastguard Worker    psllw                m3, 10
5375*c0909341SAndroid Build Coastguard Worker    pmulhw               m2, m3
5376*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
5377*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%1, m7
5378*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7
5379*c0909341SAndroid Build Coastguard Worker    packuswb            m%1, m1
5380*c0909341SAndroid Build Coastguard Worker%endmacro
5381*c0909341SAndroid Build Coastguard Worker
5382*c0909341SAndroid Build Coastguard Workercglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
5383*c0909341SAndroid Build Coastguard Worker%define base r6-blend_avx2_table
5384*c0909341SAndroid Build Coastguard Worker    lea                  r6, [blend_avx2_table]
5385*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5386*c0909341SAndroid Build Coastguard Worker    movifnidn         maskq, maskmp
5387*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5388*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [r6+wq*4]
5389*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+pb_64]
5390*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pw_512]
5391*c0909341SAndroid Build Coastguard Worker    sub                tmpq, maskq
5392*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
5393*c0909341SAndroid Build Coastguard Worker    lea                  r6, [dsq*3]
5394*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5395*c0909341SAndroid Build Coastguard Worker.w4:
5396*c0909341SAndroid Build Coastguard Worker    movd                xm0, [dstq+dsq*0]
5397*c0909341SAndroid Build Coastguard Worker    pinsrd              xm0, [dstq+dsq*1], 1
5398*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm1, [dstq+dsq*2]
5399*c0909341SAndroid Build Coastguard Worker    pinsrd              xm1, [dstq+r6   ], 3
5400*c0909341SAndroid Build Coastguard Worker    mova                xm6, [maskq]
5401*c0909341SAndroid Build Coastguard Worker    psubb               xm3, xm4, xm6
5402*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm2, xm3, xm6
5403*c0909341SAndroid Build Coastguard Worker    punpckhbw           xm3, xm6
5404*c0909341SAndroid Build Coastguard Worker    mova                xm6, [maskq+tmpq]
5405*c0909341SAndroid Build Coastguard Worker    add               maskq, 4*4
5406*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm0, xm6
5407*c0909341SAndroid Build Coastguard Worker    punpckhbw           xm1, xm6
5408*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm2
5409*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm3
5410*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm5
5411*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm1, xm5
5412*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm1
5413*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm0
5414*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm0, 1
5415*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*2], xm0, 2
5416*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+r6   ], xm0, 3
5417*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*4]
5418*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5419*c0909341SAndroid Build Coastguard Worker    jg .w4
5420*c0909341SAndroid Build Coastguard Worker    RET
5421*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5422*c0909341SAndroid Build Coastguard Worker.w8:
5423*c0909341SAndroid Build Coastguard Worker    movq                xm1, [dstq+dsq*0]
5424*c0909341SAndroid Build Coastguard Worker    movhps              xm1, [dstq+dsq*1]
5425*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [dstq+dsq*2]
5426*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [dstq+r6   ]
5427*c0909341SAndroid Build Coastguard Worker    mova                 m0, [maskq]
5428*c0909341SAndroid Build Coastguard Worker    mova                 m6, [maskq+tmpq]
5429*c0909341SAndroid Build Coastguard Worker    add               maskq, 8*4
5430*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m2, 0x30
5431*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m3, 0xc0
5432*c0909341SAndroid Build Coastguard Worker    psubb                m3, m4, m0
5433*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m0
5434*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m0
5435*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m6
5436*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m6
5437*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
5438*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
5439*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
5440*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
5441*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
5442*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
5443*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm0
5444*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm0
5445*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*2], xm1
5446*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+r6   ], xm1
5447*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*4]
5448*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5449*c0909341SAndroid Build Coastguard Worker    jg .w8
5450*c0909341SAndroid Build Coastguard Worker    RET
5451*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5452*c0909341SAndroid Build Coastguard Worker.w16:
5453*c0909341SAndroid Build Coastguard Worker    mova                 m0, [maskq]
5454*c0909341SAndroid Build Coastguard Worker    mova                xm1, [dstq+dsq*0]
5455*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [dstq+dsq*1], 1
5456*c0909341SAndroid Build Coastguard Worker    psubb                m3, m4, m0
5457*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m0
5458*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m0
5459*c0909341SAndroid Build Coastguard Worker    mova                 m6, [maskq+tmpq]
5460*c0909341SAndroid Build Coastguard Worker    add               maskq, 16*2
5461*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m6
5462*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m6
5463*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
5464*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
5465*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
5466*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
5467*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
5468*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm0
5469*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], m0, 1
5470*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5471*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5472*c0909341SAndroid Build Coastguard Worker    jg .w16
5473*c0909341SAndroid Build Coastguard Worker    RET
5474*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5475*c0909341SAndroid Build Coastguard Worker.w32:
5476*c0909341SAndroid Build Coastguard Worker    mova                 m0, [maskq]
5477*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq]
5478*c0909341SAndroid Build Coastguard Worker    mova                 m6, [maskq+tmpq]
5479*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
5480*c0909341SAndroid Build Coastguard Worker    psubb                m3, m4, m0
5481*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m0
5482*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m0
5483*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m6
5484*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m6
5485*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
5486*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
5487*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
5488*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
5489*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
5490*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
5491*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
5492*c0909341SAndroid Build Coastguard Worker    dec                  hd
5493*c0909341SAndroid Build Coastguard Worker    jg .w32
5494*c0909341SAndroid Build Coastguard Worker    RET
5495*c0909341SAndroid Build Coastguard Worker
5496*c0909341SAndroid Build Coastguard Workercglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
5497*c0909341SAndroid Build Coastguard Worker%define base r5-blend_v_avx2_table
5498*c0909341SAndroid Build Coastguard Worker    lea                  r5, [blend_v_avx2_table]
5499*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5500*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5501*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [r5+wq*4]
5502*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pw_512]
5503*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
5504*c0909341SAndroid Build Coastguard Worker    add               maskq, obmc_masks-blend_v_avx2_table
5505*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5506*c0909341SAndroid Build Coastguard Worker.w2:
5507*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm2, [maskq+2*2]
5508*c0909341SAndroid Build Coastguard Worker.w2_s0_loop:
5509*c0909341SAndroid Build Coastguard Worker    movd                xm0, [dstq+dsq*0]
5510*c0909341SAndroid Build Coastguard Worker    pinsrw              xm0, [dstq+dsq*1], 1
5511*c0909341SAndroid Build Coastguard Worker    movd                xm1, [tmpq]
5512*c0909341SAndroid Build Coastguard Worker    add                tmpq, 2*2
5513*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm0, xm1
5514*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm2
5515*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm5
5516*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm0
5517*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xm0, 0
5518*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xm0, 1
5519*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5520*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5521*c0909341SAndroid Build Coastguard Worker    jg .w2_s0_loop
5522*c0909341SAndroid Build Coastguard Worker    RET
5523*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5524*c0909341SAndroid Build Coastguard Worker.w4:
5525*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        xm2, [maskq+4*2]
5526*c0909341SAndroid Build Coastguard Worker.w4_loop:
5527*c0909341SAndroid Build Coastguard Worker    movd                xm0, [dstq+dsq*0]
5528*c0909341SAndroid Build Coastguard Worker    pinsrd              xm0, [dstq+dsq*1], 1
5529*c0909341SAndroid Build Coastguard Worker    movq                xm1, [tmpq]
5530*c0909341SAndroid Build Coastguard Worker    add                tmpq, 4*2
5531*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm0, xm1
5532*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm2
5533*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm5
5534*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm0
5535*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm0
5536*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm0, 1
5537*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5538*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5539*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
5540*c0909341SAndroid Build Coastguard Worker    RET
5541*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5542*c0909341SAndroid Build Coastguard Worker.w8:
5543*c0909341SAndroid Build Coastguard Worker    mova                xm3, [maskq+8*2]
5544*c0909341SAndroid Build Coastguard Worker.w8_loop:
5545*c0909341SAndroid Build Coastguard Worker    movq                xm0, [dstq+dsq*0]
5546*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        xm1, [dstq+dsq*1]
5547*c0909341SAndroid Build Coastguard Worker    mova                xm2, [tmpq]
5548*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8*2
5549*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm0, xm2
5550*c0909341SAndroid Build Coastguard Worker    punpckhbw           xm1, xm2
5551*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm3
5552*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm3
5553*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm5
5554*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm1, xm5
5555*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm1
5556*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], xm0
5557*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], xm0
5558*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5559*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5560*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
5561*c0909341SAndroid Build Coastguard Worker    RET
5562*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5563*c0909341SAndroid Build Coastguard Worker.w16:
5564*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [maskq+16*2]
5565*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [maskq+16*3]
5566*c0909341SAndroid Build Coastguard Worker.w16_loop:
5567*c0909341SAndroid Build Coastguard Worker    mova                xm1, [dstq+dsq*0]
5568*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [dstq+dsq*1], 1
5569*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmpq]
5570*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
5571*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
5572*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
5573*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
5574*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m4
5575*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
5576*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
5577*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
5578*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm0
5579*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], m0, 1
5580*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5581*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5582*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
5583*c0909341SAndroid Build Coastguard Worker    RET
5584*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5585*c0909341SAndroid Build Coastguard Worker.w32:
5586*c0909341SAndroid Build Coastguard Worker    mova                xm3, [maskq+16*4]
5587*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [maskq+16*6], 1
5588*c0909341SAndroid Build Coastguard Worker    mova                xm4, [maskq+16*5]
5589*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [maskq+16*7], 1
5590*c0909341SAndroid Build Coastguard Worker.w32_loop:
5591*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq]
5592*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmpq]
5593*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
5594*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
5595*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
5596*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
5597*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m4
5598*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
5599*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
5600*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
5601*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
5602*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
5603*c0909341SAndroid Build Coastguard Worker    dec                  hd
5604*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
5605*c0909341SAndroid Build Coastguard Worker    RET
5606*c0909341SAndroid Build Coastguard Worker
5607*c0909341SAndroid Build Coastguard Workercglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mask
5608*c0909341SAndroid Build Coastguard Worker%define base r5-blend_h_avx2_table
5609*c0909341SAndroid Build Coastguard Worker    lea                  r5, [blend_h_avx2_table]
5610*c0909341SAndroid Build Coastguard Worker    mov                 r6d, wd
5611*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
5612*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
5613*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [r5+wq*4]
5614*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pw_512]
5615*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
5616*c0909341SAndroid Build Coastguard Worker    lea               maskq, [base+obmc_masks+hq*2]
5617*c0909341SAndroid Build Coastguard Worker    lea                  hd, [hq*3]
5618*c0909341SAndroid Build Coastguard Worker    shr                  hd, 2 ; h * 3/4
5619*c0909341SAndroid Build Coastguard Worker    lea               maskq, [maskq+hq*2]
5620*c0909341SAndroid Build Coastguard Worker    neg                  hq
5621*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5622*c0909341SAndroid Build Coastguard Worker.w2:
5623*c0909341SAndroid Build Coastguard Worker    movd                xm0, [dstq+dsq*0]
5624*c0909341SAndroid Build Coastguard Worker    pinsrw              xm0, [dstq+dsq*1], 1
5625*c0909341SAndroid Build Coastguard Worker    movd                xm2, [maskq+hq*2]
5626*c0909341SAndroid Build Coastguard Worker    movd                xm1, [tmpq]
5627*c0909341SAndroid Build Coastguard Worker    add                tmpq, 2*2
5628*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm2
5629*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm0, xm1
5630*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm2
5631*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm5
5632*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm0
5633*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*0], xm0, 0
5634*c0909341SAndroid Build Coastguard Worker    pextrw     [dstq+dsq*1], xm0, 1
5635*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5636*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
5637*c0909341SAndroid Build Coastguard Worker    jl .w2
5638*c0909341SAndroid Build Coastguard Worker    RET
5639*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5640*c0909341SAndroid Build Coastguard Worker.w4:
5641*c0909341SAndroid Build Coastguard Worker    mova                xm3, [blend_shuf]
5642*c0909341SAndroid Build Coastguard Worker.w4_loop:
5643*c0909341SAndroid Build Coastguard Worker    movd                xm0, [dstq+dsq*0]
5644*c0909341SAndroid Build Coastguard Worker    pinsrd              xm0, [dstq+dsq*1], 1
5645*c0909341SAndroid Build Coastguard Worker    movd                xm2, [maskq+hq*2]
5646*c0909341SAndroid Build Coastguard Worker    movq                xm1, [tmpq]
5647*c0909341SAndroid Build Coastguard Worker    add                tmpq, 4*2
5648*c0909341SAndroid Build Coastguard Worker    pshufb              xm2, xm3
5649*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm0, xm1
5650*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm2
5651*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm5
5652*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm0
5653*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], xm0
5654*c0909341SAndroid Build Coastguard Worker    pextrd     [dstq+dsq*1], xm0, 1
5655*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5656*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
5657*c0909341SAndroid Build Coastguard Worker    jl .w4_loop
5658*c0909341SAndroid Build Coastguard Worker    RET
5659*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5660*c0909341SAndroid Build Coastguard Worker.w8:
5661*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [blend_shuf]
5662*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m4, 0x03
5663*c0909341SAndroid Build Coastguard Worker.w8_loop:
5664*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [dstq+dsq*0]
5665*c0909341SAndroid Build Coastguard Worker    movq                xm0, [dstq+dsq*1]
5666*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m1, 0x30
5667*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [maskq+hq*2]
5668*c0909341SAndroid Build Coastguard Worker    movq                xm1, [tmpq+8*1]
5669*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [tmpq+8*0], 1
5670*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8*2
5671*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4
5672*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1
5673*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
5674*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
5675*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
5676*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm1
5677*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*0], xm0
5678*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*1], xm0
5679*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5680*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
5681*c0909341SAndroid Build Coastguard Worker    jl .w8_loop
5682*c0909341SAndroid Build Coastguard Worker    RET
5683*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5684*c0909341SAndroid Build Coastguard Worker.w16:
5685*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [blend_shuf]
5686*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m4, 0x0c
5687*c0909341SAndroid Build Coastguard Worker.w16_loop:
5688*c0909341SAndroid Build Coastguard Worker    mova                xm1, [dstq+dsq*0]
5689*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [dstq+dsq*1], 1
5690*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [maskq+hq*2]
5691*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmpq]
5692*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
5693*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4
5694*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
5695*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
5696*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
5697*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
5698*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
5699*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
5700*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
5701*c0909341SAndroid Build Coastguard Worker    mova         [dstq+dsq*0], xm0
5702*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+dsq*1], m0, 1
5703*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5704*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
5705*c0909341SAndroid Build Coastguard Worker    jl .w16_loop
5706*c0909341SAndroid Build Coastguard Worker    RET
5707*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5708*c0909341SAndroid Build Coastguard Worker.w32: ; w32/w64/w128
5709*c0909341SAndroid Build Coastguard Worker    sub                 dsq, r6
5710*c0909341SAndroid Build Coastguard Worker.w32_loop0:
5711*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, [maskq+hq*2]
5712*c0909341SAndroid Build Coastguard Worker    mov                  wd, r6d
5713*c0909341SAndroid Build Coastguard Worker.w32_loop:
5714*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq]
5715*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmpq]
5716*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
5717*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
5718*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
5719*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
5720*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
5721*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
5722*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
5723*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
5724*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
5725*c0909341SAndroid Build Coastguard Worker    add                dstq, 32
5726*c0909341SAndroid Build Coastguard Worker    sub                  wd, 32
5727*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
5728*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
5729*c0909341SAndroid Build Coastguard Worker    inc                  hq
5730*c0909341SAndroid Build Coastguard Worker    jl .w32_loop0
5731*c0909341SAndroid Build Coastguard Worker    RET
5732*c0909341SAndroid Build Coastguard Worker
5733*c0909341SAndroid Build Coastguard Workercglobal emu_edge_8bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
5734*c0909341SAndroid Build Coastguard Worker                             bottomext, rightext
5735*c0909341SAndroid Build Coastguard Worker    ; we assume that the buffer (stride) is larger than width, so we can
5736*c0909341SAndroid Build Coastguard Worker    ; safely overwrite by a few bytes
5737*c0909341SAndroid Build Coastguard Worker
5738*c0909341SAndroid Build Coastguard Worker    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
5739*c0909341SAndroid Build Coastguard Worker    xor                r12d, r12d
5740*c0909341SAndroid Build Coastguard Worker    lea                 r10, [ihq-1]
5741*c0909341SAndroid Build Coastguard Worker    cmp                  yq, ihq
5742*c0909341SAndroid Build Coastguard Worker    cmovs               r10, yq
5743*c0909341SAndroid Build Coastguard Worker    test                 yq, yq
5744*c0909341SAndroid Build Coastguard Worker    cmovs               r10, r12
5745*c0909341SAndroid Build Coastguard Worker    imul                r10, sstrideq
5746*c0909341SAndroid Build Coastguard Worker    add                srcq, r10
5747*c0909341SAndroid Build Coastguard Worker
5748*c0909341SAndroid Build Coastguard Worker    ; ref += iclip(x, 0, iw - 1)
5749*c0909341SAndroid Build Coastguard Worker    lea                 r10, [iwq-1]
5750*c0909341SAndroid Build Coastguard Worker    cmp                  xq, iwq
5751*c0909341SAndroid Build Coastguard Worker    cmovs               r10, xq
5752*c0909341SAndroid Build Coastguard Worker    test                 xq, xq
5753*c0909341SAndroid Build Coastguard Worker    cmovs               r10, r12
5754*c0909341SAndroid Build Coastguard Worker    add                srcq, r10
5755*c0909341SAndroid Build Coastguard Worker
5756*c0909341SAndroid Build Coastguard Worker    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
5757*c0909341SAndroid Build Coastguard Worker    lea          bottomextq, [yq+bhq]
5758*c0909341SAndroid Build Coastguard Worker    sub          bottomextq, ihq
5759*c0909341SAndroid Build Coastguard Worker    lea                  r3, [bhq-1]
5760*c0909341SAndroid Build Coastguard Worker    cmovs        bottomextq, r12
5761*c0909341SAndroid Build Coastguard Worker
5762*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
5763*c0909341SAndroid Build Coastguard Worker                bottomext, rightext
5764*c0909341SAndroid Build Coastguard Worker
5765*c0909341SAndroid Build Coastguard Worker    ; top_ext = iclip(-y, 0, bh - 1)
5766*c0909341SAndroid Build Coastguard Worker    neg             topextq
5767*c0909341SAndroid Build Coastguard Worker    cmovs           topextq, r12
5768*c0909341SAndroid Build Coastguard Worker    cmp          bottomextq, bhq
5769*c0909341SAndroid Build Coastguard Worker    cmovns       bottomextq, r3
5770*c0909341SAndroid Build Coastguard Worker    cmp             topextq, bhq
5771*c0909341SAndroid Build Coastguard Worker    cmovg           topextq, r3
5772*c0909341SAndroid Build Coastguard Worker
5773*c0909341SAndroid Build Coastguard Worker    ; right_ext = iclip(x + bw - iw, 0, bw - 1)
5774*c0909341SAndroid Build Coastguard Worker    lea           rightextq, [xq+bwq]
5775*c0909341SAndroid Build Coastguard Worker    sub           rightextq, iwq
5776*c0909341SAndroid Build Coastguard Worker    lea                  r2, [bwq-1]
5777*c0909341SAndroid Build Coastguard Worker    cmovs         rightextq, r12
5778*c0909341SAndroid Build Coastguard Worker
5779*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
5780*c0909341SAndroid Build Coastguard Worker                bottomext, rightext
5781*c0909341SAndroid Build Coastguard Worker
5782*c0909341SAndroid Build Coastguard Worker    ; left_ext = iclip(-x, 0, bw - 1)
5783*c0909341SAndroid Build Coastguard Worker    neg            leftextq
5784*c0909341SAndroid Build Coastguard Worker    cmovs          leftextq, r12
5785*c0909341SAndroid Build Coastguard Worker    cmp           rightextq, bwq
5786*c0909341SAndroid Build Coastguard Worker    cmovns        rightextq, r2
5787*c0909341SAndroid Build Coastguard Worker    cmp            leftextq, bwq
5788*c0909341SAndroid Build Coastguard Worker    cmovns         leftextq, r2
5789*c0909341SAndroid Build Coastguard Worker
5790*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
5791*c0909341SAndroid Build Coastguard Worker                dst, dstride, src, sstride, bottomext, rightext
5792*c0909341SAndroid Build Coastguard Worker
5793*c0909341SAndroid Build Coastguard Worker    ; center_h = bh - top_ext - bottom_ext
5794*c0909341SAndroid Build Coastguard Worker    lea                  r3, [bottomextq+topextq]
5795*c0909341SAndroid Build Coastguard Worker    sub            centerhq, r3
5796*c0909341SAndroid Build Coastguard Worker
5797*c0909341SAndroid Build Coastguard Worker    ; blk += top_ext * PXSTRIDE(dst_stride)
5798*c0909341SAndroid Build Coastguard Worker    mov                  r2, topextq
5799*c0909341SAndroid Build Coastguard Worker    imul                 r2, dstrideq
5800*c0909341SAndroid Build Coastguard Worker    add                dstq, r2
5801*c0909341SAndroid Build Coastguard Worker    mov                 r9m, dstq
5802*c0909341SAndroid Build Coastguard Worker
5803*c0909341SAndroid Build Coastguard Worker    ; center_w = bw - left_ext - right_ext
5804*c0909341SAndroid Build Coastguard Worker    mov            centerwq, bwq
5805*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rightextq+leftextq]
5806*c0909341SAndroid Build Coastguard Worker    sub            centerwq, r3
5807*c0909341SAndroid Build Coastguard Worker
5808*c0909341SAndroid Build Coastguard Worker%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
5809*c0909341SAndroid Build Coastguard Worker.v_loop_%3:
5810*c0909341SAndroid Build Coastguard Worker%if %1
5811*c0909341SAndroid Build Coastguard Worker    ; left extension
5812*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
5813*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, [srcq]
5814*c0909341SAndroid Build Coastguard Worker.left_loop_%3:
5815*c0909341SAndroid Build Coastguard Worker    mova          [dstq+r3], m0
5816*c0909341SAndroid Build Coastguard Worker    add                  r3, 32
5817*c0909341SAndroid Build Coastguard Worker    cmp                  r3, leftextq
5818*c0909341SAndroid Build Coastguard Worker    jl .left_loop_%3
5819*c0909341SAndroid Build Coastguard Worker
5820*c0909341SAndroid Build Coastguard Worker    ; body
5821*c0909341SAndroid Build Coastguard Worker    lea                 r12, [dstq+leftextq]
5822*c0909341SAndroid Build Coastguard Worker%endif
5823*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
5824*c0909341SAndroid Build Coastguard Worker.body_loop_%3:
5825*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r3]
5826*c0909341SAndroid Build Coastguard Worker%if %1
5827*c0909341SAndroid Build Coastguard Worker    movu           [r12+r3], m0
5828*c0909341SAndroid Build Coastguard Worker%else
5829*c0909341SAndroid Build Coastguard Worker    movu          [dstq+r3], m0
5830*c0909341SAndroid Build Coastguard Worker%endif
5831*c0909341SAndroid Build Coastguard Worker    add                  r3, 32
5832*c0909341SAndroid Build Coastguard Worker    cmp                  r3, centerwq
5833*c0909341SAndroid Build Coastguard Worker    jl .body_loop_%3
5834*c0909341SAndroid Build Coastguard Worker
5835*c0909341SAndroid Build Coastguard Worker%if %2
5836*c0909341SAndroid Build Coastguard Worker    ; right extension
5837*c0909341SAndroid Build Coastguard Worker%if %1
5838*c0909341SAndroid Build Coastguard Worker    add                 r12, centerwq
5839*c0909341SAndroid Build Coastguard Worker%else
5840*c0909341SAndroid Build Coastguard Worker    lea                 r12, [dstq+centerwq]
5841*c0909341SAndroid Build Coastguard Worker%endif
5842*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
5843*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, [srcq+centerwq-1]
5844*c0909341SAndroid Build Coastguard Worker.right_loop_%3:
5845*c0909341SAndroid Build Coastguard Worker    movu           [r12+r3], m0
5846*c0909341SAndroid Build Coastguard Worker    add                  r3, 32
5847*c0909341SAndroid Build Coastguard Worker    cmp                  r3, rightextq
5848*c0909341SAndroid Build Coastguard Worker    jl .right_loop_%3
5849*c0909341SAndroid Build Coastguard Worker
5850*c0909341SAndroid Build Coastguard Worker%endif
5851*c0909341SAndroid Build Coastguard Worker    add                dstq, dstrideq
5852*c0909341SAndroid Build Coastguard Worker    add                srcq, sstrideq
5853*c0909341SAndroid Build Coastguard Worker    dec            centerhq
5854*c0909341SAndroid Build Coastguard Worker    jg .v_loop_%3
5855*c0909341SAndroid Build Coastguard Worker%endmacro
5856*c0909341SAndroid Build Coastguard Worker
5857*c0909341SAndroid Build Coastguard Worker    test           leftextq, leftextq
5858*c0909341SAndroid Build Coastguard Worker    jnz .need_left_ext
5859*c0909341SAndroid Build Coastguard Worker    test          rightextq, rightextq
5860*c0909341SAndroid Build Coastguard Worker    jnz .need_right_ext
5861*c0909341SAndroid Build Coastguard Worker    v_loop                0, 0, 0
5862*c0909341SAndroid Build Coastguard Worker    jmp .body_done
5863*c0909341SAndroid Build Coastguard Worker
5864*c0909341SAndroid Build Coastguard Worker.need_left_ext:
5865*c0909341SAndroid Build Coastguard Worker    test          rightextq, rightextq
5866*c0909341SAndroid Build Coastguard Worker    jnz .need_left_right_ext
5867*c0909341SAndroid Build Coastguard Worker    v_loop                1, 0, 1
5868*c0909341SAndroid Build Coastguard Worker    jmp .body_done
5869*c0909341SAndroid Build Coastguard Worker
5870*c0909341SAndroid Build Coastguard Worker.need_left_right_ext:
5871*c0909341SAndroid Build Coastguard Worker    v_loop                1, 1, 2
5872*c0909341SAndroid Build Coastguard Worker    jmp .body_done
5873*c0909341SAndroid Build Coastguard Worker
5874*c0909341SAndroid Build Coastguard Worker.need_right_ext:
5875*c0909341SAndroid Build Coastguard Worker    v_loop                0, 1, 3
5876*c0909341SAndroid Build Coastguard Worker
5877*c0909341SAndroid Build Coastguard Worker.body_done:
5878*c0909341SAndroid Build Coastguard Worker    ; bottom edge extension
5879*c0909341SAndroid Build Coastguard Worker    test         bottomextq, bottomextq
5880*c0909341SAndroid Build Coastguard Worker    jz .top
5881*c0909341SAndroid Build Coastguard Worker    mov                srcq, dstq
5882*c0909341SAndroid Build Coastguard Worker    sub                srcq, dstrideq
5883*c0909341SAndroid Build Coastguard Worker    xor                  r1, r1
5884*c0909341SAndroid Build Coastguard Worker.bottom_x_loop:
5885*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq+r1]
5886*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+r1]
5887*c0909341SAndroid Build Coastguard Worker    mov                  r4, bottomextq
5888*c0909341SAndroid Build Coastguard Worker.bottom_y_loop:
5889*c0909341SAndroid Build Coastguard Worker    mova               [r3], m0
5890*c0909341SAndroid Build Coastguard Worker    add                  r3, dstrideq
5891*c0909341SAndroid Build Coastguard Worker    dec                  r4
5892*c0909341SAndroid Build Coastguard Worker    jg .bottom_y_loop
5893*c0909341SAndroid Build Coastguard Worker    add                  r1, 32
5894*c0909341SAndroid Build Coastguard Worker    cmp                  r1, bwq
5895*c0909341SAndroid Build Coastguard Worker    jl .bottom_x_loop
5896*c0909341SAndroid Build Coastguard Worker
5897*c0909341SAndroid Build Coastguard Worker.top:
5898*c0909341SAndroid Build Coastguard Worker    ; top edge extension
5899*c0909341SAndroid Build Coastguard Worker    test            topextq, topextq
5900*c0909341SAndroid Build Coastguard Worker    jz .end
5901*c0909341SAndroid Build Coastguard Worker    mov                srcq, r9m
5902*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstm
5903*c0909341SAndroid Build Coastguard Worker    xor                  r1, r1
5904*c0909341SAndroid Build Coastguard Worker.top_x_loop:
5905*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq+r1]
5906*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+r1]
5907*c0909341SAndroid Build Coastguard Worker    mov                  r4, topextq
5908*c0909341SAndroid Build Coastguard Worker.top_y_loop:
5909*c0909341SAndroid Build Coastguard Worker    mova               [r3], m0
5910*c0909341SAndroid Build Coastguard Worker    add                  r3, dstrideq
5911*c0909341SAndroid Build Coastguard Worker    dec                  r4
5912*c0909341SAndroid Build Coastguard Worker    jg .top_y_loop
5913*c0909341SAndroid Build Coastguard Worker    add                  r1, 32
5914*c0909341SAndroid Build Coastguard Worker    cmp                  r1, bwq
5915*c0909341SAndroid Build Coastguard Worker    jl .top_x_loop
5916*c0909341SAndroid Build Coastguard Worker
5917*c0909341SAndroid Build Coastguard Worker.end:
5918*c0909341SAndroid Build Coastguard Worker    RET
5919*c0909341SAndroid Build Coastguard Worker
5920*c0909341SAndroid Build Coastguard Workercglobal resize_8bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
5921*c0909341SAndroid Build Coastguard Worker                                dst_w, h, src_w, dx, mx0
5922*c0909341SAndroid Build Coastguard Worker    sub          dword mx0m, 4<<14
5923*c0909341SAndroid Build Coastguard Worker    sub        dword src_wm, 8
5924*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, dxm
5925*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, mx0m
5926*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, src_wm
5927*c0909341SAndroid Build Coastguard Worker
5928*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
5929*c0909341SAndroid Build Coastguard Worker    LEA                  r7, $$
5930*c0909341SAndroid Build Coastguard Worker%define base r7-$$
5931*c0909341SAndroid Build Coastguard Worker
5932*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm3, [base+pw_m256]
5933*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+pd_63]
5934*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m15, [base+pb_8x0_8x8]
5935*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
5936*c0909341SAndroid Build Coastguard Worker    pslld                m5, 3                      ; dx*8
5937*c0909341SAndroid Build Coastguard Worker    pslld                m6, 14
5938*c0909341SAndroid Build Coastguard Worker    paddd                m8, m2                     ; mx+[0..7]*dx
5939*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
5940*c0909341SAndroid Build Coastguard Worker
5941*c0909341SAndroid Build Coastguard Worker    ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7
5942*c0909341SAndroid Build Coastguard Worker    ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8
5943*c0909341SAndroid Build Coastguard Worker
5944*c0909341SAndroid Build Coastguard Worker.loop_y:
5945*c0909341SAndroid Build Coastguard Worker    xor                  xd, xd
5946*c0909341SAndroid Build Coastguard Worker    mova                 m4, m8                     ; per-line working version of mx
5947*c0909341SAndroid Build Coastguard Worker
5948*c0909341SAndroid Build Coastguard Worker.loop_x:
5949*c0909341SAndroid Build Coastguard Worker    pmaxsd               m0, m4, m2
5950*c0909341SAndroid Build Coastguard Worker    psrad                m9, m4, 8                  ; filter offset (unmasked)
5951*c0909341SAndroid Build Coastguard Worker    pminsd               m0, m6                     ; iclip(mx, 0, src_w-8)
5952*c0909341SAndroid Build Coastguard Worker    psubd                m1, m4, m0                 ; pshufb offset
5953*c0909341SAndroid Build Coastguard Worker    psrad                m0, 14                     ; clipped src_x offset
5954*c0909341SAndroid Build Coastguard Worker    psrad                m1, 14                     ; pshufb edge_emu offset
5955*c0909341SAndroid Build Coastguard Worker    pand                 m9, m7                     ; filter offset (masked)
5956*c0909341SAndroid Build Coastguard Worker
5957*c0909341SAndroid Build Coastguard Worker    ; load source pixels - this ugly code is vpgatherdq emulation since
5958*c0909341SAndroid Build Coastguard Worker    ; directly using vpgatherdq on Haswell is quite a bit slower :(
5959*c0909341SAndroid Build Coastguard Worker    movd                r8d, xm0
5960*c0909341SAndroid Build Coastguard Worker    pextrd              r9d, xm0, 1
5961*c0909341SAndroid Build Coastguard Worker    pextrd             r10d, xm0, 2
5962*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm0, 3
5963*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m0, 1
5964*c0909341SAndroid Build Coastguard Worker    movq               xm12, [srcq+r8]
5965*c0909341SAndroid Build Coastguard Worker    movq               xm13, [srcq+r10]
5966*c0909341SAndroid Build Coastguard Worker    movhps             xm12, [srcq+r9]
5967*c0909341SAndroid Build Coastguard Worker    movhps             xm13, [srcq+r11]
5968*c0909341SAndroid Build Coastguard Worker    movd                r8d, xm0
5969*c0909341SAndroid Build Coastguard Worker    pextrd              r9d, xm0, 1
5970*c0909341SAndroid Build Coastguard Worker    pextrd             r10d, xm0, 2
5971*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm0, 3
5972*c0909341SAndroid Build Coastguard Worker    vinserti128         m12, [srcq+r8], 1
5973*c0909341SAndroid Build Coastguard Worker    vinserti128         m13, [srcq+r10], 1
5974*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m10, [srcq+r9]
5975*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m11, [srcq+r11]
5976*c0909341SAndroid Build Coastguard Worker    vpblendd            m12, m10, 11000000b
5977*c0909341SAndroid Build Coastguard Worker    vpblendd            m13, m11, 11000000b
5978*c0909341SAndroid Build Coastguard Worker
5979*c0909341SAndroid Build Coastguard Worker    ; if no emulation is required, we don't need to shuffle or emulate edges
5980*c0909341SAndroid Build Coastguard Worker    ; this also saves 2 quasi-vpgatherdqs
5981*c0909341SAndroid Build Coastguard Worker    vptest               m1, m1
5982*c0909341SAndroid Build Coastguard Worker    jz .filter
5983*c0909341SAndroid Build Coastguard Worker
5984*c0909341SAndroid Build Coastguard Worker    movq                 r9, xm1
5985*c0909341SAndroid Build Coastguard Worker    pextrq              r11, xm1, 1
5986*c0909341SAndroid Build Coastguard Worker    movsxd               r8, r9d
5987*c0909341SAndroid Build Coastguard Worker    sar                  r9, 32
5988*c0909341SAndroid Build Coastguard Worker    movsxd              r10, r11d
5989*c0909341SAndroid Build Coastguard Worker    sar                 r11, 32
5990*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m1, 1
5991*c0909341SAndroid Build Coastguard Worker    movq               xm14, [base+resize_shuf+4+r8]
5992*c0909341SAndroid Build Coastguard Worker    movq                xm0, [base+resize_shuf+4+r10]
5993*c0909341SAndroid Build Coastguard Worker    movhps             xm14, [base+resize_shuf+4+r9]
5994*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [base+resize_shuf+4+r11]
5995*c0909341SAndroid Build Coastguard Worker    movq                 r9, xm1
5996*c0909341SAndroid Build Coastguard Worker    pextrq              r11, xm1, 1
5997*c0909341SAndroid Build Coastguard Worker    movsxd               r8, r9d
5998*c0909341SAndroid Build Coastguard Worker    sar                  r9, 32
5999*c0909341SAndroid Build Coastguard Worker    movsxd              r10, r11d
6000*c0909341SAndroid Build Coastguard Worker    sar                 r11, 32
6001*c0909341SAndroid Build Coastguard Worker    vinserti128         m14, [base+resize_shuf+4+r8], 1
6002*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [base+resize_shuf+4+r10], 1
6003*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m10, [base+resize_shuf+4+r9]
6004*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m11, [base+resize_shuf+4+r11]
6005*c0909341SAndroid Build Coastguard Worker    vpblendd            m14, m10, 11000000b
6006*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m11, 11000000b
6007*c0909341SAndroid Build Coastguard Worker
6008*c0909341SAndroid Build Coastguard Worker    paddb               m14, m15
6009*c0909341SAndroid Build Coastguard Worker    paddb                m0, m15
6010*c0909341SAndroid Build Coastguard Worker    pshufb              m12, m14
6011*c0909341SAndroid Build Coastguard Worker    pshufb              m13, m0
6012*c0909341SAndroid Build Coastguard Worker
6013*c0909341SAndroid Build Coastguard Worker.filter:
6014*c0909341SAndroid Build Coastguard Worker    movd                r8d, xm9
6015*c0909341SAndroid Build Coastguard Worker    pextrd              r9d, xm9, 1
6016*c0909341SAndroid Build Coastguard Worker    pextrd             r10d, xm9, 2
6017*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm9, 3
6018*c0909341SAndroid Build Coastguard Worker    vextracti128        xm9, m9, 1
6019*c0909341SAndroid Build Coastguard Worker    movq               xm10, [base+resize_filter+r8*8]
6020*c0909341SAndroid Build Coastguard Worker    movq               xm11, [base+resize_filter+r10*8]
6021*c0909341SAndroid Build Coastguard Worker    movhps             xm10, [base+resize_filter+r9*8]
6022*c0909341SAndroid Build Coastguard Worker    movhps             xm11, [base+resize_filter+r11*8]
6023*c0909341SAndroid Build Coastguard Worker    movd                r8d, xm9
6024*c0909341SAndroid Build Coastguard Worker    pextrd              r9d, xm9, 1
6025*c0909341SAndroid Build Coastguard Worker    pextrd             r10d, xm9, 2
6026*c0909341SAndroid Build Coastguard Worker    pextrd             r11d, xm9, 3
6027*c0909341SAndroid Build Coastguard Worker    vinserti128         m10, [base+resize_filter+r8*8], 1
6028*c0909341SAndroid Build Coastguard Worker    vinserti128         m11, [base+resize_filter+r10*8], 1
6029*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m14, [base+resize_filter+r9*8]
6030*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [base+resize_filter+r11*8]
6031*c0909341SAndroid Build Coastguard Worker    vpblendd            m10, m14, 11000000b
6032*c0909341SAndroid Build Coastguard Worker    vpblendd            m11, m1, 11000000b
6033*c0909341SAndroid Build Coastguard Worker
6034*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, m10
6035*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m11
6036*c0909341SAndroid Build Coastguard Worker    phaddw              m12, m13
6037*c0909341SAndroid Build Coastguard Worker    vextracti128       xm13, m12, 1
6038*c0909341SAndroid Build Coastguard Worker    phaddsw            xm12, xm13
6039*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xm12, xm3                    ; x=(x+64)>>7
6040*c0909341SAndroid Build Coastguard Worker    packuswb           xm12, xm12
6041*c0909341SAndroid Build Coastguard Worker    movq          [dstq+xq], xm12
6042*c0909341SAndroid Build Coastguard Worker
6043*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
6044*c0909341SAndroid Build Coastguard Worker    add                  xd, 8
6045*c0909341SAndroid Build Coastguard Worker    cmp                  xd, dst_wd
6046*c0909341SAndroid Build Coastguard Worker    jl .loop_x
6047*c0909341SAndroid Build Coastguard Worker
6048*c0909341SAndroid Build Coastguard Worker    add                dstq, dst_strideq
6049*c0909341SAndroid Build Coastguard Worker    add                srcq, src_strideq
6050*c0909341SAndroid Build Coastguard Worker    dec                  hd
6051*c0909341SAndroid Build Coastguard Worker    jg .loop_y
6052*c0909341SAndroid Build Coastguard Worker    RET
6053*c0909341SAndroid Build Coastguard Worker
6054*c0909341SAndroid Build Coastguard Workercglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
6055*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_420_avx2_table
6056*c0909341SAndroid Build Coastguard Worker    lea                  r7, [w_mask_420_avx2_table]
6057*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
6058*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r7m ; sign
6059*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
6060*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r7+wq*4]
6061*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
6062*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+pw_2048]
6063*c0909341SAndroid Build Coastguard Worker    pmovzxbd             m9, [base+deint_shuf4]
6064*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+wm_420_sign+r6*4] ; 258 - sign
6065*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
6066*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
6067*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
6068*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
6069*c0909341SAndroid Build Coastguard Worker    jmp                  wq
6070*c0909341SAndroid Build Coastguard Worker.w4:
6071*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
6072*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
6073*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
6074*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm1
6075*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 1
6076*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
6077*c0909341SAndroid Build Coastguard Worker    jl .w4_end
6078*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
6079*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*0], xm0, 2
6080*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 3
6081*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm1, 2
6082*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 3
6083*c0909341SAndroid Build Coastguard Worker    jg .w4_h16
6084*c0909341SAndroid Build Coastguard Worker.w4_end:
6085*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m4, 1
6086*c0909341SAndroid Build Coastguard Worker    vpblendd            xm1, xm4, xm0, 0x05
6087*c0909341SAndroid Build Coastguard Worker    vpblendd            xm4, xm0, 0x0a
6088*c0909341SAndroid Build Coastguard Worker    pshufd              xm1, xm1, q2301
6089*c0909341SAndroid Build Coastguard Worker    psubw               xm4, xm8, xm4
6090*c0909341SAndroid Build Coastguard Worker    psubw               xm4, xm1
6091*c0909341SAndroid Build Coastguard Worker    psrlw               xm4, 2
6092*c0909341SAndroid Build Coastguard Worker    packuswb            xm4, xm4
6093*c0909341SAndroid Build Coastguard Worker    movq            [maskq], xm4
6094*c0909341SAndroid Build Coastguard Worker    RET
6095*c0909341SAndroid Build Coastguard Worker.w4_h16:
6096*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 5, 2, 3
6097*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
6098*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m5
6099*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
6100*c0909341SAndroid Build Coastguard Worker    psubw                m4, m8, m4
6101*c0909341SAndroid Build Coastguard Worker    psrlw                m4, 2
6102*c0909341SAndroid Build Coastguard Worker    vpermd               m4, m9, m4
6103*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
6104*c0909341SAndroid Build Coastguard Worker    packuswb            xm4, xm5
6105*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
6106*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
6107*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm1
6108*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q], xm1, 1
6109*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
6110*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*0], xm0, 2
6111*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 3
6112*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm1, 2
6113*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 3
6114*c0909341SAndroid Build Coastguard Worker    mova            [maskq], xm4
6115*c0909341SAndroid Build Coastguard Worker    RET
6116*c0909341SAndroid Build Coastguard Worker.w8_loop:
6117*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 2*32
6118*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 2*32
6119*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
6120*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
6121*c0909341SAndroid Build Coastguard Worker    add               maskq, 8
6122*c0909341SAndroid Build Coastguard Worker.w8:
6123*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m4, 1
6124*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
6125*c0909341SAndroid Build Coastguard Worker    psubw               xm4, xm8, xm4
6126*c0909341SAndroid Build Coastguard Worker    psubw               xm4, xm2
6127*c0909341SAndroid Build Coastguard Worker    psrlw               xm4, 2
6128*c0909341SAndroid Build Coastguard Worker    packuswb            xm4, xm4
6129*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
6130*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
6131*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
6132*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
6133*c0909341SAndroid Build Coastguard Worker    movq            [maskq], xm4
6134*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
6135*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
6136*c0909341SAndroid Build Coastguard Worker    RET
6137*c0909341SAndroid Build Coastguard Worker.w16_loop:
6138*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 4*32
6139*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 4*32
6140*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
6141*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
6142*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
6143*c0909341SAndroid Build Coastguard Worker.w16:
6144*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6145*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
6146*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
6147*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 5, 2, 3
6148*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m4, m5
6149*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m5
6150*c0909341SAndroid Build Coastguard Worker    psubw                m1, m8, m1
6151*c0909341SAndroid Build Coastguard Worker    psubw                m1, m4
6152*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2
6153*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6154*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m1
6155*c0909341SAndroid Build Coastguard Worker    vpermd               m1, m9, m1
6156*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*2], xm0
6157*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+stride3q ], m0, 1
6158*c0909341SAndroid Build Coastguard Worker    mova            [maskq], xm1
6159*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
6160*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
6161*c0909341SAndroid Build Coastguard Worker    RET
6162*c0909341SAndroid Build Coastguard Worker.w32_loop:
6163*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 4*32
6164*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 4*32
6165*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
6166*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
6167*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
6168*c0909341SAndroid Build Coastguard Worker.w32:
6169*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6170*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
6171*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 5, 2, 3
6172*c0909341SAndroid Build Coastguard Worker    psubw                m4, m8, m4
6173*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5
6174*c0909341SAndroid Build Coastguard Worker    psrlw                m4, 2
6175*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6176*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m4
6177*c0909341SAndroid Build Coastguard Worker    vpermd               m4, m9, m4
6178*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m0
6179*c0909341SAndroid Build Coastguard Worker    mova            [maskq], xm4
6180*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
6181*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
6182*c0909341SAndroid Build Coastguard Worker    RET
6183*c0909341SAndroid Build Coastguard Worker.w64_loop_even:
6184*c0909341SAndroid Build Coastguard Worker    psubw               m10, m8, m4
6185*c0909341SAndroid Build Coastguard Worker    psubw               m11, m8, m5
6186*c0909341SAndroid Build Coastguard Worker    dec                  hd
6187*c0909341SAndroid Build Coastguard Worker.w64_loop:
6188*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 4*32
6189*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 4*32
6190*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
6191*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
6192*c0909341SAndroid Build Coastguard Worker.w64:
6193*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6194*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
6195*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 5, 2, 3
6196*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6197*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m0
6198*c0909341SAndroid Build Coastguard Worker    test                 hd, 1
6199*c0909341SAndroid Build Coastguard Worker    jz .w64_loop_even
6200*c0909341SAndroid Build Coastguard Worker    psubw                m4, m10, m4
6201*c0909341SAndroid Build Coastguard Worker    psubw                m5, m11, m5
6202*c0909341SAndroid Build Coastguard Worker    psrlw                m4, 2
6203*c0909341SAndroid Build Coastguard Worker    psrlw                m5, 2
6204*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
6205*c0909341SAndroid Build Coastguard Worker    vpermd               m4, m9, m4
6206*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m4
6207*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
6208*c0909341SAndroid Build Coastguard Worker    dec                  hd
6209*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
6210*c0909341SAndroid Build Coastguard Worker    RET
6211*c0909341SAndroid Build Coastguard Worker.w128_loop_even:
6212*c0909341SAndroid Build Coastguard Worker    psubw               m12, m8, m4
6213*c0909341SAndroid Build Coastguard Worker    psubw               m13, m8, m5
6214*c0909341SAndroid Build Coastguard Worker    dec                  hd
6215*c0909341SAndroid Build Coastguard Worker.w128_loop:
6216*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
6217*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
6218*c0909341SAndroid Build Coastguard Worker.w128:
6219*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6220*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
6221*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 5, 2, 3
6222*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6223*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m0
6224*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 8*32
6225*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 8*32
6226*c0909341SAndroid Build Coastguard Worker    test                 hd, 1
6227*c0909341SAndroid Build Coastguard Worker    jz .w128_even
6228*c0909341SAndroid Build Coastguard Worker    psubw                m4, m10, m4
6229*c0909341SAndroid Build Coastguard Worker    psubw                m5, m11, m5
6230*c0909341SAndroid Build Coastguard Worker    psrlw                m4, 2
6231*c0909341SAndroid Build Coastguard Worker    psrlw                m5, 2
6232*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
6233*c0909341SAndroid Build Coastguard Worker    vpermd               m4, m9, m4
6234*c0909341SAndroid Build Coastguard Worker    mova       [maskq+32*0], m4
6235*c0909341SAndroid Build Coastguard Worker    jmp .w128_odd
6236*c0909341SAndroid Build Coastguard Worker.w128_even:
6237*c0909341SAndroid Build Coastguard Worker    psubw               m10, m8, m4
6238*c0909341SAndroid Build Coastguard Worker    psubw               m11, m8, m5
6239*c0909341SAndroid Build Coastguard Worker.w128_odd:
6240*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, -4, -3
6241*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6242*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*2], m0
6243*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 5, -2, -1
6244*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6245*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*3], m0
6246*c0909341SAndroid Build Coastguard Worker    test                 hd, 1
6247*c0909341SAndroid Build Coastguard Worker    jz .w128_loop_even
6248*c0909341SAndroid Build Coastguard Worker    psubw                m4, m12, m4
6249*c0909341SAndroid Build Coastguard Worker    psubw                m5, m13, m5
6250*c0909341SAndroid Build Coastguard Worker    psrlw                m4, 2
6251*c0909341SAndroid Build Coastguard Worker    psrlw                m5, 2
6252*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
6253*c0909341SAndroid Build Coastguard Worker    vpermd               m4, m9, m4
6254*c0909341SAndroid Build Coastguard Worker    mova       [maskq+32*1], m4
6255*c0909341SAndroid Build Coastguard Worker    add               maskq, 64
6256*c0909341SAndroid Build Coastguard Worker    dec                  hd
6257*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
6258*c0909341SAndroid Build Coastguard Worker    RET
6259*c0909341SAndroid Build Coastguard Worker
6260*c0909341SAndroid Build Coastguard Workercglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
6261*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_422_avx2_table
6262*c0909341SAndroid Build Coastguard Worker    lea                  r7, [w_mask_422_avx2_table]
6263*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
6264*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r7m ; sign
6265*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
6266*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
6267*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [r7+wq*4]
6268*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
6269*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+pw_2048]
6270*c0909341SAndroid Build Coastguard Worker    pmovzxbd            m10, [base+deint_shuf4]
6271*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+wm_422_sign+r6*4] ; 128 - sign
6272*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
6273*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
6274*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
6275*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
6276*c0909341SAndroid Build Coastguard Worker    jmp                  wq
6277*c0909341SAndroid Build Coastguard Worker.w4:
6278*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
6279*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
6280*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
6281*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm1
6282*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 1
6283*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
6284*c0909341SAndroid Build Coastguard Worker    jl .w4_end
6285*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
6286*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*0], xm0, 2
6287*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 3
6288*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm1, 2
6289*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 3
6290*c0909341SAndroid Build Coastguard Worker    jg .w4_h16
6291*c0909341SAndroid Build Coastguard Worker.w4_end:
6292*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
6293*c0909341SAndroid Build Coastguard Worker    packuswb            xm4, xm5
6294*c0909341SAndroid Build Coastguard Worker    psubb               xm5, xm8, xm4
6295*c0909341SAndroid Build Coastguard Worker    pavgb               xm5, xm9
6296*c0909341SAndroid Build Coastguard Worker    pshufd              xm5, xm5, q3120
6297*c0909341SAndroid Build Coastguard Worker    mova            [maskq], xm5
6298*c0909341SAndroid Build Coastguard Worker    RET
6299*c0909341SAndroid Build Coastguard Worker.w4_h16:
6300*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 5, 2, 3
6301*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
6302*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
6303*c0909341SAndroid Build Coastguard Worker    psubb                m5, m8, m4
6304*c0909341SAndroid Build Coastguard Worker    pavgb                m5, m9
6305*c0909341SAndroid Build Coastguard Worker    vpermd               m5, m10, m5
6306*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
6307*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
6308*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
6309*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm1
6310*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 1
6311*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
6312*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*0], xm0, 2
6313*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 3
6314*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm1, 2
6315*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 3
6316*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m5
6317*c0909341SAndroid Build Coastguard Worker    RET
6318*c0909341SAndroid Build Coastguard Worker.w8_loop:
6319*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*2
6320*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*2
6321*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
6322*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
6323*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
6324*c0909341SAndroid Build Coastguard Worker.w8:
6325*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
6326*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
6327*c0909341SAndroid Build Coastguard Worker    packuswb            xm4, xm5
6328*c0909341SAndroid Build Coastguard Worker    psubb               xm5, xm8, xm4
6329*c0909341SAndroid Build Coastguard Worker    pavgb               xm5, xm9
6330*c0909341SAndroid Build Coastguard Worker    pshufd              xm5, xm5, q3120
6331*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
6332*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
6333*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
6334*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
6335*c0909341SAndroid Build Coastguard Worker    mova            [maskq], xm5
6336*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
6337*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
6338*c0909341SAndroid Build Coastguard Worker    RET
6339*c0909341SAndroid Build Coastguard Worker.w16_loop:
6340*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*4
6341*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*4
6342*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
6343*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
6344*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
6345*c0909341SAndroid Build Coastguard Worker.w16:
6346*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6347*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
6348*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
6349*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 5, 2, 3
6350*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
6351*c0909341SAndroid Build Coastguard Worker    psubb                m5, m8, m4
6352*c0909341SAndroid Build Coastguard Worker    pavgb                m5, m9
6353*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6354*c0909341SAndroid Build Coastguard Worker    vpermd               m5, m10, m5
6355*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*2], xm0
6356*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+stride3q ], m0, 1
6357*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m5
6358*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
6359*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
6360*c0909341SAndroid Build Coastguard Worker    RET
6361*c0909341SAndroid Build Coastguard Worker.w32_loop:
6362*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*4
6363*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*4
6364*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
6365*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
6366*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
6367*c0909341SAndroid Build Coastguard Worker.w32:
6368*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6369*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
6370*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 5, 2, 3
6371*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
6372*c0909341SAndroid Build Coastguard Worker    psubb                m5, m8, m4
6373*c0909341SAndroid Build Coastguard Worker    pavgb                m5, m9
6374*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6375*c0909341SAndroid Build Coastguard Worker    vpermd               m5, m10, m5
6376*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m0
6377*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m5
6378*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
6379*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
6380*c0909341SAndroid Build Coastguard Worker    RET
6381*c0909341SAndroid Build Coastguard Worker.w64_loop:
6382*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*4
6383*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*4
6384*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
6385*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
6386*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
6387*c0909341SAndroid Build Coastguard Worker.w64:
6388*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6389*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
6390*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 5, 2, 3
6391*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
6392*c0909341SAndroid Build Coastguard Worker    psubb                m5, m8, m4
6393*c0909341SAndroid Build Coastguard Worker    pavgb                m5, m9
6394*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6395*c0909341SAndroid Build Coastguard Worker    vpermd               m5, m10, m5
6396*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m0
6397*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m5
6398*c0909341SAndroid Build Coastguard Worker    dec                  hd
6399*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
6400*c0909341SAndroid Build Coastguard Worker    RET
6401*c0909341SAndroid Build Coastguard Worker.w128_loop:
6402*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
6403*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*8
6404*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1
6405*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
6406*c0909341SAndroid Build Coastguard Worker    add               maskq, 32*2
6407*c0909341SAndroid Build Coastguard Worker.w128:
6408*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6409*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
6410*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 5, 2, 3
6411*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
6412*c0909341SAndroid Build Coastguard Worker    psubb                m5, m8, m4
6413*c0909341SAndroid Build Coastguard Worker    pavgb                m5, m9
6414*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6415*c0909341SAndroid Build Coastguard Worker    vpermd               m5, m10, m5
6416*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m0
6417*c0909341SAndroid Build Coastguard Worker    mova       [maskq+32*0], m5
6418*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 4, 5
6419*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6420*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*2], m0
6421*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 5, 6, 7
6422*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
6423*c0909341SAndroid Build Coastguard Worker    psubb                m5, m8, m4
6424*c0909341SAndroid Build Coastguard Worker    pavgb                m5, m9
6425*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6426*c0909341SAndroid Build Coastguard Worker    vpermd               m5, m10, m5
6427*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*3], m0
6428*c0909341SAndroid Build Coastguard Worker    mova       [maskq+32*1], m5
6429*c0909341SAndroid Build Coastguard Worker    dec                  hd
6430*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
6431*c0909341SAndroid Build Coastguard Worker    RET
6432*c0909341SAndroid Build Coastguard Worker
6433*c0909341SAndroid Build Coastguard Workercglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3
6434*c0909341SAndroid Build Coastguard Worker%define base r7-w_mask_444_avx2_table
6435*c0909341SAndroid Build Coastguard Worker    lea                  r7, [w_mask_444_avx2_table]
6436*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
6437*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
6438*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
6439*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [r7+wq*4]
6440*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
6441*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pb_64]
6442*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+pw_2048]
6443*c0909341SAndroid Build Coastguard Worker    add                  wq, r7
6444*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1, 1
6445*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
6446*c0909341SAndroid Build Coastguard Worker    jmp                  wq
6447*c0909341SAndroid Build Coastguard Worker.w4:
6448*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
6449*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
6450*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
6451*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm1
6452*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 1
6453*c0909341SAndroid Build Coastguard Worker    mova       [maskq+32*0], m4
6454*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
6455*c0909341SAndroid Build Coastguard Worker    jl .w4_end
6456*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
6457*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*0], xm0, 2
6458*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 3
6459*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm1, 2
6460*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 3
6461*c0909341SAndroid Build Coastguard Worker    je .w4_end
6462*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 2, 3, 1
6463*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
6464*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
6465*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
6466*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
6467*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm1
6468*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 1
6469*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
6470*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*0], xm0, 2
6471*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 3
6472*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm1, 2
6473*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm1, 3
6474*c0909341SAndroid Build Coastguard Worker    mova       [maskq+32*1], m4
6475*c0909341SAndroid Build Coastguard Worker.w4_end:
6476*c0909341SAndroid Build Coastguard Worker    RET
6477*c0909341SAndroid Build Coastguard Worker.w8_loop:
6478*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*2
6479*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*2
6480*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1, 1
6481*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
6482*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
6483*c0909341SAndroid Build Coastguard Worker.w8:
6484*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
6485*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
6486*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
6487*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
6488*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
6489*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m4
6490*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
6491*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
6492*c0909341SAndroid Build Coastguard Worker    RET
6493*c0909341SAndroid Build Coastguard Worker.w16_loop:
6494*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*2
6495*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*2
6496*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1, 1
6497*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
6498*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
6499*c0909341SAndroid Build Coastguard Worker.w16:
6500*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6501*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
6502*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
6503*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m4
6504*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
6505*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
6506*c0909341SAndroid Build Coastguard Worker    RET
6507*c0909341SAndroid Build Coastguard Worker.w32_loop:
6508*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*2
6509*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*2
6510*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1, 1
6511*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
6512*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
6513*c0909341SAndroid Build Coastguard Worker.w32:
6514*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6515*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
6516*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m4
6517*c0909341SAndroid Build Coastguard Worker    dec                  hd
6518*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
6519*c0909341SAndroid Build Coastguard Worker    RET
6520*c0909341SAndroid Build Coastguard Worker.w64_loop:
6521*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*4
6522*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*4
6523*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1, 1
6524*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
6525*c0909341SAndroid Build Coastguard Worker    add               maskq, 32*2
6526*c0909341SAndroid Build Coastguard Worker.w64:
6527*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6528*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
6529*c0909341SAndroid Build Coastguard Worker    mova       [maskq+32*0], m4
6530*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 2, 3, 1
6531*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6532*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m0
6533*c0909341SAndroid Build Coastguard Worker    mova       [maskq+32*1], m4
6534*c0909341SAndroid Build Coastguard Worker    dec                  hd
6535*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
6536*c0909341SAndroid Build Coastguard Worker    RET
6537*c0909341SAndroid Build Coastguard Worker.w128_loop:
6538*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
6539*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*8
6540*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 0, 1, 1
6541*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
6542*c0909341SAndroid Build Coastguard Worker    add               maskq, 32*4
6543*c0909341SAndroid Build Coastguard Worker.w128:
6544*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6545*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
6546*c0909341SAndroid Build Coastguard Worker    mova       [maskq+32*0], m4
6547*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 2, 3, 1
6548*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6549*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m0
6550*c0909341SAndroid Build Coastguard Worker    mova       [maskq+32*1], m4
6551*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 4, 5, 1
6552*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6553*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*2], m0
6554*c0909341SAndroid Build Coastguard Worker    mova       [maskq+32*2], m4
6555*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 4, 6, 7, 1
6556*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
6557*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*3], m0
6558*c0909341SAndroid Build Coastguard Worker    mova       [maskq+32*3], m4
6559*c0909341SAndroid Build Coastguard Worker    dec                  hd
6560*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
6561*c0909341SAndroid Build Coastguard Worker    RET
6562*c0909341SAndroid Build Coastguard Worker
6563*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
6564