xref: /aosp_15_r20/external/libdav1d/src/x86/mc16_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Worker; dav1d_obmc_masks[] << 9
32*c0909341SAndroid Build Coastguard Workerobmc_masks:     dw     0,     0,  9728,     0, 12800,  7168,  2560,     0
33*c0909341SAndroid Build Coastguard Worker                dw 14336, 11264,  8192,  5632,  3584,  1536,     0,     0
34*c0909341SAndroid Build Coastguard Worker                dw 15360, 13824, 12288, 10752,  9216,  7680,  6144,  5120
35*c0909341SAndroid Build Coastguard Worker                dw  4096,  3072,  2048,  1536,     0,     0,     0,     0
36*c0909341SAndroid Build Coastguard Worker                dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240
37*c0909341SAndroid Build Coastguard Worker                dw  9728,  8704,  8192,  7168,  6656,  6144,  5632,  4608
38*c0909341SAndroid Build Coastguard Worker                dw  4096,  3584,  3072,  2560,  2048,  2048,  1536,  1024
39*c0909341SAndroid Build Coastguard Worker
40*c0909341SAndroid Build Coastguard Workerblend_shuf:     db 0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
41*c0909341SAndroid Build Coastguard Workerspel_h_shufA:   db 0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
42*c0909341SAndroid Build Coastguard Workerspel_h_shufB:   db 4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
43*c0909341SAndroid Build Coastguard Workerspel_h_shuf2:   db 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9
44*c0909341SAndroid Build Coastguard Workerspel_s_shuf2:   db 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7
45*c0909341SAndroid Build Coastguard Workerspel_s_shuf8:   db 0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
46*c0909341SAndroid Build Coastguard Workerunpckw:         db 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
47*c0909341SAndroid Build Coastguard Workerrescale_mul:    dd 0,  1,  2,  3
48*c0909341SAndroid Build Coastguard Workerresize_shuf:    db 0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7
49*c0909341SAndroid Build Coastguard Worker                db 8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
50*c0909341SAndroid Build Coastguard Workerbdct_lb_q: times 8 db 0
51*c0909341SAndroid Build Coastguard Worker           times 8 db 4
52*c0909341SAndroid Build Coastguard Worker           times 8 db 8
53*c0909341SAndroid Build Coastguard Worker           times 8 db 12
54*c0909341SAndroid Build Coastguard Worker
55*c0909341SAndroid Build Coastguard Workerpw_2:             times 8 dw 2
56*c0909341SAndroid Build Coastguard Workerpw_16:            times 4 dw 16
57*c0909341SAndroid Build Coastguard Workerprep_mul:         times 4 dw 16
58*c0909341SAndroid Build Coastguard Worker                  times 8 dw 4
59*c0909341SAndroid Build Coastguard Workerpw_64:            times 8 dw 64
60*c0909341SAndroid Build Coastguard Workerpw_256:           times 8 dw 256
61*c0909341SAndroid Build Coastguard Workerpw_2048:          times 4 dw 2048
62*c0909341SAndroid Build Coastguard Workerbidir_mul:        times 4 dw 2048
63*c0909341SAndroid Build Coastguard Workerpw_8192:          times 8 dw 8192
64*c0909341SAndroid Build Coastguard Workerpw_27615:         times 8 dw 27615
65*c0909341SAndroid Build Coastguard Workerpw_32766:         times 8 dw 32766
66*c0909341SAndroid Build Coastguard Workerpw_m512:          times 8 dw -512
67*c0909341SAndroid Build Coastguard Workerpd_63:            times 4 dd 63
68*c0909341SAndroid Build Coastguard Workerpd_64:            times 4 dd 64
69*c0909341SAndroid Build Coastguard Workerpd_512:           times 4 dd 512
70*c0909341SAndroid Build Coastguard Workerpd_2560:          times 2 dd 2560
71*c0909341SAndroid Build Coastguard Workerpd_8704:          times 2 dd 8704
72*c0909341SAndroid Build Coastguard Workerpd_m524256:       times 4 dd -524256 ; -8192 << 6 + 32
73*c0909341SAndroid Build Coastguard Workerpd_0x3ff:         times 4 dd 0x3ff
74*c0909341SAndroid Build Coastguard Workerpd_0x4000:        times 4 dd 0x4000
75*c0909341SAndroid Build Coastguard Workerpq_0x400000:      times 2 dq 0x400000
76*c0909341SAndroid Build Coastguard Workerpq_0x40000000:    times 2 dq 0x40000000
77*c0909341SAndroid Build Coastguard Workerpd_65538:         times 2 dd 65538
78*c0909341SAndroid Build Coastguard Worker
79*c0909341SAndroid Build Coastguard Workerput_bilin_h_rnd:  times 4 dw 8
80*c0909341SAndroid Build Coastguard Worker                  times 4 dw 10
81*c0909341SAndroid Build Coastguard Workers_8tap_h_rnd:     times 2 dd 2
82*c0909341SAndroid Build Coastguard Worker                  times 2 dd 8
83*c0909341SAndroid Build Coastguard Workerput_s_8tap_v_rnd: times 2 dd 512
84*c0909341SAndroid Build Coastguard Worker                  times 2 dd 128
85*c0909341SAndroid Build Coastguard Workers_8tap_h_sh:      dd 2, 4
86*c0909341SAndroid Build Coastguard Workerput_s_8tap_v_sh:  dd 10, 8
87*c0909341SAndroid Build Coastguard Workerbidir_rnd:        times 4 dw -16400
88*c0909341SAndroid Build Coastguard Worker                  times 4 dw -16388
89*c0909341SAndroid Build Coastguard Workerput_8tap_h_rnd:   dd 34, 34, 40, 40
90*c0909341SAndroid Build Coastguard Workerprep_8tap_1d_rnd: times 2 dd     8 - (8192 <<  4)
91*c0909341SAndroid Build Coastguard Workerprep_8tap_2d_rnd: times 4 dd    32 - (8192 <<  5)
92*c0909341SAndroid Build Coastguard Worker
93*c0909341SAndroid Build Coastguard Workerwarp8x8_shift:    dd 11, 13
94*c0909341SAndroid Build Coastguard Workerwarp8x8_rnd1:     dd 1024, 1024, 4096, 4096
95*c0909341SAndroid Build Coastguard Workerwarp8x8_rnd2:     times 4 dw 4096
96*c0909341SAndroid Build Coastguard Worker                  times 4 dw 16384
97*c0909341SAndroid Build Coastguard Workerwarp8x8t_rnd:     times 2 dd 16384 - (8192 << 15)
98*c0909341SAndroid Build Coastguard Worker
99*c0909341SAndroid Build Coastguard Worker%macro BIDIR_JMP_TABLE 2-*
100*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - 2*%3)
101*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_%2_table
102*c0909341SAndroid Build Coastguard Worker    %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
103*c0909341SAndroid Build Coastguard Worker    %%table:
104*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
105*c0909341SAndroid Build Coastguard Worker        dd %%prefix %+ .w%3 - %%base
106*c0909341SAndroid Build Coastguard Worker        %rotate 1
107*c0909341SAndroid Build Coastguard Worker    %endrep
108*c0909341SAndroid Build Coastguard Worker%endmacro
109*c0909341SAndroid Build Coastguard Worker
110*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE avg,        ssse3,    4, 8, 16, 32, 64, 128
111*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_avg,      ssse3,    4, 8, 16, 32, 64, 128
112*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE mask,       ssse3,    4, 8, 16, 32, 64, 128
113*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_420, ssse3,    4, 8, 16, 32, 64, 128
114*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_422, ssse3,    4, 8, 16, 32, 64, 128
115*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_444, ssse3,    4, 8, 16, 32, 64, 128
116*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend,      ssse3,    4, 8, 16, 32
117*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend_v,    ssse3, 2, 4, 8, 16, 32
118*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend_h,    ssse3, 2, 4, 8, 16, 32, 64, 128
119*c0909341SAndroid Build Coastguard Worker
120*c0909341SAndroid Build Coastguard Worker%macro BASE_JMP_TABLE 3-*
121*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - %3)
122*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_%2
123*c0909341SAndroid Build Coastguard Worker    %%table:
124*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
125*c0909341SAndroid Build Coastguard Worker        dw %%base %+ _w%3 - %%base
126*c0909341SAndroid Build Coastguard Worker        %rotate 1
127*c0909341SAndroid Build Coastguard Worker    %endrep
128*c0909341SAndroid Build Coastguard Worker%endmacro
129*c0909341SAndroid Build Coastguard Worker
130*c0909341SAndroid Build Coastguard Worker%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put)
131*c0909341SAndroid Build Coastguard Worker%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep)
132*c0909341SAndroid Build Coastguard Worker
133*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE put,  ssse3, 2, 4, 8, 16, 32, 64, 128
134*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE prep, ssse3,    4, 8, 16, 32, 64, 128
135*c0909341SAndroid Build Coastguard Worker
136*c0909341SAndroid Build Coastguard Worker%macro SCALED_JMP_TABLE 2-*
137*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - %3)
138*c0909341SAndroid Build Coastguard Worker    %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2)
139*c0909341SAndroid Build Coastguard Worker%%table:
140*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
141*c0909341SAndroid Build Coastguard Worker        dw %%base %+ .w%3 - %%base
142*c0909341SAndroid Build Coastguard Worker        %rotate 1
143*c0909341SAndroid Build Coastguard Worker    %endrep
144*c0909341SAndroid Build Coastguard Worker    %rotate 2
145*c0909341SAndroid Build Coastguard Worker%%dy_1024:
146*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
147*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
148*c0909341SAndroid Build Coastguard Worker        dw %%base %+ .dy1_w%3 - %%base
149*c0909341SAndroid Build Coastguard Worker        %rotate 1
150*c0909341SAndroid Build Coastguard Worker    %endrep
151*c0909341SAndroid Build Coastguard Worker    %rotate 2
152*c0909341SAndroid Build Coastguard Worker%%dy_2048:
153*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
154*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
155*c0909341SAndroid Build Coastguard Worker        dw %%base %+ .dy2_w%3 - %%base
156*c0909341SAndroid Build Coastguard Worker        %rotate 1
157*c0909341SAndroid Build Coastguard Worker    %endrep
158*c0909341SAndroid Build Coastguard Worker%endmacro
159*c0909341SAndroid Build Coastguard Worker
160*c0909341SAndroid Build Coastguard WorkerSCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128
161*c0909341SAndroid Build Coastguard WorkerSCALED_JMP_TABLE prep_8tap_scaled, ssse3,   4, 8, 16, 32, 64, 128
162*c0909341SAndroid Build Coastguard Worker
163*c0909341SAndroid Build Coastguard Workercextern mc_subpel_filters
164*c0909341SAndroid Build Coastguard Worker%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
165*c0909341SAndroid Build Coastguard Worker
166*c0909341SAndroid Build Coastguard Workercextern mc_warp_filter
167*c0909341SAndroid Build Coastguard Workercextern resize_filter
168*c0909341SAndroid Build Coastguard Worker
169*c0909341SAndroid Build Coastguard WorkerSECTION .text
170*c0909341SAndroid Build Coastguard Worker
171*c0909341SAndroid Build Coastguard Worker%if UNIX64
172*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7
173*c0909341SAndroid Build Coastguard Worker%else
174*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5
175*c0909341SAndroid Build Coastguard Worker%endif
176*c0909341SAndroid Build Coastguard Worker
177*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
178*c0909341SAndroid Build Coastguard Workercglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy
179*c0909341SAndroid Build Coastguard Worker%define base t0-put_ssse3
180*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r6m ; mx
181*c0909341SAndroid Build Coastguard Worker    LEA                  t0, put_ssse3
182*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
183*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
184*c0909341SAndroid Build Coastguard Worker    jnz .h
185*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r7m ; my
186*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
187*c0909341SAndroid Build Coastguard Worker    jnz .v
188*c0909341SAndroid Build Coastguard Worker.put:
189*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
190*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base+put_ssse3_table+wq*2]
191*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
192*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
193*c0909341SAndroid Build Coastguard Worker    jmp                  wq
194*c0909341SAndroid Build Coastguard Worker.put_w2:
195*c0909341SAndroid Build Coastguard Worker    mov                 r4d, [srcq+ssq*0]
196*c0909341SAndroid Build Coastguard Worker    mov                 r6d, [srcq+ssq*1]
197*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
198*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r4d
199*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r6d
200*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
201*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
202*c0909341SAndroid Build Coastguard Worker    jg .put_w2
203*c0909341SAndroid Build Coastguard Worker    RET
204*c0909341SAndroid Build Coastguard Worker.put_w4:
205*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
206*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*1]
207*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
208*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m0
209*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*1], m1
210*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
211*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
212*c0909341SAndroid Build Coastguard Worker    jg .put_w4
213*c0909341SAndroid Build Coastguard Worker    RET
214*c0909341SAndroid Build Coastguard Worker.put_w8:
215*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
216*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
217*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
218*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m0
219*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
220*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
221*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
222*c0909341SAndroid Build Coastguard Worker    jg .put_w8
223*c0909341SAndroid Build Coastguard Worker    RET
224*c0909341SAndroid Build Coastguard Worker.put_w16:
225*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+16*0]
226*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+16*1]
227*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1+16*0]
228*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1+16*1]
229*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
230*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+16*0], m0
231*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+16*1], m1
232*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+16*0], m2
233*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+16*1], m3
234*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
235*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
236*c0909341SAndroid Build Coastguard Worker    jg .put_w16
237*c0909341SAndroid Build Coastguard Worker    RET
238*c0909341SAndroid Build Coastguard Worker.put_w32:
239*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+16*0]
240*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+16*1]
241*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+16*2]
242*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+16*3]
243*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
244*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
245*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
246*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m2
247*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m3
248*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
249*c0909341SAndroid Build Coastguard Worker    dec                  hd
250*c0909341SAndroid Build Coastguard Worker    jg .put_w32
251*c0909341SAndroid Build Coastguard Worker    RET
252*c0909341SAndroid Build Coastguard Worker.put_w64:
253*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+16*0]
254*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+16*1]
255*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+16*2]
256*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+16*3]
257*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
258*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
259*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m2
260*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m3
261*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+16*4]
262*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+16*5]
263*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+16*6]
264*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+16*7]
265*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
266*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*4], m0
267*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*5], m1
268*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*6], m2
269*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*7], m3
270*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
271*c0909341SAndroid Build Coastguard Worker    dec                  hd
272*c0909341SAndroid Build Coastguard Worker    jg .put_w64
273*c0909341SAndroid Build Coastguard Worker    RET
274*c0909341SAndroid Build Coastguard Worker.put_w128:
275*c0909341SAndroid Build Coastguard Worker    add                srcq, 16*8
276*c0909341SAndroid Build Coastguard Worker    add                dstq, 16*8
277*c0909341SAndroid Build Coastguard Worker.put_w128_loop:
278*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq-16*8]
279*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq-16*7]
280*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq-16*6]
281*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq-16*5]
282*c0909341SAndroid Build Coastguard Worker    mova        [dstq-16*8], m0
283*c0909341SAndroid Build Coastguard Worker    mova        [dstq-16*7], m1
284*c0909341SAndroid Build Coastguard Worker    mova        [dstq-16*6], m2
285*c0909341SAndroid Build Coastguard Worker    mova        [dstq-16*5], m3
286*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq-16*4]
287*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq-16*3]
288*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq-16*2]
289*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq-16*1]
290*c0909341SAndroid Build Coastguard Worker    mova        [dstq-16*4], m0
291*c0909341SAndroid Build Coastguard Worker    mova        [dstq-16*3], m1
292*c0909341SAndroid Build Coastguard Worker    mova        [dstq-16*2], m2
293*c0909341SAndroid Build Coastguard Worker    mova        [dstq-16*1], m3
294*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+16*0]
295*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+16*1]
296*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+16*2]
297*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+16*3]
298*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
299*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
300*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m2
301*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m3
302*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+16*4]
303*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+16*5]
304*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+16*6]
305*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+16*7]
306*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
307*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*4], m0
308*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*5], m1
309*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*6], m2
310*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*7], m3
311*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
312*c0909341SAndroid Build Coastguard Worker    dec                  hd
313*c0909341SAndroid Build Coastguard Worker    jg .put_w128_loop
314*c0909341SAndroid Build Coastguard Worker    RET
315*c0909341SAndroid Build Coastguard Worker.h:
316*c0909341SAndroid Build Coastguard Worker    movd                 m5, mxyd
317*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r7m ; my
318*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+pw_16]
319*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [base+pw_256]
320*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5
321*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
322*c0909341SAndroid Build Coastguard Worker    jnz .hv
323*c0909341SAndroid Build Coastguard Worker    ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
324*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r8m ; bitdepth_max
325*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
326*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+put_bilin_h_rnd+r6*8]
327*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
328*c0909341SAndroid Build Coastguard Worker    sub                  wd, 8
329*c0909341SAndroid Build Coastguard Worker    jg .h_w16
330*c0909341SAndroid Build Coastguard Worker    je .h_w8
331*c0909341SAndroid Build Coastguard Worker    cmp                  wd, -4
332*c0909341SAndroid Build Coastguard Worker    je .h_w4
333*c0909341SAndroid Build Coastguard Worker.h_w2:
334*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*0]
335*c0909341SAndroid Build Coastguard Worker    movhps               m1, [srcq+ssq*1]
336*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
337*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4, m1
338*c0909341SAndroid Build Coastguard Worker    psrlq                m1, 16
339*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5
340*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
341*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
342*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 4
343*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m0
344*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
345*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m0
346*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
347*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
348*c0909341SAndroid Build Coastguard Worker    jg .h_w2
349*c0909341SAndroid Build Coastguard Worker    RET
350*c0909341SAndroid Build Coastguard Worker.h_w4:
351*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
352*c0909341SAndroid Build Coastguard Worker    movhps               m0, [srcq+ssq*1]
353*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*0+2]
354*c0909341SAndroid Build Coastguard Worker    movhps               m1, [srcq+ssq*1+2]
355*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
356*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
357*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5
358*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
359*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
360*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 4
361*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m0
362*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m0
363*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
364*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
365*c0909341SAndroid Build Coastguard Worker    jg .h_w4
366*c0909341SAndroid Build Coastguard Worker    RET
367*c0909341SAndroid Build Coastguard Worker.h_w8:
368*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
369*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+2]
370*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
371*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5
372*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
373*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
374*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
375*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1+2]
376*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
377*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
378*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5
379*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
380*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
381*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 4
382*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 4
383*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m0
384*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
385*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
386*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
387*c0909341SAndroid Build Coastguard Worker    jg .h_w8
388*c0909341SAndroid Build Coastguard Worker    RET
389*c0909341SAndroid Build Coastguard Worker.h_w16:
390*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+wq*2]
391*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+wq*2]
392*c0909341SAndroid Build Coastguard Worker    neg                  wq
393*c0909341SAndroid Build Coastguard Worker.h_w16_loop0:
394*c0909341SAndroid Build Coastguard Worker    mov                  r6, wq
395*c0909341SAndroid Build Coastguard Worker.h_w16_loop:
396*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6*2+ 0]
397*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6*2+ 2]
398*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
399*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5
400*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
401*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
402*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6*2+16]
403*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6*2+18]
404*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
405*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5
406*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
407*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
408*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 4
409*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 4
410*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r6*2+16*0], m0
411*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r6*2+16*1], m1
412*c0909341SAndroid Build Coastguard Worker    add                  r6, 16
413*c0909341SAndroid Build Coastguard Worker    jl .h_w16_loop
414*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
415*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
416*c0909341SAndroid Build Coastguard Worker    dec                  hd
417*c0909341SAndroid Build Coastguard Worker    jg .h_w16_loop0
418*c0909341SAndroid Build Coastguard Worker    RET
419*c0909341SAndroid Build Coastguard Worker.v:
420*c0909341SAndroid Build Coastguard Worker    shl                mxyd, 11
421*c0909341SAndroid Build Coastguard Worker    movd                 m5, mxyd
422*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [base+pw_256]
423*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
424*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
425*c0909341SAndroid Build Coastguard Worker    jg .v_w8
426*c0909341SAndroid Build Coastguard Worker    je .v_w4
427*c0909341SAndroid Build Coastguard Worker.v_w2:
428*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
429*c0909341SAndroid Build Coastguard Worker.v_w2_loop:
430*c0909341SAndroid Build Coastguard Worker    movd                 m1, [srcq+ssq*1]
431*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
432*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m0, m1
433*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
434*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m0
435*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
436*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
437*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
438*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m1
439*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m1
440*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m1
441*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
442*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
443*c0909341SAndroid Build Coastguard Worker    jg .v_w2_loop
444*c0909341SAndroid Build Coastguard Worker    RET
445*c0909341SAndroid Build Coastguard Worker.v_w4:
446*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
447*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
448*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*1]
449*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
450*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m0, m1
451*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
452*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m0
453*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
454*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
455*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
456*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m1
457*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m1
458*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
459*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
460*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
461*c0909341SAndroid Build Coastguard Worker    RET
462*c0909341SAndroid Build Coastguard Worker.v_w8:
463*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
464*c0909341SAndroid Build Coastguard Worker%if WIN64
465*c0909341SAndroid Build Coastguard Worker    push                 r7
466*c0909341SAndroid Build Coastguard Worker%endif
467*c0909341SAndroid Build Coastguard Worker    shl                  wd, 5
468*c0909341SAndroid Build Coastguard Worker    mov                  r7, srcq
469*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [wq+hq-256]
470*c0909341SAndroid Build Coastguard Worker    mov                  r4, dstq
471*c0909341SAndroid Build Coastguard Worker%else
472*c0909341SAndroid Build Coastguard Worker    mov                  r6, srcq
473*c0909341SAndroid Build Coastguard Worker%endif
474*c0909341SAndroid Build Coastguard Worker.v_w8_loop0:
475*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
476*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
477*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1]
478*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
479*c0909341SAndroid Build Coastguard Worker    psubw                m1, m3, m0
480*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
481*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
482*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
483*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, m3
484*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5
485*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
486*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m1
487*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m2
488*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
489*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
490*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
491*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
492*c0909341SAndroid Build Coastguard Worker    add                  r7, 16
493*c0909341SAndroid Build Coastguard Worker    add                  r4, 16
494*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
495*c0909341SAndroid Build Coastguard Worker    mov                srcq, r7
496*c0909341SAndroid Build Coastguard Worker    mov                dstq, r4
497*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
498*c0909341SAndroid Build Coastguard Worker%else
499*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstmp
500*c0909341SAndroid Build Coastguard Worker    add                  r6, 16
501*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
502*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
503*c0909341SAndroid Build Coastguard Worker    mov                srcq, r6
504*c0909341SAndroid Build Coastguard Worker    mov               dstmp, dstq
505*c0909341SAndroid Build Coastguard Worker    sub                  wd, 8
506*c0909341SAndroid Build Coastguard Worker%endif
507*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop0
508*c0909341SAndroid Build Coastguard Worker%if WIN64
509*c0909341SAndroid Build Coastguard Worker    pop                 r7
510*c0909341SAndroid Build Coastguard Worker%endif
511*c0909341SAndroid Build Coastguard Worker    RET
512*c0909341SAndroid Build Coastguard Worker.hv:
513*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       8
514*c0909341SAndroid Build Coastguard Worker    shl                mxyd, 11
515*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+pw_2]
516*c0909341SAndroid Build Coastguard Worker    movd                 m6, mxyd
517*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+pw_8192]
518*c0909341SAndroid Build Coastguard Worker    pshufb               m6, [base+pw_256]
519*c0909341SAndroid Build Coastguard Worker    test          dword r8m, 0x800
520*c0909341SAndroid Build Coastguard Worker    jnz .hv_12bpc
521*c0909341SAndroid Build Coastguard Worker    psllw                m4, 2
522*c0909341SAndroid Build Coastguard Worker    psllw                m5, 2
523*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+pw_2048]
524*c0909341SAndroid Build Coastguard Worker.hv_12bpc:
525*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
526*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
527*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
528*c0909341SAndroid Build Coastguard Worker    je .hv_w4
529*c0909341SAndroid Build Coastguard Worker.hv_w2:
530*c0909341SAndroid Build Coastguard Worker    movddup              m0, [srcq+ssq*0]
531*c0909341SAndroid Build Coastguard Worker    pshufhw              m1, m0, q0321
532*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
533*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5
534*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
535*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
536*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
537*c0909341SAndroid Build Coastguard Worker.hv_w2_loop:
538*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+ssq*1]
539*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
540*c0909341SAndroid Build Coastguard Worker    movhps               m2, [srcq+ssq*0]
541*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, m2
542*c0909341SAndroid Build Coastguard Worker    psrlq                m2, 16
543*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5
544*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
545*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
546*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2            ; 1 _ 2 _
547*c0909341SAndroid Build Coastguard Worker    shufpd               m2, m0, m1, 0x01 ; 0 _ 1 _
548*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
549*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
550*c0909341SAndroid Build Coastguard Worker    paddw                m1, m1
551*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m6
552*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
553*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7
554*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m1
555*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m1
556*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m1
557*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
558*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
559*c0909341SAndroid Build Coastguard Worker    jg .hv_w2_loop
560*c0909341SAndroid Build Coastguard Worker    RET
561*c0909341SAndroid Build Coastguard Worker.hv_w4:
562*c0909341SAndroid Build Coastguard Worker    movddup              m0, [srcq+ssq*0]
563*c0909341SAndroid Build Coastguard Worker    movddup              m1, [srcq+ssq*0+2]
564*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
565*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5
566*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
567*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
568*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
569*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
570*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*1]
571*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+ssq*1+2]
572*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
573*c0909341SAndroid Build Coastguard Worker    movhps               m1, [srcq+ssq*0]
574*c0909341SAndroid Build Coastguard Worker    movhps               m2, [srcq+ssq*0+2]
575*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
576*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5
577*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
578*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
579*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2            ; 1 2
580*c0909341SAndroid Build Coastguard Worker    shufpd               m2, m0, m1, 0x01 ; 0 1
581*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
582*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
583*c0909341SAndroid Build Coastguard Worker    paddw                m1, m1
584*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m6
585*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
586*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7
587*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m1
588*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m1
589*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
590*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
591*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
592*c0909341SAndroid Build Coastguard Worker    RET
593*c0909341SAndroid Build Coastguard Worker.hv_w8:
594*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
595*c0909341SAndroid Build Coastguard Worker%if WIN64
596*c0909341SAndroid Build Coastguard Worker    push                 r7
597*c0909341SAndroid Build Coastguard Worker%endif
598*c0909341SAndroid Build Coastguard Worker    shl                  wd, 5
599*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [wq+hq-256]
600*c0909341SAndroid Build Coastguard Worker    mov                  r4, srcq
601*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
602*c0909341SAndroid Build Coastguard Worker%else
603*c0909341SAndroid Build Coastguard Worker    mov                  r6, srcq
604*c0909341SAndroid Build Coastguard Worker%endif
605*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0:
606*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
607*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+2]
608*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
609*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5
610*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
611*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
612*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
613*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
614*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
615*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1+2]
616*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
617*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
618*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5
619*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
620*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
621*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2
622*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1, m0
623*c0909341SAndroid Build Coastguard Worker    paddw                m2, m2
624*c0909341SAndroid Build Coastguard Worker    pmulhw               m2, m6
625*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
626*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m7
627*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m2
628*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
629*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*0+2]
630*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
631*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5
632*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
633*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
634*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
635*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, m1
636*c0909341SAndroid Build Coastguard Worker    paddw                m2, m2
637*c0909341SAndroid Build Coastguard Worker    pmulhw               m2, m6
638*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1
639*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m7
640*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m2
641*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
642*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
643*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
644*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
645*c0909341SAndroid Build Coastguard Worker    add                  r4, 16
646*c0909341SAndroid Build Coastguard Worker    add                  r7, 16
647*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
648*c0909341SAndroid Build Coastguard Worker    mov                srcq, r4
649*c0909341SAndroid Build Coastguard Worker    mov                dstq, r7
650*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
651*c0909341SAndroid Build Coastguard Worker%else
652*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstmp
653*c0909341SAndroid Build Coastguard Worker    add                  r6, 16
654*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
655*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
656*c0909341SAndroid Build Coastguard Worker    mov                srcq, r6
657*c0909341SAndroid Build Coastguard Worker    mov               dstmp, dstq
658*c0909341SAndroid Build Coastguard Worker    sub                  wd, 8
659*c0909341SAndroid Build Coastguard Worker%endif
660*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop0
661*c0909341SAndroid Build Coastguard Worker%if WIN64
662*c0909341SAndroid Build Coastguard Worker    pop                  r7
663*c0909341SAndroid Build Coastguard Worker%endif
664*c0909341SAndroid Build Coastguard Worker    RET
665*c0909341SAndroid Build Coastguard Worker
666*c0909341SAndroid Build Coastguard Workercglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3
667*c0909341SAndroid Build Coastguard Worker%define base r6-prep_ssse3
668*c0909341SAndroid Build Coastguard Worker    movifnidn          mxyd, r5m ; mx
669*c0909341SAndroid Build Coastguard Worker    LEA                  r6, prep_ssse3
670*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
671*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
672*c0909341SAndroid Build Coastguard Worker    jnz .h
673*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r6m ; my
674*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
675*c0909341SAndroid Build Coastguard Worker    jnz .v
676*c0909341SAndroid Build Coastguard Worker.prep:
677*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
678*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base+prep_ssse3_table+wq*2]
679*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r7m ; bitdepth_max
680*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+pw_8192]
681*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
682*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 11
683*c0909341SAndroid Build Coastguard Worker    movddup              m4, [base+prep_mul+r5*8]
684*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
685*c0909341SAndroid Build Coastguard Worker    jmp                  wq
686*c0909341SAndroid Build Coastguard Worker.prep_w4:
687*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+strideq*0]
688*c0909341SAndroid Build Coastguard Worker    movhps               m0, [srcq+strideq*1]
689*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+strideq*2]
690*c0909341SAndroid Build Coastguard Worker    movhps               m1, [srcq+stride3q ]
691*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
692*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
693*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
694*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
695*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5
696*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
697*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
698*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
699*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
700*c0909341SAndroid Build Coastguard Worker    jg .prep_w4
701*c0909341SAndroid Build Coastguard Worker    RET
702*c0909341SAndroid Build Coastguard Worker.prep_w8:
703*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
704*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+strideq*1]
705*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+strideq*2]
706*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+stride3q ]
707*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
708*c0909341SAndroid Build Coastguard Worker    REPX     {pmullw x, m4}, m0, m1, m2, m3
709*c0909341SAndroid Build Coastguard Worker    REPX     {psubw  x, m5}, m0, m1, m2, m3
710*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
711*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
712*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*2], m2
713*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*3], m3
714*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*4
715*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
716*c0909341SAndroid Build Coastguard Worker    jg .prep_w8
717*c0909341SAndroid Build Coastguard Worker    RET
718*c0909341SAndroid Build Coastguard Worker.prep_w16:
719*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0+16*0]
720*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+strideq*0+16*1]
721*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+strideq*1+16*0]
722*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+strideq*1+16*1]
723*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
724*c0909341SAndroid Build Coastguard Worker    REPX     {pmullw x, m4}, m0, m1, m2, m3
725*c0909341SAndroid Build Coastguard Worker    REPX     {psubw  x, m5}, m0, m1, m2, m3
726*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
727*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
728*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*2], m2
729*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*3], m3
730*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*4
731*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
732*c0909341SAndroid Build Coastguard Worker    jg .prep_w16
733*c0909341SAndroid Build Coastguard Worker    RET
734*c0909341SAndroid Build Coastguard Worker.prep_w32:
735*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+16*0]
736*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+16*1]
737*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+16*2]
738*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+16*3]
739*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
740*c0909341SAndroid Build Coastguard Worker    REPX     {pmullw x, m4}, m0, m1, m2, m3
741*c0909341SAndroid Build Coastguard Worker    REPX     {psubw  x, m5}, m0, m1, m2, m3
742*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
743*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
744*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*2], m2
745*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*3], m3
746*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*4
747*c0909341SAndroid Build Coastguard Worker    dec                  hd
748*c0909341SAndroid Build Coastguard Worker    jg .prep_w32
749*c0909341SAndroid Build Coastguard Worker    RET
750*c0909341SAndroid Build Coastguard Worker.prep_w64:
751*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+16*0]
752*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+16*1]
753*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+16*2]
754*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+16*3]
755*c0909341SAndroid Build Coastguard Worker    REPX     {pmullw x, m4}, m0, m1, m2, m3
756*c0909341SAndroid Build Coastguard Worker    REPX     {psubw  x, m5}, m0, m1, m2, m3
757*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
758*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
759*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*2], m2
760*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*3], m3
761*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+16*4]
762*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+16*5]
763*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+16*6]
764*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+16*7]
765*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
766*c0909341SAndroid Build Coastguard Worker    REPX     {pmullw x, m4}, m0, m1, m2, m3
767*c0909341SAndroid Build Coastguard Worker    REPX     {psubw  x, m5}, m0, m1, m2, m3
768*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*4], m0
769*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*5], m1
770*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*6], m2
771*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*7], m3
772*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*8
773*c0909341SAndroid Build Coastguard Worker    dec                  hd
774*c0909341SAndroid Build Coastguard Worker    jg .prep_w64
775*c0909341SAndroid Build Coastguard Worker    RET
776*c0909341SAndroid Build Coastguard Worker.prep_w128:
777*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+16* 0]
778*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+16* 1]
779*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+16* 2]
780*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+16* 3]
781*c0909341SAndroid Build Coastguard Worker    REPX     {pmullw x, m4}, m0, m1, m2, m3
782*c0909341SAndroid Build Coastguard Worker    REPX     {psubw  x, m5}, m0, m1, m2, m3
783*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
784*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
785*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*2], m2
786*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*3], m3
787*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+16* 4]
788*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+16* 5]
789*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+16* 6]
790*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+16* 7]
791*c0909341SAndroid Build Coastguard Worker    REPX     {pmullw x, m4}, m0, m1, m2, m3
792*c0909341SAndroid Build Coastguard Worker    REPX     {psubw  x, m5}, m0, m1, m2, m3
793*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*4], m0
794*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*5], m1
795*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*6], m2
796*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*7], m3
797*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+16* 8]
798*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+16* 9]
799*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+16*10]
800*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+16*11]
801*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*16
802*c0909341SAndroid Build Coastguard Worker    REPX     {pmullw x, m4}, m0, m1, m2, m3
803*c0909341SAndroid Build Coastguard Worker    REPX     {psubw  x, m5}, m0, m1, m2, m3
804*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-16*8], m0
805*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-16*7], m1
806*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-16*6], m2
807*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-16*5], m3
808*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+16*12]
809*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+16*13]
810*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+16*14]
811*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+16*15]
812*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
813*c0909341SAndroid Build Coastguard Worker    REPX     {pmullw x, m4}, m0, m1, m2, m3
814*c0909341SAndroid Build Coastguard Worker    REPX     {psubw  x, m5}, m0, m1, m2, m3
815*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-16*4], m0
816*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-16*3], m1
817*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-16*2], m2
818*c0909341SAndroid Build Coastguard Worker    mova        [tmpq-16*1], m3
819*c0909341SAndroid Build Coastguard Worker    dec                  hd
820*c0909341SAndroid Build Coastguard Worker    jg .prep_w128
821*c0909341SAndroid Build Coastguard Worker    RET
822*c0909341SAndroid Build Coastguard Worker.h:
823*c0909341SAndroid Build Coastguard Worker    movd                 m4, mxyd
824*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r6m ; my
825*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+pw_16]
826*c0909341SAndroid Build Coastguard Worker    pshufb               m4, [base+pw_256]
827*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+pw_32766]
828*c0909341SAndroid Build Coastguard Worker    psubw                m3, m4
829*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
830*c0909341SAndroid Build Coastguard Worker    jnz .h_12bpc
831*c0909341SAndroid Build Coastguard Worker    psllw                m3, 2
832*c0909341SAndroid Build Coastguard Worker    psllw                m4, 2
833*c0909341SAndroid Build Coastguard Worker.h_12bpc:
834*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
835*c0909341SAndroid Build Coastguard Worker    jnz .hv
836*c0909341SAndroid Build Coastguard Worker    sub                  wd, 8
837*c0909341SAndroid Build Coastguard Worker    je .h_w8
838*c0909341SAndroid Build Coastguard Worker    jg .h_w16
839*c0909341SAndroid Build Coastguard Worker.h_w4:
840*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+strideq*0]
841*c0909341SAndroid Build Coastguard Worker    movhps               m0, [srcq+strideq*1]
842*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+strideq*0+2]
843*c0909341SAndroid Build Coastguard Worker    movhps               m1, [srcq+strideq*1+2]
844*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
845*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m3
846*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
847*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
848*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
849*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
850*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
851*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
852*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
853*c0909341SAndroid Build Coastguard Worker    jg .h_w4
854*c0909341SAndroid Build Coastguard Worker    RET
855*c0909341SAndroid Build Coastguard Worker.h_w8:
856*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
857*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+strideq*0+2]
858*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m3
859*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
860*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
861*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
862*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+strideq*1]
863*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+strideq*1+2]
864*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
865*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m3
866*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4
867*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5
868*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
869*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
870*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
871*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
872*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
873*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
874*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
875*c0909341SAndroid Build Coastguard Worker    jg .h_w8
876*c0909341SAndroid Build Coastguard Worker    RET
877*c0909341SAndroid Build Coastguard Worker.h_w16:
878*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+wq*2]
879*c0909341SAndroid Build Coastguard Worker    neg                  wq
880*c0909341SAndroid Build Coastguard Worker.h_w16_loop0:
881*c0909341SAndroid Build Coastguard Worker    mov                  r6, wq
882*c0909341SAndroid Build Coastguard Worker.h_w16_loop:
883*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6*2+ 0]
884*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6*2+ 2]
885*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m3
886*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
887*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
888*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
889*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6*2+16]
890*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6*2+18]
891*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m3
892*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4
893*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5
894*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
895*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
896*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
897*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
898*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
899*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
900*c0909341SAndroid Build Coastguard Worker    add                  r6, 16
901*c0909341SAndroid Build Coastguard Worker    jl .h_w16_loop
902*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
903*c0909341SAndroid Build Coastguard Worker    dec                  hd
904*c0909341SAndroid Build Coastguard Worker    jg .h_w16_loop0
905*c0909341SAndroid Build Coastguard Worker    RET
906*c0909341SAndroid Build Coastguard Worker.v:
907*c0909341SAndroid Build Coastguard Worker    movd                 m4, mxyd
908*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+pw_16]
909*c0909341SAndroid Build Coastguard Worker    pshufb               m4, [base+pw_256]
910*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+pw_32766]
911*c0909341SAndroid Build Coastguard Worker    psubw                m3, m4
912*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
913*c0909341SAndroid Build Coastguard Worker    jnz .v_12bpc
914*c0909341SAndroid Build Coastguard Worker    psllw                m3, 2
915*c0909341SAndroid Build Coastguard Worker    psllw                m4, 2
916*c0909341SAndroid Build Coastguard Worker.v_12bpc:
917*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
918*c0909341SAndroid Build Coastguard Worker    je .v_w8
919*c0909341SAndroid Build Coastguard Worker    jg .v_w16
920*c0909341SAndroid Build Coastguard Worker.v_w4:
921*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+strideq*0]
922*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
923*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+strideq*1]
924*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
925*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m0, m2 ; 0 1
926*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+strideq*0]
927*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m0     ; 1 2
928*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m3
929*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4
930*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5
931*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
932*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
933*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m1
934*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
935*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
936*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
937*c0909341SAndroid Build Coastguard Worker    RET
938*c0909341SAndroid Build Coastguard Worker.v_w8:
939*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
940*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
941*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+strideq*1]
942*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
943*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m3
944*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, m2
945*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
946*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
947*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
948*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
949*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m3
950*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m1
951*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, m0
952*c0909341SAndroid Build Coastguard Worker    psubw                m2, m5
953*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
954*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
955*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
956*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
957*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
958*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
959*c0909341SAndroid Build Coastguard Worker    RET
960*c0909341SAndroid Build Coastguard Worker.v_w16:
961*c0909341SAndroid Build Coastguard Worker%if WIN64
962*c0909341SAndroid Build Coastguard Worker    push                 r7
963*c0909341SAndroid Build Coastguard Worker%endif
964*c0909341SAndroid Build Coastguard Worker    mov                  r5, srcq
965*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
966*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [wq*4-32]
967*c0909341SAndroid Build Coastguard Worker    mov                  wd, wd
968*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+r6*8]
969*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
970*c0909341SAndroid Build Coastguard Worker%else
971*c0909341SAndroid Build Coastguard Worker    mov                 r6d, wd
972*c0909341SAndroid Build Coastguard Worker%endif
973*c0909341SAndroid Build Coastguard Worker.v_w16_loop0:
974*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
975*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
976*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+strideq*1]
977*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
978*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m3
979*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, m2
980*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
981*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
982*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
983*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
984*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m3
985*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+wq*0], m1
986*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, m0
987*c0909341SAndroid Build Coastguard Worker    psubw                m2, m5
988*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
989*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
990*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+wq*2], m1
991*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+wq*4]
992*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
993*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
994*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
995*c0909341SAndroid Build Coastguard Worker    add                  r5, 16
996*c0909341SAndroid Build Coastguard Worker    add                  r7, 16
997*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
998*c0909341SAndroid Build Coastguard Worker    mov                srcq, r5
999*c0909341SAndroid Build Coastguard Worker    mov                tmpq, r7
1000*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
1001*c0909341SAndroid Build Coastguard Worker%else
1002*c0909341SAndroid Build Coastguard Worker    mov                tmpq, tmpmp
1003*c0909341SAndroid Build Coastguard Worker    add                  r5, 16
1004*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
1005*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
1006*c0909341SAndroid Build Coastguard Worker    mov                srcq, r5
1007*c0909341SAndroid Build Coastguard Worker    mov               tmpmp, tmpq
1008*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 8
1009*c0909341SAndroid Build Coastguard Worker%endif
1010*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop0
1011*c0909341SAndroid Build Coastguard Worker%if WIN64
1012*c0909341SAndroid Build Coastguard Worker    pop                  r7
1013*c0909341SAndroid Build Coastguard Worker%endif
1014*c0909341SAndroid Build Coastguard Worker    RET
1015*c0909341SAndroid Build Coastguard Worker.hv:
1016*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       7
1017*c0909341SAndroid Build Coastguard Worker    shl                mxyd, 11
1018*c0909341SAndroid Build Coastguard Worker    movd                 m6, mxyd
1019*c0909341SAndroid Build Coastguard Worker    pshufb               m6, [base+pw_256]
1020*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
1021*c0909341SAndroid Build Coastguard Worker    je .hv_w8
1022*c0909341SAndroid Build Coastguard Worker    jg .hv_w16
1023*c0909341SAndroid Build Coastguard Worker.hv_w4:
1024*c0909341SAndroid Build Coastguard Worker    movddup              m0, [srcq+strideq*0]
1025*c0909341SAndroid Build Coastguard Worker    movddup              m1, [srcq+strideq*0+2]
1026*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m3
1027*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
1028*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
1029*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1030*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
1031*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
1032*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+strideq*1]
1033*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+strideq*1+2]
1034*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1035*c0909341SAndroid Build Coastguard Worker    movhps               m1, [srcq+strideq*0]
1036*c0909341SAndroid Build Coastguard Worker    movhps               m2, [srcq+strideq*0+2]
1037*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m3
1038*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4
1039*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5
1040*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1041*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2            ; 1 2
1042*c0909341SAndroid Build Coastguard Worker    shufpd               m2, m0, m1, 0x01 ; 0 1
1043*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
1044*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
1045*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6
1046*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1047*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m1
1048*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
1049*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1050*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
1051*c0909341SAndroid Build Coastguard Worker    RET
1052*c0909341SAndroid Build Coastguard Worker.hv_w8:
1053*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
1054*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+strideq*0+2]
1055*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m3
1056*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
1057*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
1058*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1059*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
1060*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
1061*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+strideq*1]
1062*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+strideq*1+2]
1063*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1064*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m3
1065*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4
1066*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5
1067*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1068*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
1069*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1, m0
1070*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
1071*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
1072*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m2
1073*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
1074*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+strideq*0+2]
1075*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m3
1076*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4
1077*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
1078*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1079*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
1080*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, m1
1081*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
1082*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1
1083*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m2
1084*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
1085*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1086*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
1087*c0909341SAndroid Build Coastguard Worker    RET
1088*c0909341SAndroid Build Coastguard Worker.hv_w16:
1089*c0909341SAndroid Build Coastguard Worker%if WIN64
1090*c0909341SAndroid Build Coastguard Worker    push                 r7
1091*c0909341SAndroid Build Coastguard Worker%endif
1092*c0909341SAndroid Build Coastguard Worker    mov                  r5, srcq
1093*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1094*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [wq*4-32]
1095*c0909341SAndroid Build Coastguard Worker    mov                  wd, wd
1096*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+r6*8]
1097*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
1098*c0909341SAndroid Build Coastguard Worker%else
1099*c0909341SAndroid Build Coastguard Worker    mov                 r6d, wd
1100*c0909341SAndroid Build Coastguard Worker%endif
1101*c0909341SAndroid Build Coastguard Worker.hv_w16_loop0:
1102*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
1103*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+strideq*0+2]
1104*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m3
1105*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
1106*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
1107*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1108*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
1109*c0909341SAndroid Build Coastguard Worker.hv_w16_loop:
1110*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+strideq*1]
1111*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+strideq*1+2]
1112*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1113*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m3
1114*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4
1115*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5
1116*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1117*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
1118*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1, m0
1119*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
1120*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
1121*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+wq*0], m2
1122*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
1123*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+strideq*0+2]
1124*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m3
1125*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4
1126*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
1127*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1128*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
1129*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, m1
1130*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
1131*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1
1132*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+wq*2], m2
1133*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+wq*4]
1134*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1135*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop
1136*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1137*c0909341SAndroid Build Coastguard Worker    add                  r5, 16
1138*c0909341SAndroid Build Coastguard Worker    add                  r7, 16
1139*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
1140*c0909341SAndroid Build Coastguard Worker    mov                srcq, r5
1141*c0909341SAndroid Build Coastguard Worker    mov                tmpq, r7
1142*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
1143*c0909341SAndroid Build Coastguard Worker%else
1144*c0909341SAndroid Build Coastguard Worker    mov                tmpq, tmpmp
1145*c0909341SAndroid Build Coastguard Worker    add                  r5, 16
1146*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
1147*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
1148*c0909341SAndroid Build Coastguard Worker    mov                srcq, r5
1149*c0909341SAndroid Build Coastguard Worker    mov               tmpmp, tmpq
1150*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 8
1151*c0909341SAndroid Build Coastguard Worker%endif
1152*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop0
1153*c0909341SAndroid Build Coastguard Worker%if WIN64
1154*c0909341SAndroid Build Coastguard Worker    pop                  r7
1155*c0909341SAndroid Build Coastguard Worker%endif
1156*c0909341SAndroid Build Coastguard Worker    RET
1157*c0909341SAndroid Build Coastguard Worker
1158*c0909341SAndroid Build Coastguard Worker; int8_t subpel_filters[5][15][8]
1159*c0909341SAndroid Build Coastguard Worker%assign FILTER_REGULAR (0*15 << 16) | 3*15
1160*c0909341SAndroid Build Coastguard Worker%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
1161*c0909341SAndroid Build Coastguard Worker%assign FILTER_SHARP   (2*15 << 16) | 3*15
1162*c0909341SAndroid Build Coastguard Worker
1163*c0909341SAndroid Build Coastguard Worker%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
1164*c0909341SAndroid Build Coastguard Workercglobal %1_%2_16bpc
1165*c0909341SAndroid Build Coastguard Worker    mov                 t0d, FILTER_%3
1166*c0909341SAndroid Build Coastguard Worker%ifidn %3, %4
1167*c0909341SAndroid Build Coastguard Worker    mov                 t1d, t0d
1168*c0909341SAndroid Build Coastguard Worker%else
1169*c0909341SAndroid Build Coastguard Worker    mov                 t1d, FILTER_%4
1170*c0909341SAndroid Build Coastguard Worker%endif
1171*c0909341SAndroid Build Coastguard Worker%if %0 == 5 ; skip the jump in the last filter
1172*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
1173*c0909341SAndroid Build Coastguard Worker%endif
1174*c0909341SAndroid Build Coastguard Worker%endmacro
1175*c0909341SAndroid Build Coastguard Worker
1176*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1177*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 1, 2, 6
1178*c0909341SAndroid Build Coastguard Worker%elif WIN64
1179*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4, 5, 8
1180*c0909341SAndroid Build Coastguard Worker%else
1181*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7, 8, 8
1182*c0909341SAndroid Build Coastguard Worker%endif
1183*c0909341SAndroid Build Coastguard Worker
1184*c0909341SAndroid Build Coastguard Worker%define PUT_8TAP_FN FN put_8tap,
1185*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_16bpc
1186*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_16bpc
1187*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_16bpc
1188*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular,        REGULAR, REGULAR
1189*c0909341SAndroid Build Coastguard Worker
1190*c0909341SAndroid Build Coastguard Workercglobal put_6tap_16bpc, 0, 9, 0, dst, ds, src, ss, w, h, mx, my
1191*c0909341SAndroid Build Coastguard Worker    %define            base  t2-put_ssse3
1192*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1193*c0909341SAndroid Build Coastguard Worker    %define             mxb  r0b
1194*c0909341SAndroid Build Coastguard Worker    %define             mxd  r0
1195*c0909341SAndroid Build Coastguard Worker    %define             mxq  r0
1196*c0909341SAndroid Build Coastguard Worker    %define             myb  r1b
1197*c0909341SAndroid Build Coastguard Worker    %define             myd  r1
1198*c0909341SAndroid Build Coastguard Worker    %define             myq  r1
1199*c0909341SAndroid Build Coastguard Worker%endif
1200*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
1201*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
1202*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
1203*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 6tap_v, my, 4tap_v
1204*c0909341SAndroid Build Coastguard Worker    LEA                  t2, put_ssse3
1205*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
1206*c0909341SAndroid Build Coastguard Worker    movifnidn          srcq, srcmp
1207*c0909341SAndroid Build Coastguard Worker    movifnidn           ssq, ssmp
1208*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
1209*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
1210*c0909341SAndroid Build Coastguard Worker    jnz .h
1211*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
1212*c0909341SAndroid Build Coastguard Worker    jnz .v
1213*c0909341SAndroid Build Coastguard Worker.put:
1214*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
1215*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base+put_ssse3_table+wq*2]
1216*c0909341SAndroid Build Coastguard Worker    movifnidn          dstq, dstmp
1217*c0909341SAndroid Build Coastguard Worker    movifnidn           dsq, dsmp
1218*c0909341SAndroid Build Coastguard Worker    add                  wq, t2
1219*c0909341SAndroid Build Coastguard Worker%if WIN64
1220*c0909341SAndroid Build Coastguard Worker    pop                  r8
1221*c0909341SAndroid Build Coastguard Worker    pop                  r7
1222*c0909341SAndroid Build Coastguard Worker%endif
1223*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1224*c0909341SAndroid Build Coastguard Worker.h_w2:
1225*c0909341SAndroid Build Coastguard Worker    mova                 m2, [base+spel_h_shuf2]
1226*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q2121
1227*c0909341SAndroid Build Coastguard Worker.h_w2_loop:
1228*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
1229*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
1230*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1231*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2
1232*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2
1233*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m3
1234*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m3
1235*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m1
1236*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
1237*c0909341SAndroid Build Coastguard Worker    psrad                m0, 6
1238*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m0
1239*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
1240*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m5
1241*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m1
1242*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m0
1243*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q3232
1244*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m0
1245*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1246*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1247*c0909341SAndroid Build Coastguard Worker    jg .h_w2_loop
1248*c0909341SAndroid Build Coastguard Worker    RET
1249*c0909341SAndroid Build Coastguard Worker.h_w4:
1250*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
1251*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq-2]
1252*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+mxq*8]
1253*c0909341SAndroid Build Coastguard Worker    movifnidn          dstq, dstmp
1254*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m3
1255*c0909341SAndroid Build Coastguard Worker    psraw                m3, 8 ; sign-extend
1256*c0909341SAndroid Build Coastguard Worker    jl .h_w2
1257*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       9
1258*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+spel_h_shufA]
1259*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1260*c0909341SAndroid Build Coastguard Worker    %define              m8  [base+spel_h_shufB]
1261*c0909341SAndroid Build Coastguard Worker%else
1262*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+spel_h_shufB]
1263*c0909341SAndroid Build Coastguard Worker%endif
1264*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q1111
1265*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q2222
1266*c0909341SAndroid Build Coastguard Worker.h_w4_loop:
1267*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
1268*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
1269*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1270*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m0, m7 ; 0 1 1 2 2 3 3 4
1271*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2
1272*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m8     ; 2 3 3 4 4 5 5 6
1273*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m3
1274*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6
1275*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m1, m7
1276*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2
1277*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m8
1278*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m3
1279*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
1280*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4
1281*c0909341SAndroid Build Coastguard Worker    paddd                m1, m6
1282*c0909341SAndroid Build Coastguard Worker    psrad                m0, 6
1283*c0909341SAndroid Build Coastguard Worker    psrad                m1, 6
1284*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
1285*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
1286*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m5
1287*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m1
1288*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m0
1289*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m0
1290*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1291*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1292*c0909341SAndroid Build Coastguard Worker    jg .h_w4_loop
1293*c0909341SAndroid Build Coastguard Worker    RET
1294*c0909341SAndroid Build Coastguard Worker.h:
1295*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
1296*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
1297*c0909341SAndroid Build Coastguard Worker    jnz .hv
1298*c0909341SAndroid Build Coastguard Worker    mov                 myd, r8m
1299*c0909341SAndroid Build Coastguard Worker    movd                 m5, r8m
1300*c0909341SAndroid Build Coastguard Worker    shr                 myd, 11
1301*c0909341SAndroid Build Coastguard Worker    movddup              m4, [base+put_8tap_h_rnd+myq*8]
1302*c0909341SAndroid Build Coastguard Worker    movifnidn           dsq, dsmp
1303*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [base+pw_256]
1304*c0909341SAndroid Build Coastguard Worker    sub                  wd, 4
1305*c0909341SAndroid Build Coastguard Worker    jle .h_w4
1306*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      11
1307*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
1308*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+1+mxq*8]
1309*c0909341SAndroid Build Coastguard Worker    movifnidn          dstq, dstmp
1310*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+spel_h_shufA]
1311*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+spel_h_shufB]
1312*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+wq*2]
1313*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m2
1314*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+wq*2]
1315*c0909341SAndroid Build Coastguard Worker    psraw                m2, 8
1316*c0909341SAndroid Build Coastguard Worker    neg                  wq
1317*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1318*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -16*3
1319*c0909341SAndroid Build Coastguard Worker    %define              m8  [rsp+16*0]
1320*c0909341SAndroid Build Coastguard Worker    %define              m9  [rsp+16*1]
1321*c0909341SAndroid Build Coastguard Worker    %define             m10  [rsp+16*2]
1322*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m2, q0000
1323*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m2, q1111
1324*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q2222
1325*c0909341SAndroid Build Coastguard Worker    mova                 m8, m0
1326*c0909341SAndroid Build Coastguard Worker    mova                 m9, m1
1327*c0909341SAndroid Build Coastguard Worker    mova                m10, m2
1328*c0909341SAndroid Build Coastguard Worker%else
1329*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m2, q0000
1330*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m2, q1111
1331*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m2, q2222
1332*c0909341SAndroid Build Coastguard Worker%endif
1333*c0909341SAndroid Build Coastguard Worker.h_w8_loop0:
1334*c0909341SAndroid Build Coastguard Worker    mov                  r6, wq
1335*c0909341SAndroid Build Coastguard Worker.h_w8_loop:
1336*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+r6*2-4]
1337*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6*2+8]
1338*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3, m6   ; 01 12 23 34
1339*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m8       ; abcd0
1340*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7       ; 23 34 45 56
1341*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m9, m3   ; abcd1
1342*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
1343*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2, m6   ; 67 78 89 9a
1344*c0909341SAndroid Build Coastguard Worker    shufpd               m3, m1, 0x01 ; 45 56 67 78
1345*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m9       ; efgh1
1346*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7       ; 89 9a ab bc
1347*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m10      ; efgh2
1348*c0909341SAndroid Build Coastguard Worker    paddd                m1, m2
1349*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m10, m3  ; abcd2
1350*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m8       ; efgh0
1351*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
1352*c0909341SAndroid Build Coastguard Worker    paddd                m1, m4
1353*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
1354*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
1355*c0909341SAndroid Build Coastguard Worker    psrad                m0, 6
1356*c0909341SAndroid Build Coastguard Worker    psrad                m1, 6
1357*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
1358*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
1359*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m5
1360*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m1
1361*c0909341SAndroid Build Coastguard Worker    mova        [dstq+r6*2], m0
1362*c0909341SAndroid Build Coastguard Worker    add                  r6, 8
1363*c0909341SAndroid Build Coastguard Worker    jl .h_w8_loop
1364*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
1365*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
1366*c0909341SAndroid Build Coastguard Worker    dec                  hd
1367*c0909341SAndroid Build Coastguard Worker    jg .h_w8_loop0
1368*c0909341SAndroid Build Coastguard Worker    RET
1369*c0909341SAndroid Build Coastguard Worker.v:
1370*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1371*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1372*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1373*c0909341SAndroid Build Coastguard Worker    cmovb               myd, mxd
1374*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+1+myq*8]
1375*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      11, 16
1376*c0909341SAndroid Build Coastguard Worker    movd                 m5, r8m
1377*c0909341SAndroid Build Coastguard Worker    movifnidn          dstq, dstmp
1378*c0909341SAndroid Build Coastguard Worker    movifnidn           dsq, dsmp
1379*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m2
1380*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [base+pw_256]
1381*c0909341SAndroid Build Coastguard Worker    psraw                m2, 8 ; sign-extend
1382*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1383*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -16*4
1384*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m2, q0000
1385*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
1386*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m2, q1111
1387*c0909341SAndroid Build Coastguard Worker    neg                  r6
1388*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q2222
1389*c0909341SAndroid Build Coastguard Worker    mova                 m8, m0
1390*c0909341SAndroid Build Coastguard Worker    mova                 m9, m1
1391*c0909341SAndroid Build Coastguard Worker    mova                m10, m2
1392*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 2
1393*c0909341SAndroid Build Coastguard Worker    jne .v_w4
1394*c0909341SAndroid Build Coastguard Worker%else
1395*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
1396*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m2, q0000
1397*c0909341SAndroid Build Coastguard Worker    neg                  r6
1398*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1399*c0909341SAndroid Build Coastguard Worker    jg .v_w8
1400*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m2, q1111
1401*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m2, q2222
1402*c0909341SAndroid Build Coastguard Worker    je .v_w4
1403*c0909341SAndroid Build Coastguard Worker%endif
1404*c0909341SAndroid Build Coastguard Worker.v_w2:
1405*c0909341SAndroid Build Coastguard Worker    movd                 m1, [srcq+r6 *2]
1406*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+r6 *1]
1407*c0909341SAndroid Build Coastguard Worker    movd                 m2, [srcq+ssq*0]
1408*c0909341SAndroid Build Coastguard Worker    movd                 m4, [srcq+ssq*1]
1409*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1410*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
1411*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m3      ; 0 1
1412*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m2      ; 1 2
1413*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4      ; 2 3
1414*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m0      ; 3 4
1415*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3      ; 01 12
1416*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4      ; 23 34
1417*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
1418*c0909341SAndroid Build Coastguard Worker.v_w2_loop:
1419*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+ssq*1]
1420*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1421*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m8, m1  ; a0 b0
1422*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
1423*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9      ; a1 b1
1424*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
1425*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m0, m3  ; 4 5
1426*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
1427*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m0      ; 5 6
1428*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3      ; 67 78
1429*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m10, m2 ; a2 b2
1430*c0909341SAndroid Build Coastguard Worker    paddd                m4, m3
1431*c0909341SAndroid Build Coastguard Worker    psrad                m4, 5
1432*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m4
1433*c0909341SAndroid Build Coastguard Worker    pmaxsw               m4, m6
1434*c0909341SAndroid Build Coastguard Worker    pavgw                m4, m6
1435*c0909341SAndroid Build Coastguard Worker    pminsw               m4, m5
1436*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m4
1437*c0909341SAndroid Build Coastguard Worker    pshuflw              m4, m4, q3232
1438*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m4
1439*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1440*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1441*c0909341SAndroid Build Coastguard Worker    jg .v_w2_loop
1442*c0909341SAndroid Build Coastguard Worker    RET
1443*c0909341SAndroid Build Coastguard Worker.v_w4:
1444*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1445*c0909341SAndroid Build Coastguard Worker    shl                  wd, 14
1446*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+r6*2]
1447*c0909341SAndroid Build Coastguard Worker    lea                  wd, [wq+hq-(1<<16)]
1448*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
1449*c0909341SAndroid Build Coastguard Worker    %define           dstmp  [esp+16*3]
1450*c0909341SAndroid Build Coastguard Worker%endif
1451*c0909341SAndroid Build Coastguard Worker.v_w4_loop0:
1452*c0909341SAndroid Build Coastguard Worker    mov               dstmp, dstq
1453*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*0]
1454*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+ssq*1]
1455*c0909341SAndroid Build Coastguard Worker    lea                  r6, [srcq+ssq*2]
1456*c0909341SAndroid Build Coastguard Worker    movq                 m3, [r6  +ssq*0]
1457*c0909341SAndroid Build Coastguard Worker    movq                 m4, [r6  +ssq*1]
1458*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6  +ssq*2]
1459*c0909341SAndroid Build Coastguard Worker%else
1460*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+r6 *2]
1461*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+r6 *1]
1462*c0909341SAndroid Build Coastguard Worker    lea                  r6, [srcq+ssq*2]
1463*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+ssq*0]
1464*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ssq*1]
1465*c0909341SAndroid Build Coastguard Worker%endif
1466*c0909341SAndroid Build Coastguard Worker    movq                 m0, [r6  +ssq*0]
1467*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2      ; 01
1468*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3      ; 12
1469*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4      ; 23
1470*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m0      ; 34
1471*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
1472*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m8, m1  ; a0
1473*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m8, m2  ; b0
1474*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
1475*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9      ; a1
1476*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
1477*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m9      ; b1
1478*c0909341SAndroid Build Coastguard Worker    paddd                m6, m3
1479*c0909341SAndroid Build Coastguard Worker    movq                 m3, [r6+ssq*0]
1480*c0909341SAndroid Build Coastguard Worker    paddd                m7, m4
1481*c0909341SAndroid Build Coastguard Worker    movq                 m4, [r6+ssq*1]
1482*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6+ssq*2]
1483*c0909341SAndroid Build Coastguard Worker    movq                 m0, [r6+ssq*0]
1484*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4      ; 45
1485*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m0      ; 56
1486*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m10, m3 ; a2
1487*c0909341SAndroid Build Coastguard Worker    paddd                m6, m0
1488*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m10, m4 ; b2
1489*c0909341SAndroid Build Coastguard Worker    paddd                m7, m0
1490*c0909341SAndroid Build Coastguard Worker    psrad                m6, 5
1491*c0909341SAndroid Build Coastguard Worker    psrad                m7, 5
1492*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
1493*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
1494*c0909341SAndroid Build Coastguard Worker    pmaxsw               m6, m7
1495*c0909341SAndroid Build Coastguard Worker    pavgw                m6, m7
1496*c0909341SAndroid Build Coastguard Worker    pminsw               m6, m5
1497*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m6
1498*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m6
1499*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1500*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1501*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
1502*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1503*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstmp
1504*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
1505*c0909341SAndroid Build Coastguard Worker    movzx                hd, ww
1506*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
1507*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<16
1508*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop0
1509*c0909341SAndroid Build Coastguard Worker    RET
1510*c0909341SAndroid Build Coastguard Worker%else
1511*c0909341SAndroid Build Coastguard Worker    RET
1512*c0909341SAndroid Build Coastguard Worker.v_w8:
1513*c0909341SAndroid Build Coastguard Worker    mova                r6m, m8
1514*c0909341SAndroid Build Coastguard Worker    shl                  wd, 5
1515*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m2, q1111
1516*c0909341SAndroid Build Coastguard Worker    lea                  wd, [wq+hq-(1<<8)]
1517*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m2, q2222
1518*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       16
1519*c0909341SAndroid Build Coastguard Worker.v_w8_loop0:
1520*c0909341SAndroid Build Coastguard Worker    movu                 m9, [srcq+ r6*2]
1521*c0909341SAndroid Build Coastguard Worker    movu                m11, [srcq+ r6*1]
1522*c0909341SAndroid Build Coastguard Worker    lea                  r7, [srcq+ssq*2]
1523*c0909341SAndroid Build Coastguard Worker    movu                m13, [srcq+ssq*0]
1524*c0909341SAndroid Build Coastguard Worker    movu                m15, [srcq+ssq*1]
1525*c0909341SAndroid Build Coastguard Worker    mov                  r8, dstq
1526*c0909341SAndroid Build Coastguard Worker    movu                 m4, [r7  +ssq*0]
1527*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m9, m11  ; 01
1528*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m11
1529*c0909341SAndroid Build Coastguard Worker    punpcklwd           m10, m11, m13 ; 12
1530*c0909341SAndroid Build Coastguard Worker    punpckhwd           m11, m13
1531*c0909341SAndroid Build Coastguard Worker    punpcklwd           m12, m13, m15 ; 23
1532*c0909341SAndroid Build Coastguard Worker    punpckhwd           m13, m15
1533*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m15, m4  ; 34
1534*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m4
1535*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
1536*c0909341SAndroid Build Coastguard Worker    mova                 m3, r6m
1537*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m8, m3   ; a0
1538*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9, m3   ; a0'
1539*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m10, m3  ; b0
1540*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m11      ; b0'
1541*c0909341SAndroid Build Coastguard Worker    mova                 m8, m12
1542*c0909341SAndroid Build Coastguard Worker    pmaddwd             m12, m6       ; a1
1543*c0909341SAndroid Build Coastguard Worker    mova                 m9, m13
1544*c0909341SAndroid Build Coastguard Worker    pmaddwd             m13, m6       ; a1'
1545*c0909341SAndroid Build Coastguard Worker    mova                m10, m14
1546*c0909341SAndroid Build Coastguard Worker    pmaddwd             m14, m6       ; b1
1547*c0909341SAndroid Build Coastguard Worker    mova                m11, m15
1548*c0909341SAndroid Build Coastguard Worker    pmaddwd             m15, m6       ; b1'
1549*c0909341SAndroid Build Coastguard Worker    paddd                m0, m12
1550*c0909341SAndroid Build Coastguard Worker    paddd                m2, m13
1551*c0909341SAndroid Build Coastguard Worker    movu                m13, [r7+ssq*0]
1552*c0909341SAndroid Build Coastguard Worker    paddd                m1, m14
1553*c0909341SAndroid Build Coastguard Worker    paddd                m3, m15
1554*c0909341SAndroid Build Coastguard Worker    movu                m15, [r7+ssq*1]
1555*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+ssq*2]
1556*c0909341SAndroid Build Coastguard Worker    movu                 m4, [r7+ssq*0]
1557*c0909341SAndroid Build Coastguard Worker    punpcklwd           m12, m13, m15 ; 45
1558*c0909341SAndroid Build Coastguard Worker    punpckhwd           m13, m15
1559*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m15, m4  ; 56
1560*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m4
1561*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7, m12  ; a2
1562*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
1563*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7, m13  ; a2'
1564*c0909341SAndroid Build Coastguard Worker    paddd                m2, m4
1565*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7, m14  ; b2
1566*c0909341SAndroid Build Coastguard Worker    paddd                m1, m4
1567*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7, m15  ; b2'
1568*c0909341SAndroid Build Coastguard Worker    paddd                m3, m4
1569*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 5}, m0, m2, m1, m3
1570*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2
1571*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3
1572*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
1573*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m2
1574*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m2
1575*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m2
1576*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m2
1577*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m5
1578*c0909341SAndroid Build Coastguard Worker    pminsw               m1, m5
1579*c0909341SAndroid Build Coastguard Worker    mova         [r8+dsq*0], m0
1580*c0909341SAndroid Build Coastguard Worker    mova         [r8+dsq*1], m1
1581*c0909341SAndroid Build Coastguard Worker    lea                  r8, [r8+dsq*2]
1582*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1583*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
1584*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
1585*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
1586*c0909341SAndroid Build Coastguard Worker    movzx                hd, wb
1587*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<8
1588*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop0
1589*c0909341SAndroid Build Coastguard Worker    RET
1590*c0909341SAndroid Build Coastguard Worker%endif
1591*c0909341SAndroid Build Coastguard Worker.hv:
1592*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1593*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
1594*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      12, 16
1595*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1596*c0909341SAndroid Build Coastguard Worker    movd                 m3, r8m
1597*c0909341SAndroid Build Coastguard Worker    pshufb               m3, [base+pw_256]
1598*c0909341SAndroid Build Coastguard Worker%else
1599*c0909341SAndroid Build Coastguard Worker    movd                m11, r8m
1600*c0909341SAndroid Build Coastguard Worker    pshufb              m11, [base+pw_256]
1601*c0909341SAndroid Build Coastguard Worker%endif
1602*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
1603*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base+subpel_filters+mxq*8]
1604*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1605*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1606*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1607*c0909341SAndroid Build Coastguard Worker    cmovb               myd, mxd
1608*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+1+myq*8]
1609*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+pd_8704]
1610*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
1611*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q2121
1612*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
1613*c0909341SAndroid Build Coastguard Worker    punpcklbw            m6, m0
1614*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m2
1615*c0909341SAndroid Build Coastguard Worker    psraw                m2, 8 ; sign-extend
1616*c0909341SAndroid Build Coastguard Worker    test          dword r8m, 0x800
1617*c0909341SAndroid Build Coastguard Worker    jz .hv_w2_10bpc
1618*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+pd_2560]
1619*c0909341SAndroid Build Coastguard Worker    psraw                m6, 2
1620*c0909341SAndroid Build Coastguard Worker    psllw                m2, 2
1621*c0909341SAndroid Build Coastguard Worker.hv_w2_10bpc:
1622*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1623*c0909341SAndroid Build Coastguard Worker%assign regs_used 2
1624*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -16*7
1625*c0909341SAndroid Build Coastguard Worker%assign regs_used 7
1626*c0909341SAndroid Build Coastguard Worker    mov                dstq, r0mp
1627*c0909341SAndroid Build Coastguard Worker    mov                 dsq, r1mp
1628*c0909341SAndroid Build Coastguard Worker    %define             m11  [esp+16*4]
1629*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m2, q0000
1630*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m2, q1111
1631*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q2222
1632*c0909341SAndroid Build Coastguard Worker    mova                 m8, m0
1633*c0909341SAndroid Build Coastguard Worker    mova                 m9, m1
1634*c0909341SAndroid Build Coastguard Worker    mova                m10, m2
1635*c0909341SAndroid Build Coastguard Worker    mova                m11, m3
1636*c0909341SAndroid Build Coastguard Worker    neg                 ssq
1637*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*2]
1638*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*1]
1639*c0909341SAndroid Build Coastguard Worker    neg                 ssq
1640*c0909341SAndroid Build Coastguard Worker%else
1641*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m2, q0000
1642*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
1643*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m2, q1111
1644*c0909341SAndroid Build Coastguard Worker    neg                  r6
1645*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m2, q2222
1646*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+r6 *2]
1647*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+r6 *1]
1648*c0909341SAndroid Build Coastguard Worker%endif
1649*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0]
1650*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*1]
1651*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1652*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*0]
1653*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1654*c0909341SAndroid Build Coastguard Worker    je .hv_w4
1655*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+spel_h_shuf2]
1656*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb  x, m5}, m3, m4, m0, m1, m2
1657*c0909341SAndroid Build Coastguard Worker    REPX    {pmaddwd x, m6}, m3, m0, m4, m1, m2
1658*c0909341SAndroid Build Coastguard Worker    phaddd               m3, m0        ; 0 3
1659*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m1        ; 1 2
1660*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m2        ; 3 4
1661*c0909341SAndroid Build Coastguard Worker    REPX    {paddd   x, m7}, m3, m4, m0
1662*c0909341SAndroid Build Coastguard Worker    REPX    {psrad   x, 10}, m3, m4, m0
1663*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m4        ; 0 3 1 2
1664*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m0        ; 1 2 3 4
1665*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q1320 ; 0 1 2 3
1666*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m4    ; 01 12
1667*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m4        ; 23 34
1668*c0909341SAndroid Build Coastguard Worker.hv_w2_loop:
1669*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1]
1670*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1671*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*0]
1672*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m5
1673*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m5
1674*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m6
1675*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m6
1676*c0909341SAndroid Build Coastguard Worker    phaddd               m3, m4
1677*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m8, m1    ; a0 b0
1678*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
1679*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9        ; a1 b1
1680*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
1681*c0909341SAndroid Build Coastguard Worker    paddd                m3, m7
1682*c0909341SAndroid Build Coastguard Worker    psrad                m3, 10        ; 5 6
1683*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m3
1684*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q2103
1685*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0        ; 45 56
1686*c0909341SAndroid Build Coastguard Worker    mova                 m0, m3
1687*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m10, m2   ; a2 b2
1688*c0909341SAndroid Build Coastguard Worker    paddd                m4, m3
1689*c0909341SAndroid Build Coastguard Worker    psrad                m4, 10
1690*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m4
1691*c0909341SAndroid Build Coastguard Worker    pxor                 m3, m3
1692*c0909341SAndroid Build Coastguard Worker    pminsw               m4, m11
1693*c0909341SAndroid Build Coastguard Worker    pmaxsw               m4, m3
1694*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m4
1695*c0909341SAndroid Build Coastguard Worker    pshuflw              m4, m4, q1032
1696*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m4
1697*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1698*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1699*c0909341SAndroid Build Coastguard Worker    jg .hv_w2_loop
1700*c0909341SAndroid Build Coastguard Worker    RET
1701*c0909341SAndroid Build Coastguard Worker.hv_w4:
1702*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1703*c0909341SAndroid Build Coastguard Worker    %define             m12  [esp+16*5]
1704*c0909341SAndroid Build Coastguard Worker    %define             m13  [esp+16*6]
1705*c0909341SAndroid Build Coastguard Worker    %define             m14  [base+spel_h_shufA]
1706*c0909341SAndroid Build Coastguard Worker    %define             m15  [base+spel_h_shufB]
1707*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m6, q0000
1708*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q1111
1709*c0909341SAndroid Build Coastguard Worker    mova                m12, m5
1710*c0909341SAndroid Build Coastguard Worker    mova                m13, m6
1711*c0909341SAndroid Build Coastguard Worker%else
1712*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       16
1713*c0909341SAndroid Build Coastguard Worker    mova                m14, [base+spel_h_shufA]
1714*c0909341SAndroid Build Coastguard Worker    mova                m15, [base+spel_h_shufB]
1715*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m6, q0000
1716*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m6, q1111
1717*c0909341SAndroid Build Coastguard Worker%endif
1718*c0909341SAndroid Build Coastguard Worker%macro HV_H_W4_6TAP 3-4 m15 ; dst, src, tmp, shufB
1719*c0909341SAndroid Build Coastguard Worker    pshufb               %3, %2, m14
1720*c0909341SAndroid Build Coastguard Worker    pmaddwd              %3, m12
1721*c0909341SAndroid Build Coastguard Worker    pshufb               %2, %4
1722*c0909341SAndroid Build Coastguard Worker    pmaddwd              %2, m13
1723*c0909341SAndroid Build Coastguard Worker    paddd                %3, m7
1724*c0909341SAndroid Build Coastguard Worker    paddd                %1, %2, %3
1725*c0909341SAndroid Build Coastguard Worker%endmacro
1726*c0909341SAndroid Build Coastguard Worker    HV_H_W4_6TAP         m3, m3, m5
1727*c0909341SAndroid Build Coastguard Worker    HV_H_W4_6TAP         m4, m4, m5
1728*c0909341SAndroid Build Coastguard Worker    HV_H_W4_6TAP         m5, m1, m5
1729*c0909341SAndroid Build Coastguard Worker    HV_H_W4_6TAP         m0, m0, m1
1730*c0909341SAndroid Build Coastguard Worker    HV_H_W4_6TAP         m2, m2, m1
1731*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 10}, m3, m5, m4, m0, m2
1732*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m5      ; 0 2
1733*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m0      ; 1 3
1734*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m2      ; 2 4
1735*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m4  ; 01
1736*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4      ; 23
1737*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m5  ; 12
1738*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5      ; 34
1739*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
1740*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*1]
1741*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m8, m1  ; a0
1742*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1743*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m8, m2  ; b0
1744*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
1745*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9      ; a1
1746*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
1747*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m9      ; b1
1748*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
1749*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*0]
1750*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4
1751*c0909341SAndroid Build Coastguard Worker    HV_H_W4_6TAP         m0, m0, m4
1752*c0909341SAndroid Build Coastguard Worker    HV_H_W4_6TAP         m3, m3, m4
1753*c0909341SAndroid Build Coastguard Worker    psrad                m4, m2, 16
1754*c0909341SAndroid Build Coastguard Worker    psrad                m0, 10
1755*c0909341SAndroid Build Coastguard Worker    psrad                m3, 10
1756*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m0      ; 4 5
1757*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m3      ; 5 6
1758*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m0  ; 45
1759*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0      ; 56
1760*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m10, m3 ; a2
1761*c0909341SAndroid Build Coastguard Worker    paddd                m5, m0
1762*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m10, m4 ; b2
1763*c0909341SAndroid Build Coastguard Worker    paddd                m6, m0
1764*c0909341SAndroid Build Coastguard Worker    psrad                m5, 10
1765*c0909341SAndroid Build Coastguard Worker    psrad                m6, 10
1766*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m6
1767*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
1768*c0909341SAndroid Build Coastguard Worker    pminsw               m5, m11
1769*c0909341SAndroid Build Coastguard Worker    pmaxsw               m5, m6
1770*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m5
1771*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m5
1772*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1773*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1774*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
1775*c0909341SAndroid Build Coastguard Worker    RET
1776*c0909341SAndroid Build Coastguard Worker.hv_w8:
1777*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
1778*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
1779*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+1+mxq*8]
1780*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1781*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1782*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1783*c0909341SAndroid Build Coastguard Worker    cmovb               myd, mxd
1784*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base+subpel_filters+1+myq*8]
1785*c0909341SAndroid Build Coastguard Worker    movd                 m3, r8m
1786*c0909341SAndroid Build Coastguard Worker    movddup              m4, [base+pd_8704]
1787*c0909341SAndroid Build Coastguard Worker    pshufb               m3, [base+pw_256]
1788*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
1789*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m2
1790*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
1791*c0909341SAndroid Build Coastguard Worker    sub                srcq, 4
1792*c0909341SAndroid Build Coastguard Worker    psraw                m1, 8 ; sign-extend
1793*c0909341SAndroid Build Coastguard Worker    test          dword r8m, 0x800
1794*c0909341SAndroid Build Coastguard Worker    jz .hv_w8_10bpc
1795*c0909341SAndroid Build Coastguard Worker    movddup              m4, [base+pd_2560]
1796*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
1797*c0909341SAndroid Build Coastguard Worker    psllw                m1, 2
1798*c0909341SAndroid Build Coastguard Worker.hv_w8_10bpc:
1799*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1800*c0909341SAndroid Build Coastguard Worker%assign regs_used 2
1801*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -16*9
1802*c0909341SAndroid Build Coastguard Worker%assign regs_used 7
1803*c0909341SAndroid Build Coastguard Worker    mov                dstq, r0mp
1804*c0909341SAndroid Build Coastguard Worker    mov                 dsq, r1mp
1805*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*7], m4
1806*c0909341SAndroid Build Coastguard Worker%else
1807*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK        16*7, 16
1808*c0909341SAndroid Build Coastguard Worker%endif
1809*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*6], m3
1810*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q0000
1811*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m2
1812*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q1111
1813*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m2
1814*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m0, q2222
1815*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m0
1816*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m1, q0000
1817*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m2
1818*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m1, q1111
1819*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*4], m2
1820*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q2222
1821*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*5], m1
1822*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
1823*c0909341SAndroid Build Coastguard Worker    neg                  r6
1824*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1825*c0909341SAndroid Build Coastguard Worker    shl                  wd, 14
1826*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [wq+hq-(1<<16)]
1827*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
1828*c0909341SAndroid Build Coastguard Worker    %define           srcmp  [esp+16*8+4*0]
1829*c0909341SAndroid Build Coastguard Worker    %define           dstmp  [esp+16*8+4*1]
1830*c0909341SAndroid Build Coastguard Worker%endif
1831*c0909341SAndroid Build Coastguard Worker%macro HV_H_6TAP 3-6 [rsp+16*0], [rsp+16*1], [rsp+16*2] ; dst, src[1-2], mul[1-3]
1832*c0909341SAndroid Build Coastguard Worker    punpcklwd            %1, %2, %3   ; 01 12 23 34
1833*c0909341SAndroid Build Coastguard Worker    punpckhwd            %2, %3       ; 45 56 67 78
1834*c0909341SAndroid Build Coastguard Worker    pmaddwd              %3, %4, %1   ; a0
1835*c0909341SAndroid Build Coastguard Worker    shufpd               %1, %2, 0x01 ; 23 34 45 56
1836*c0909341SAndroid Build Coastguard Worker    pmaddwd              %2, %6       ; a2
1837*c0909341SAndroid Build Coastguard Worker    pmaddwd              %1, %5       ; a1
1838*c0909341SAndroid Build Coastguard Worker    paddd                %2, %3
1839*c0909341SAndroid Build Coastguard Worker    paddd                %1, %2
1840*c0909341SAndroid Build Coastguard Worker%endmacro
1841*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0:
1842*c0909341SAndroid Build Coastguard Worker    mov               srcmp, srcq
1843*c0909341SAndroid Build Coastguard Worker    mov               dstmp, dstq
1844*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+r6*2+0]
1845*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+r6*2+2]
1846*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+16*0]
1847*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+16*1]
1848*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+16*2]
1849*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m2, m5, m6, m7, m1, m0
1850*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+r6*1+0]
1851*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+r6*1+2]
1852*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m3, m5, m6, m7, m1, m0
1853*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*0+0]
1854*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*0+2]
1855*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m4, m5, m6, m7, m1, m0
1856*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*1+0]
1857*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*1+2]
1858*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1859*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m0, m5, m6, m7, m1
1860*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*0+0]
1861*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*0+2]
1862*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m1, m5, m6, m7
1863*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*7]
1864*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m2, m3, m4, m0, m1
1865*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 10}, m2, m4, m3, m0, m1
1866*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m4     ; 0 2
1867*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m0     ; 1 3
1868*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m1     ; 2 4
1869*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2, m3 ; 01
1870*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m3     ; 23
1871*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m4 ; 12
1872*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4     ; 34
1873*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
1874*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*3]
1875*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+16*4]
1876*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m0, m5 ; a0
1877*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1     ; b0
1878*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
1879*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m6     ; a1
1880*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
1881*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m6     ; b1
1882*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
1883*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1+0]
1884*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
1885*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1+2]
1886*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1887*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m6, m2, m3
1888*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*0+0]
1889*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*0+2]
1890*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m7, m2, m3
1891*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+16*7]
1892*c0909341SAndroid Build Coastguard Worker    psrad                m3, m1, 16
1893*c0909341SAndroid Build Coastguard Worker    paddd                m6, m2
1894*c0909341SAndroid Build Coastguard Worker    paddd                m7, m2
1895*c0909341SAndroid Build Coastguard Worker    psrad                m6, 10
1896*c0909341SAndroid Build Coastguard Worker    psrad                m7, 10
1897*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m6     ; 4 5
1898*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7     ; 5 6
1899*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+16*5]
1900*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m6 ; 45
1901*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m6     ; 56
1902*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m7 ; a2
1903*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m3     ; b2
1904*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
1905*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
1906*c0909341SAndroid Build Coastguard Worker    psrad                m4, 10
1907*c0909341SAndroid Build Coastguard Worker    psrad                m5, 10
1908*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
1909*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
1910*c0909341SAndroid Build Coastguard Worker    pminsw               m4, [rsp+16*6]
1911*c0909341SAndroid Build Coastguard Worker    pmaxsw               m4, m5
1912*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m4
1913*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m4
1914*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1915*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1916*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
1917*c0909341SAndroid Build Coastguard Worker    mov                srcq, srcmp
1918*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstmp
1919*c0909341SAndroid Build Coastguard Worker    movzx                hd, r4w
1920*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
1921*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
1922*c0909341SAndroid Build Coastguard Worker    sub                 r4d, 1<<16
1923*c0909341SAndroid Build Coastguard Worker%else
1924*c0909341SAndroid Build Coastguard Worker    shl                  wd, 5
1925*c0909341SAndroid Build Coastguard Worker    lea                 r8d, [wq+hq-256]
1926*c0909341SAndroid Build Coastguard Worker%macro HV_H_6TAP 5-9 [spel_h_shufA], [rsp+16*0], [rsp+16*1], [rsp+16*2] ; dst, src[1-3], shift, shuf, mul[1-3]
1927*c0909341SAndroid Build Coastguard Worker%ifid %6
1928*c0909341SAndroid Build Coastguard Worker    REPX     {pshufb x, %6}, %2, %3, %4
1929*c0909341SAndroid Build Coastguard Worker%else
1930*c0909341SAndroid Build Coastguard Worker    mova                 %1, %6
1931*c0909341SAndroid Build Coastguard Worker    pshufb               %2, %1       ; 01 12 23 34
1932*c0909341SAndroid Build Coastguard Worker    pshufb               %3, %1       ; 45 56 67 78
1933*c0909341SAndroid Build Coastguard Worker    pshufb               %4, %1       ; 89 9a ab bc
1934*c0909341SAndroid Build Coastguard Worker%endif
1935*c0909341SAndroid Build Coastguard Worker    pmaddwd              %1, %7, %2
1936*c0909341SAndroid Build Coastguard Worker    shufpd               %2, %3, 0x01 ; 23 34 45 56
1937*c0909341SAndroid Build Coastguard Worker    pmaddwd              %2, %8
1938*c0909341SAndroid Build Coastguard Worker    paddd                %1, %2
1939*c0909341SAndroid Build Coastguard Worker    pmaddwd              %2, %9, %3
1940*c0909341SAndroid Build Coastguard Worker    paddd                %1, %2
1941*c0909341SAndroid Build Coastguard Worker    pmaddwd              %2, %7, %3
1942*c0909341SAndroid Build Coastguard Worker    shufpd               %3, %4, 0x01 ; 67 78 89 9a
1943*c0909341SAndroid Build Coastguard Worker    pmaddwd              %4, %9
1944*c0909341SAndroid Build Coastguard Worker    pmaddwd              %3, %8
1945*c0909341SAndroid Build Coastguard Worker    paddd                %1, m4
1946*c0909341SAndroid Build Coastguard Worker    paddd                %2, m4
1947*c0909341SAndroid Build Coastguard Worker    paddd                %3, %4
1948*c0909341SAndroid Build Coastguard Worker    paddd                %2, %3
1949*c0909341SAndroid Build Coastguard Worker    psrad                %1, %5
1950*c0909341SAndroid Build Coastguard Worker    psrad                %2, %5
1951*c0909341SAndroid Build Coastguard Worker    packssdw             %1, %2
1952*c0909341SAndroid Build Coastguard Worker%endmacro
1953*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0:
1954*c0909341SAndroid Build Coastguard Worker    mova                 m5, [spel_h_shufA]
1955*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6*2+ 0]
1956*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+16*0]
1957*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6*2+ 8]
1958*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+16*1]
1959*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6*2+16]
1960*c0909341SAndroid Build Coastguard Worker    mova                 m8, [rsp+16*2]
1961*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m9, m0, m1, m2, 10, m5, m6, m7, m8
1962*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6*1+ 0]
1963*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6*1+ 8]
1964*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6*1+16]
1965*c0909341SAndroid Build Coastguard Worker    lea                  r4, [srcq+ssq*2]
1966*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP           m11, m0, m1, m2, 10, m5, m6, m7, m8
1967*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+ 0]
1968*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+ 8]
1969*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*0+16]
1970*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
1971*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP           m13, m0, m1, m2, 10, m5, m6, m7, m8
1972*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*1+ 0]
1973*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1+ 8]
1974*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1+16]
1975*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP           m15, m0, m1, m2, 10, m5, m6, m7, m8
1976*c0909341SAndroid Build Coastguard Worker    movu                 m0, [r4+ssq*0+ 0]
1977*c0909341SAndroid Build Coastguard Worker    movu                 m1, [r4+ssq*0+ 8]
1978*c0909341SAndroid Build Coastguard Worker    movu                 m2, [r4+ssq*0+16]
1979*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m5, m0, m1, m2, 10, m5, m6, m7, m8
1980*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m9, m11  ; 01
1981*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m11
1982*c0909341SAndroid Build Coastguard Worker    punpcklwd           m10, m11, m13 ; 12
1983*c0909341SAndroid Build Coastguard Worker    punpckhwd           m11, m13
1984*c0909341SAndroid Build Coastguard Worker    punpcklwd           m12, m13, m15 ; 23
1985*c0909341SAndroid Build Coastguard Worker    punpckhwd           m13, m15
1986*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m15, m5  ; 34
1987*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m5
1988*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
1989*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+16*3]
1990*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+16*4]
1991*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m8, m3   ; a0
1992*c0909341SAndroid Build Coastguard Worker    mova                 m8, m12
1993*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9, m3   ; a0'
1994*c0909341SAndroid Build Coastguard Worker    mova                 m9, m13
1995*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m10, m3  ; b0
1996*c0909341SAndroid Build Coastguard Worker    mova                m10, m14
1997*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m11      ; b0'
1998*c0909341SAndroid Build Coastguard Worker    mova                m11, m15
1999*c0909341SAndroid Build Coastguard Worker    REPX    {pmaddwd x, m7}, m12, m13, m14, m15
2000*c0909341SAndroid Build Coastguard Worker    movu                 m6, [r4+ssq*1+ 0]
2001*c0909341SAndroid Build Coastguard Worker    paddd                m0, m12
2002*c0909341SAndroid Build Coastguard Worker    movu                 m7, [r4+ssq*1+ 8]
2003*c0909341SAndroid Build Coastguard Worker    paddd                m2, m13
2004*c0909341SAndroid Build Coastguard Worker    movu                m12, [r4+ssq*1+16]
2005*c0909341SAndroid Build Coastguard Worker    paddd                m1, m14
2006*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r4+ssq*2]
2007*c0909341SAndroid Build Coastguard Worker    paddd                m3, m15
2008*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP           m15, m6, m7, m12, 10
2009*c0909341SAndroid Build Coastguard Worker    movu                 m6, [r4+ssq*0+ 0]
2010*c0909341SAndroid Build Coastguard Worker    movu                 m7, [r4+ssq*0+ 8]
2011*c0909341SAndroid Build Coastguard Worker    movu                m14, [r4+ssq*0+16]
2012*c0909341SAndroid Build Coastguard Worker    punpcklwd           m12, m5, m15 ; 45
2013*c0909341SAndroid Build Coastguard Worker    punpckhwd           m13, m5, m15
2014*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m5, m6, m7, m14, 10
2015*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+16*5]
2016*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m15, m5  ; 56
2017*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m5
2018*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m12, m7  ; a2
2019*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6
2020*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m13, m7  ; a2'
2021*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6
2022*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m14, m7  ; b2
2023*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m15      ; b2'
2024*c0909341SAndroid Build Coastguard Worker    paddd                m1, m6
2025*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+16*6]
2026*c0909341SAndroid Build Coastguard Worker    paddd                m3, m7
2027*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 10}, m0, m2, m1, m3
2028*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2
2029*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3
2030*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
2031*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m6
2032*c0909341SAndroid Build Coastguard Worker    pminsw               m1, m6
2033*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m2
2034*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m2
2035*c0909341SAndroid Build Coastguard Worker    mova         [r7+dsq*0], m0
2036*c0909341SAndroid Build Coastguard Worker    mova         [r7+dsq*1], m1
2037*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+dsq*2]
2038*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2039*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
2040*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
2041*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
2042*c0909341SAndroid Build Coastguard Worker    movzx                hd, r8b
2043*c0909341SAndroid Build Coastguard Worker    sub                 r8d, 1<<8
2044*c0909341SAndroid Build Coastguard Worker%endif
2045*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop0
2046*c0909341SAndroid Build Coastguard Worker    RET
2047*c0909341SAndroid Build Coastguard Worker
2048*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_16bpc
2049*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_16bpc
2050*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_sharp,  REGULAR, SHARP,   put_8tap_16bpc
2051*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_8tap_16bpc
2052*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp,          SHARP,   SHARP
2053*c0909341SAndroid Build Coastguard Worker
2054*c0909341SAndroid Build Coastguard Workercglobal put_8tap_16bpc, 0, 9, 0, dst, ds, src, ss, w, h, mx, my
2055*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2056*c0909341SAndroid Build Coastguard Worker    %define             mxb  r0b
2057*c0909341SAndroid Build Coastguard Worker    %define             mxd  r0
2058*c0909341SAndroid Build Coastguard Worker    %define             mxq  r0
2059*c0909341SAndroid Build Coastguard Worker    %define             myb  r1b
2060*c0909341SAndroid Build Coastguard Worker    %define             myd  r1
2061*c0909341SAndroid Build Coastguard Worker    %define             myq  r1
2062*c0909341SAndroid Build Coastguard Worker    %define              m8  [esp+16*0]
2063*c0909341SAndroid Build Coastguard Worker    %define              m9  [esp+16*1]
2064*c0909341SAndroid Build Coastguard Worker    %define             m10  [esp+16*2]
2065*c0909341SAndroid Build Coastguard Worker    %define             m11  [esp+16*3]
2066*c0909341SAndroid Build Coastguard Worker    %define             m12  [esp+16*4]
2067*c0909341SAndroid Build Coastguard Worker    %define             m13  [esp+16*5]
2068*c0909341SAndroid Build Coastguard Worker    %define             m14  [esp+16*6]
2069*c0909341SAndroid Build Coastguard Worker    %define             m15  [esp+16*7]
2070*c0909341SAndroid Build Coastguard Worker%endif
2071*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
2072*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
2073*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
2074*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 8tap_v, my, 4tap_v
2075*c0909341SAndroid Build Coastguard Worker    LEA                  t2, put_ssse3
2076*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
2077*c0909341SAndroid Build Coastguard Worker    movifnidn          srcq, srcmp
2078*c0909341SAndroid Build Coastguard Worker    movifnidn           ssq, ssmp
2079*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
2080*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
2081*c0909341SAndroid Build Coastguard Worker    jnz .h
2082*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2083*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _put_6tap_16bpc_ssse3).put
2084*c0909341SAndroid Build Coastguard Worker.v:
2085*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2086*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2087*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2088*c0909341SAndroid Build Coastguard Worker    cmovb               myd, mxd
2089*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+myq*8]
2090*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      15
2091*c0909341SAndroid Build Coastguard Worker    movd                 m7, r8m
2092*c0909341SAndroid Build Coastguard Worker    movifnidn          dstq, dstmp
2093*c0909341SAndroid Build Coastguard Worker    movifnidn           dsq, dsmp
2094*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m3
2095*c0909341SAndroid Build Coastguard Worker    pshufb               m7, [base+pw_256]
2096*c0909341SAndroid Build Coastguard Worker    psraw                m3, 8 ; sign-extend
2097*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2098*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -16*7
2099*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q0000
2100*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m3, q1111
2101*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q2222
2102*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q3333
2103*c0909341SAndroid Build Coastguard Worker    mova                 m8, m0
2104*c0909341SAndroid Build Coastguard Worker    mova                 m9, m1
2105*c0909341SAndroid Build Coastguard Worker    mova                m10, m2
2106*c0909341SAndroid Build Coastguard Worker    mova                m11, m3
2107*c0909341SAndroid Build Coastguard Worker%else
2108*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m3, q0000
2109*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m3, q1111
2110*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m3, q2222
2111*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m3, q3333
2112*c0909341SAndroid Build Coastguard Worker%endif
2113*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
2114*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
2115*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 2
2116*c0909341SAndroid Build Coastguard Worker    jne .v_w4
2117*c0909341SAndroid Build Coastguard Worker.v_w2:
2118*c0909341SAndroid Build Coastguard Worker    movd                 m1, [srcq+ssq*0]
2119*c0909341SAndroid Build Coastguard Worker    movd                 m4, [srcq+ssq*1]
2120*c0909341SAndroid Build Coastguard Worker    movd                 m2, [srcq+ssq*2]
2121*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2122*c0909341SAndroid Build Coastguard Worker    movd                 m5, [srcq+ssq*0]
2123*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+ssq*1]
2124*c0909341SAndroid Build Coastguard Worker    movd                 m6, [srcq+ssq*2]
2125*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2126*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
2127*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m4      ; 0 1
2128*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m2      ; 1 2
2129*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m5      ; 2 3
2130*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m3      ; 3 4
2131*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m6      ; 4 5
2132*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m0      ; 5 6
2133*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m4      ; 01 12
2134*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5      ; 23 34
2135*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m6      ; 45 56
2136*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
2137*c0909341SAndroid Build Coastguard Worker.v_w2_loop:
2138*c0909341SAndroid Build Coastguard Worker    movd                 m4, [srcq+ssq*1]
2139*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2140*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m8, m1  ; a0 b0
2141*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
2142*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9      ; a1 b1
2143*c0909341SAndroid Build Coastguard Worker    paddd                m5, m2
2144*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
2145*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m10     ; a2 b2
2146*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
2147*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m0, m4  ; 6 7
2148*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
2149*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m0      ; 7 8
2150*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4      ; 67 78
2151*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m11, m3 ; a3 b3
2152*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4
2153*c0909341SAndroid Build Coastguard Worker    psrad                m5, 5
2154*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m5
2155*c0909341SAndroid Build Coastguard Worker    pmaxsw               m5, m6
2156*c0909341SAndroid Build Coastguard Worker    pavgw                m5, m6
2157*c0909341SAndroid Build Coastguard Worker    pminsw               m5, m7
2158*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m5
2159*c0909341SAndroid Build Coastguard Worker    pshuflw              m5, m5, q3232
2160*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m5
2161*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2162*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2163*c0909341SAndroid Build Coastguard Worker    jg .v_w2_loop
2164*c0909341SAndroid Build Coastguard Worker    RET
2165*c0909341SAndroid Build Coastguard Worker.v_w4:
2166*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2167*c0909341SAndroid Build Coastguard Worker    shl                  wd, 14
2168*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
2169*c0909341SAndroid Build Coastguard Worker    mov          [esp+4*29], srcq
2170*c0909341SAndroid Build Coastguard Worker    mov          [esp+4*30], dstq
2171*c0909341SAndroid Build Coastguard Worker%else
2172*c0909341SAndroid Build Coastguard Worker    mov               srcmp, srcq
2173*c0909341SAndroid Build Coastguard Worker%endif
2174*c0909341SAndroid Build Coastguard Worker    lea                  wd, [wq+hq-(1<<16)]
2175*c0909341SAndroid Build Coastguard Worker%else
2176*c0909341SAndroid Build Coastguard Worker    shl                  wd, 6
2177*c0909341SAndroid Build Coastguard Worker    mov                  r7, srcq
2178*c0909341SAndroid Build Coastguard Worker    mov                  r8, dstq
2179*c0909341SAndroid Build Coastguard Worker    lea                  wd, [wq+hq-(1<<8)]
2180*c0909341SAndroid Build Coastguard Worker%endif
2181*c0909341SAndroid Build Coastguard Worker.v_w4_loop0:
2182*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*0]
2183*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+ssq*1]
2184*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+ssq*2]
2185*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2186*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ssq*0]
2187*c0909341SAndroid Build Coastguard Worker    movq                 m5, [srcq+ssq*1]
2188*c0909341SAndroid Build Coastguard Worker    movq                 m6, [srcq+ssq*2]
2189*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2190*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
2191*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2      ; 01
2192*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3      ; 12
2193*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4      ; 23
2194*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5      ; 34
2195*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6      ; 45
2196*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m0      ; 56
2197*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2198*c0909341SAndroid Build Coastguard Worker    jmp .v_w4_loop_start
2199*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
2200*c0909341SAndroid Build Coastguard Worker    mova                 m1, m12
2201*c0909341SAndroid Build Coastguard Worker    mova                 m2, m13
2202*c0909341SAndroid Build Coastguard Worker    mova                 m3, m14
2203*c0909341SAndroid Build Coastguard Worker.v_w4_loop_start:
2204*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m8      ; a0
2205*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m8      ; b0
2206*c0909341SAndroid Build Coastguard Worker    mova                m12, m3
2207*c0909341SAndroid Build Coastguard Worker    mova                m13, m4
2208*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9      ; a1
2209*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m9      ; b1
2210*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
2211*c0909341SAndroid Build Coastguard Worker    paddd                m2, m4
2212*c0909341SAndroid Build Coastguard Worker    mova                m14, m5
2213*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
2214*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m10     ; a2
2215*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m10     ; b2
2216*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5
2217*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6
2218*c0909341SAndroid Build Coastguard Worker    movq                 m6, [srcq+ssq*1]
2219*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2220*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m0, m6  ; 67
2221*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
2222*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m11, m5 ; a3
2223*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m0      ; 78
2224*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
2225*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m11, m6 ; b3
2226*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3
2227*c0909341SAndroid Build Coastguard Worker    psrad                m1, 5
2228*c0909341SAndroid Build Coastguard Worker    psrad                m2, 5
2229*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m2
2230*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
2231*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m2
2232*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m2
2233*c0909341SAndroid Build Coastguard Worker    pminsw               m1, m7
2234*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m1
2235*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m1
2236*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2237*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2238*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
2239*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
2240*c0909341SAndroid Build Coastguard Worker    mov                srcq, [esp+4*29]
2241*c0909341SAndroid Build Coastguard Worker    mov                dstq, [esp+4*30]
2242*c0909341SAndroid Build Coastguard Worker    movzx                hd, ww
2243*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
2244*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
2245*c0909341SAndroid Build Coastguard Worker    mov          [esp+4*29], srcq
2246*c0909341SAndroid Build Coastguard Worker    mov          [esp+4*30], dstq
2247*c0909341SAndroid Build Coastguard Worker%else
2248*c0909341SAndroid Build Coastguard Worker    mov                srcq, srcmp
2249*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstmp
2250*c0909341SAndroid Build Coastguard Worker    movzx                hd, ww
2251*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
2252*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
2253*c0909341SAndroid Build Coastguard Worker    mov               srcmp, srcq
2254*c0909341SAndroid Build Coastguard Worker    mov               dstmp, dstq
2255*c0909341SAndroid Build Coastguard Worker%endif
2256*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<16
2257*c0909341SAndroid Build Coastguard Worker%else
2258*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
2259*c0909341SAndroid Build Coastguard Worker    pmaddwd             m12, m8, m1  ; a0
2260*c0909341SAndroid Build Coastguard Worker    pmaddwd             m13, m8, m2  ; b0
2261*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
2262*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
2263*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9      ; a1
2264*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m9      ; b1
2265*c0909341SAndroid Build Coastguard Worker    paddd               m12, m3
2266*c0909341SAndroid Build Coastguard Worker    paddd               m13, m4
2267*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
2268*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
2269*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m10     ; a2
2270*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m10     ; b2
2271*c0909341SAndroid Build Coastguard Worker    paddd               m12, m5
2272*c0909341SAndroid Build Coastguard Worker    paddd               m13, m6
2273*c0909341SAndroid Build Coastguard Worker    movq                 m6, [srcq+ssq*1]
2274*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2275*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m0, m6  ; 67
2276*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
2277*c0909341SAndroid Build Coastguard Worker    pmaddwd             m14, m11, m5 ; a3
2278*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m0      ; 78
2279*c0909341SAndroid Build Coastguard Worker    paddd               m12, m14
2280*c0909341SAndroid Build Coastguard Worker    pmaddwd             m14, m11, m6 ; b3
2281*c0909341SAndroid Build Coastguard Worker    paddd               m13, m14
2282*c0909341SAndroid Build Coastguard Worker    psrad               m12, 5
2283*c0909341SAndroid Build Coastguard Worker    psrad               m13, 5
2284*c0909341SAndroid Build Coastguard Worker    packssdw            m12, m13
2285*c0909341SAndroid Build Coastguard Worker    pxor                m13, m13
2286*c0909341SAndroid Build Coastguard Worker    pmaxsw              m12, m13
2287*c0909341SAndroid Build Coastguard Worker    pavgw               m12, m13
2288*c0909341SAndroid Build Coastguard Worker    pminsw              m12, m7
2289*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m12
2290*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m12
2291*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2292*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2293*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
2294*c0909341SAndroid Build Coastguard Worker    add                  r7, 8
2295*c0909341SAndroid Build Coastguard Worker    add                  r8, 8
2296*c0909341SAndroid Build Coastguard Worker    movzx                hd, wb
2297*c0909341SAndroid Build Coastguard Worker    mov                srcq, r7
2298*c0909341SAndroid Build Coastguard Worker    mov                dstq, r8
2299*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<8
2300*c0909341SAndroid Build Coastguard Worker%endif
2301*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop0
2302*c0909341SAndroid Build Coastguard Worker    RET
2303*c0909341SAndroid Build Coastguard Worker.h:
2304*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
2305*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2306*c0909341SAndroid Build Coastguard Worker    jnz .hv
2307*c0909341SAndroid Build Coastguard Worker    mov                 myd, r8m
2308*c0909341SAndroid Build Coastguard Worker    movd                 m5, r8m
2309*c0909341SAndroid Build Coastguard Worker    shr                 myd, 11
2310*c0909341SAndroid Build Coastguard Worker    movddup              m4, [base+put_8tap_h_rnd+myq*8]
2311*c0909341SAndroid Build Coastguard Worker    movifnidn           dsq, dsmp
2312*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [base+pw_256]
2313*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2314*c0909341SAndroid Build Coastguard Worker    jle mangle(private_prefix %+ _put_6tap_16bpc_ssse3).h_w4
2315*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      12
2316*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
2317*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+mxq*8]
2318*c0909341SAndroid Build Coastguard Worker    movifnidn          dstq, dstmp
2319*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+spel_h_shufA]
2320*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+spel_h_shufB]
2321*c0909341SAndroid Build Coastguard Worker%if UNIX64
2322*c0909341SAndroid Build Coastguard Worker    mov                  wd, wd
2323*c0909341SAndroid Build Coastguard Worker%endif
2324*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+wq*2]
2325*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m3
2326*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+wq*2]
2327*c0909341SAndroid Build Coastguard Worker    psraw                m3, 8
2328*c0909341SAndroid Build Coastguard Worker    neg                  wq
2329*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2330*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -16*4
2331*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q0000
2332*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m3, q1111
2333*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q2222
2334*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q3333
2335*c0909341SAndroid Build Coastguard Worker    mova                 m8, m0
2336*c0909341SAndroid Build Coastguard Worker    mova                 m9, m1
2337*c0909341SAndroid Build Coastguard Worker    mova                m10, m2
2338*c0909341SAndroid Build Coastguard Worker    mova                m11, m3
2339*c0909341SAndroid Build Coastguard Worker%else
2340*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m3, q0000
2341*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m3, q1111
2342*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m3, q2222
2343*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m3, q3333
2344*c0909341SAndroid Build Coastguard Worker%endif
2345*c0909341SAndroid Build Coastguard Worker.h_w8_loop0:
2346*c0909341SAndroid Build Coastguard Worker    mov                  r6, wq
2347*c0909341SAndroid Build Coastguard Worker.h_w8_loop:
2348*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6*2- 6]
2349*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6*2+ 2]
2350*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m0, m6   ; 0 1 1 2 2 3 3 4
2351*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m7       ; 2 3 3 4 4 5 5 6
2352*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m8       ; abcd0
2353*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m9       ; abcd1
2354*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m1, m6   ; 4 5 5 6 6 7 7 8
2355*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m7       ; 6 7 7 8 8 9 9 a
2356*c0909341SAndroid Build Coastguard Worker    paddd                m2, m4
2357*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
2358*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m10, m3  ; abcd2
2359*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m8       ; efgh0
2360*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
2361*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m11, m1  ; abcd3
2362*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m9       ; efgh1
2363*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
2364*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6*2+10]
2365*c0909341SAndroid Build Coastguard Worker    paddd                m3, m4
2366*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
2367*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m2, m6   ; 8 9 9 a a b b c
2368*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7       ; a b b c c d d e
2369*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m10      ; efgh2
2370*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m11      ; efgh3
2371*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
2372*c0909341SAndroid Build Coastguard Worker    paddd                m1, m2
2373*c0909341SAndroid Build Coastguard Worker    psrad                m0, 6
2374*c0909341SAndroid Build Coastguard Worker    psrad                m1, 6
2375*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
2376*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
2377*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m5
2378*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m1
2379*c0909341SAndroid Build Coastguard Worker    mova        [dstq+r6*2], m0
2380*c0909341SAndroid Build Coastguard Worker    add                  r6, 8
2381*c0909341SAndroid Build Coastguard Worker    jl .h_w8_loop
2382*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
2383*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
2384*c0909341SAndroid Build Coastguard Worker    dec                  hd
2385*c0909341SAndroid Build Coastguard Worker    jg .h_w8_loop0
2386*c0909341SAndroid Build Coastguard Worker    RET
2387*c0909341SAndroid Build Coastguard Worker.hv:
2388*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
2389*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2390*c0909341SAndroid Build Coastguard Worker    movd                 m4, r8m
2391*c0909341SAndroid Build Coastguard Worker    pshufb               m4, [base+pw_256]
2392*c0909341SAndroid Build Coastguard Worker%else
2393*c0909341SAndroid Build Coastguard Worker%if WIN64
2394*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK        16*6, 16
2395*c0909341SAndroid Build Coastguard Worker%endif
2396*c0909341SAndroid Build Coastguard Worker    movd                m15, r8m
2397*c0909341SAndroid Build Coastguard Worker    pshufb              m15, [base+pw_256]
2398*c0909341SAndroid Build Coastguard Worker%endif
2399*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2400*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
2401*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
2402*c0909341SAndroid Build Coastguard Worker    je .hv_w4
2403*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base+subpel_filters+mxq*8]
2404*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2405*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2406*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2407*c0909341SAndroid Build Coastguard Worker    cmovb               myd, mxd
2408*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+myq*8]
2409*c0909341SAndroid Build Coastguard Worker    movddup              m6, [base+pd_8704]
2410*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q2121
2411*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
2412*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m0
2413*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m3
2414*c0909341SAndroid Build Coastguard Worker    psraw                m3, 8 ; sign-extend
2415*c0909341SAndroid Build Coastguard Worker    test          dword r8m, 0x800
2416*c0909341SAndroid Build Coastguard Worker    jz .hv_w2_10bpc
2417*c0909341SAndroid Build Coastguard Worker    movddup              m6, [base+pd_2560]
2418*c0909341SAndroid Build Coastguard Worker    psraw                m7, 2
2419*c0909341SAndroid Build Coastguard Worker    psllw                m3, 2
2420*c0909341SAndroid Build Coastguard Worker.hv_w2_10bpc:
2421*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2422*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstmp
2423*c0909341SAndroid Build Coastguard Worker    mov                 dsq, dsmp
2424*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+spel_h_shuf2]
2425*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -16*8
2426*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q0000
2427*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m3, q1111
2428*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q2222
2429*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q3333
2430*c0909341SAndroid Build Coastguard Worker    mova                 m9, m5
2431*c0909341SAndroid Build Coastguard Worker    mova                m11, m0
2432*c0909341SAndroid Build Coastguard Worker    mova                m12, m1
2433*c0909341SAndroid Build Coastguard Worker    mova                m13, m2
2434*c0909341SAndroid Build Coastguard Worker    mova                m14, m3
2435*c0909341SAndroid Build Coastguard Worker    mova                m15, m4
2436*c0909341SAndroid Build Coastguard Worker%else
2437*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+spel_h_shuf2]
2438*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m3, q0000
2439*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m3, q1111
2440*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m3, q2222
2441*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m3, q3333
2442*c0909341SAndroid Build Coastguard Worker%endif
2443*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
2444*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
2445*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
2446*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*0]
2447*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1]
2448*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*2]
2449*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2450*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*0]
2451*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2452*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb  x, m5}, m2, m3, m1, m4
2453*c0909341SAndroid Build Coastguard Worker%else
2454*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb  x, m9}, m2, m3, m1, m4
2455*c0909341SAndroid Build Coastguard Worker%endif
2456*c0909341SAndroid Build Coastguard Worker    REPX    {pmaddwd x, m7}, m2, m3, m1, m4
2457*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m3        ; 0 1
2458*c0909341SAndroid Build Coastguard Worker    phaddd               m1, m4        ; 2 3
2459*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1]
2460*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*2]
2461*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2462*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
2463*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2464*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb  x, m5}, m3, m4, m0
2465*c0909341SAndroid Build Coastguard Worker%else
2466*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb  x, m9}, m3, m4, m0
2467*c0909341SAndroid Build Coastguard Worker%endif
2468*c0909341SAndroid Build Coastguard Worker    REPX    {pmaddwd x, m7}, m3, m4, m0
2469*c0909341SAndroid Build Coastguard Worker    phaddd               m3, m4        ; 4 5
2470*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m0        ; 6 6
2471*c0909341SAndroid Build Coastguard Worker    REPX    {paddd   x, m6}, m2, m1, m3, m0
2472*c0909341SAndroid Build Coastguard Worker    REPX    {psrad   x, 10}, m2, m1, m3, m0
2473*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m1        ; 0 1 2 3
2474*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m0        ; 4 5 6 _
2475*c0909341SAndroid Build Coastguard Worker    palignr              m4, m3, m2, 4 ; 1 2 3 4
2476*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m3, q0321 ; 5 6 _ _
2477*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m4    ; 01 12
2478*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m4        ; 23 34
2479*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m5        ; 45 56
2480*c0909341SAndroid Build Coastguard Worker.hv_w2_loop:
2481*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*1]
2482*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2483*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*0]
2484*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m9
2485*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m9
2486*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7
2487*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m7
2488*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m5
2489*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m11, m1   ; a0 b0
2490*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
2491*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m12       ; a1 b1
2492*c0909341SAndroid Build Coastguard Worker    paddd                m5, m2
2493*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
2494*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m13       ; a2 b2
2495*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
2496*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
2497*c0909341SAndroid Build Coastguard Worker    psrad                m4, 10        ; 7 8
2498*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m4
2499*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m0, q2103
2500*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0        ; 67 78
2501*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
2502*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m14, m3   ; a3 b3
2503*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4
2504*c0909341SAndroid Build Coastguard Worker    psrad                m5, 10
2505*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m5
2506*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
2507*c0909341SAndroid Build Coastguard Worker    pminsw               m5, m15
2508*c0909341SAndroid Build Coastguard Worker    pmaxsw               m5, m4
2509*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m5
2510*c0909341SAndroid Build Coastguard Worker    pshuflw              m5, m5, q3232
2511*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m5
2512*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2513*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2514*c0909341SAndroid Build Coastguard Worker    jg .hv_w2_loop
2515*c0909341SAndroid Build Coastguard Worker    RET
2516*c0909341SAndroid Build Coastguard Worker.hv_w8:
2517*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
2518*c0909341SAndroid Build Coastguard Worker.hv_w4:
2519*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+mxq*8]
2520*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2521*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2522*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2523*c0909341SAndroid Build Coastguard Worker    cmovb               myd, mxd
2524*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+myq*8]
2525*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2526*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
2527*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstmp
2528*c0909341SAndroid Build Coastguard Worker    mov                 dsq, dsmp
2529*c0909341SAndroid Build Coastguard Worker    mova                 m0, [base+spel_h_shufA]
2530*c0909341SAndroid Build Coastguard Worker    mova                 m1, [base+spel_h_shufB]
2531*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+pd_512]
2532*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK      -16*15
2533*c0909341SAndroid Build Coastguard Worker    mova                 m8, m0
2534*c0909341SAndroid Build Coastguard Worker    mova                 m9, m1
2535*c0909341SAndroid Build Coastguard Worker    mova                m14, m6
2536*c0909341SAndroid Build Coastguard Worker%else
2537*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+spel_h_shufA]
2538*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+spel_h_shufB]
2539*c0909341SAndroid Build Coastguard Worker%endif
2540*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
2541*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m2
2542*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m3
2543*c0909341SAndroid Build Coastguard Worker    psraw                m3, 8
2544*c0909341SAndroid Build Coastguard Worker    test          dword r8m, 0x800
2545*c0909341SAndroid Build Coastguard Worker    jz .hv_w4_10bpc
2546*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
2547*c0909341SAndroid Build Coastguard Worker    psllw                m3, 2
2548*c0909341SAndroid Build Coastguard Worker.hv_w4_10bpc:
2549*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
2550*c0909341SAndroid Build Coastguard Worker    sub                srcq, 6
2551*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
2552*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2553*c0909341SAndroid Build Coastguard Worker    %define tmp esp+16*8
2554*c0909341SAndroid Build Coastguard Worker    shl                  wd, 14
2555*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
2556*c0909341SAndroid Build Coastguard Worker    mov          [esp+4*61], srcq
2557*c0909341SAndroid Build Coastguard Worker    mov          [esp+4*62], dstq
2558*c0909341SAndroid Build Coastguard Worker%else
2559*c0909341SAndroid Build Coastguard Worker    mov               srcmp, srcq
2560*c0909341SAndroid Build Coastguard Worker%endif
2561*c0909341SAndroid Build Coastguard Worker    mova         [tmp+16*5], m4
2562*c0909341SAndroid Build Coastguard Worker    lea                  wd, [wq+hq-(1<<16)]
2563*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m0, q0000
2564*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q1111
2565*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m0, q2222
2566*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m0, q3333
2567*c0909341SAndroid Build Coastguard Worker    mova                m10, m1
2568*c0909341SAndroid Build Coastguard Worker    mova                m11, m2
2569*c0909341SAndroid Build Coastguard Worker    mova                m12, m5
2570*c0909341SAndroid Build Coastguard Worker    mova                m13, m0
2571*c0909341SAndroid Build Coastguard Worker%else
2572*c0909341SAndroid Build Coastguard Worker%if WIN64
2573*c0909341SAndroid Build Coastguard Worker    %define tmp rsp
2574*c0909341SAndroid Build Coastguard Worker%else
2575*c0909341SAndroid Build Coastguard Worker    %define tmp rsp-104 ; red zone
2576*c0909341SAndroid Build Coastguard Worker%endif
2577*c0909341SAndroid Build Coastguard Worker    shl                  wd, 6
2578*c0909341SAndroid Build Coastguard Worker    mov                  r7, srcq
2579*c0909341SAndroid Build Coastguard Worker    mov                  r8, dstq
2580*c0909341SAndroid Build Coastguard Worker    lea                  wd, [wq+hq-(1<<8)]
2581*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m0, q0000
2582*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m0, q1111
2583*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m0, q2222
2584*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m0, q3333
2585*c0909341SAndroid Build Coastguard Worker    mova         [tmp+16*5], m15
2586*c0909341SAndroid Build Coastguard Worker%endif
2587*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q0000
2588*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m3, q1111
2589*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q2222
2590*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q3333
2591*c0909341SAndroid Build Coastguard Worker    mova         [tmp+16*1], m0
2592*c0909341SAndroid Build Coastguard Worker    mova         [tmp+16*2], m1
2593*c0909341SAndroid Build Coastguard Worker    mova         [tmp+16*3], m2
2594*c0909341SAndroid Build Coastguard Worker    mova         [tmp+16*4], m3
2595*c0909341SAndroid Build Coastguard Worker%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512]
2596*c0909341SAndroid Build Coastguard Worker    pshufb              m%3, m%1, m8 ; 0 1 1 2 2 3 3 4
2597*c0909341SAndroid Build Coastguard Worker    pshufb              m%1, m9      ; 2 3 3 4 4 5 5 6
2598*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%3, m10
2599*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m11
2600*c0909341SAndroid Build Coastguard Worker    paddd               m%3, %5
2601*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%3
2602*c0909341SAndroid Build Coastguard Worker    pshufb              m%3, m%2, m8 ; 4 5 5 6 6 7 7 8
2603*c0909341SAndroid Build Coastguard Worker    pshufb              m%2, m9      ; 6 7 7 8 8 9 9 a
2604*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%3, m12
2605*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%2, m13
2606*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%3
2607*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%2
2608*c0909341SAndroid Build Coastguard Worker    psrad               m%1, %4
2609*c0909341SAndroid Build Coastguard Worker%endmacro
2610*c0909341SAndroid Build Coastguard Worker.hv_w4_loop0:
2611*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2612*c0909341SAndroid Build Coastguard Worker    mova                m14, [pd_512]
2613*c0909341SAndroid Build Coastguard Worker%endif
2614*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*0+0]
2615*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+8]
2616*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*1+0]
2617*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1+8]
2618*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*2+0]
2619*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*2+8]
2620*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2621*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         4, 1, 0, 10
2622*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         5, 2, 0, 10
2623*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         6, 3, 0, 10
2624*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*0+0]
2625*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*0+8]
2626*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1+0]
2627*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1+8]
2628*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         7, 2, 0, 10
2629*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         1, 3, 0, 10
2630*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*2+0]
2631*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*2+8]
2632*c0909341SAndroid Build Coastguard Worker    add                srcq, r6
2633*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         2, 3, 0, 10
2634*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m7      ; 0 3
2635*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m1      ; 1 4
2636*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+0]
2637*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+8]
2638*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         0, 1, 3, 10
2639*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m2      ; 2 5
2640*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m0      ; 3 6
2641*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m4, m5  ; 01
2642*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5      ; 34
2643*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5, m6  ; 12
2644*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6      ; 45
2645*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m6, m7  ; 23
2646*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m7      ; 56
2647*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2648*c0909341SAndroid Build Coastguard Worker    jmp .hv_w4_loop_start
2649*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
2650*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp+16*6]
2651*c0909341SAndroid Build Coastguard Worker    mova                 m2, m15
2652*c0909341SAndroid Build Coastguard Worker.hv_w4_loop_start:
2653*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tmp+16*1]
2654*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m7      ; a0
2655*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m7      ; b0
2656*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tmp+16*2]
2657*c0909341SAndroid Build Coastguard Worker    mova         [tmp+16*6], m3
2658*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m7      ; a1
2659*c0909341SAndroid Build Coastguard Worker    mova                m15, m4
2660*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7      ; b1
2661*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tmp+16*3]
2662*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
2663*c0909341SAndroid Build Coastguard Worker    paddd                m2, m4
2664*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
2665*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m7      ; a2
2666*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
2667*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m7      ; b2
2668*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5
2669*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6
2670*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*1+0]
2671*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*1+8]
2672*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2673*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         7, 5, 6, 10
2674*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m7      ; 6 7
2675*c0909341SAndroid Build Coastguard Worker    mova         [tmp+16*0], m0
2676*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+0]
2677*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*0+8]
2678*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         0, 5, 6, 10
2679*c0909341SAndroid Build Coastguard Worker    mova                 m6, [tmp+16*0]
2680*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m0      ; 7 8
2681*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6, m7  ; 67
2682*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m7      ; 78
2683*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m5, [tmp+16*4]
2684*c0909341SAndroid Build Coastguard Worker    paddd                m1, m7      ; a3
2685*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m6, [tmp+16*4]
2686*c0909341SAndroid Build Coastguard Worker    paddd                m2, m7      ; b3
2687*c0909341SAndroid Build Coastguard Worker    psrad                m1, 9
2688*c0909341SAndroid Build Coastguard Worker    psrad                m2, 9
2689*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m2
2690*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
2691*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m7
2692*c0909341SAndroid Build Coastguard Worker    pavgw                m7, m1
2693*c0909341SAndroid Build Coastguard Worker    pminsw               m7, [tmp+16*5]
2694*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m7
2695*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m7
2696*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2697*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2698*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
2699*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
2700*c0909341SAndroid Build Coastguard Worker    mov                srcq, [esp+4*61]
2701*c0909341SAndroid Build Coastguard Worker    mov                dstq, [esp+4*62]
2702*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
2703*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
2704*c0909341SAndroid Build Coastguard Worker    mov          [esp+4*61], srcq
2705*c0909341SAndroid Build Coastguard Worker    mov          [esp+4*62], dstq
2706*c0909341SAndroid Build Coastguard Worker%else
2707*c0909341SAndroid Build Coastguard Worker    mov                srcq, srcmp
2708*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstmp
2709*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
2710*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
2711*c0909341SAndroid Build Coastguard Worker    mov               srcmp, srcq
2712*c0909341SAndroid Build Coastguard Worker    mov               dstmp, dstq
2713*c0909341SAndroid Build Coastguard Worker%endif
2714*c0909341SAndroid Build Coastguard Worker    movzx                hd, ww
2715*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<16
2716*c0909341SAndroid Build Coastguard Worker%else
2717*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
2718*c0909341SAndroid Build Coastguard Worker    mova                m15, [tmp+16*1]
2719*c0909341SAndroid Build Coastguard Worker    pmaddwd             m14, m15, m1 ; a0
2720*c0909341SAndroid Build Coastguard Worker    pmaddwd             m15, m2      ; b0
2721*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tmp+16*2]
2722*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
2723*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m7      ; a1
2724*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
2725*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7      ; b1
2726*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tmp+16*3]
2727*c0909341SAndroid Build Coastguard Worker    paddd               m14, m3
2728*c0909341SAndroid Build Coastguard Worker    paddd               m15, m4
2729*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
2730*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m7      ; a2
2731*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
2732*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m7      ; b2
2733*c0909341SAndroid Build Coastguard Worker    paddd               m14, m5
2734*c0909341SAndroid Build Coastguard Worker    paddd               m15, m6
2735*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*1+0]
2736*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*1+8]
2737*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2738*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         7, 5, 6, 10, [pd_512]
2739*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m7      ; 6 7
2740*c0909341SAndroid Build Coastguard Worker    mova         [tmp+16*0], m0
2741*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+0]
2742*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*0+8]
2743*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         0, 5, 6, 10, [pd_512]
2744*c0909341SAndroid Build Coastguard Worker    mova                 m6, [tmp+16*0]
2745*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m0      ; 7 8
2746*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6, m7  ; 67
2747*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m7      ; 78
2748*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m5, [tmp+16*4]
2749*c0909341SAndroid Build Coastguard Worker    paddd               m14, m7      ; a3
2750*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m6, [tmp+16*4]
2751*c0909341SAndroid Build Coastguard Worker    paddd               m15, m7      ; b3
2752*c0909341SAndroid Build Coastguard Worker    psrad               m14, 9
2753*c0909341SAndroid Build Coastguard Worker    psrad               m15, 9
2754*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15
2755*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
2756*c0909341SAndroid Build Coastguard Worker    pmaxsw              m14, m7
2757*c0909341SAndroid Build Coastguard Worker    pavgw                m7, m14
2758*c0909341SAndroid Build Coastguard Worker    pminsw               m7, [tmp+16*5]
2759*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m7
2760*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m7
2761*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2762*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2763*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
2764*c0909341SAndroid Build Coastguard Worker    add                  r7, 8
2765*c0909341SAndroid Build Coastguard Worker    add                  r8, 8
2766*c0909341SAndroid Build Coastguard Worker    movzx                hd, wb
2767*c0909341SAndroid Build Coastguard Worker    mov                srcq, r7
2768*c0909341SAndroid Build Coastguard Worker    mov                dstq, r8
2769*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<8
2770*c0909341SAndroid Build Coastguard Worker%endif
2771*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop0
2772*c0909341SAndroid Build Coastguard Worker    RET
2773*c0909341SAndroid Build Coastguard Worker%undef tmp
2774*c0909341SAndroid Build Coastguard Worker
2775*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2776*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 2, 1, 6, 4
2777*c0909341SAndroid Build Coastguard Worker%elif WIN64
2778*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 4, 7, 4
2779*c0909341SAndroid Build Coastguard Worker%else
2780*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 7, 7, 8
2781*c0909341SAndroid Build Coastguard Worker%endif
2782*c0909341SAndroid Build Coastguard Worker
2783*c0909341SAndroid Build Coastguard Worker%define PREP_8TAP_FN FN prep_8tap,
2784*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_16bpc
2785*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_16bpc
2786*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_16bpc
2787*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular,        REGULAR, REGULAR
2788*c0909341SAndroid Build Coastguard Worker
2789*c0909341SAndroid Build Coastguard Workercglobal prep_6tap_16bpc, 0, 8, 0, tmp, src, ss, w, h, mx, my
2790*c0909341SAndroid Build Coastguard Worker    %define            base  t2-prep_ssse3
2791*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2792*c0909341SAndroid Build Coastguard Worker    %define             mxb  r0b
2793*c0909341SAndroid Build Coastguard Worker    %define             mxd  r0
2794*c0909341SAndroid Build Coastguard Worker    %define             mxq  r0
2795*c0909341SAndroid Build Coastguard Worker    %define             myb  r2b
2796*c0909341SAndroid Build Coastguard Worker    %define             myd  r2
2797*c0909341SAndroid Build Coastguard Worker    %define             myq  r2
2798*c0909341SAndroid Build Coastguard Worker%endif
2799*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
2800*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
2801*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
2802*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 6tap_v, my, 4tap_v
2803*c0909341SAndroid Build Coastguard Worker    LEA                  t2, prep_ssse3
2804*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
2805*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
2806*c0909341SAndroid Build Coastguard Worker    movifnidn          srcq, srcmp
2807*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
2808*c0909341SAndroid Build Coastguard Worker    jnz .h
2809*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2810*c0909341SAndroid Build Coastguard Worker    jnz .v
2811*c0909341SAndroid Build Coastguard Worker.prep:
2812*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
2813*c0909341SAndroid Build Coastguard Worker    mov                 myd, r7m ; bitdepth_max
2814*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base+prep_ssse3_table+wq*2]
2815*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+pw_8192]
2816*c0909341SAndroid Build Coastguard Worker    shr                 myd, 11
2817*c0909341SAndroid Build Coastguard Worker    add                  wq, t2
2818*c0909341SAndroid Build Coastguard Worker    movddup              m4, [base+prep_mul+myq*8]
2819*c0909341SAndroid Build Coastguard Worker    movifnidn           ssq, ssmp
2820*c0909341SAndroid Build Coastguard Worker    movifnidn          tmpq, tmpmp
2821*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
2822*c0909341SAndroid Build Coastguard Worker%if WIN64
2823*c0909341SAndroid Build Coastguard Worker    pop                  r7
2824*c0909341SAndroid Build Coastguard Worker%endif
2825*c0909341SAndroid Build Coastguard Worker    jmp                  wq
2826*c0909341SAndroid Build Coastguard Worker.h:
2827*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
2828*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2829*c0909341SAndroid Build Coastguard Worker    jnz .hv
2830*c0909341SAndroid Build Coastguard Worker    movifnidn           ssq, r2mp
2831*c0909341SAndroid Build Coastguard Worker    movddup              m5, [base+prep_8tap_1d_rnd]
2832*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2833*c0909341SAndroid Build Coastguard Worker    je mangle(private_prefix %+ _prep_8tap_16bpc_ssse3).h_w4
2834*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      10
2835*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
2836*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+1+mxq*8]
2837*c0909341SAndroid Build Coastguard Worker    movifnidn          tmpq, r0mp
2838*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+spel_h_shufA]
2839*c0909341SAndroid Build Coastguard Worker    add                  wd, wd
2840*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+spel_h_shufB]
2841*c0909341SAndroid Build Coastguard Worker    add                srcq, wq
2842*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m2
2843*c0909341SAndroid Build Coastguard Worker    add                tmpq, wq
2844*c0909341SAndroid Build Coastguard Worker    psraw                m2, 8
2845*c0909341SAndroid Build Coastguard Worker    neg                  wq
2846*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
2847*c0909341SAndroid Build Coastguard Worker    jnz .h_w8_12bpc
2848*c0909341SAndroid Build Coastguard Worker    psllw                m2, 2
2849*c0909341SAndroid Build Coastguard Worker.h_w8_12bpc:
2850*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m2, q0000
2851*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2852*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -16*2
2853*c0909341SAndroid Build Coastguard Worker    %define              m8  [rsp+16*0]
2854*c0909341SAndroid Build Coastguard Worker    %define              m9  [rsp+16*1]
2855*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m2, q1111
2856*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m2, q2222
2857*c0909341SAndroid Build Coastguard Worker    mova                 m8, m0
2858*c0909341SAndroid Build Coastguard Worker    mova                 m9, m1
2859*c0909341SAndroid Build Coastguard Worker%else
2860*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m2, q1111
2861*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m2, q2222
2862*c0909341SAndroid Build Coastguard Worker%endif
2863*c0909341SAndroid Build Coastguard Worker.h_w8_loop0:
2864*c0909341SAndroid Build Coastguard Worker    mov                  r6, wq
2865*c0909341SAndroid Build Coastguard Worker.h_w8_loop:
2866*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+r6-4]
2867*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6+8]
2868*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3, m4  ; 01 12 23 34
2869*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m7      ; abcd0
2870*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m6      ; 23 34 45 56
2871*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m8, m3  ; abcd1
2872*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
2873*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2, m4  ; 67 78 89 9a
2874*c0909341SAndroid Build Coastguard Worker    shufpd               m3, m1, 0x01; 45 56 67 78
2875*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m8      ; efgh1
2876*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m6      ; 89 9a ab bc
2877*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9      ; efgh2
2878*c0909341SAndroid Build Coastguard Worker    paddd                m1, m2
2879*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9 , m3 ; abcd2
2880*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m7      ; efgh0
2881*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
2882*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5
2883*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
2884*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
2885*c0909341SAndroid Build Coastguard Worker    psrad                m0, 4
2886*c0909341SAndroid Build Coastguard Worker    psrad                m1, 4
2887*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
2888*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+r6], m0
2889*c0909341SAndroid Build Coastguard Worker    add                  r6, 16
2890*c0909341SAndroid Build Coastguard Worker    jl .h_w8_loop
2891*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
2892*c0909341SAndroid Build Coastguard Worker    sub                tmpq, wq
2893*c0909341SAndroid Build Coastguard Worker    dec                  hd
2894*c0909341SAndroid Build Coastguard Worker    jg .h_w8_loop0
2895*c0909341SAndroid Build Coastguard Worker    RET
2896*c0909341SAndroid Build Coastguard Worker.v:
2897*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2898*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2899*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2900*c0909341SAndroid Build Coastguard Worker    cmovb               myd, mxd
2901*c0909341SAndroid Build Coastguard Worker    movddup              m5, [base+prep_8tap_1d_rnd]
2902*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+1+myq*8]
2903*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      11, 16
2904*c0909341SAndroid Build Coastguard Worker    movifnidn           ssq, r2mp
2905*c0909341SAndroid Build Coastguard Worker    movifnidn          tmpq, r0mp
2906*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m2
2907*c0909341SAndroid Build Coastguard Worker    sub                srcq, ssq
2908*c0909341SAndroid Build Coastguard Worker    psraw                m2, 8 ; sign-extend
2909*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
2910*c0909341SAndroid Build Coastguard Worker    jnz .v_12bpc
2911*c0909341SAndroid Build Coastguard Worker    psllw                m2, 2
2912*c0909341SAndroid Build Coastguard Worker.v_12bpc:
2913*c0909341SAndroid Build Coastguard Worker    sub                srcq, ssq
2914*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2915*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -16*4
2916*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m2, q0000
2917*c0909341SAndroid Build Coastguard Worker    mov                 r6d, wd
2918*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m2, q1111
2919*c0909341SAndroid Build Coastguard Worker    shl                 r6d, 14
2920*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q2222
2921*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [r6+hq-(1<<16)]
2922*c0909341SAndroid Build Coastguard Worker    mova                 m8, m0
2923*c0909341SAndroid Build Coastguard Worker    mova                 m9, m1
2924*c0909341SAndroid Build Coastguard Worker    mova                m10, m2
2925*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
2926*c0909341SAndroid Build Coastguard Worker    %define           srcmp  [esp+16*3+4*0]
2927*c0909341SAndroid Build Coastguard Worker    %define           tmpmp  [esp+16*3+4*1]
2928*c0909341SAndroid Build Coastguard Worker%endif
2929*c0909341SAndroid Build Coastguard Worker.v_w4_loop0:
2930*c0909341SAndroid Build Coastguard Worker    mov               srcmp, srcq
2931*c0909341SAndroid Build Coastguard Worker    mov               tmpmp, tmpq
2932*c0909341SAndroid Build Coastguard Worker%else
2933*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m2, q0000
2934*c0909341SAndroid Build Coastguard Worker    and                  wd, -8
2935*c0909341SAndroid Build Coastguard Worker    jnz .v_w8
2936*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m2, q1111
2937*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m2, q2222
2938*c0909341SAndroid Build Coastguard Worker%endif
2939*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*0]
2940*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+ssq*1]
2941*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2942*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+ssq*0]
2943*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ssq*1]
2944*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2945*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
2946*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2      ; 01
2947*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3      ; 12
2948*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4      ; 23
2949*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m0      ; 34
2950*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
2951*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m8, m1  ; a0
2952*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m8, m2  ; b0
2953*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
2954*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9      ; a1
2955*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
2956*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m9      ; b1
2957*c0909341SAndroid Build Coastguard Worker    paddd                m6, m3
2958*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+ssq*0]
2959*c0909341SAndroid Build Coastguard Worker    paddd                m7, m4
2960*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ssq*1]
2961*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2962*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
2963*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4      ; 45
2964*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m0      ; 56
2965*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m10, m3 ; a2
2966*c0909341SAndroid Build Coastguard Worker    paddd                m6, m5
2967*c0909341SAndroid Build Coastguard Worker    paddd                m6, m0
2968*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m10, m4 ; b2
2969*c0909341SAndroid Build Coastguard Worker    paddd                m7, m5
2970*c0909341SAndroid Build Coastguard Worker    paddd                m7, m0
2971*c0909341SAndroid Build Coastguard Worker    psrad                m6, 4
2972*c0909341SAndroid Build Coastguard Worker    psrad                m7, 4
2973*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
2974*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2975*c0909341SAndroid Build Coastguard Worker    movq        [tmpq+wq*0], m6
2976*c0909341SAndroid Build Coastguard Worker    movhps      [tmpq+wq*2], m6
2977*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+wq*4]
2978*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2979*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
2980*c0909341SAndroid Build Coastguard Worker    mov                srcq, srcmp
2981*c0909341SAndroid Build Coastguard Worker    mov                tmpq, tmpmp
2982*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6w
2983*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
2984*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8
2985*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<16
2986*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop0
2987*c0909341SAndroid Build Coastguard Worker    RET
2988*c0909341SAndroid Build Coastguard Worker%else
2989*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m6
2990*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
2991*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2992*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
2993*c0909341SAndroid Build Coastguard Worker    RET
2994*c0909341SAndroid Build Coastguard Worker.v_w8:
2995*c0909341SAndroid Build Coastguard Worker    mova                r6m, m8
2996*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [wq*4-(1<<5)]
2997*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m2, q1111
2998*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+r6*8]
2999*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m2, q2222
3000*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       16
3001*c0909341SAndroid Build Coastguard Worker.v_w8_loop0:
3002*c0909341SAndroid Build Coastguard Worker    movu                 m9, [srcq+ssq*0]
3003*c0909341SAndroid Build Coastguard Worker    lea                  r5, [srcq+ssq*2]
3004*c0909341SAndroid Build Coastguard Worker    movu                m11, [srcq+ssq*1]
3005*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
3006*c0909341SAndroid Build Coastguard Worker    movu                m13, [r5+ssq*0]
3007*c0909341SAndroid Build Coastguard Worker    movu                m15, [r5+ssq*1]
3008*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+ssq*2]
3009*c0909341SAndroid Build Coastguard Worker    movu                 m4, [r5+ssq*0]
3010*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m9, m11  ; 01
3011*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m11
3012*c0909341SAndroid Build Coastguard Worker    punpcklwd           m10, m11, m13 ; 12
3013*c0909341SAndroid Build Coastguard Worker    punpckhwd           m11, m13
3014*c0909341SAndroid Build Coastguard Worker    punpcklwd           m12, m13, m15 ; 23
3015*c0909341SAndroid Build Coastguard Worker    punpckhwd           m13, m15
3016*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m15, m4  ; 34
3017*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m4
3018*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
3019*c0909341SAndroid Build Coastguard Worker    mova                 m3, r6m
3020*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m8, m3   ; a0
3021*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9, m3   ; a0'
3022*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m10, m3  ; b0
3023*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m11      ; b0'
3024*c0909341SAndroid Build Coastguard Worker    mova                 m8, m12
3025*c0909341SAndroid Build Coastguard Worker    pmaddwd             m12, m6       ; a1
3026*c0909341SAndroid Build Coastguard Worker    mova                 m9, m13
3027*c0909341SAndroid Build Coastguard Worker    pmaddwd             m13, m6       ; a1'
3028*c0909341SAndroid Build Coastguard Worker    mova                m10, m14
3029*c0909341SAndroid Build Coastguard Worker    pmaddwd             m14, m6       ; b1
3030*c0909341SAndroid Build Coastguard Worker    mova                m11, m15
3031*c0909341SAndroid Build Coastguard Worker    pmaddwd             m15, m6       ; b1'
3032*c0909341SAndroid Build Coastguard Worker    paddd                m0, m12
3033*c0909341SAndroid Build Coastguard Worker    paddd                m2, m13
3034*c0909341SAndroid Build Coastguard Worker    movu                m13, [r5+ssq*0]
3035*c0909341SAndroid Build Coastguard Worker    paddd                m1, m14
3036*c0909341SAndroid Build Coastguard Worker    paddd                m3, m15
3037*c0909341SAndroid Build Coastguard Worker    movu                m15, [r5+ssq*1]
3038*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+ssq*2]
3039*c0909341SAndroid Build Coastguard Worker    movu                 m4, [r5+ssq*0]
3040*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m0, m2, m1, m3
3041*c0909341SAndroid Build Coastguard Worker    punpcklwd           m12, m13, m15 ; 45
3042*c0909341SAndroid Build Coastguard Worker    punpckhwd           m13, m15
3043*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m15, m4  ; 56
3044*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m4
3045*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7, m12  ; a2
3046*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
3047*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7, m13  ; a2'
3048*c0909341SAndroid Build Coastguard Worker    paddd                m2, m4
3049*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7, m14  ; b2
3050*c0909341SAndroid Build Coastguard Worker    paddd                m1, m4
3051*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7, m15  ; b2'
3052*c0909341SAndroid Build Coastguard Worker    paddd                m3, m4
3053*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 4}, m0, m2, m1, m3
3054*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2
3055*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3
3056*c0909341SAndroid Build Coastguard Worker    mova          [r7+wq*0], m0
3057*c0909341SAndroid Build Coastguard Worker    mova          [r7+wq*2], m1
3058*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+wq*4]
3059*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3060*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
3061*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
3062*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
3063*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
3064*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
3065*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop0
3066*c0909341SAndroid Build Coastguard Worker    RET
3067*c0909341SAndroid Build Coastguard Worker%endif
3068*c0909341SAndroid Build Coastguard Worker.hv:
3069*c0909341SAndroid Build Coastguard Worker    and                  wd, -8
3070*c0909341SAndroid Build Coastguard Worker    jnz .hv_w8
3071*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
3072*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base+subpel_filters+mxq*8]
3073*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3074*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3075*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
3076*c0909341SAndroid Build Coastguard Worker    cmovb               myd, mxd
3077*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+1+myq*8]
3078*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      15
3079*c0909341SAndroid Build Coastguard Worker    movifnidn           ssq, r2mp
3080*c0909341SAndroid Build Coastguard Worker    movifnidn          tmpq, r0mp
3081*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+prep_8tap_2d_rnd]
3082*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
3083*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q2121
3084*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
3085*c0909341SAndroid Build Coastguard Worker    punpcklbw            m6, m0
3086*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m2
3087*c0909341SAndroid Build Coastguard Worker    psraw                m6, 4
3088*c0909341SAndroid Build Coastguard Worker    psraw                m2, 8
3089*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
3090*c0909341SAndroid Build Coastguard Worker    jz .hv_w4_10bpc
3091*c0909341SAndroid Build Coastguard Worker    psraw                m6, 2
3092*c0909341SAndroid Build Coastguard Worker.hv_w4_10bpc:
3093*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3094*c0909341SAndroid Build Coastguard Worker%assign regs_used 4
3095*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -16*7
3096*c0909341SAndroid Build Coastguard Worker%assign regs_used 7
3097*c0909341SAndroid Build Coastguard Worker    %define             m10  [esp+16*3]
3098*c0909341SAndroid Build Coastguard Worker    %define             m12  [esp+16*5]
3099*c0909341SAndroid Build Coastguard Worker    %define             m13  [esp+16*6]
3100*c0909341SAndroid Build Coastguard Worker    %define             m14  [base+spel_h_shufA]
3101*c0909341SAndroid Build Coastguard Worker    %define             m11  [base+spel_h_shufB]
3102*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m2, q0000
3103*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m2, q1111
3104*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q2222
3105*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m6, q0000
3106*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q1111
3107*c0909341SAndroid Build Coastguard Worker    mova                 m8, m0
3108*c0909341SAndroid Build Coastguard Worker    mova                 m9, m1
3109*c0909341SAndroid Build Coastguard Worker    mova                m10, m2
3110*c0909341SAndroid Build Coastguard Worker    mova                m12, m5
3111*c0909341SAndroid Build Coastguard Worker    mova                m13, m6
3112*c0909341SAndroid Build Coastguard Worker    neg                 ssq
3113*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*2]
3114*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*1]
3115*c0909341SAndroid Build Coastguard Worker    neg                 ssq
3116*c0909341SAndroid Build Coastguard Worker%else
3117*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
3118*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m2, q0000
3119*c0909341SAndroid Build Coastguard Worker    neg                  r6
3120*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m2, q1111
3121*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+r6 *2]
3122*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m2, q2222
3123*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+r6 *1]
3124*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m6, q0000
3125*c0909341SAndroid Build Coastguard Worker    mova                m14, [base+spel_h_shufA]
3126*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m6, q1111
3127*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+spel_h_shufB]
3128*c0909341SAndroid Build Coastguard Worker%endif
3129*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0]
3130*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*1]
3131*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3132*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*0]
3133*c0909341SAndroid Build Coastguard Worker    HV_H_W4_6TAP         m3, m3, m5, m11
3134*c0909341SAndroid Build Coastguard Worker    HV_H_W4_6TAP         m4, m4, m5, m11
3135*c0909341SAndroid Build Coastguard Worker    HV_H_W4_6TAP         m5, m1, m5, m11
3136*c0909341SAndroid Build Coastguard Worker    HV_H_W4_6TAP         m0, m0, m1, m11
3137*c0909341SAndroid Build Coastguard Worker    HV_H_W4_6TAP         m2, m2, m1, m11
3138*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 6}, m3, m5, m4, m0, m2
3139*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m5      ; 0 2
3140*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m0      ; 1 3
3141*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m2      ; 2 4
3142*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m4  ; 01
3143*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4      ; 23
3144*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m5  ; 12
3145*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5      ; 34
3146*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
3147*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*1]
3148*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m8, m1  ; a0
3149*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3150*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m8, m2  ; b0
3151*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3152*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9      ; a1
3153*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
3154*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m9      ; b1
3155*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
3156*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*0]
3157*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4
3158*c0909341SAndroid Build Coastguard Worker    HV_H_W4_6TAP         m0, m0, m4, m11
3159*c0909341SAndroid Build Coastguard Worker    HV_H_W4_6TAP         m3, m3, m4, m11
3160*c0909341SAndroid Build Coastguard Worker    psrad                m4, m2, 16
3161*c0909341SAndroid Build Coastguard Worker    psrad                m0, 6
3162*c0909341SAndroid Build Coastguard Worker    psrad                m3, 6
3163*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m0      ; 4 5
3164*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m3      ; 5 6
3165*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m0  ; 45
3166*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0      ; 56
3167*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m10, m3 ; a2
3168*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
3169*c0909341SAndroid Build Coastguard Worker    paddd                m5, m0
3170*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m10, m4 ; b2
3171*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
3172*c0909341SAndroid Build Coastguard Worker    paddd                m6, m0
3173*c0909341SAndroid Build Coastguard Worker    psrad                m5, 6
3174*c0909341SAndroid Build Coastguard Worker    psrad                m6, 6
3175*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m6
3176*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m5
3177*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
3178*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3179*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
3180*c0909341SAndroid Build Coastguard Worker    RET
3181*c0909341SAndroid Build Coastguard Worker.hv_w8:
3182*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
3183*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
3184*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+1+mxq*8]
3185*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3186*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3187*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
3188*c0909341SAndroid Build Coastguard Worker    cmovb               myd, mxd
3189*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base+subpel_filters+1+myq*8]
3190*c0909341SAndroid Build Coastguard Worker    movifnidn           ssq, r2mp
3191*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+prep_8tap_2d_rnd]
3192*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
3193*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m2
3194*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
3195*c0909341SAndroid Build Coastguard Worker    sub                srcq, 4
3196*c0909341SAndroid Build Coastguard Worker    psraw                m0, 4
3197*c0909341SAndroid Build Coastguard Worker    psraw                m1, 8
3198*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
3199*c0909341SAndroid Build Coastguard Worker    jz .hv_w8_10bpc
3200*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
3201*c0909341SAndroid Build Coastguard Worker.hv_w8_10bpc:
3202*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3203*c0909341SAndroid Build Coastguard Worker%assign regs_used 1
3204*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -16*9
3205*c0909341SAndroid Build Coastguard Worker%assign regs_used 7
3206*c0909341SAndroid Build Coastguard Worker    mov                tmpq, r0mp
3207*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*7], m4
3208*c0909341SAndroid Build Coastguard Worker%else
3209*c0909341SAndroid Build Coastguard Worker%if WIN64
3210*c0909341SAndroid Build Coastguard Worker    PUSH                 r8
3211*c0909341SAndroid Build Coastguard Worker%assign regs_used 9
3212*c0909341SAndroid Build Coastguard Worker%endif
3213*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK        16*6, 16
3214*c0909341SAndroid Build Coastguard Worker%endif
3215*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q0000
3216*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m2
3217*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q1111
3218*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m2
3219*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m0, q2222
3220*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m0
3221*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m1, q0000
3222*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m2
3223*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m1, q1111
3224*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*4], m2
3225*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q2222
3226*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*5], m1
3227*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
3228*c0909341SAndroid Build Coastguard Worker    neg                  r6
3229*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3230*c0909341SAndroid Build Coastguard Worker    mov                 r5d, wd
3231*c0909341SAndroid Build Coastguard Worker    shl                 r5d, 14
3232*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r5+hq-(1<<16)]
3233*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
3234*c0909341SAndroid Build Coastguard Worker    %define           srcmp  [esp+16*8+4*0]
3235*c0909341SAndroid Build Coastguard Worker    %define           tmpmp  [esp+16*8+4*1]
3236*c0909341SAndroid Build Coastguard Worker%endif
3237*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0:
3238*c0909341SAndroid Build Coastguard Worker    mov               srcmp, srcq
3239*c0909341SAndroid Build Coastguard Worker    mov               tmpmp, tmpq
3240*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+r6*2+0]
3241*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+r6*2+2]
3242*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+16*0]
3243*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+16*1]
3244*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+16*2]
3245*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m2, m5, m6, m7, m1, m0
3246*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+r6*1+0]
3247*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+r6*1+2]
3248*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m3, m5, m6, m7, m1, m0
3249*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*0+0]
3250*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*0+2]
3251*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m4, m5, m6, m7, m1, m0
3252*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*1+0]
3253*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*1+2]
3254*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3255*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m0, m5, m6, m7, m1
3256*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*0+0]
3257*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*0+2]
3258*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m1, m5, m6, m7
3259*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*7]
3260*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m2, m3, m4, m0, m1
3261*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 6 }, m2, m4, m3, m0, m1
3262*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m4     ; 0 2
3263*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m0     ; 1 3
3264*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m1     ; 2 4
3265*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2, m3 ; 01
3266*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m3     ; 23
3267*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m4 ; 12
3268*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4     ; 34
3269*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
3270*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*3]
3271*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+16*4]
3272*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m0, m5 ; a0
3273*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1     ; b0
3274*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
3275*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m6     ; a1
3276*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3277*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m6     ; b1
3278*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3279*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1+0]
3280*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
3281*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1+2]
3282*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3283*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m6, m2, m3
3284*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*0+0]
3285*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*0+2]
3286*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m7, m2, m3
3287*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+16*7]
3288*c0909341SAndroid Build Coastguard Worker    psrad                m3, m1, 16
3289*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m2}, m6, m7, m4, m5
3290*c0909341SAndroid Build Coastguard Worker    psrad                m6, 6
3291*c0909341SAndroid Build Coastguard Worker    psrad                m7, 6
3292*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m6     ; 4 5
3293*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7     ; 5 6
3294*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+16*5]
3295*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m6 ; 45
3296*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m6     ; 56
3297*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m7 ; a2
3298*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m3     ; b2
3299*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
3300*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
3301*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
3302*c0909341SAndroid Build Coastguard Worker    psrad                m5, 6
3303*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
3304*c0909341SAndroid Build Coastguard Worker    movq        [tmpq+wq*0], m4
3305*c0909341SAndroid Build Coastguard Worker    movhps      [tmpq+wq*2], m4
3306*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+wq*4]
3307*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3308*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
3309*c0909341SAndroid Build Coastguard Worker    mov                srcq, srcmp
3310*c0909341SAndroid Build Coastguard Worker    mov                tmpq, tmpmp
3311*c0909341SAndroid Build Coastguard Worker    movzx                hd, r5w
3312*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
3313*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8
3314*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 1<<16
3315*c0909341SAndroid Build Coastguard Worker%else
3316*c0909341SAndroid Build Coastguard Worker    lea                 r8d, [wq*4-(1<<5)]
3317*c0909341SAndroid Build Coastguard Worker    lea                 r8d, [hq+r8*8]
3318*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0:
3319*c0909341SAndroid Build Coastguard Worker    mova                 m5, [spel_h_shufA]
3320*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6*2+ 0]
3321*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+16*0]
3322*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6*2+ 8]
3323*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+16*1]
3324*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6*2+16]
3325*c0909341SAndroid Build Coastguard Worker    mova                 m8, [rsp+16*2]
3326*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m9, m0, m1, m2, 6, m5, m6, m7, m8
3327*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6*1+ 0]
3328*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6*1+ 8]
3329*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6*1+16]
3330*c0909341SAndroid Build Coastguard Worker    lea                  r5, [srcq+ssq*2]
3331*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP           m11, m0, m1, m2, 6, m5, m6, m7, m8
3332*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+ 0]
3333*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+ 8]
3334*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*0+16]
3335*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
3336*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP           m13, m0, m1, m2, 6, m5, m6, m7, m8
3337*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*1+ 0]
3338*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1+ 8]
3339*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1+16]
3340*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP           m15, m0, m1, m2, 6, m5, m6, m7, m8
3341*c0909341SAndroid Build Coastguard Worker    movu                 m0, [r5+ssq*0+ 0]
3342*c0909341SAndroid Build Coastguard Worker    movu                 m1, [r5+ssq*0+ 8]
3343*c0909341SAndroid Build Coastguard Worker    movu                 m2, [r5+ssq*0+16]
3344*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m5, m0, m1, m2, 6, m5, m6, m7, m8
3345*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m9, m11  ; 01
3346*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m11
3347*c0909341SAndroid Build Coastguard Worker    punpcklwd           m10, m11, m13 ; 12
3348*c0909341SAndroid Build Coastguard Worker    punpckhwd           m11, m13
3349*c0909341SAndroid Build Coastguard Worker    punpcklwd           m12, m13, m15 ; 23
3350*c0909341SAndroid Build Coastguard Worker    punpckhwd           m13, m15
3351*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m15, m5  ; 34
3352*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m5
3353*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
3354*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+16*3]
3355*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+16*4]
3356*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m8, m3   ; a0
3357*c0909341SAndroid Build Coastguard Worker    mova                 m8, m12
3358*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9, m3   ; a0'
3359*c0909341SAndroid Build Coastguard Worker    mova                 m9, m13
3360*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m10, m3  ; b0
3361*c0909341SAndroid Build Coastguard Worker    mova                m10, m14
3362*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m11      ; b0'
3363*c0909341SAndroid Build Coastguard Worker    mova                m11, m15
3364*c0909341SAndroid Build Coastguard Worker    REPX    {pmaddwd x, m7}, m12, m13, m14, m15
3365*c0909341SAndroid Build Coastguard Worker    movu                 m6, [r5+ssq*1+ 0]
3366*c0909341SAndroid Build Coastguard Worker    paddd                m0, m12
3367*c0909341SAndroid Build Coastguard Worker    movu                 m7, [r5+ssq*1+ 8]
3368*c0909341SAndroid Build Coastguard Worker    paddd                m2, m13
3369*c0909341SAndroid Build Coastguard Worker    movu                m12, [r5+ssq*1+16]
3370*c0909341SAndroid Build Coastguard Worker    paddd                m1, m14
3371*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+ssq*2]
3372*c0909341SAndroid Build Coastguard Worker    paddd                m3, m15
3373*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP           m15, m6, m7, m12, 6
3374*c0909341SAndroid Build Coastguard Worker    movu                 m6, [r5+ssq*0+ 0]
3375*c0909341SAndroid Build Coastguard Worker    movu                 m7, [r5+ssq*0+ 8]
3376*c0909341SAndroid Build Coastguard Worker    movu                m14, [r5+ssq*0+16]
3377*c0909341SAndroid Build Coastguard Worker    punpcklwd           m12, m5, m15 ; 45
3378*c0909341SAndroid Build Coastguard Worker    punpckhwd           m13, m5, m15
3379*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m5, m6, m7, m14, 6
3380*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+16*5]
3381*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m4}, m0, m2, m1, m3
3382*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m15, m5  ; 56
3383*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m5
3384*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m12, m7  ; a2
3385*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6
3386*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m13, m7  ; a2'
3387*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6
3388*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m14, m7  ; b2
3389*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m15      ; b2'
3390*c0909341SAndroid Build Coastguard Worker    paddd                m1, m6
3391*c0909341SAndroid Build Coastguard Worker    paddd                m3, m7
3392*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 6}, m0, m2, m1, m3
3393*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2
3394*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3
3395*c0909341SAndroid Build Coastguard Worker    mova          [r7+wq*0], m0
3396*c0909341SAndroid Build Coastguard Worker    mova          [r7+wq*2], m1
3397*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+wq*4]
3398*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3399*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
3400*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
3401*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
3402*c0909341SAndroid Build Coastguard Worker    movzx                hd, r8b
3403*c0909341SAndroid Build Coastguard Worker    sub                 r8d, 1<<8
3404*c0909341SAndroid Build Coastguard Worker%endif
3405*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop0
3406*c0909341SAndroid Build Coastguard Worker    RET
3407*c0909341SAndroid Build Coastguard Worker
3408*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_16bpc
3409*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_16bpc
3410*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_16bpc
3411*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_16bpc
3412*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp,          SHARP,   SHARP
3413*c0909341SAndroid Build Coastguard Worker
3414*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_16bpc, 0, 8, 0, tmp, src, ss, w, h, mx, my
3415*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3416*c0909341SAndroid Build Coastguard Worker    %define             mxb  r0b
3417*c0909341SAndroid Build Coastguard Worker    %define             mxd  r0
3418*c0909341SAndroid Build Coastguard Worker    %define             mxq  r0
3419*c0909341SAndroid Build Coastguard Worker    %define             myb  r2b
3420*c0909341SAndroid Build Coastguard Worker    %define             myd  r2
3421*c0909341SAndroid Build Coastguard Worker    %define             myq  r2
3422*c0909341SAndroid Build Coastguard Worker    %define              m8  [esp+16*0]
3423*c0909341SAndroid Build Coastguard Worker    %define              m9  [esp+16*1]
3424*c0909341SAndroid Build Coastguard Worker    %define             m10  [esp+16*2]
3425*c0909341SAndroid Build Coastguard Worker    %define             m11  [esp+16*3]
3426*c0909341SAndroid Build Coastguard Worker    %define             m12  [esp+16*4]
3427*c0909341SAndroid Build Coastguard Worker    %define             m13  [esp+16*5]
3428*c0909341SAndroid Build Coastguard Worker    %define             m14  [esp+16*6]
3429*c0909341SAndroid Build Coastguard Worker    %define             m15  [esp+16*7]
3430*c0909341SAndroid Build Coastguard Worker%endif
3431*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
3432*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
3433*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
3434*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 8tap_v, my, 4tap_v
3435*c0909341SAndroid Build Coastguard Worker    LEA                  t2, prep_ssse3
3436*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
3437*c0909341SAndroid Build Coastguard Worker    movifnidn          srcq, srcmp
3438*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
3439*c0909341SAndroid Build Coastguard Worker    jnz .h
3440*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
3441*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
3442*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _prep_6tap_16bpc_ssse3).prep
3443*c0909341SAndroid Build Coastguard Worker.v:
3444*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3445*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3446*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
3447*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
3448*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+myq*8]
3449*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      15
3450*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+prep_8tap_1d_rnd]
3451*c0909341SAndroid Build Coastguard Worker    movifnidn           ssq, r2mp
3452*c0909341SAndroid Build Coastguard Worker    movifnidn          tmpq, r0mp
3453*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m3
3454*c0909341SAndroid Build Coastguard Worker    psraw                m3, 8 ; sign-extend
3455*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
3456*c0909341SAndroid Build Coastguard Worker    jnz .v_12bpc
3457*c0909341SAndroid Build Coastguard Worker    psllw                m3, 2
3458*c0909341SAndroid Build Coastguard Worker.v_12bpc:
3459*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3460*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -16*7
3461*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q0000
3462*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m3, q1111
3463*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q2222
3464*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q3333
3465*c0909341SAndroid Build Coastguard Worker    mova                 m8, m0
3466*c0909341SAndroid Build Coastguard Worker    mova                 m9, m1
3467*c0909341SAndroid Build Coastguard Worker    mova                m10, m2
3468*c0909341SAndroid Build Coastguard Worker    mova                m11, m3
3469*c0909341SAndroid Build Coastguard Worker%else
3470*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m3, q0000
3471*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m3, q1111
3472*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m3, q2222
3473*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m3, q3333
3474*c0909341SAndroid Build Coastguard Worker%endif
3475*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
3476*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
3477*c0909341SAndroid Build Coastguard Worker    mov                 r6d, wd
3478*c0909341SAndroid Build Coastguard Worker    shl                  wd, 6
3479*c0909341SAndroid Build Coastguard Worker    mov                  r5, srcq
3480*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3481*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
3482*c0909341SAndroid Build Coastguard Worker%elif STACK_ALIGNMENT < 16
3483*c0909341SAndroid Build Coastguard Worker    mov          [esp+4*29], tmpq
3484*c0909341SAndroid Build Coastguard Worker%endif
3485*c0909341SAndroid Build Coastguard Worker    lea                  wd, [wq+hq-(1<<8)]
3486*c0909341SAndroid Build Coastguard Worker.v_loop0:
3487*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*0]
3488*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+ssq*1]
3489*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3490*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+ssq*0]
3491*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ssq*1]
3492*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3493*c0909341SAndroid Build Coastguard Worker    movq                 m5, [srcq+ssq*0]
3494*c0909341SAndroid Build Coastguard Worker    movq                 m6, [srcq+ssq*1]
3495*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3496*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
3497*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2      ; 01
3498*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3      ; 12
3499*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4      ; 23
3500*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5      ; 34
3501*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6      ; 45
3502*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m0      ; 56
3503*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3504*c0909341SAndroid Build Coastguard Worker    jmp .v_loop_start
3505*c0909341SAndroid Build Coastguard Worker.v_loop:
3506*c0909341SAndroid Build Coastguard Worker    mova                 m1, m12
3507*c0909341SAndroid Build Coastguard Worker    mova                 m2, m13
3508*c0909341SAndroid Build Coastguard Worker    mova                 m3, m14
3509*c0909341SAndroid Build Coastguard Worker.v_loop_start:
3510*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m8      ; a0
3511*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m8      ; b0
3512*c0909341SAndroid Build Coastguard Worker    mova                m12, m3
3513*c0909341SAndroid Build Coastguard Worker    mova                m13, m4
3514*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9      ; a1
3515*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m9      ; b1
3516*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
3517*c0909341SAndroid Build Coastguard Worker    paddd                m2, m4
3518*c0909341SAndroid Build Coastguard Worker    mova                m14, m5
3519*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
3520*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m10     ; a2
3521*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m10     ; b2
3522*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5
3523*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6
3524*c0909341SAndroid Build Coastguard Worker    movq                 m6, [srcq+ssq*1]
3525*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3526*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m0, m6  ; 67
3527*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
3528*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m11, m5 ; a3
3529*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m0      ; 78
3530*c0909341SAndroid Build Coastguard Worker    paddd                m1, m7
3531*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
3532*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m11, m6 ; b3
3533*c0909341SAndroid Build Coastguard Worker    paddd                m2, m7
3534*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3
3535*c0909341SAndroid Build Coastguard Worker    psrad                m1, 4
3536*c0909341SAndroid Build Coastguard Worker    psrad                m2, 4
3537*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m2
3538*c0909341SAndroid Build Coastguard Worker    movq        [tmpq+r6*0], m1
3539*c0909341SAndroid Build Coastguard Worker    movhps      [tmpq+r6*2], m1
3540*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+r6*4]
3541*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3542*c0909341SAndroid Build Coastguard Worker    jg .v_loop
3543*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
3544*c0909341SAndroid Build Coastguard Worker    mov                tmpq, [esp+4*29]
3545*c0909341SAndroid Build Coastguard Worker    add                  r5, 8
3546*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8
3547*c0909341SAndroid Build Coastguard Worker    mov                srcq, r5
3548*c0909341SAndroid Build Coastguard Worker    mov          [esp+4*29], tmpq
3549*c0909341SAndroid Build Coastguard Worker%else
3550*c0909341SAndroid Build Coastguard Worker    mov                tmpq, tmpmp
3551*c0909341SAndroid Build Coastguard Worker    add                  r5, 8
3552*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8
3553*c0909341SAndroid Build Coastguard Worker    mov                srcq, r5
3554*c0909341SAndroid Build Coastguard Worker    mov               tmpmp, tmpq
3555*c0909341SAndroid Build Coastguard Worker%endif
3556*c0909341SAndroid Build Coastguard Worker%else
3557*c0909341SAndroid Build Coastguard Worker.v_loop:
3558*c0909341SAndroid Build Coastguard Worker    pmaddwd             m12, m8, m1  ; a0
3559*c0909341SAndroid Build Coastguard Worker    pmaddwd             m13, m8, m2  ; b0
3560*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3561*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
3562*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9      ; a1
3563*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m9      ; b1
3564*c0909341SAndroid Build Coastguard Worker    paddd               m12, m3
3565*c0909341SAndroid Build Coastguard Worker    paddd               m13, m4
3566*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
3567*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
3568*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m10     ; a2
3569*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m10     ; b2
3570*c0909341SAndroid Build Coastguard Worker    paddd               m12, m5
3571*c0909341SAndroid Build Coastguard Worker    paddd               m13, m6
3572*c0909341SAndroid Build Coastguard Worker    movq                 m6, [srcq+ssq*1]
3573*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3574*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m0, m6  ; 67
3575*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
3576*c0909341SAndroid Build Coastguard Worker    pmaddwd             m14, m11, m5 ; a3
3577*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m0      ; 78
3578*c0909341SAndroid Build Coastguard Worker    paddd               m12, m7
3579*c0909341SAndroid Build Coastguard Worker    paddd               m12, m14
3580*c0909341SAndroid Build Coastguard Worker    pmaddwd             m14, m11, m6 ; b3
3581*c0909341SAndroid Build Coastguard Worker    paddd               m13, m7
3582*c0909341SAndroid Build Coastguard Worker    paddd               m13, m14
3583*c0909341SAndroid Build Coastguard Worker    psrad               m12, 4
3584*c0909341SAndroid Build Coastguard Worker    psrad               m13, 4
3585*c0909341SAndroid Build Coastguard Worker    packssdw            m12, m13
3586*c0909341SAndroid Build Coastguard Worker    movq        [tmpq+r6*0], m12
3587*c0909341SAndroid Build Coastguard Worker    movhps      [tmpq+r6*2], m12
3588*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+r6*4]
3589*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3590*c0909341SAndroid Build Coastguard Worker    jg .v_loop
3591*c0909341SAndroid Build Coastguard Worker    add                  r5, 8
3592*c0909341SAndroid Build Coastguard Worker    add                  r7, 8
3593*c0909341SAndroid Build Coastguard Worker    mov                srcq, r5
3594*c0909341SAndroid Build Coastguard Worker    mov                tmpq, r7
3595*c0909341SAndroid Build Coastguard Worker%endif
3596*c0909341SAndroid Build Coastguard Worker    movzx                hd, wb
3597*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<8
3598*c0909341SAndroid Build Coastguard Worker    jg .v_loop0
3599*c0909341SAndroid Build Coastguard Worker    RET
3600*c0909341SAndroid Build Coastguard Worker.h:
3601*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
3602*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
3603*c0909341SAndroid Build Coastguard Worker    jnz .hv
3604*c0909341SAndroid Build Coastguard Worker    movifnidn           ssq, r2mp
3605*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, r4m
3606*c0909341SAndroid Build Coastguard Worker    movddup              m5, [base+prep_8tap_1d_rnd]
3607*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
3608*c0909341SAndroid Build Coastguard Worker    jne .h_w8
3609*c0909341SAndroid Build Coastguard Worker.h_w4:
3610*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
3611*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base+subpel_filters+mxq*8]
3612*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+spel_h_shufA]
3613*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+spel_h_shufB]
3614*c0909341SAndroid Build Coastguard Worker    movifnidn          tmpq, tmpmp
3615*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
3616*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       8
3617*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
3618*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8
3619*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
3620*c0909341SAndroid Build Coastguard Worker    jnz .h_w4_12bpc
3621*c0909341SAndroid Build Coastguard Worker    psllw                m0, 2
3622*c0909341SAndroid Build Coastguard Worker.h_w4_12bpc:
3623*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m0, q1111
3624*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m0, q2222
3625*c0909341SAndroid Build Coastguard Worker.h_w4_loop:
3626*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0]
3627*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1]
3628*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3629*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1, m3 ; 0 1 1 2 2 3 3 4
3630*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4     ; 2 3 3 4 4 5 5 6
3631*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m6
3632*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m7
3633*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
3634*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
3635*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2, m3
3636*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
3637*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m6
3638*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m7
3639*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5
3640*c0909341SAndroid Build Coastguard Worker    paddd                m1, m2
3641*c0909341SAndroid Build Coastguard Worker    psrad                m0, 4
3642*c0909341SAndroid Build Coastguard Worker    psrad                m1, 4
3643*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
3644*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
3645*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
3646*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3647*c0909341SAndroid Build Coastguard Worker    jg .h_w4_loop
3648*c0909341SAndroid Build Coastguard Worker    RET
3649*c0909341SAndroid Build Coastguard Worker.h_w8:
3650*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      11
3651*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
3652*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+mxq*8]
3653*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+spel_h_shufA]
3654*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+spel_h_shufB]
3655*c0909341SAndroid Build Coastguard Worker    movifnidn          tmpq, r0mp
3656*c0909341SAndroid Build Coastguard Worker    add                  wd, wd
3657*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m2
3658*c0909341SAndroid Build Coastguard Worker    add                srcq, wq
3659*c0909341SAndroid Build Coastguard Worker    psraw                m2, 8
3660*c0909341SAndroid Build Coastguard Worker    add                tmpq, wq
3661*c0909341SAndroid Build Coastguard Worker    neg                  wq
3662*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
3663*c0909341SAndroid Build Coastguard Worker    jnz .h_w8_12bpc
3664*c0909341SAndroid Build Coastguard Worker    psllw                m2, 2
3665*c0909341SAndroid Build Coastguard Worker.h_w8_12bpc:
3666*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m2, q0000
3667*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3668*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -16*3
3669*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m2, q1111
3670*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m2, q2222
3671*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q3333
3672*c0909341SAndroid Build Coastguard Worker    mova                 m8, m0
3673*c0909341SAndroid Build Coastguard Worker    mova                 m9, m1
3674*c0909341SAndroid Build Coastguard Worker    mova                m10, m2
3675*c0909341SAndroid Build Coastguard Worker%else
3676*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m2, q1111
3677*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m2, q2222
3678*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m2, q3333
3679*c0909341SAndroid Build Coastguard Worker%endif
3680*c0909341SAndroid Build Coastguard Worker.h_w8_loop0:
3681*c0909341SAndroid Build Coastguard Worker    mov                  r6, wq
3682*c0909341SAndroid Build Coastguard Worker.h_w8_loop:
3683*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6- 6]
3684*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6+ 2]
3685*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m0, m4  ; 0 1 1 2 2 3 3 4
3686*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6      ; 2 3 3 4 4 5 5 6
3687*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m7      ; abcd0
3688*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m8      ; abcd1
3689*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m1, m4  ; 4 5 5 6 6 7 7 8
3690*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m6      ; 6 7 7 8 8 9 9 a
3691*c0909341SAndroid Build Coastguard Worker    paddd                m2, m5
3692*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
3693*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9, m3  ; abcd2
3694*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m7      ; efgh0
3695*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
3696*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m10, m1 ; abcd3
3697*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m8      ; efgh1
3698*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
3699*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6+10]
3700*c0909341SAndroid Build Coastguard Worker    paddd                m3, m5
3701*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
3702*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m2, m4  ; a b b c c d d e
3703*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m6      ; 8 9 9 a a b b c
3704*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9      ; efgh2
3705*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m10     ; efgh3
3706*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
3707*c0909341SAndroid Build Coastguard Worker    paddd                m1, m2
3708*c0909341SAndroid Build Coastguard Worker    psrad                m0, 4
3709*c0909341SAndroid Build Coastguard Worker    psrad                m1, 4
3710*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
3711*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+r6], m0
3712*c0909341SAndroid Build Coastguard Worker    add                  r6, 16
3713*c0909341SAndroid Build Coastguard Worker    jl .h_w8_loop
3714*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
3715*c0909341SAndroid Build Coastguard Worker    sub                tmpq, wq
3716*c0909341SAndroid Build Coastguard Worker    dec                  hd
3717*c0909341SAndroid Build Coastguard Worker    jg .h_w8_loop0
3718*c0909341SAndroid Build Coastguard Worker    RET
3719*c0909341SAndroid Build Coastguard Worker.hv:
3720*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
3721*c0909341SAndroid Build Coastguard Worker    movzx               t3d, mxb
3722*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
3723*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
3724*c0909341SAndroid Build Coastguard Worker    cmove               mxd, t3d
3725*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, r4m
3726*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+mxq*8]
3727*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3728*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3729*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
3730*c0909341SAndroid Build Coastguard Worker    cmove               myd, mxd
3731*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+myq*8]
3732*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3733*c0909341SAndroid Build Coastguard Worker    mov                 ssq, r2mp
3734*c0909341SAndroid Build Coastguard Worker    mov                tmpq, r0mp
3735*c0909341SAndroid Build Coastguard Worker    mova                 m0, [base+spel_h_shufA]
3736*c0909341SAndroid Build Coastguard Worker    mova                 m1, [base+spel_h_shufB]
3737*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+prep_8tap_2d_rnd]
3738*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK      -16*14
3739*c0909341SAndroid Build Coastguard Worker    mova                 m8, m0
3740*c0909341SAndroid Build Coastguard Worker    mova                 m9, m1
3741*c0909341SAndroid Build Coastguard Worker    mova                m14, m4
3742*c0909341SAndroid Build Coastguard Worker%else
3743*c0909341SAndroid Build Coastguard Worker%if WIN64
3744*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK        16*6, 16
3745*c0909341SAndroid Build Coastguard Worker%endif
3746*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+spel_h_shufA]
3747*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+spel_h_shufB]
3748*c0909341SAndroid Build Coastguard Worker%endif
3749*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
3750*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m2
3751*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m3
3752*c0909341SAndroid Build Coastguard Worker    psraw                m0, 4
3753*c0909341SAndroid Build Coastguard Worker    psraw                m3, 8
3754*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
3755*c0909341SAndroid Build Coastguard Worker    jz .hv_10bpc
3756*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
3757*c0909341SAndroid Build Coastguard Worker.hv_10bpc:
3758*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
3759*c0909341SAndroid Build Coastguard Worker    sub                srcq, 6
3760*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
3761*c0909341SAndroid Build Coastguard Worker    mov                 r6d, wd
3762*c0909341SAndroid Build Coastguard Worker    shl                  wd, 6
3763*c0909341SAndroid Build Coastguard Worker    mov                  r5, srcq
3764*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3765*c0909341SAndroid Build Coastguard Worker    %define             tmp  esp+16*8
3766*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
3767*c0909341SAndroid Build Coastguard Worker    mov          [esp+4*61], tmpq
3768*c0909341SAndroid Build Coastguard Worker%endif
3769*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m0, q0000
3770*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q1111
3771*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m0, q2222
3772*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m0, q3333
3773*c0909341SAndroid Build Coastguard Worker    mova                m10, m1
3774*c0909341SAndroid Build Coastguard Worker    mova                m11, m2
3775*c0909341SAndroid Build Coastguard Worker    mova                m12, m5
3776*c0909341SAndroid Build Coastguard Worker    mova                m13, m0
3777*c0909341SAndroid Build Coastguard Worker%else
3778*c0909341SAndroid Build Coastguard Worker%if WIN64
3779*c0909341SAndroid Build Coastguard Worker    %define             tmp  rsp
3780*c0909341SAndroid Build Coastguard Worker%else
3781*c0909341SAndroid Build Coastguard Worker    %define             tmp  rsp-88 ; red zone
3782*c0909341SAndroid Build Coastguard Worker%endif
3783*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
3784*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m0, q0000
3785*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m0, q1111
3786*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m0, q2222
3787*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m0, q3333
3788*c0909341SAndroid Build Coastguard Worker%endif
3789*c0909341SAndroid Build Coastguard Worker    lea                  wd, [wq+hq-(1<<8)]
3790*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q0000
3791*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m3, q1111
3792*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q2222
3793*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q3333
3794*c0909341SAndroid Build Coastguard Worker    mova         [tmp+16*1], m0
3795*c0909341SAndroid Build Coastguard Worker    mova         [tmp+16*2], m1
3796*c0909341SAndroid Build Coastguard Worker    mova         [tmp+16*3], m2
3797*c0909341SAndroid Build Coastguard Worker    mova         [tmp+16*4], m3
3798*c0909341SAndroid Build Coastguard Worker.hv_loop0:
3799*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3800*c0909341SAndroid Build Coastguard Worker    mova                m14, [prep_8tap_2d_rnd]
3801*c0909341SAndroid Build Coastguard Worker%endif
3802*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*0+0]
3803*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+8]
3804*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*1+0]
3805*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1+8]
3806*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3807*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*0+0]
3808*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*0+8]
3809*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         4, 1, 0, 6
3810*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         5, 2, 0, 6
3811*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         6, 3, 0, 6
3812*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*1+0]
3813*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1+8]
3814*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3815*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+0]
3816*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*0+8]
3817*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         7, 2, 0, 6
3818*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         1, 3, 0, 6
3819*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1+0]
3820*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1+8]
3821*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3822*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         2, 3, 0, 6
3823*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m7      ; 0 3
3824*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m1      ; 1 4
3825*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+0]
3826*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+8]
3827*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         0, 1, 3, 6
3828*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m2      ; 2 5
3829*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m0      ; 3 6
3830*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m4, m5  ; 01
3831*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5      ; 34
3832*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5, m6  ; 12
3833*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6      ; 45
3834*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m6, m7  ; 23
3835*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m7      ; 56
3836*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3837*c0909341SAndroid Build Coastguard Worker    jmp .hv_loop_start
3838*c0909341SAndroid Build Coastguard Worker.hv_loop:
3839*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp+16*5]
3840*c0909341SAndroid Build Coastguard Worker    mova                 m2, m15
3841*c0909341SAndroid Build Coastguard Worker.hv_loop_start:
3842*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tmp+16*1]
3843*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m7      ; a0
3844*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m7      ; b0
3845*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tmp+16*2]
3846*c0909341SAndroid Build Coastguard Worker    mova         [tmp+16*5], m3
3847*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m7      ; a1
3848*c0909341SAndroid Build Coastguard Worker    mova                m15, m4
3849*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7      ; b1
3850*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tmp+16*3]
3851*c0909341SAndroid Build Coastguard Worker    paddd                m1, m14
3852*c0909341SAndroid Build Coastguard Worker    paddd                m2, m14
3853*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
3854*c0909341SAndroid Build Coastguard Worker    paddd                m2, m4
3855*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
3856*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m7      ; a2
3857*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
3858*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m7      ; b2
3859*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5
3860*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6
3861*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*1+0]
3862*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*1+8]
3863*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3864*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         7, 5, 6, 6
3865*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m7      ; 6 7
3866*c0909341SAndroid Build Coastguard Worker    mova         [tmp+16*0], m0
3867*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+0]
3868*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*0+8]
3869*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         0, 5, 6, 6
3870*c0909341SAndroid Build Coastguard Worker    mova                 m6, [tmp+16*0]
3871*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m0      ; 7 8
3872*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6, m7  ; 67
3873*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m7      ; 78
3874*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m5, [tmp+16*4]
3875*c0909341SAndroid Build Coastguard Worker    paddd                m1, m7      ; a3
3876*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m6, [tmp+16*4]
3877*c0909341SAndroid Build Coastguard Worker    paddd                m2, m7      ; b3
3878*c0909341SAndroid Build Coastguard Worker    psrad                m1, 6
3879*c0909341SAndroid Build Coastguard Worker    psrad                m2, 6
3880*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m2
3881*c0909341SAndroid Build Coastguard Worker    movq        [tmpq+r6*0], m1
3882*c0909341SAndroid Build Coastguard Worker    movhps      [tmpq+r6*2], m1
3883*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+r6*4]
3884*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3885*c0909341SAndroid Build Coastguard Worker    jg .hv_loop
3886*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
3887*c0909341SAndroid Build Coastguard Worker    mov                tmpq, [esp+4*61]
3888*c0909341SAndroid Build Coastguard Worker    add                  r5, 8
3889*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8
3890*c0909341SAndroid Build Coastguard Worker    mov                srcq, r5
3891*c0909341SAndroid Build Coastguard Worker    mov          [esp+4*61], tmpq
3892*c0909341SAndroid Build Coastguard Worker%else
3893*c0909341SAndroid Build Coastguard Worker    mov                tmpq, tmpmp
3894*c0909341SAndroid Build Coastguard Worker    add                  r5, 8
3895*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8
3896*c0909341SAndroid Build Coastguard Worker    mov                srcq, r5
3897*c0909341SAndroid Build Coastguard Worker    mov               tmpmp, tmpq
3898*c0909341SAndroid Build Coastguard Worker%endif
3899*c0909341SAndroid Build Coastguard Worker%else
3900*c0909341SAndroid Build Coastguard Worker.hv_loop:
3901*c0909341SAndroid Build Coastguard Worker    mova                m15, [tmp+16*1]
3902*c0909341SAndroid Build Coastguard Worker    mova                 m7, [prep_8tap_2d_rnd]
3903*c0909341SAndroid Build Coastguard Worker    pmaddwd             m14, m15, m1 ; a0
3904*c0909341SAndroid Build Coastguard Worker    pmaddwd             m15, m2      ; b0
3905*c0909341SAndroid Build Coastguard Worker    paddd               m14, m7
3906*c0909341SAndroid Build Coastguard Worker    paddd               m15, m7
3907*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tmp+16*2]
3908*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3909*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m7      ; a1
3910*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
3911*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7      ; b1
3912*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tmp+16*3]
3913*c0909341SAndroid Build Coastguard Worker    paddd               m14, m3
3914*c0909341SAndroid Build Coastguard Worker    paddd               m15, m4
3915*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
3916*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m7      ; a2
3917*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
3918*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m7      ; b2
3919*c0909341SAndroid Build Coastguard Worker    paddd               m14, m5
3920*c0909341SAndroid Build Coastguard Worker    paddd               m15, m6
3921*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*1+0]
3922*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*1+8]
3923*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3924*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         7, 5, 6, 6, [prep_8tap_2d_rnd]
3925*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m7      ; 6 7
3926*c0909341SAndroid Build Coastguard Worker    mova         [tmp+16*0], m0
3927*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+0]
3928*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*0+8]
3929*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_HV_H         0, 5, 6, 6, [prep_8tap_2d_rnd]
3930*c0909341SAndroid Build Coastguard Worker    mova                 m6, [tmp+16*0]
3931*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m0      ; 7 8
3932*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6, m7  ; 67
3933*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m7      ; 78
3934*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m5, [tmp+16*4]
3935*c0909341SAndroid Build Coastguard Worker    paddd               m14, m7      ; a3
3936*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m6, [tmp+16*4]
3937*c0909341SAndroid Build Coastguard Worker    paddd               m15, m7      ; b3
3938*c0909341SAndroid Build Coastguard Worker    psrad               m14, 6
3939*c0909341SAndroid Build Coastguard Worker    psrad               m15, 6
3940*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15
3941*c0909341SAndroid Build Coastguard Worker    movq        [tmpq+r6*0], m14
3942*c0909341SAndroid Build Coastguard Worker    movhps      [tmpq+r6*2], m14
3943*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+r6*4]
3944*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3945*c0909341SAndroid Build Coastguard Worker    jg .hv_loop
3946*c0909341SAndroid Build Coastguard Worker    add                  r5, 8
3947*c0909341SAndroid Build Coastguard Worker    add                  r7, 8
3948*c0909341SAndroid Build Coastguard Worker    mov                srcq, r5
3949*c0909341SAndroid Build Coastguard Worker    mov                tmpq, r7
3950*c0909341SAndroid Build Coastguard Worker%endif
3951*c0909341SAndroid Build Coastguard Worker    movzx                hd, wb
3952*c0909341SAndroid Build Coastguard Worker    sub                  wd, 1<<8
3953*c0909341SAndroid Build Coastguard Worker    jg .hv_loop0
3954*c0909341SAndroid Build Coastguard Worker    RET
3955*c0909341SAndroid Build Coastguard Worker%undef tmp
3956*c0909341SAndroid Build Coastguard Worker
3957*c0909341SAndroid Build Coastguard Worker%macro movifprep 2
3958*c0909341SAndroid Build Coastguard Worker %if isprep
3959*c0909341SAndroid Build Coastguard Worker    mov %1, %2
3960*c0909341SAndroid Build Coastguard Worker %endif
3961*c0909341SAndroid Build Coastguard Worker%endmacro
3962*c0909341SAndroid Build Coastguard Worker
3963*c0909341SAndroid Build Coastguard Worker%macro SAVE_REG 1
3964*c0909341SAndroid Build Coastguard Worker %xdefine r%1_save  r%1
3965*c0909341SAndroid Build Coastguard Worker %xdefine r%1q_save r%1q
3966*c0909341SAndroid Build Coastguard Worker %xdefine r%1d_save r%1d
3967*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
3968*c0909341SAndroid Build Coastguard Worker  %define r%1m_save [rstk+stack_offset+(%1+1)*4]
3969*c0909341SAndroid Build Coastguard Worker %endif
3970*c0909341SAndroid Build Coastguard Worker%endmacro
3971*c0909341SAndroid Build Coastguard Worker
3972*c0909341SAndroid Build Coastguard Worker%macro LOAD_REG 1
3973*c0909341SAndroid Build Coastguard Worker %xdefine r%1  r%1_save
3974*c0909341SAndroid Build Coastguard Worker %xdefine r%1q r%1q_save
3975*c0909341SAndroid Build Coastguard Worker %xdefine r%1d r%1d_save
3976*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
3977*c0909341SAndroid Build Coastguard Worker  %define r%1m r%1m_save
3978*c0909341SAndroid Build Coastguard Worker %endif
3979*c0909341SAndroid Build Coastguard Worker %undef r%1d_save
3980*c0909341SAndroid Build Coastguard Worker %undef r%1q_save
3981*c0909341SAndroid Build Coastguard Worker %undef r%1_save
3982*c0909341SAndroid Build Coastguard Worker%endmacro
3983*c0909341SAndroid Build Coastguard Worker
3984*c0909341SAndroid Build Coastguard Worker%macro REMAP_REG 2-3
3985*c0909341SAndroid Build Coastguard Worker %xdefine r%1  r%2
3986*c0909341SAndroid Build Coastguard Worker %xdefine r%1q r%2q
3987*c0909341SAndroid Build Coastguard Worker %xdefine r%1d r%2d
3988*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
3989*c0909341SAndroid Build Coastguard Worker  %if %3 == 0
3990*c0909341SAndroid Build Coastguard Worker   %xdefine r%1m r%2m
3991*c0909341SAndroid Build Coastguard Worker  %else
3992*c0909341SAndroid Build Coastguard Worker   %define r%1m [rstk+stack_offset+(%1+1)*4]
3993*c0909341SAndroid Build Coastguard Worker  %endif
3994*c0909341SAndroid Build Coastguard Worker %endif
3995*c0909341SAndroid Build Coastguard Worker%endmacro
3996*c0909341SAndroid Build Coastguard Worker
3997*c0909341SAndroid Build Coastguard Worker%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
3998*c0909341SAndroid Build Coastguard Worker %if isprep
3999*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_64
4000*c0909341SAndroid Build Coastguard Worker   SAVE_REG 14
4001*c0909341SAndroid Build Coastguard Worker   %assign %%i 14
4002*c0909341SAndroid Build Coastguard Worker   %rep 14
4003*c0909341SAndroid Build Coastguard Worker    %assign %%j %%i-1
4004*c0909341SAndroid Build Coastguard Worker    REMAP_REG %%i, %%j
4005*c0909341SAndroid Build Coastguard Worker    %assign %%i %%i-1
4006*c0909341SAndroid Build Coastguard Worker   %endrep
4007*c0909341SAndroid Build Coastguard Worker  %else
4008*c0909341SAndroid Build Coastguard Worker   SAVE_REG 5
4009*c0909341SAndroid Build Coastguard Worker   %assign %%i 5
4010*c0909341SAndroid Build Coastguard Worker   %rep 5
4011*c0909341SAndroid Build Coastguard Worker    %assign %%j %%i-1
4012*c0909341SAndroid Build Coastguard Worker    REMAP_REG %%i, %%j, 0
4013*c0909341SAndroid Build Coastguard Worker    %assign %%i %%i-1
4014*c0909341SAndroid Build Coastguard Worker   %endrep
4015*c0909341SAndroid Build Coastguard Worker  %endif
4016*c0909341SAndroid Build Coastguard Worker %endif
4017*c0909341SAndroid Build Coastguard Worker%endmacro
4018*c0909341SAndroid Build Coastguard Worker
4019*c0909341SAndroid Build Coastguard Worker%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
4020*c0909341SAndroid Build Coastguard Worker %if isprep
4021*c0909341SAndroid Build Coastguard Worker  %assign %%i 1
4022*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_64
4023*c0909341SAndroid Build Coastguard Worker   %rep 13
4024*c0909341SAndroid Build Coastguard Worker    %assign %%j %%i+1
4025*c0909341SAndroid Build Coastguard Worker    REMAP_REG %%i, %%j
4026*c0909341SAndroid Build Coastguard Worker    %assign %%i %%i+1
4027*c0909341SAndroid Build Coastguard Worker   %endrep
4028*c0909341SAndroid Build Coastguard Worker   LOAD_REG 14
4029*c0909341SAndroid Build Coastguard Worker  %else
4030*c0909341SAndroid Build Coastguard Worker   %rep 4
4031*c0909341SAndroid Build Coastguard Worker    %assign %%j %%i+1
4032*c0909341SAndroid Build Coastguard Worker    REMAP_REG %%i, %%j, 1
4033*c0909341SAndroid Build Coastguard Worker    %assign %%i %%i+1
4034*c0909341SAndroid Build Coastguard Worker   %endrep
4035*c0909341SAndroid Build Coastguard Worker   LOAD_REG 5
4036*c0909341SAndroid Build Coastguard Worker  %endif
4037*c0909341SAndroid Build Coastguard Worker %endif
4038*c0909341SAndroid Build Coastguard Worker%endmacro
4039*c0909341SAndroid Build Coastguard Worker
4040*c0909341SAndroid Build Coastguard Worker%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
4041*c0909341SAndroid Build Coastguard Worker    MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
4042*c0909341SAndroid Build Coastguard Worker    RET
4043*c0909341SAndroid Build Coastguard Worker %if %1
4044*c0909341SAndroid Build Coastguard Worker    MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
4045*c0909341SAndroid Build Coastguard Worker %endif
4046*c0909341SAndroid Build Coastguard Worker%endmacro
4047*c0909341SAndroid Build Coastguard Worker
4048*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4049*c0909341SAndroid Build Coastguard Worker %macro MC_4TAP_SCALED_H 1 ; dst_mem
4050*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*0]
4051*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1]
4052*c0909341SAndroid Build Coastguard Worker    movu                 m5, [r4  +ssq*0]
4053*c0909341SAndroid Build Coastguard Worker    movu                 m6, [r4  +ssq*1]
4054*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
4055*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r4  +ssq*2]
4056*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m12}, m7, m2
4057*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m13}, m7, m2
4058*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m14}, m5, m6
4059*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m5, m6
4060*c0909341SAndroid Build Coastguard Worker    phaddd               m7, m5
4061*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m6
4062*c0909341SAndroid Build Coastguard Worker    mova                 m5, [esp+0x00]
4063*c0909341SAndroid Build Coastguard Worker    movd                 m6, [esp+0x10]
4064*c0909341SAndroid Build Coastguard Worker    paddd                m7, m5
4065*c0909341SAndroid Build Coastguard Worker    paddd                m2, m5
4066*c0909341SAndroid Build Coastguard Worker    psrad                m7, m6
4067*c0909341SAndroid Build Coastguard Worker    psrad                m2, m6
4068*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m2
4069*c0909341SAndroid Build Coastguard Worker    mova           [stk+%1], m7
4070*c0909341SAndroid Build Coastguard Worker %endmacro
4071*c0909341SAndroid Build Coastguard Worker%endif
4072*c0909341SAndroid Build Coastguard Worker
4073*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4074*c0909341SAndroid Build Coastguard Worker %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
4075*c0909341SAndroid Build Coastguard Worker    movu                m%1, [srcq+ r4*2]
4076*c0909341SAndroid Build Coastguard Worker    movu                m%2, [srcq+ r6*2]
4077*c0909341SAndroid Build Coastguard Worker    movu                m%3, [srcq+ r7*2]
4078*c0909341SAndroid Build Coastguard Worker    movu                m%4, [srcq+ r9*2]
4079*c0909341SAndroid Build Coastguard Worker    movu                m%5, [srcq+r10*2]
4080*c0909341SAndroid Build Coastguard Worker    movu                m%6, [srcq+r11*2]
4081*c0909341SAndroid Build Coastguard Worker    movu                m%7, [srcq+r13*2]
4082*c0909341SAndroid Build Coastguard Worker    movu                m%8, [srcq+ rX*2]
4083*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
4084*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, [stk+0x10]
4085*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%2, [stk+0x20]
4086*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%3, [stk+0x30]
4087*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%4, [stk+0x40]
4088*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, [stk+0x50]
4089*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%6, [stk+0x60]
4090*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%7, [stk+0x70]
4091*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%8, [stk+0x80]
4092*c0909341SAndroid Build Coastguard Worker    phaddd              m%1, m%2
4093*c0909341SAndroid Build Coastguard Worker    phaddd              m%3, m%4
4094*c0909341SAndroid Build Coastguard Worker    phaddd              m%5, m%6
4095*c0909341SAndroid Build Coastguard Worker    phaddd              m%7, m%8
4096*c0909341SAndroid Build Coastguard Worker    phaddd              m%1, m%3
4097*c0909341SAndroid Build Coastguard Worker    phaddd              m%5, m%7
4098*c0909341SAndroid Build Coastguard Worker    paddd               m%1, hround
4099*c0909341SAndroid Build Coastguard Worker    paddd               m%5, hround
4100*c0909341SAndroid Build Coastguard Worker    psrad               m%1, m12
4101*c0909341SAndroid Build Coastguard Worker    psrad               m%5, m12
4102*c0909341SAndroid Build Coastguard Worker    packssdw            m%1, m%5
4103*c0909341SAndroid Build Coastguard Worker %endmacro
4104*c0909341SAndroid Build Coastguard Worker%else
4105*c0909341SAndroid Build Coastguard Worker %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem, load_fh_offsets
4106*c0909341SAndroid Build Coastguard Worker  %if %3 == 1
4107*c0909341SAndroid Build Coastguard Worker    mov                  r0, [stk+ 0]
4108*c0909341SAndroid Build Coastguard Worker    mov                  rX, [stk+ 4]
4109*c0909341SAndroid Build Coastguard Worker    mov                  r4, [stk+ 8]
4110*c0909341SAndroid Build Coastguard Worker    mov                  r5, [stk+12]
4111*c0909341SAndroid Build Coastguard Worker  %endif
4112*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r0*2]
4113*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+rX*2]
4114*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r4*2]
4115*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+r5*2]
4116*c0909341SAndroid Build Coastguard Worker    mov                  r0, [stk+16]
4117*c0909341SAndroid Build Coastguard Worker    mov                  rX, [stk+20]
4118*c0909341SAndroid Build Coastguard Worker    mov                  r4, [stk+24]
4119*c0909341SAndroid Build Coastguard Worker    mov                  r5, [stk+28]
4120*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, [stk+%1+0x00]
4121*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, [stk+%1+0x10]
4122*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, [stk+%1+0x20]
4123*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, [stk+%1+0x30]
4124*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m1
4125*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m3
4126*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+r0*2]
4127*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+rX*2]
4128*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+r4*2]
4129*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+r5*2]
4130*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
4131*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, [stk+%1+0xa0]
4132*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, [stk+%1+0xb0]
4133*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, [stk+%1+0xc0]
4134*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, [stk+%1+0xd0]
4135*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m5
4136*c0909341SAndroid Build Coastguard Worker    phaddd               m6, m7
4137*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m2
4138*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m6
4139*c0909341SAndroid Build Coastguard Worker    paddd                m0, hround
4140*c0909341SAndroid Build Coastguard Worker    paddd                m4, hround
4141*c0909341SAndroid Build Coastguard Worker    psrad                m0, m12
4142*c0909341SAndroid Build Coastguard Worker    psrad                m4, m12
4143*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m4
4144*c0909341SAndroid Build Coastguard Worker  %if %2 != 0
4145*c0909341SAndroid Build Coastguard Worker    mova           [stk+%2], m0
4146*c0909341SAndroid Build Coastguard Worker  %endif
4147*c0909341SAndroid Build Coastguard Worker %endmacro
4148*c0909341SAndroid Build Coastguard Worker%endif
4149*c0909341SAndroid Build Coastguard Worker
4150*c0909341SAndroid Build Coastguard Worker%macro MC_8TAP_SCALED 1
4151*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
4152*c0909341SAndroid Build Coastguard Worker %assign isput  1
4153*c0909341SAndroid Build Coastguard Worker %assign isprep 0
4154*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
4155*c0909341SAndroid Build Coastguard Worker  %if required_stack_alignment <= STACK_ALIGNMENT
4156*c0909341SAndroid Build Coastguard Workercglobal put_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
4157*c0909341SAndroid Build Coastguard Worker  %else
4158*c0909341SAndroid Build Coastguard Workercglobal put_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
4159*c0909341SAndroid Build Coastguard Worker  %endif
4160*c0909341SAndroid Build Coastguard Worker %else ; ARCH_X86_32
4161*c0909341SAndroid Build Coastguard Worker  %if required_stack_alignment <= STACK_ALIGNMENT
4162*c0909341SAndroid Build Coastguard Workercglobal put_8tap_scaled_16bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
4163*c0909341SAndroid Build Coastguard Worker  %else
4164*c0909341SAndroid Build Coastguard Workercglobal put_8tap_scaled_16bpc, 0, 7, 8, -0x200-0x30, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
4165*c0909341SAndroid Build Coastguard Worker  %endif
4166*c0909341SAndroid Build Coastguard Worker %endif
4167*c0909341SAndroid Build Coastguard Worker %xdefine base_reg r12
4168*c0909341SAndroid Build Coastguard Worker%else ; prep
4169*c0909341SAndroid Build Coastguard Worker %assign isput  0
4170*c0909341SAndroid Build Coastguard Worker %assign isprep 1
4171*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
4172*c0909341SAndroid Build Coastguard Worker  %if required_stack_alignment <= STACK_ALIGNMENT
4173*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
4174*c0909341SAndroid Build Coastguard Worker   %xdefine tmp_stridem r14q
4175*c0909341SAndroid Build Coastguard Worker  %else
4176*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
4177*c0909341SAndroid Build Coastguard Worker   %define tmp_stridem qword [stk+0x138]
4178*c0909341SAndroid Build Coastguard Worker  %endif
4179*c0909341SAndroid Build Coastguard Worker  %xdefine base_reg r11
4180*c0909341SAndroid Build Coastguard Worker %else ; ARCH_X86_32
4181*c0909341SAndroid Build Coastguard Worker  %if required_stack_alignment <= STACK_ALIGNMENT
4182*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_scaled_16bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
4183*c0909341SAndroid Build Coastguard Worker  %else
4184*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
4185*c0909341SAndroid Build Coastguard Worker  %endif
4186*c0909341SAndroid Build Coastguard Worker  %define tmp_stridem dword [stk+0x138]
4187*c0909341SAndroid Build Coastguard Worker %endif
4188*c0909341SAndroid Build Coastguard Worker%endif
4189*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4190*c0909341SAndroid Build Coastguard Worker    mov         [esp+0x1f0], t0d
4191*c0909341SAndroid Build Coastguard Worker    mov         [esp+0x1f4], t1d
4192*c0909341SAndroid Build Coastguard Worker %if isput && required_stack_alignment > STACK_ALIGNMENT
4193*c0909341SAndroid Build Coastguard Worker    mov                dstd, dstm
4194*c0909341SAndroid Build Coastguard Worker    mov                 dsd, dsm
4195*c0909341SAndroid Build Coastguard Worker    mov                srcd, srcm
4196*c0909341SAndroid Build Coastguard Worker    mov                 ssd, ssm
4197*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
4198*c0909341SAndroid Build Coastguard Worker    mov                  r4, mxm
4199*c0909341SAndroid Build Coastguard Worker  %define r0m  [esp+0x200]
4200*c0909341SAndroid Build Coastguard Worker  %define dsm  [esp+0x204]
4201*c0909341SAndroid Build Coastguard Worker  %define dsmp dsm
4202*c0909341SAndroid Build Coastguard Worker  %define r1m  dsm
4203*c0909341SAndroid Build Coastguard Worker  %define r2m  [esp+0x208]
4204*c0909341SAndroid Build Coastguard Worker  %define ssm  [esp+0x20c]
4205*c0909341SAndroid Build Coastguard Worker  %define r3m  ssm
4206*c0909341SAndroid Build Coastguard Worker  %define hm   [esp+0x210]
4207*c0909341SAndroid Build Coastguard Worker  %define mxm  [esp+0x214]
4208*c0909341SAndroid Build Coastguard Worker    mov                 r0m, dstd
4209*c0909341SAndroid Build Coastguard Worker    mov                 dsm, dsd
4210*c0909341SAndroid Build Coastguard Worker    mov                 r2m, srcd
4211*c0909341SAndroid Build Coastguard Worker    mov                 ssm, ssd
4212*c0909341SAndroid Build Coastguard Worker    mov                  hm, hd
4213*c0909341SAndroid Build Coastguard Worker    mov                  r0, mym
4214*c0909341SAndroid Build Coastguard Worker    mov                  r1, dxm
4215*c0909341SAndroid Build Coastguard Worker    mov                  r2, dym
4216*c0909341SAndroid Build Coastguard Worker  %define mym    [esp+0x218]
4217*c0909341SAndroid Build Coastguard Worker  %define dxm    [esp+0x21c]
4218*c0909341SAndroid Build Coastguard Worker  %define dym    [esp+0x220]
4219*c0909341SAndroid Build Coastguard Worker    mov                 mxm, r4
4220*c0909341SAndroid Build Coastguard Worker    mov                 mym, r0
4221*c0909341SAndroid Build Coastguard Worker    mov                 dxm, r1
4222*c0909341SAndroid Build Coastguard Worker    mov                 dym, r2
4223*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
4224*c0909341SAndroid Build Coastguard Worker %endif
4225*c0909341SAndroid Build Coastguard Worker %if isput
4226*c0909341SAndroid Build Coastguard Worker    mov                  r3, pxmaxm
4227*c0909341SAndroid Build Coastguard Worker  %define pxmaxm r3
4228*c0909341SAndroid Build Coastguard Worker %else
4229*c0909341SAndroid Build Coastguard Worker    mov                  r2, pxmaxm
4230*c0909341SAndroid Build Coastguard Worker %endif
4231*c0909341SAndroid Build Coastguard Worker %if isprep && required_stack_alignment > STACK_ALIGNMENT
4232*c0909341SAndroid Build Coastguard Worker  %xdefine base_reg r5
4233*c0909341SAndroid Build Coastguard Worker %else
4234*c0909341SAndroid Build Coastguard Worker  %xdefine base_reg r6
4235*c0909341SAndroid Build Coastguard Worker %endif
4236*c0909341SAndroid Build Coastguard Worker%endif
4237*c0909341SAndroid Build Coastguard Worker    LEA            base_reg, %1_8tap_scaled_16bpc_ssse3
4238*c0909341SAndroid Build Coastguard Worker%xdefine base base_reg-%1_8tap_scaled_16bpc_ssse3
4239*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT
4240*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
4241*c0909341SAndroid Build Coastguard Worker%endif
4242*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4243*c0909341SAndroid Build Coastguard Worker %if isput
4244*c0909341SAndroid Build Coastguard Worker    mov                 r7d, pxmaxm
4245*c0909341SAndroid Build Coastguard Worker %endif
4246*c0909341SAndroid Build Coastguard Worker%else
4247*c0909341SAndroid Build Coastguard Worker %define m8  m0
4248*c0909341SAndroid Build Coastguard Worker %define m9  m1
4249*c0909341SAndroid Build Coastguard Worker %define m14 m4
4250*c0909341SAndroid Build Coastguard Worker %define m15 m3
4251*c0909341SAndroid Build Coastguard Worker%endif
4252*c0909341SAndroid Build Coastguard Worker    movd                 m8, dxm
4253*c0909341SAndroid Build Coastguard Worker    movd                m14, mxm
4254*c0909341SAndroid Build Coastguard Worker%if isput
4255*c0909341SAndroid Build Coastguard Worker    movd                m15, pxmaxm
4256*c0909341SAndroid Build Coastguard Worker%endif
4257*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m8, q0000
4258*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m14, q0000
4259*c0909341SAndroid Build Coastguard Worker%if isput
4260*c0909341SAndroid Build Coastguard Worker    pshuflw             m15, m15, q0000
4261*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m15, m15
4262*c0909341SAndroid Build Coastguard Worker%endif
4263*c0909341SAndroid Build Coastguard Worker%if isprep
4264*c0909341SAndroid Build Coastguard Worker %if UNIX64
4265*c0909341SAndroid Build Coastguard Worker    mov                 r5d, t0d
4266*c0909341SAndroid Build Coastguard Worker  DECLARE_REG_TMP 5, 7
4267*c0909341SAndroid Build Coastguard Worker %endif
4268*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
4269*c0909341SAndroid Build Coastguard Worker    mov                 r6d, pxmaxm
4270*c0909341SAndroid Build Coastguard Worker %endif
4271*c0909341SAndroid Build Coastguard Worker%endif
4272*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4273*c0909341SAndroid Build Coastguard Worker    mov                 dyd, dym
4274*c0909341SAndroid Build Coastguard Worker%endif
4275*c0909341SAndroid Build Coastguard Worker%if isput
4276*c0909341SAndroid Build Coastguard Worker %if WIN64
4277*c0909341SAndroid Build Coastguard Worker    mov                 r8d, hm
4278*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
4279*c0909341SAndroid Build Coastguard Worker  %define hm r5m
4280*c0909341SAndroid Build Coastguard Worker  %define dxm r8m
4281*c0909341SAndroid Build Coastguard Worker %elif ARCH_X86_64
4282*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
4283*c0909341SAndroid Build Coastguard Worker  %define hm r6m
4284*c0909341SAndroid Build Coastguard Worker %else
4285*c0909341SAndroid Build Coastguard Worker %endif
4286*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
4287*c0909341SAndroid Build Coastguard Worker  %if required_stack_alignment > STACK_ALIGNMENT
4288*c0909341SAndroid Build Coastguard Worker   %define dsm [rsp+0x138]
4289*c0909341SAndroid Build Coastguard Worker   %define rX r1
4290*c0909341SAndroid Build Coastguard Worker   %define rXd r1d
4291*c0909341SAndroid Build Coastguard Worker  %else
4292*c0909341SAndroid Build Coastguard Worker   %define dsm dsq
4293*c0909341SAndroid Build Coastguard Worker   %define rX r14
4294*c0909341SAndroid Build Coastguard Worker   %define rXd r14d
4295*c0909341SAndroid Build Coastguard Worker  %endif
4296*c0909341SAndroid Build Coastguard Worker %else
4297*c0909341SAndroid Build Coastguard Worker  %define rX r1
4298*c0909341SAndroid Build Coastguard Worker %endif
4299*c0909341SAndroid Build Coastguard Worker%else ; prep
4300*c0909341SAndroid Build Coastguard Worker %if WIN64
4301*c0909341SAndroid Build Coastguard Worker    mov                 r7d, hm
4302*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
4303*c0909341SAndroid Build Coastguard Worker  %define hm r4m
4304*c0909341SAndroid Build Coastguard Worker  %define dxm r7m
4305*c0909341SAndroid Build Coastguard Worker %elif ARCH_X86_64
4306*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
4307*c0909341SAndroid Build Coastguard Worker  %xdefine hm r7m
4308*c0909341SAndroid Build Coastguard Worker %endif
4309*c0909341SAndroid Build Coastguard Worker MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
4310*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
4311*c0909341SAndroid Build Coastguard Worker  %define rX r14
4312*c0909341SAndroid Build Coastguard Worker  %define rXd r14d
4313*c0909341SAndroid Build Coastguard Worker %else
4314*c0909341SAndroid Build Coastguard Worker  %define rX r3
4315*c0909341SAndroid Build Coastguard Worker %endif
4316*c0909341SAndroid Build Coastguard Worker%endif
4317*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4318*c0909341SAndroid Build Coastguard Worker    shr                 r7d, 11
4319*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+pd_0x3ff]
4320*c0909341SAndroid Build Coastguard Worker    movddup             m11, [base+s_8tap_h_rnd+r7*8]
4321*c0909341SAndroid Build Coastguard Worker    movd                m12, [base+s_8tap_h_sh+r7*4]
4322*c0909341SAndroid Build Coastguard Worker %if isput
4323*c0909341SAndroid Build Coastguard Worker    movddup             m13, [base+put_s_8tap_v_rnd+r7*8]
4324*c0909341SAndroid Build Coastguard Worker    movd                 m7, [base+put_s_8tap_v_sh+r7*4]
4325*c0909341SAndroid Build Coastguard Worker  %define pxmaxm [rsp]
4326*c0909341SAndroid Build Coastguard Worker    mova             pxmaxm, m15
4327*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m12, m7
4328*c0909341SAndroid Build Coastguard Worker %endif
4329*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
4330*c0909341SAndroid Build Coastguard Worker    movzx               r7d, t1b
4331*c0909341SAndroid Build Coastguard Worker    shr                 t1d, 16
4332*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
4333*c0909341SAndroid Build Coastguard Worker    cmovs               t1d, r7d
4334*c0909341SAndroid Build Coastguard Worker    sub                srcq, ss3q
4335*c0909341SAndroid Build Coastguard Worker%else
4336*c0909341SAndroid Build Coastguard Worker %define m10    [base+pd_0x3ff]
4337*c0909341SAndroid Build Coastguard Worker %define m11    [esp+0x00]
4338*c0909341SAndroid Build Coastguard Worker %define m12    [esp+0x10]
4339*c0909341SAndroid Build Coastguard Worker    shr                  r3, 11
4340*c0909341SAndroid Build Coastguard Worker    movddup              m1, [base+s_8tap_h_rnd+r3*8]
4341*c0909341SAndroid Build Coastguard Worker    movd                 m2, [base+s_8tap_h_sh+r3*4]
4342*c0909341SAndroid Build Coastguard Worker %if isput
4343*c0909341SAndroid Build Coastguard Worker  %define m13    [esp+0x20]
4344*c0909341SAndroid Build Coastguard Worker  %define pxmaxm [esp+0x30]
4345*c0909341SAndroid Build Coastguard Worker  %define stk esp+0x40
4346*c0909341SAndroid Build Coastguard Worker    movddup              m5, [base+put_s_8tap_v_rnd+r3*8]
4347*c0909341SAndroid Build Coastguard Worker    movd                 m6, [base+put_s_8tap_v_sh+r3*4]
4348*c0909341SAndroid Build Coastguard Worker    mova             pxmaxm, m15
4349*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m6
4350*c0909341SAndroid Build Coastguard Worker    mova                m13, m5
4351*c0909341SAndroid Build Coastguard Worker %else
4352*c0909341SAndroid Build Coastguard Worker  %define m13 [base+pd_m524256]
4353*c0909341SAndroid Build Coastguard Worker %endif
4354*c0909341SAndroid Build Coastguard Worker    mov                 ssd, ssm
4355*c0909341SAndroid Build Coastguard Worker    mova                m11, m1
4356*c0909341SAndroid Build Coastguard Worker    mova                m12, m2
4357*c0909341SAndroid Build Coastguard Worker MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
4358*c0909341SAndroid Build Coastguard Worker    mov                  r1, [esp+0x1f4]
4359*c0909341SAndroid Build Coastguard Worker    lea                  r0, [ssd*3]
4360*c0909341SAndroid Build Coastguard Worker    movzx                r2, r1b
4361*c0909341SAndroid Build Coastguard Worker    shr                  r1, 16
4362*c0909341SAndroid Build Coastguard Worker    cmp            dword hm, 6
4363*c0909341SAndroid Build Coastguard Worker    cmovs                r1, r2
4364*c0909341SAndroid Build Coastguard Worker    mov         [esp+0x1f4], r1
4365*c0909341SAndroid Build Coastguard Worker %if isprep
4366*c0909341SAndroid Build Coastguard Worker    mov                  r1, r1m
4367*c0909341SAndroid Build Coastguard Worker %endif
4368*c0909341SAndroid Build Coastguard Worker    mov                  r2, r2m
4369*c0909341SAndroid Build Coastguard Worker    sub                srcq, r0
4370*c0909341SAndroid Build Coastguard Worker MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
4371*c0909341SAndroid Build Coastguard Worker %define ss3q r0
4372*c0909341SAndroid Build Coastguard Worker %define myd r4
4373*c0909341SAndroid Build Coastguard Worker %define dyd dword dym
4374*c0909341SAndroid Build Coastguard Worker %define hd  dword hm
4375*c0909341SAndroid Build Coastguard Worker%endif
4376*c0909341SAndroid Build Coastguard Worker    cmp                 dyd, 1024
4377*c0909341SAndroid Build Coastguard Worker    je .dy1
4378*c0909341SAndroid Build Coastguard Worker    cmp                 dyd, 2048
4379*c0909341SAndroid Build Coastguard Worker    je .dy2
4380*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
4381*c0909341SAndroid Build Coastguard Worker    add                  wq, base_reg
4382*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4383*c0909341SAndroid Build Coastguard Worker%if isput
4384*c0909341SAndroid Build Coastguard Worker.w2:
4385*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
4386*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4387*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
4388*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
4389*c0909341SAndroid Build Coastguard Worker    movd                m15, t0d
4390*c0909341SAndroid Build Coastguard Worker %else
4391*c0909341SAndroid Build Coastguard Worker    movzx                r4, byte [esp+0x1f0]
4392*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
4393*c0909341SAndroid Build Coastguard Worker    movd                m15, r4
4394*c0909341SAndroid Build Coastguard Worker %endif
4395*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
4396*c0909341SAndroid Build Coastguard Worker    punpckldq            m9, m8
4397*c0909341SAndroid Build Coastguard Worker    paddd               m14, m9 ; mx+dx*[0-1]
4398*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
4399*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pd_0x4000]
4400*c0909341SAndroid Build Coastguard Worker %endif
4401*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0000
4402*c0909341SAndroid Build Coastguard Worker    pand                 m8, m14, m10
4403*c0909341SAndroid Build Coastguard Worker    psrld                m8, 6
4404*c0909341SAndroid Build Coastguard Worker    paddd               m15, m8
4405*c0909341SAndroid Build Coastguard Worker    movd                r4d, m15
4406*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0321
4407*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
4408*c0909341SAndroid Build Coastguard Worker    movd                r6d, m15
4409*c0909341SAndroid Build Coastguard Worker %else
4410*c0909341SAndroid Build Coastguard Worker    movd                r3d, m15
4411*c0909341SAndroid Build Coastguard Worker %endif
4412*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+bdct_lb_q]
4413*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+spel_s_shuf2]
4414*c0909341SAndroid Build Coastguard Worker    movd                m15, [base+subpel_filters+r4*8+2]
4415*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
4416*c0909341SAndroid Build Coastguard Worker    movd                 m7, [base+subpel_filters+r6*8+2]
4417*c0909341SAndroid Build Coastguard Worker %else
4418*c0909341SAndroid Build Coastguard Worker    movd                 m7, [base+subpel_filters+r3*8+2]
4419*c0909341SAndroid Build Coastguard Worker %endif
4420*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
4421*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m2
4422*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
4423*c0909341SAndroid Build Coastguard Worker    paddd               m14, m14
4424*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
4425*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
4426*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
4427*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
4428*c0909341SAndroid Build Coastguard Worker    mova              [stk], m14
4429*c0909341SAndroid Build Coastguard Worker    SWAP                 m5, m0
4430*c0909341SAndroid Build Coastguard Worker    SWAP                 m6, m3
4431*c0909341SAndroid Build Coastguard Worker  %define m15 m6
4432*c0909341SAndroid Build Coastguard Worker %endif
4433*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
4434*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
4435*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*2]
4436*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ss3q ]
4437*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
4438*c0909341SAndroid Build Coastguard Worker    punpckldq           m15, m7
4439*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
4440*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
4441*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
4442*c0909341SAndroid Build Coastguard Worker    pand                 m9, m8
4443*c0909341SAndroid Build Coastguard Worker    pandn                m8, m15
4444*c0909341SAndroid Build Coastguard Worker    SWAP                m15, m8
4445*c0909341SAndroid Build Coastguard Worker    por                 m15, m9
4446*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*0]
4447*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*1]
4448*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*2]
4449*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ss3q ]
4450*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
4451*c0909341SAndroid Build Coastguard Worker %else
4452*c0909341SAndroid Build Coastguard Worker    pand                 m7, m5, [base+pd_0x4000]
4453*c0909341SAndroid Build Coastguard Worker    pandn                m5, m15
4454*c0909341SAndroid Build Coastguard Worker    por                  m5, m7
4455*c0909341SAndroid Build Coastguard Worker  %define m15 m5
4456*c0909341SAndroid Build Coastguard Worker %endif
4457*c0909341SAndroid Build Coastguard Worker    punpcklbw           m15, m15
4458*c0909341SAndroid Build Coastguard Worker    psraw               m15, 8
4459*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m14}, m0, m1, m2, m3
4460*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m0, m1, m2, m3
4461*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
4462*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m14}, m4, m5, m6, m7
4463*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m4, m5, m6, m7
4464*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m1
4465*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m3
4466*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m5
4467*c0909341SAndroid Build Coastguard Worker    phaddd               m6, m7
4468*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m11}, m0, m2, m4, m6
4469*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, m12}, m0, m2, m4, m6
4470*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2 ; 0 1 2 3
4471*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m6 ; 4 5 6 7
4472*c0909341SAndroid Build Coastguard Worker    SWAP                 m1, m4
4473*c0909341SAndroid Build Coastguard Worker %else
4474*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x10], m15
4475*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m1
4476*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m3
4477*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0]
4478*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*1]
4479*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*2]
4480*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ss3q ]
4481*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
4482*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m14}, m1, m7, m6, m3
4483*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m1, m7, m6, m3
4484*c0909341SAndroid Build Coastguard Worker    phaddd               m1, m7
4485*c0909341SAndroid Build Coastguard Worker    phaddd               m6, m3
4486*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m11}, m0, m2, m1, m6
4487*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, m12}, m0, m2, m1, m6
4488*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2
4489*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m6
4490*c0909341SAndroid Build Coastguard Worker  %define m14 [stk+0x00]
4491*c0909341SAndroid Build Coastguard Worker  %define m15 [stk+0x10]
4492*c0909341SAndroid Build Coastguard Worker %endif
4493*c0909341SAndroid Build Coastguard Worker    palignr              m2, m1, m0, 4 ; 1 2 3 4
4494*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m0, m2    ; 01 12
4495*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2        ; 23 34
4496*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m1, q0321 ; 5 6 7 _
4497*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1, m5    ; 45 56
4498*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m1, m5    ; 67 __
4499*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
4500*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4501*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
4502*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m3
4503*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m0
4504*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m2
4505*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m4
4506*c0909341SAndroid Build Coastguard Worker %endif
4507*c0909341SAndroid Build Coastguard Worker.w2_loop:
4508*c0909341SAndroid Build Coastguard Worker    and                 myd, 0x3ff
4509*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
4510*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 64 << 24
4511*c0909341SAndroid Build Coastguard Worker    mov                 r4d, myd
4512*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6
4513*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [t1+r4]
4514*c0909341SAndroid Build Coastguard Worker    cmovnz              r6q, [base+subpel_filters+r4*8]
4515*c0909341SAndroid Build Coastguard Worker    movq                m10, r6q
4516*c0909341SAndroid Build Coastguard Worker    punpcklbw           m10, m10
4517*c0909341SAndroid Build Coastguard Worker    psraw               m10, 8
4518*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m10, q0000
4519*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m10, q1111
4520*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m3, m7
4521*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m0, m8
4522*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m10, q2222
4523*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m10, q3333
4524*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m2, m9
4525*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m4, m10
4526*c0909341SAndroid Build Coastguard Worker    paddd                m5, m6
4527*c0909341SAndroid Build Coastguard Worker    paddd                m7, m8
4528*c0909341SAndroid Build Coastguard Worker %else
4529*c0909341SAndroid Build Coastguard Worker    mov                  r1, [esp+0x1f4]
4530*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
4531*c0909341SAndroid Build Coastguard Worker    mov                  r5, myd
4532*c0909341SAndroid Build Coastguard Worker    shr                  r5, 6
4533*c0909341SAndroid Build Coastguard Worker    lea                  r1, [r1+r5]
4534*c0909341SAndroid Build Coastguard Worker    mov                  r5, 64 << 24
4535*c0909341SAndroid Build Coastguard Worker    cmovnz               r3, [base+subpel_filters+r1*8+4]
4536*c0909341SAndroid Build Coastguard Worker    cmovnz               r5, [base+subpel_filters+r1*8+0]
4537*c0909341SAndroid Build Coastguard Worker    movd                 m6, r3
4538*c0909341SAndroid Build Coastguard Worker    movd                 m7, r5
4539*c0909341SAndroid Build Coastguard Worker    punpckldq            m7, m6
4540*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m7
4541*c0909341SAndroid Build Coastguard Worker    psraw                m7, 8
4542*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m7, q0000
4543*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m7, q1111
4544*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5
4545*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m6
4546*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m7, q2222
4547*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q3333
4548*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5
4549*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7
4550*c0909341SAndroid Build Coastguard Worker    paddd                m3, m0
4551*c0909341SAndroid Build Coastguard Worker    paddd                m2, m4
4552*c0909341SAndroid Build Coastguard Worker    SWAP                 m5, m3
4553*c0909341SAndroid Build Coastguard Worker    SWAP                 m7, m2
4554*c0909341SAndroid Build Coastguard Worker  %define m8 m3
4555*c0909341SAndroid Build Coastguard Worker %endif
4556*c0909341SAndroid Build Coastguard Worker    paddd                m5, m13
4557*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m12, q1032
4558*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
4559*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
4560*c0909341SAndroid Build Coastguard Worker    psrad                m5, m6
4561*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m5
4562*c0909341SAndroid Build Coastguard Worker    pmaxsw               m5, m8
4563*c0909341SAndroid Build Coastguard Worker    pminsw               m5, pxmaxm
4564*c0909341SAndroid Build Coastguard Worker    movd             [dstq], m5
4565*c0909341SAndroid Build Coastguard Worker    add                dstq, dsmp
4566*c0909341SAndroid Build Coastguard Worker    dec                  hd
4567*c0909341SAndroid Build Coastguard Worker    jz .ret
4568*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
4569*c0909341SAndroid Build Coastguard Worker    add                 myd, dyd
4570*c0909341SAndroid Build Coastguard Worker %else
4571*c0909341SAndroid Build Coastguard Worker    add                 myd, dym
4572*c0909341SAndroid Build Coastguard Worker %endif
4573*c0909341SAndroid Build Coastguard Worker    test                myd, ~0x3ff
4574*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
4575*c0909341SAndroid Build Coastguard Worker    SWAP                 m3, m5
4576*c0909341SAndroid Build Coastguard Worker    SWAP                 m2, m7
4577*c0909341SAndroid Build Coastguard Worker    mova                 m3, [stk+0x20]
4578*c0909341SAndroid Build Coastguard Worker    mova                 m0, [stk+0x30]
4579*c0909341SAndroid Build Coastguard Worker    mova                 m2, [stk+0x40]
4580*c0909341SAndroid Build Coastguard Worker    mova                 m4, [stk+0x50]
4581*c0909341SAndroid Build Coastguard Worker %endif
4582*c0909341SAndroid Build Coastguard Worker    jz .w2_loop
4583*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
4584*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
4585*c0909341SAndroid Build Coastguard Worker %endif
4586*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq]
4587*c0909341SAndroid Build Coastguard Worker    test                myd, 0x400
4588*c0909341SAndroid Build Coastguard Worker    jz .w2_skip_line
4589*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
4590*c0909341SAndroid Build Coastguard Worker    shufps               m3, m0, q1032      ; 01 12
4591*c0909341SAndroid Build Coastguard Worker    shufps               m0, m2, q1032      ; 23 34
4592*c0909341SAndroid Build Coastguard Worker    shufps               m2, m4, q1032      ; 45 56
4593*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m14
4594*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m15
4595*c0909341SAndroid Build Coastguard Worker    phaddd               m5, m5
4596*c0909341SAndroid Build Coastguard Worker    paddd                m5, m11
4597*c0909341SAndroid Build Coastguard Worker    psrad                m5, m12
4598*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m5
4599*c0909341SAndroid Build Coastguard Worker    palignr              m4, m5, m1, 12
4600*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m4, m4         ; 6 7 6 7
4601*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m1, m5         ; 67 __
4602*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
4603*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m3
4604*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m0
4605*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m2
4606*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m4
4607*c0909341SAndroid Build Coastguard Worker %endif
4608*c0909341SAndroid Build Coastguard Worker    jmp .w2_loop
4609*c0909341SAndroid Build Coastguard Worker.w2_skip_line:
4610*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*1]
4611*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
4612*c0909341SAndroid Build Coastguard Worker    mova                 m3, m0             ; 01 12
4613*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2             ; 23 34
4614*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m14
4615*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m14
4616*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m15
4617*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m15
4618*c0909341SAndroid Build Coastguard Worker    phaddd               m5, m6
4619*c0909341SAndroid Build Coastguard Worker    paddd                m5, m11
4620*c0909341SAndroid Build Coastguard Worker    psrad                m5, m12
4621*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m5             ; 6 7 6 7
4622*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m5             ; 4 5 6 7
4623*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m1, q0321      ; 5 6 7 _
4624*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1, m5         ; 45 56
4625*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m1, m5         ; 67 __
4626*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
4627*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m3
4628*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m0
4629*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m2
4630*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m4
4631*c0909341SAndroid Build Coastguard Worker %endif
4632*c0909341SAndroid Build Coastguard Worker    jmp .w2_loop
4633*c0909341SAndroid Build Coastguard Worker%endif
4634*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
4635*c0909341SAndroid Build Coastguard Worker.w4:
4636*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4637*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4638*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], m11
4639*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m12
4640*c0909341SAndroid Build Coastguard Worker %if isput
4641*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x30], m13
4642*c0909341SAndroid Build Coastguard Worker %endif
4643*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
4644*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
4645*c0909341SAndroid Build Coastguard Worker    movd                m15, t0d
4646*c0909341SAndroid Build Coastguard Worker%else
4647*c0909341SAndroid Build Coastguard Worker %define m8  m0
4648*c0909341SAndroid Build Coastguard Worker %xdefine m14 m4
4649*c0909341SAndroid Build Coastguard Worker %define m15 m3
4650*c0909341SAndroid Build Coastguard Worker    movzx                r4, byte [esp+0x1f0]
4651*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
4652*c0909341SAndroid Build Coastguard Worker    movd                m15, r4
4653*c0909341SAndroid Build Coastguard Worker%endif
4654*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [base+rescale_mul]
4655*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4656*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pd_0x4000]
4657*c0909341SAndroid Build Coastguard Worker%else
4658*c0909341SAndroid Build Coastguard Worker %define m9 [base+pd_0x4000]
4659*c0909341SAndroid Build Coastguard Worker%endif
4660*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0000
4661*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-3]
4662*c0909341SAndroid Build Coastguard Worker    pand                 m0, m14, m10
4663*c0909341SAndroid Build Coastguard Worker    psrld                m0, 6
4664*c0909341SAndroid Build Coastguard Worker    paddd               m15, m0
4665*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m15, q1032
4666*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4667*c0909341SAndroid Build Coastguard Worker    movd                r4d, m15
4668*c0909341SAndroid Build Coastguard Worker    movd               r11d, m7
4669*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0321
4670*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q0321
4671*c0909341SAndroid Build Coastguard Worker    movd                r6d, m15
4672*c0909341SAndroid Build Coastguard Worker    movd               r13d, m7
4673*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+bdct_lb_q+ 0]
4674*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+bdct_lb_q+16]
4675*c0909341SAndroid Build Coastguard Worker    movd                m13, [base+subpel_filters+ r4*8+2]
4676*c0909341SAndroid Build Coastguard Worker    movd                 m2, [base+subpel_filters+ r6*8+2]
4677*c0909341SAndroid Build Coastguard Worker    movd                m15, [base+subpel_filters+r11*8+2]
4678*c0909341SAndroid Build Coastguard Worker    movd                 m4, [base+subpel_filters+r13*8+2]
4679*c0909341SAndroid Build Coastguard Worker%else
4680*c0909341SAndroid Build Coastguard Worker    movd                 r0, m15
4681*c0909341SAndroid Build Coastguard Worker    movd                 r4, m7
4682*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0321
4683*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q0321
4684*c0909341SAndroid Build Coastguard Worker    movd                 rX, m15
4685*c0909341SAndroid Build Coastguard Worker    movd                 r5, m7
4686*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+bdct_lb_q+ 0]
4687*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+bdct_lb_q+16]
4688*c0909341SAndroid Build Coastguard Worker    movd                 m1, [base+subpel_filters+r0*8+2]
4689*c0909341SAndroid Build Coastguard Worker    movd                 m2, [base+subpel_filters+rX*8+2]
4690*c0909341SAndroid Build Coastguard Worker    movd                 m3, [base+subpel_filters+r4*8+2]
4691*c0909341SAndroid Build Coastguard Worker    movd                 m7, [base+subpel_filters+r5*8+2]
4692*c0909341SAndroid Build Coastguard Worker    movifprep            r3, r3m
4693*c0909341SAndroid Build Coastguard Worker    SWAP                 m4, m7
4694*c0909341SAndroid Build Coastguard Worker %define m10 m5
4695*c0909341SAndroid Build Coastguard Worker %define m11 m6
4696*c0909341SAndroid Build Coastguard Worker %define m12 m1
4697*c0909341SAndroid Build Coastguard Worker %define m13 m1
4698*c0909341SAndroid Build Coastguard Worker%endif
4699*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
4700*c0909341SAndroid Build Coastguard Worker    paddd               m14, m14
4701*c0909341SAndroid Build Coastguard Worker    punpckldq           m13, m2
4702*c0909341SAndroid Build Coastguard Worker    punpckldq           m15, m4
4703*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m13, m15
4704*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
4705*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m0, m2
4706*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4707*c0909341SAndroid Build Coastguard Worker    pand                 m9, m0
4708*c0909341SAndroid Build Coastguard Worker%else
4709*c0909341SAndroid Build Coastguard Worker    pand                 m2, m9, m0
4710*c0909341SAndroid Build Coastguard Worker %define m9 m2
4711*c0909341SAndroid Build Coastguard Worker    SWAP                 m7, m4
4712*c0909341SAndroid Build Coastguard Worker%endif
4713*c0909341SAndroid Build Coastguard Worker    pandn                m0, m13
4714*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4715*c0909341SAndroid Build Coastguard Worker    SWAP                m13, m0
4716*c0909341SAndroid Build Coastguard Worker%else
4717*c0909341SAndroid Build Coastguard Worker %define m13 m0
4718*c0909341SAndroid Build Coastguard Worker%endif
4719*c0909341SAndroid Build Coastguard Worker    por                 m13, m9
4720*c0909341SAndroid Build Coastguard Worker    punpckhbw           m15, m13, m13
4721*c0909341SAndroid Build Coastguard Worker    punpcklbw           m13, m13
4722*c0909341SAndroid Build Coastguard Worker    psraw               m15, 8
4723*c0909341SAndroid Build Coastguard Worker    psraw               m13, 8
4724*c0909341SAndroid Build Coastguard Worker    pshufb              m12, m14, m10
4725*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m11
4726*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+spel_s_shuf2]
4727*c0909341SAndroid Build Coastguard Worker    movd                r4d, m14
4728*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 24
4729*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4730*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m13
4731*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m15
4732*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
4733*c0909341SAndroid Build Coastguard Worker%endif
4734*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m14, m2
4735*c0909341SAndroid Build Coastguard Worker    psubb               m14, m7
4736*c0909341SAndroid Build Coastguard Worker    paddb               m12, m10
4737*c0909341SAndroid Build Coastguard Worker    paddb               m14, m10
4738*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4739*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r4+ssq*1]
4740*c0909341SAndroid Build Coastguard Worker    lea                 r11, [r4+ssq*2]
4741*c0909341SAndroid Build Coastguard Worker    lea                 r13, [r4+ss3q ]
4742*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*0]
4743*c0909341SAndroid Build Coastguard Worker    movu                 m9, [srcq+ssq*1]
4744*c0909341SAndroid Build Coastguard Worker    movu                 m8, [srcq+ssq*2]
4745*c0909341SAndroid Build Coastguard Worker    movu                m10, [srcq+ss3q ]
4746*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r4   ]
4747*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+r6   ]
4748*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r11  ]
4749*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+r13  ]
4750*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
4751*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m12}, m7, m9, m8, m10
4752*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m13}, m7, m9, m8, m10
4753*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m14}, m1, m2, m3, m4
4754*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m1, m2, m3, m4
4755*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+0x10]
4756*c0909341SAndroid Build Coastguard Worker    movd                xm6, [rsp+0x20]
4757*c0909341SAndroid Build Coastguard Worker    phaddd               m7, m1
4758*c0909341SAndroid Build Coastguard Worker    phaddd               m9, m3
4759*c0909341SAndroid Build Coastguard Worker    phaddd               m8, m2
4760*c0909341SAndroid Build Coastguard Worker    phaddd              m10, m4
4761*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0]
4762*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1]
4763*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*2]
4764*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ss3q ]
4765*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m7, m9, m8, m10
4766*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, xm6}, m7, m9, m8, m10
4767*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m9  ; 0 1
4768*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m10 ; 2 3
4769*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r4   ]
4770*c0909341SAndroid Build Coastguard Worker    movu                 m9, [srcq+r6   ]
4771*c0909341SAndroid Build Coastguard Worker    movu                m10, [srcq+r11  ]
4772*c0909341SAndroid Build Coastguard Worker    movu                m11, [srcq+r13  ]
4773*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
4774*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m12}, m1, m2, m3, m4
4775*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m13}, m1, m2, m3, m4
4776*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m14}, m0, m9, m10, m11
4777*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m0, m9, m10, m11
4778*c0909341SAndroid Build Coastguard Worker    phaddd               m1, m0
4779*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m9
4780*c0909341SAndroid Build Coastguard Worker    phaddd               m3, m10
4781*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m11
4782*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m1, m2, m3, m4
4783*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, xm6}, m1, m2, m3, m4
4784*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m2 ; 4 5
4785*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m4 ; 6 7
4786*c0909341SAndroid Build Coastguard Worker    SWAP                 m9, m1
4787*c0909341SAndroid Build Coastguard Worker    shufps               m4, m7, m8, q1032  ; 1 2
4788*c0909341SAndroid Build Coastguard Worker    shufps               m5, m8, m9, q1032  ; 3 4
4789*c0909341SAndroid Build Coastguard Worker    shufps               m6, m9, m3, q1032  ; 5 6
4790*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m3, q1032      ; 7 _
4791*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m7, m4 ; 01
4792*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m4     ; 12
4793*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m8, m5 ; 23
4794*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m5     ; 34
4795*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m9, m6 ; 45
4796*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m6     ; 56
4797*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m10    ; 67
4798*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x40], m7
4799*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], m8
4800*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x60], m9
4801*c0909341SAndroid Build Coastguard Worker%else
4802*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x00], m12
4803*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x10], m14
4804*c0909341SAndroid Build Coastguard Worker    add                  r4, srcq
4805*c0909341SAndroid Build Coastguard Worker    MC_4TAP_SCALED_H   0x40 ; 0 1
4806*c0909341SAndroid Build Coastguard Worker    MC_4TAP_SCALED_H   0x50 ; 2 3
4807*c0909341SAndroid Build Coastguard Worker    MC_4TAP_SCALED_H   0x60 ; 4 5
4808*c0909341SAndroid Build Coastguard Worker    MC_4TAP_SCALED_H   0x70 ; 6 7
4809*c0909341SAndroid Build Coastguard Worker    mova                 m4, [stk+0x40]
4810*c0909341SAndroid Build Coastguard Worker    mova                 m5, [stk+0x50]
4811*c0909341SAndroid Build Coastguard Worker    mova                 m6, [stk+0x60]
4812*c0909341SAndroid Build Coastguard Worker    mova                 m7, [stk+0x70]
4813*c0909341SAndroid Build Coastguard Worker    mov          [stk+0xc0], r4
4814*c0909341SAndroid Build Coastguard Worker    shufps               m1, m4, m5, q1032 ; 1 2
4815*c0909341SAndroid Build Coastguard Worker    shufps               m2, m5, m6, q1032 ; 3 4
4816*c0909341SAndroid Build Coastguard Worker    shufps               m3, m6, m7, q1032 ; 5 6
4817*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m7, q1032     ; 7 _
4818*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xb0], m0
4819*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m4, m1         ; 01
4820*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m1             ; 12
4821*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m5, m2         ; 23
4822*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m2             ; 34
4823*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m6, m3         ; 45
4824*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m3             ; 56
4825*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m7, [stk+0xb0] ; 67
4826*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4827*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
4828*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m0 ; 01
4829*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m1 ; 23
4830*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x60], m2 ; 45
4831*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x70], m3 ; 67
4832*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x80], m4 ; 12
4833*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m5 ; 34
4834*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xa0], m6 ; 56
4835*c0909341SAndroid Build Coastguard Worker %define m12 [stk+0x00]
4836*c0909341SAndroid Build Coastguard Worker %define m14 [stk+0x10]
4837*c0909341SAndroid Build Coastguard Worker %define m13 [stk+0x20]
4838*c0909341SAndroid Build Coastguard Worker %define m15 [stk+0x30]
4839*c0909341SAndroid Build Coastguard Worker %define hrnd_mem [esp+0x00]
4840*c0909341SAndroid Build Coastguard Worker %define hsh_mem  [esp+0x10]
4841*c0909341SAndroid Build Coastguard Worker %if isput
4842*c0909341SAndroid Build Coastguard Worker  %define vrnd_mem [esp+0x20]
4843*c0909341SAndroid Build Coastguard Worker %else
4844*c0909341SAndroid Build Coastguard Worker  %define vrnd_mem [base+pd_m524256]
4845*c0909341SAndroid Build Coastguard Worker %endif
4846*c0909341SAndroid Build Coastguard Worker%endif
4847*c0909341SAndroid Build Coastguard Worker.w4_loop:
4848*c0909341SAndroid Build Coastguard Worker    and                 myd, 0x3ff
4849*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4850*c0909341SAndroid Build Coastguard Worker    mov                r11d, 64 << 24
4851*c0909341SAndroid Build Coastguard Worker    mov                r13d, myd
4852*c0909341SAndroid Build Coastguard Worker    shr                r13d, 6
4853*c0909341SAndroid Build Coastguard Worker    lea                r13d, [t1+r13]
4854*c0909341SAndroid Build Coastguard Worker    cmovnz             r11q, [base+subpel_filters+r13*8]
4855*c0909341SAndroid Build Coastguard Worker    movq                 m9, r11q
4856*c0909341SAndroid Build Coastguard Worker    punpcklbw            m9, m9
4857*c0909341SAndroid Build Coastguard Worker    psraw                m9, 8
4858*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m9, q0000
4859*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m9, q1111
4860*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m0, m7
4861*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, m8
4862*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m9, q2222
4863*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m9, q3333
4864*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m7
4865*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m3, m9
4866*c0909341SAndroid Build Coastguard Worker %if isput
4867*c0909341SAndroid Build Coastguard Worker    movd                 m9, [rsp+0x28]
4868*c0909341SAndroid Build Coastguard Worker  %define vrnd_mem [rsp+0x30]
4869*c0909341SAndroid Build Coastguard Worker %else
4870*c0909341SAndroid Build Coastguard Worker  %define vrnd_mem [base+pd_m524256]
4871*c0909341SAndroid Build Coastguard Worker %endif
4872*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
4873*c0909341SAndroid Build Coastguard Worker    paddd                m6, m8
4874*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
4875*c0909341SAndroid Build Coastguard Worker    paddd                m4, vrnd_mem
4876*c0909341SAndroid Build Coastguard Worker%else
4877*c0909341SAndroid Build Coastguard Worker    mov                 mym, myd
4878*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+0x1f4]
4879*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
4880*c0909341SAndroid Build Coastguard Worker    shr                  r4, 6
4881*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+r4]
4882*c0909341SAndroid Build Coastguard Worker    mov                  r4, 64 << 24
4883*c0909341SAndroid Build Coastguard Worker    cmovnz               r4, [base+subpel_filters+r5*8+0]
4884*c0909341SAndroid Build Coastguard Worker    cmovnz               r3, [base+subpel_filters+r5*8+4]
4885*c0909341SAndroid Build Coastguard Worker    movd                 m7, r4
4886*c0909341SAndroid Build Coastguard Worker    movd                 m6, r3
4887*c0909341SAndroid Build Coastguard Worker    punpckldq            m7, m6
4888*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m7
4889*c0909341SAndroid Build Coastguard Worker    psraw                m7, 8
4890*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m7, q0000
4891*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m7, q1111
4892*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m7, q2222
4893*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q3333
4894*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m4
4895*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5
4896*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m6
4897*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m7
4898*c0909341SAndroid Build Coastguard Worker %if isput
4899*c0909341SAndroid Build Coastguard Worker    movd                 m4, [esp+0x18]
4900*c0909341SAndroid Build Coastguard Worker %endif
4901*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
4902*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3
4903*c0909341SAndroid Build Coastguard Worker    paddd                m0, vrnd_mem
4904*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
4905*c0909341SAndroid Build Coastguard Worker    SWAP                 m4, m0
4906*c0909341SAndroid Build Coastguard Worker %define m9 m0
4907*c0909341SAndroid Build Coastguard Worker%endif
4908*c0909341SAndroid Build Coastguard Worker%if isput
4909*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
4910*c0909341SAndroid Build Coastguard Worker    psrad                m4, m9
4911*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m4
4912*c0909341SAndroid Build Coastguard Worker    pmaxsw               m4, m5
4913*c0909341SAndroid Build Coastguard Worker    pminsw               m4, pxmaxm
4914*c0909341SAndroid Build Coastguard Worker    movq             [dstq], m4
4915*c0909341SAndroid Build Coastguard Worker    add                dstq, dsmp
4916*c0909341SAndroid Build Coastguard Worker%else
4917*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
4918*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m4
4919*c0909341SAndroid Build Coastguard Worker    movq             [tmpq], m4
4920*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8
4921*c0909341SAndroid Build Coastguard Worker%endif
4922*c0909341SAndroid Build Coastguard Worker    dec                  hd
4923*c0909341SAndroid Build Coastguard Worker    jz .ret
4924*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4925*c0909341SAndroid Build Coastguard Worker    add                 myd, dyd
4926*c0909341SAndroid Build Coastguard Worker    test                myd, ~0x3ff
4927*c0909341SAndroid Build Coastguard Worker    jz .w4_loop
4928*c0909341SAndroid Build Coastguard Worker    mova                 m8, [rsp+0x10]
4929*c0909341SAndroid Build Coastguard Worker    movd                 m9, [rsp+0x20]
4930*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq]
4931*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+r4]
4932*c0909341SAndroid Build Coastguard Worker    test                myd, 0x400
4933*c0909341SAndroid Build Coastguard Worker    jz .w4_skip_line
4934*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+0x40]
4935*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x40], m1
4936*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+0x50]
4937*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], m2
4938*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+0x60]
4939*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x60], m3
4940*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m12
4941*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m14
4942*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m13
4943*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m15
4944*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m5
4945*c0909341SAndroid Build Coastguard Worker    paddd                m4, m8
4946*c0909341SAndroid Build Coastguard Worker    psrad                m4, m9
4947*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m4
4948*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m10, m4
4949*c0909341SAndroid Build Coastguard Worker    mova                m10, m4
4950*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
4951*c0909341SAndroid Build Coastguard Worker    jmp .w4_loop
4952*c0909341SAndroid Build Coastguard Worker.w4_skip_line:
4953*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*1]
4954*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+r6]
4955*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+0x50]
4956*c0909341SAndroid Build Coastguard Worker    mova                m11, [rsp+0x60]
4957*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m12
4958*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m12
4959*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m14
4960*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m14
4961*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m13
4962*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m13
4963*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m15
4964*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m15
4965*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x40], m0
4966*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], m11
4967*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m5
4968*c0909341SAndroid Build Coastguard Worker    phaddd               m6, m7
4969*c0909341SAndroid Build Coastguard Worker    paddd                m4, m8
4970*c0909341SAndroid Build Coastguard Worker    paddd                m6, m8
4971*c0909341SAndroid Build Coastguard Worker    psrad                m4, m9
4972*c0909341SAndroid Build Coastguard Worker    psrad                m6, m9
4973*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m6
4974*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m10, m4
4975*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x60], m9
4976*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m4, q1032
4977*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
4978*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
4979*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
4980*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m10
4981*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
4982*c0909341SAndroid Build Coastguard Worker    jmp .w4_loop
4983*c0909341SAndroid Build Coastguard Worker%else
4984*c0909341SAndroid Build Coastguard Worker    SWAP                 m0, m4
4985*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
4986*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
4987*c0909341SAndroid Build Coastguard Worker    add                 myd, dym
4988*c0909341SAndroid Build Coastguard Worker    test                myd, ~0x3ff
4989*c0909341SAndroid Build Coastguard Worker    jnz .w4_next_line
4990*c0909341SAndroid Build Coastguard Worker    mova                 m0, [stk+0x40]
4991*c0909341SAndroid Build Coastguard Worker    mova                 m1, [stk+0x50]
4992*c0909341SAndroid Build Coastguard Worker    mova                 m2, [stk+0x60]
4993*c0909341SAndroid Build Coastguard Worker    mova                 m3, [stk+0x70]
4994*c0909341SAndroid Build Coastguard Worker    jmp .w4_loop
4995*c0909341SAndroid Build Coastguard Worker.w4_next_line:
4996*c0909341SAndroid Build Coastguard Worker    mov                  r5, [stk+0xc0]
4997*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq]
4998*c0909341SAndroid Build Coastguard Worker    movu                 m5, [r5]
4999*c0909341SAndroid Build Coastguard Worker    test                myd, 0x400
5000*c0909341SAndroid Build Coastguard Worker    jz .w4_skip_line
5001*c0909341SAndroid Build Coastguard Worker    add          [stk+0xc0], ssq
5002*c0909341SAndroid Build Coastguard Worker    mova                 m0, [stk+0x80]
5003*c0909341SAndroid Build Coastguard Worker    mova                 m3, [stk+0x50]
5004*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m0
5005*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x80], m3
5006*c0909341SAndroid Build Coastguard Worker    mova                 m1, [stk+0x90]
5007*c0909341SAndroid Build Coastguard Worker    mova                 m6, [stk+0x60]
5008*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m1
5009*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m6
5010*c0909341SAndroid Build Coastguard Worker    mova                 m2, [stk+0xa0]
5011*c0909341SAndroid Build Coastguard Worker    mova                 m7, [stk+0x70]
5012*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x60], m2
5013*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xa0], m7
5014*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m12
5015*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m14
5016*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m13
5017*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m15
5018*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m5
5019*c0909341SAndroid Build Coastguard Worker    paddd                m4, hrnd_mem
5020*c0909341SAndroid Build Coastguard Worker    psrad                m4, hsh_mem
5021*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m4
5022*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, [stk+0xb0], m4
5023*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xb0], m4
5024*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x70], m3
5025*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
5026*c0909341SAndroid Build Coastguard Worker    jmp .w4_loop
5027*c0909341SAndroid Build Coastguard Worker.w4_skip_line:
5028*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*1]
5029*c0909341SAndroid Build Coastguard Worker    movu                 m7, [r5  +ssq*1]
5030*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5  +ssq*2]
5031*c0909341SAndroid Build Coastguard Worker    mov          [stk+0xc0], r5
5032*c0909341SAndroid Build Coastguard Worker    mova                 m0, [stk+0x50]
5033*c0909341SAndroid Build Coastguard Worker    mova                 m1, [stk+0x60]
5034*c0909341SAndroid Build Coastguard Worker    mova                 m2, [stk+0x70]
5035*c0909341SAndroid Build Coastguard Worker    mova                 m3, [stk+0x90]
5036*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m12
5037*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m12
5038*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m14
5039*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m14
5040*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m13
5041*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m13
5042*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m15
5043*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m15
5044*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m0
5045*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m1
5046*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x60], m2
5047*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x80], m3
5048*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m5
5049*c0909341SAndroid Build Coastguard Worker    phaddd               m6, m7
5050*c0909341SAndroid Build Coastguard Worker    mova                 m5, [stk+0xa0]
5051*c0909341SAndroid Build Coastguard Worker    mova                 m7, [stk+0xb0]
5052*c0909341SAndroid Build Coastguard Worker    paddd                m4, hrnd_mem
5053*c0909341SAndroid Build Coastguard Worker    paddd                m6, hrnd_mem
5054*c0909341SAndroid Build Coastguard Worker    psrad                m4, hsh_mem
5055*c0909341SAndroid Build Coastguard Worker    psrad                m6, hsh_mem
5056*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m6
5057*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m4
5058*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m4, q1032
5059*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m5
5060*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xa0], m7
5061*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xb0], m6
5062*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m6
5063*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x70], m3
5064*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
5065*c0909341SAndroid Build Coastguard Worker    jmp .w4_loop
5066*c0909341SAndroid Build Coastguard Worker%endif
5067*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
5068*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5069*c0909341SAndroid Build Coastguard Worker %define stk rsp+0x20
5070*c0909341SAndroid Build Coastguard Worker%endif
5071*c0909341SAndroid Build Coastguard Worker.w8:
5072*c0909341SAndroid Build Coastguard Worker    mov    dword [stk+0xf0], 1
5073*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 16
5074*c0909341SAndroid Build Coastguard Worker    jmp .w_start
5075*c0909341SAndroid Build Coastguard Worker.w16:
5076*c0909341SAndroid Build Coastguard Worker    mov    dword [stk+0xf0], 2
5077*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 32
5078*c0909341SAndroid Build Coastguard Worker    jmp .w_start
5079*c0909341SAndroid Build Coastguard Worker.w32:
5080*c0909341SAndroid Build Coastguard Worker    mov    dword [stk+0xf0], 4
5081*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 64
5082*c0909341SAndroid Build Coastguard Worker    jmp .w_start
5083*c0909341SAndroid Build Coastguard Worker.w64:
5084*c0909341SAndroid Build Coastguard Worker    mov    dword [stk+0xf0], 8
5085*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 128
5086*c0909341SAndroid Build Coastguard Worker    jmp .w_start
5087*c0909341SAndroid Build Coastguard Worker.w128:
5088*c0909341SAndroid Build Coastguard Worker    mov    dword [stk+0xf0], 16
5089*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 256
5090*c0909341SAndroid Build Coastguard Worker.w_start:
5091*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5092*c0909341SAndroid Build Coastguard Worker %ifidn %1, put
5093*c0909341SAndroid Build Coastguard Worker    movifnidn           dsm, dsq
5094*c0909341SAndroid Build Coastguard Worker %endif
5095*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], m11
5096*c0909341SAndroid Build Coastguard Worker %define hround m11
5097*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 16
5098*c0909341SAndroid Build Coastguard Worker    movd                m15, t0d
5099*c0909341SAndroid Build Coastguard Worker %if isprep
5100*c0909341SAndroid Build Coastguard Worker    mova                m13, [base+pd_m524256]
5101*c0909341SAndroid Build Coastguard Worker %endif
5102*c0909341SAndroid Build Coastguard Worker%else
5103*c0909341SAndroid Build Coastguard Worker %define hround [esp+0x00]
5104*c0909341SAndroid Build Coastguard Worker %define m12    [esp+0x10]
5105*c0909341SAndroid Build Coastguard Worker %define m10    [base+pd_0x3ff]
5106*c0909341SAndroid Build Coastguard Worker %define m8  m0
5107*c0909341SAndroid Build Coastguard Worker %xdefine m14 m4
5108*c0909341SAndroid Build Coastguard Worker %define m15 m3
5109*c0909341SAndroid Build Coastguard Worker %if isprep
5110*c0909341SAndroid Build Coastguard Worker  %define ssq ssm
5111*c0909341SAndroid Build Coastguard Worker %endif
5112*c0909341SAndroid Build Coastguard Worker    mov                  r4, [esp+0x1f0]
5113*c0909341SAndroid Build Coastguard Worker    shr                  r4, 16
5114*c0909341SAndroid Build Coastguard Worker    movd                m15, r4
5115*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
5116*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
5117*c0909341SAndroid Build Coastguard Worker%endif
5118*c0909341SAndroid Build Coastguard Worker    sub                srcq, 6
5119*c0909341SAndroid Build Coastguard Worker    pslld                m7, m8, 2 ; dx*4
5120*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
5121*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0000
5122*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-3]
5123*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x100], m7
5124*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x120], m15
5125*c0909341SAndroid Build Coastguard Worker    mov         [stk+0x0f8], srcq
5126*c0909341SAndroid Build Coastguard Worker    mov         [stk+0x130], r0q ; dstq / tmpq
5127*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 && UNIX64
5128*c0909341SAndroid Build Coastguard Worker    mov                  hm, hd
5129*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32
5130*c0909341SAndroid Build Coastguard Worker    mov                  r5, hm
5131*c0909341SAndroid Build Coastguard Worker    mov         [stk+0x0f4], myd
5132*c0909341SAndroid Build Coastguard Worker    mov         [stk+0x134], r5
5133*c0909341SAndroid Build Coastguard Worker%endif
5134*c0909341SAndroid Build Coastguard Worker    jmp .hloop
5135*c0909341SAndroid Build Coastguard Worker.hloop_prep:
5136*c0909341SAndroid Build Coastguard Worker    dec   dword [stk+0x0f0]
5137*c0909341SAndroid Build Coastguard Worker    jz .ret
5138*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5139*c0909341SAndroid Build Coastguard Worker    add   qword [stk+0x130], 16
5140*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
5141*c0909341SAndroid Build Coastguard Worker%else
5142*c0909341SAndroid Build Coastguard Worker    add   dword [stk+0x130], 16
5143*c0909341SAndroid Build Coastguard Worker    mov                 myd, [stk+0x0f4]
5144*c0909341SAndroid Build Coastguard Worker    mov                  r5, [stk+0x134]
5145*c0909341SAndroid Build Coastguard Worker    mov                  r0, [stk+0x130]
5146*c0909341SAndroid Build Coastguard Worker%endif
5147*c0909341SAndroid Build Coastguard Worker    mova                 m7, [stk+0x100]
5148*c0909341SAndroid Build Coastguard Worker    mova                m14, [stk+0x110]
5149*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5150*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+pd_0x3ff]
5151*c0909341SAndroid Build Coastguard Worker    mova                m11, [rsp+0x10]
5152*c0909341SAndroid Build Coastguard Worker%endif
5153*c0909341SAndroid Build Coastguard Worker    mova                m15, [stk+0x120]
5154*c0909341SAndroid Build Coastguard Worker    mov                srcq, [stk+0x0f8]
5155*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5156*c0909341SAndroid Build Coastguard Worker    mov                 r0q, [stk+0x130] ; dstq / tmpq
5157*c0909341SAndroid Build Coastguard Worker%else
5158*c0909341SAndroid Build Coastguard Worker    mov                 mym, myd
5159*c0909341SAndroid Build Coastguard Worker    mov                  hm, r5
5160*c0909341SAndroid Build Coastguard Worker    mov                 r0m, r0
5161*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
5162*c0909341SAndroid Build Coastguard Worker%endif
5163*c0909341SAndroid Build Coastguard Worker    paddd               m14, m7
5164*c0909341SAndroid Build Coastguard Worker.hloop:
5165*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5166*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pq_0x40000000]
5167*c0909341SAndroid Build Coastguard Worker%else
5168*c0909341SAndroid Build Coastguard Worker %define m9 [base+pq_0x40000000]
5169*c0909341SAndroid Build Coastguard Worker%endif
5170*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
5171*c0909341SAndroid Build Coastguard Worker    psrld                m2, m14, 10
5172*c0909341SAndroid Build Coastguard Worker    mova              [stk], m2
5173*c0909341SAndroid Build Coastguard Worker    pand                 m6, m14, m10
5174*c0909341SAndroid Build Coastguard Worker    psrld                m6, 6
5175*c0909341SAndroid Build Coastguard Worker    paddd                m5, m15, m6
5176*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m6, m1
5177*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m5, q1032
5178*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5179*c0909341SAndroid Build Coastguard Worker    movd                r4d, m5
5180*c0909341SAndroid Build Coastguard Worker    movd                r6d, m2
5181*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q0321
5182*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q0321
5183*c0909341SAndroid Build Coastguard Worker    movd                r7d, m5
5184*c0909341SAndroid Build Coastguard Worker    movd                r9d, m2
5185*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base+subpel_filters+r4*8]
5186*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base+subpel_filters+r6*8]
5187*c0909341SAndroid Build Coastguard Worker    movhps               m0, [base+subpel_filters+r7*8]
5188*c0909341SAndroid Build Coastguard Worker    movhps               m1, [base+subpel_filters+r9*8]
5189*c0909341SAndroid Build Coastguard Worker%else
5190*c0909341SAndroid Build Coastguard Worker    movd                 r0, m5
5191*c0909341SAndroid Build Coastguard Worker    movd                 rX, m2
5192*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q0321
5193*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q0321
5194*c0909341SAndroid Build Coastguard Worker    movd                 r4, m5
5195*c0909341SAndroid Build Coastguard Worker    movd                 r5, m2
5196*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base+subpel_filters+r0*8]
5197*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base+subpel_filters+rX*8]
5198*c0909341SAndroid Build Coastguard Worker    movhps               m0, [base+subpel_filters+r4*8]
5199*c0909341SAndroid Build Coastguard Worker    movhps               m1, [base+subpel_filters+r5*8]
5200*c0909341SAndroid Build Coastguard Worker%endif
5201*c0909341SAndroid Build Coastguard Worker    paddd               m14, m7 ; mx+dx*[4-7]
5202*c0909341SAndroid Build Coastguard Worker    pand                 m5, m14, m10
5203*c0909341SAndroid Build Coastguard Worker    psrld                m5, 6
5204*c0909341SAndroid Build Coastguard Worker    paddd               m15, m5
5205*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
5206*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m5, m2
5207*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x110], m14
5208*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m15, q1032
5209*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5210*c0909341SAndroid Build Coastguard Worker    movd               r10d, m15
5211*c0909341SAndroid Build Coastguard Worker    movd               r11d, m4
5212*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0321
5213*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m4, q0321
5214*c0909341SAndroid Build Coastguard Worker    movd               r13d, m15
5215*c0909341SAndroid Build Coastguard Worker    movd                rXd, m4
5216*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+r10*8]
5217*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+r11*8]
5218*c0909341SAndroid Build Coastguard Worker    movhps               m2, [base+subpel_filters+r13*8]
5219*c0909341SAndroid Build Coastguard Worker    movhps               m3, [base+subpel_filters+ rX*8]
5220*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
5221*c0909341SAndroid Build Coastguard Worker    movq                r11, m14
5222*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m14, m14
5223*c0909341SAndroid Build Coastguard Worker    movq                 rX, m14
5224*c0909341SAndroid Build Coastguard Worker    mov                r10d, r11d
5225*c0909341SAndroid Build Coastguard Worker    shr                 r11, 32
5226*c0909341SAndroid Build Coastguard Worker    mov                r13d, rXd
5227*c0909341SAndroid Build Coastguard Worker    shr                  rX, 32
5228*c0909341SAndroid Build Coastguard Worker    mov                 r4d, [stk+ 0]
5229*c0909341SAndroid Build Coastguard Worker    mov                 r6d, [stk+ 4]
5230*c0909341SAndroid Build Coastguard Worker    mov                 r7d, [stk+ 8]
5231*c0909341SAndroid Build Coastguard Worker    mov                 r9d, [stk+12]
5232*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m6, q1100
5233*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q3322
5234*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m5, q1100
5235*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q3322
5236*c0909341SAndroid Build Coastguard Worker    pand                 m7, m9, m4
5237*c0909341SAndroid Build Coastguard Worker    pand                 m8, m9, m6
5238*c0909341SAndroid Build Coastguard Worker    pand                m15, m9, m14
5239*c0909341SAndroid Build Coastguard Worker    pand                 m9, m9, m5
5240*c0909341SAndroid Build Coastguard Worker    pandn                m4, m0
5241*c0909341SAndroid Build Coastguard Worker    pandn                m6, m1
5242*c0909341SAndroid Build Coastguard Worker    pandn               m14, m2
5243*c0909341SAndroid Build Coastguard Worker    pandn                m5, m3
5244*c0909341SAndroid Build Coastguard Worker    por                  m7, m4
5245*c0909341SAndroid Build Coastguard Worker    por                  m8, m6
5246*c0909341SAndroid Build Coastguard Worker    por                 m15, m14
5247*c0909341SAndroid Build Coastguard Worker    por                  m9, m5
5248*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m7, m7
5249*c0909341SAndroid Build Coastguard Worker    punpckhbw            m7, m7
5250*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m8, m8
5251*c0909341SAndroid Build Coastguard Worker    punpckhbw            m8, m8
5252*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8
5253*c0909341SAndroid Build Coastguard Worker    psraw                m7, 8
5254*c0909341SAndroid Build Coastguard Worker    psraw                m1, 8
5255*c0909341SAndroid Build Coastguard Worker    psraw                m8, 8
5256*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m15, m15
5257*c0909341SAndroid Build Coastguard Worker    punpckhbw           m15, m15
5258*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m9, m9
5259*c0909341SAndroid Build Coastguard Worker    punpckhbw            m9, m9
5260*c0909341SAndroid Build Coastguard Worker    psraw                m2, 8
5261*c0909341SAndroid Build Coastguard Worker    psraw               m15, 8
5262*c0909341SAndroid Build Coastguard Worker    psraw                m3, 8
5263*c0909341SAndroid Build Coastguard Worker    psraw                m9, 8
5264*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x10], m0
5265*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m7
5266*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m1
5267*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m8
5268*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m2
5269*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x60], m15
5270*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x70], m3
5271*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x80], m9
5272*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
5273*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m1
5274*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
5275*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xa0], m2
5276*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
5277*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xb0], m3
5278*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
5279*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xc0], m4
5280*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
5281*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xd0], m5
5282*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
5283*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
5284*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
5285*c0909341SAndroid Build Coastguard Worker    mova                 m5, [stk+0xd0]
5286*c0909341SAndroid Build Coastguard Worker    mova                 m1, [stk+0x90]
5287*c0909341SAndroid Build Coastguard Worker    mova                 m2, [stk+0xa0]
5288*c0909341SAndroid Build Coastguard Worker    mova                 m3, [stk+0xb0]
5289*c0909341SAndroid Build Coastguard Worker    mova                 m9, [stk+0xc0]
5290*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
5291*c0909341SAndroid Build Coastguard Worker    mov                 dyd, dym
5292*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m6 ; 45a
5293*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6     ; 45b
5294*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7, m8 ; 67a
5295*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m8     ; 67b
5296*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2 ; 01a
5297*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2     ; 01b
5298*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m9 ; 23a
5299*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m9     ; 23b
5300*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m4
5301*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xa0], m5
5302*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xb0], m6
5303*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xc0], m7
5304*c0909341SAndroid Build Coastguard Worker %define hround [rsp+0x10]
5305*c0909341SAndroid Build Coastguard Worker.vloop:
5306*c0909341SAndroid Build Coastguard Worker    and                 myd, 0x3ff
5307*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 64 << 24
5308*c0909341SAndroid Build Coastguard Worker    mov                 r4d, myd
5309*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6
5310*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [t1+r4]
5311*c0909341SAndroid Build Coastguard Worker    cmovnz              r6q, [base+subpel_filters+r4*8]
5312*c0909341SAndroid Build Coastguard Worker    movq                m11, r6q
5313*c0909341SAndroid Build Coastguard Worker    punpcklbw           m11, m11
5314*c0909341SAndroid Build Coastguard Worker    psraw               m11, 8
5315*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m11, q0000
5316*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m11, q1111
5317*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m11, q2222
5318*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m11, q3333
5319*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m5, m0
5320*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m5, m1
5321*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m7, m2
5322*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m7, m3
5323*c0909341SAndroid Build Coastguard Worker    paddd                m4, m13
5324*c0909341SAndroid Build Coastguard Worker    paddd                m5, m13
5325*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
5326*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
5327*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, [stk+0x90], m10
5328*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, [stk+0xa0], m10
5329*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [stk+0xb0], m11
5330*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, [stk+0xc0], m11
5331*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
5332*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
5333*c0909341SAndroid Build Coastguard Worker %if isput
5334*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m12, q1032
5335*c0909341SAndroid Build Coastguard Worker %endif
5336*c0909341SAndroid Build Coastguard Worker    paddd                m4, m8
5337*c0909341SAndroid Build Coastguard Worker    paddd                m5, m9
5338*c0909341SAndroid Build Coastguard Worker%else
5339*c0909341SAndroid Build Coastguard Worker    movd                 r0, m15
5340*c0909341SAndroid Build Coastguard Worker    movd                 rX, m4
5341*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0321
5342*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m4, q0321
5343*c0909341SAndroid Build Coastguard Worker    movd                 r4, m15
5344*c0909341SAndroid Build Coastguard Worker    movd                 r5, m4
5345*c0909341SAndroid Build Coastguard Worker    mova                m14, [stk+0x110]
5346*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+r0*8]
5347*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+rX*8]
5348*c0909341SAndroid Build Coastguard Worker    movhps               m2, [base+subpel_filters+r4*8]
5349*c0909341SAndroid Build Coastguard Worker    movhps               m3, [base+subpel_filters+r5*8]
5350*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
5351*c0909341SAndroid Build Coastguard Worker    mova           [stk+16], m14
5352*c0909341SAndroid Build Coastguard Worker    mov                  r0, [stk+ 0]
5353*c0909341SAndroid Build Coastguard Worker    mov                  rX, [stk+ 4]
5354*c0909341SAndroid Build Coastguard Worker    mov                  r4, [stk+ 8]
5355*c0909341SAndroid Build Coastguard Worker    mov                  r5, [stk+12]
5356*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m0
5357*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m1
5358*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m2
5359*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m3
5360*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m6, q1100
5361*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q3322
5362*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m5, q1100
5363*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q3322
5364*c0909341SAndroid Build Coastguard Worker    pand                 m0, m9, m4
5365*c0909341SAndroid Build Coastguard Worker    pand                 m1, m9, m6
5366*c0909341SAndroid Build Coastguard Worker    pand                 m2, m9, m7
5367*c0909341SAndroid Build Coastguard Worker    pand                 m3, m9, m5
5368*c0909341SAndroid Build Coastguard Worker    pandn                m4, [stk+0x20]
5369*c0909341SAndroid Build Coastguard Worker    pandn                m6, [stk+0x30]
5370*c0909341SAndroid Build Coastguard Worker    pandn                m7, [stk+0x40]
5371*c0909341SAndroid Build Coastguard Worker    pandn                m5, [stk+0x50]
5372*c0909341SAndroid Build Coastguard Worker    por                  m0, m4
5373*c0909341SAndroid Build Coastguard Worker    por                  m1, m6
5374*c0909341SAndroid Build Coastguard Worker    por                  m2, m7
5375*c0909341SAndroid Build Coastguard Worker    por                  m3, m5
5376*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m0, m0
5377*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m0
5378*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m1, m1
5379*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m1
5380*c0909341SAndroid Build Coastguard Worker    psraw                m4, 8
5381*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8
5382*c0909341SAndroid Build Coastguard Worker    psraw                m5, 8
5383*c0909341SAndroid Build Coastguard Worker    psraw                m1, 8
5384*c0909341SAndroid Build Coastguard Worker    punpcklbw            m6, m2, m2
5385*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m2
5386*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m3, m3
5387*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m3
5388*c0909341SAndroid Build Coastguard Worker    psraw                m6, 8
5389*c0909341SAndroid Build Coastguard Worker    psraw                m2, 8
5390*c0909341SAndroid Build Coastguard Worker    psraw                m7, 8
5391*c0909341SAndroid Build Coastguard Worker    psraw                m3, 8
5392*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x0a0], m4
5393*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x0b0], m0
5394*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x0c0], m5
5395*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x0d0], m1
5396*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x140], m6
5397*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x150], m2
5398*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x160], m7
5399*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x170], m3
5400*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x20, 0 ; 0
5401*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x30    ; 1
5402*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x40    ; 2
5403*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x50    ; 3
5404*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x60    ; 4
5405*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x70    ; 5
5406*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x80    ; 6
5407*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x90    ; 7
5408*c0909341SAndroid Build Coastguard Worker    mova                 m5, [stk+0x60]
5409*c0909341SAndroid Build Coastguard Worker    mova                 m6, [stk+0x70]
5410*c0909341SAndroid Build Coastguard Worker    mova                 m7, [stk+0x80]
5411*c0909341SAndroid Build Coastguard Worker    mova                 m0, [stk+0x90]
5412*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
5413*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m6      ; 45a
5414*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6          ; 45b
5415*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7, m0      ; 67a
5416*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m0          ; 67b
5417*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x60], m4
5418*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x70], m5
5419*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x80], m6
5420*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m7
5421*c0909341SAndroid Build Coastguard Worker    mova                 m1, [stk+0x20]
5422*c0909341SAndroid Build Coastguard Worker    mova                 m2, [stk+0x30]
5423*c0909341SAndroid Build Coastguard Worker    mova                 m3, [stk+0x40]
5424*c0909341SAndroid Build Coastguard Worker    mova                 m4, [stk+0x50]
5425*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2      ; 01a
5426*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2          ; 01b
5427*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m4      ; 23a
5428*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4          ; 23b
5429*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m0
5430*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m1
5431*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m2
5432*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m3
5433*c0909341SAndroid Build Coastguard Worker.vloop:
5434*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
5435*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+0x1f4]
5436*c0909341SAndroid Build Coastguard Worker    and                 myd, 0x3ff
5437*c0909341SAndroid Build Coastguard Worker    mov                 mym, myd
5438*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
5439*c0909341SAndroid Build Coastguard Worker    shr                  r4, 6
5440*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+r4]
5441*c0909341SAndroid Build Coastguard Worker    mov                  r4, 64 << 24
5442*c0909341SAndroid Build Coastguard Worker    cmovnz               r4, [base+subpel_filters+r5*8+0]
5443*c0909341SAndroid Build Coastguard Worker    cmovnz               r3, [base+subpel_filters+r5*8+4]
5444*c0909341SAndroid Build Coastguard Worker    movd                 m7, r4
5445*c0909341SAndroid Build Coastguard Worker    movd                 m6, r3
5446*c0909341SAndroid Build Coastguard Worker    punpckldq            m7, m6
5447*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m7
5448*c0909341SAndroid Build Coastguard Worker    psraw                m7, 8
5449*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m7, q0000
5450*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m7, q1111
5451*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m4
5452*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m4
5453*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5
5454*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5
5455*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m7, q2222
5456*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q3333
5457*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
5458*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
5459*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, [stk+0x60], m6
5460*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, [stk+0x70], m6
5461*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, [stk+0x80], m7
5462*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, [stk+0x90], m7
5463*c0909341SAndroid Build Coastguard Worker %if isput
5464*c0909341SAndroid Build Coastguard Worker    movd                 m6, [esp+0x18]
5465*c0909341SAndroid Build Coastguard Worker %endif
5466*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
5467*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
5468*c0909341SAndroid Build Coastguard Worker    paddd                m0, vrnd_mem
5469*c0909341SAndroid Build Coastguard Worker    paddd                m1, vrnd_mem
5470*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
5471*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1
5472*c0909341SAndroid Build Coastguard Worker%endif
5473*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
5474*c0909341SAndroid Build Coastguard Worker    psrad                m4, m6
5475*c0909341SAndroid Build Coastguard Worker    psrad                m5, m6
5476*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
5477*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
5478*c0909341SAndroid Build Coastguard Worker    pmaxsw               m4, m7
5479*c0909341SAndroid Build Coastguard Worker    pminsw               m4, pxmaxm
5480*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m4
5481*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
5482*c0909341SAndroid Build Coastguard Worker%else
5483*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
5484*c0909341SAndroid Build Coastguard Worker    psrad                m5, 6
5485*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
5486*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m4
5487*c0909341SAndroid Build Coastguard Worker    add                tmpq, tmp_stridem
5488*c0909341SAndroid Build Coastguard Worker%endif
5489*c0909341SAndroid Build Coastguard Worker    dec                  hd
5490*c0909341SAndroid Build Coastguard Worker    jz .hloop_prep
5491*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5492*c0909341SAndroid Build Coastguard Worker    add                 myd, dyd
5493*c0909341SAndroid Build Coastguard Worker    test                myd, ~0x3ff
5494*c0909341SAndroid Build Coastguard Worker    jz .vloop
5495*c0909341SAndroid Build Coastguard Worker    test                myd, 0x400
5496*c0909341SAndroid Build Coastguard Worker    mov         [stk+0x140], myd
5497*c0909341SAndroid Build Coastguard Worker    mov                 r4d, [stk+ 0]
5498*c0909341SAndroid Build Coastguard Worker    mov                 r6d, [stk+ 4]
5499*c0909341SAndroid Build Coastguard Worker    mov                 r7d, [stk+ 8]
5500*c0909341SAndroid Build Coastguard Worker    mov                 r9d, [stk+12]
5501*c0909341SAndroid Build Coastguard Worker    jz .skip_line
5502*c0909341SAndroid Build Coastguard Worker    mova                m14, [base+unpckw]
5503*c0909341SAndroid Build Coastguard Worker    movu                 m8, [srcq+r10*2]
5504*c0909341SAndroid Build Coastguard Worker    movu                 m9, [srcq+r11*2]
5505*c0909341SAndroid Build Coastguard Worker    movu                m10, [srcq+r13*2]
5506*c0909341SAndroid Build Coastguard Worker    movu                m11, [srcq+ rX*2]
5507*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ r4*2]
5508*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ r6*2]
5509*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ r7*2]
5510*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ r9*2]
5511*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
5512*c0909341SAndroid Build Coastguard Worker    mov                 myd, [stk+0x140]
5513*c0909341SAndroid Build Coastguard Worker    mov                 dyd, dym
5514*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m14, q1032
5515*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14                ; 0a 1a
5516*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14                ; 0b 1b
5517*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m15                ; 3a 2a
5518*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m15                ; 3b 2b
5519*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [stk+0x50]
5520*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, [stk+0x60]
5521*c0909341SAndroid Build Coastguard Worker    pmaddwd             m10, [stk+0x70]
5522*c0909341SAndroid Build Coastguard Worker    pmaddwd             m11, [stk+0x80]
5523*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, [stk+0x10]
5524*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, [stk+0x20]
5525*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, [stk+0x30]
5526*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, [stk+0x40]
5527*c0909341SAndroid Build Coastguard Worker    phaddd               m8, m9
5528*c0909341SAndroid Build Coastguard Worker    phaddd              m10, m11
5529*c0909341SAndroid Build Coastguard Worker    mova                m11, hround
5530*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m5
5531*c0909341SAndroid Build Coastguard Worker    phaddd               m6, m7
5532*c0909341SAndroid Build Coastguard Worker    phaddd               m8, m10
5533*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m6
5534*c0909341SAndroid Build Coastguard Worker    paddd                m4, m11
5535*c0909341SAndroid Build Coastguard Worker    paddd                m8, m11
5536*c0909341SAndroid Build Coastguard Worker    psrad                m4, m12
5537*c0909341SAndroid Build Coastguard Worker    psrad                m8, m12
5538*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m8
5539*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [stk+0x90], m14    ; 4a 5a
5540*c0909341SAndroid Build Coastguard Worker    pshufb               m6, [stk+0xa0], m14    ; 4b 5b
5541*c0909341SAndroid Build Coastguard Worker    pshufb               m7, [stk+0xb0], m15    ; 7a 6a
5542*c0909341SAndroid Build Coastguard Worker    pshufb               m8, [stk+0xc0], m15    ; 7b 6b
5543*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2 ; 12a
5544*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m3 ; 12b
5545*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5 ; 34a
5546*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m6 ; 34b
5547*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m7 ; 56a
5548*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m8 ; 56b
5549*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m4 ; 78a
5550*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m4, m4
5551*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m4 ; 78b
5552*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m5
5553*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xa0], m6
5554*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xb0], m7
5555*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xc0], m8
5556*c0909341SAndroid Build Coastguard Worker    jmp .vloop
5557*c0909341SAndroid Build Coastguard Worker.skip_line:
5558*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11
5559*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 0, 10, 11
5560*c0909341SAndroid Build Coastguard Worker    mov                 myd, [stk+0x140]
5561*c0909341SAndroid Build Coastguard Worker    mov                 dyd, dym
5562*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2         ; 01a
5563*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3         ; 01b
5564*c0909341SAndroid Build Coastguard Worker    mova                 m2, [stk+0x90] ; 23a
5565*c0909341SAndroid Build Coastguard Worker    mova                 m3, [stk+0xa0] ; 23b
5566*c0909341SAndroid Build Coastguard Worker    mova                 m5, [stk+0xb0] ; 45a
5567*c0909341SAndroid Build Coastguard Worker    mova                 m6, [stk+0xc0] ; 45b
5568*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m4, m8     ; 67a
5569*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m8         ; 67b
5570*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m5
5571*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xa0], m6
5572*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xb0], m7
5573*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xc0], m4
5574*c0909341SAndroid Build Coastguard Worker%else
5575*c0909341SAndroid Build Coastguard Worker    mov                 r0m, r0
5576*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
5577*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
5578*c0909341SAndroid Build Coastguard Worker    add                 myd, dym
5579*c0909341SAndroid Build Coastguard Worker    test                myd, ~0x3ff
5580*c0909341SAndroid Build Coastguard Worker    mov                 mym, myd
5581*c0909341SAndroid Build Coastguard Worker    jnz .next_line
5582*c0909341SAndroid Build Coastguard Worker    mova                 m0, [stk+0x20]
5583*c0909341SAndroid Build Coastguard Worker    mova                 m1, [stk+0x30]
5584*c0909341SAndroid Build Coastguard Worker    mova                 m2, [stk+0x40]
5585*c0909341SAndroid Build Coastguard Worker    mova                 m3, [stk+0x50]
5586*c0909341SAndroid Build Coastguard Worker    jmp .vloop
5587*c0909341SAndroid Build Coastguard Worker.next_line:
5588*c0909341SAndroid Build Coastguard Worker    test                myd, 0x400
5589*c0909341SAndroid Build Coastguard Worker    mov                  r0, [stk+ 0]
5590*c0909341SAndroid Build Coastguard Worker    mov                  rX, [stk+ 4]
5591*c0909341SAndroid Build Coastguard Worker    mov                  r4, [stk+ 8]
5592*c0909341SAndroid Build Coastguard Worker    mov                  r5, [stk+12]
5593*c0909341SAndroid Build Coastguard Worker    jz .skip_line
5594*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
5595*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+unpckw]
5596*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m7, q1032
5597*c0909341SAndroid Build Coastguard Worker    pshufb               m0, [stk+0x20], m7 ; 0a 1a
5598*c0909341SAndroid Build Coastguard Worker    pshufb               m1, [stk+0x30], m7 ; 0b 1b
5599*c0909341SAndroid Build Coastguard Worker    pshufb               m2, [stk+0x40], m4 ; 3a 2a
5600*c0909341SAndroid Build Coastguard Worker    pshufb               m3, [stk+0x50], m4 ; 3b 2b
5601*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [stk+0x60], m7 ; 4a 5a
5602*c0909341SAndroid Build Coastguard Worker    pshufb               m6, [stk+0x70], m7 ; 4b 5b
5603*c0909341SAndroid Build Coastguard Worker    pshufb               m7, [stk+0x80], m4 ; 7a 6a
5604*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2 ; 12a
5605*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m3 ; 12b
5606*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5 ; 34a
5607*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m6 ; 34b
5608*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m0
5609*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m1
5610*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m2
5611*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m3
5612*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m7 ; 56a
5613*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x60], m5
5614*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [stk+0x90], m4 ; 7b 6b
5615*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, [stk+0xe0] ; 78a
5616*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m5 ; 56b
5617*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x70], m6
5618*c0909341SAndroid Build Coastguard Worker    movq                 m6, [stk+0xe8]
5619*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x80], m7
5620*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6
5621*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
5622*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m5
5623*c0909341SAndroid Build Coastguard Worker    jmp .vloop
5624*c0909341SAndroid Build Coastguard Worker.skip_line:
5625*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
5626*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 0xa0, 0       ; 9
5627*c0909341SAndroid Build Coastguard Worker    mova                 m7, [stk+0xe0]
5628*c0909341SAndroid Build Coastguard Worker    mova                 m2, [stk+0x60] ; 23a
5629*c0909341SAndroid Build Coastguard Worker    mova                 m3, [stk+0x70] ; 23b
5630*c0909341SAndroid Build Coastguard Worker    mova                 m4, [stk+0x80] ; 45a
5631*c0909341SAndroid Build Coastguard Worker    mova                 m5, [stk+0x90] ; 45b
5632*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7, m0     ; 67a
5633*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m0         ; 67b
5634*c0909341SAndroid Build Coastguard Worker    mova                 m0, [stk+0x40] ; 01a
5635*c0909341SAndroid Build Coastguard Worker    mova                 m1, [stk+0x50] ; 01b
5636*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
5637*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m2
5638*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m3
5639*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x60], m4
5640*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x70], m5
5641*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x80], m6
5642*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m7
5643*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m0
5644*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m1
5645*c0909341SAndroid Build Coastguard Worker%endif
5646*c0909341SAndroid Build Coastguard Worker    jmp .vloop
5647*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
5648*c0909341SAndroid Build Coastguard Worker.dy1:
5649*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
5650*c0909341SAndroid Build Coastguard Worker    add                  wq, base_reg
5651*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5652*c0909341SAndroid Build Coastguard Worker%if isput
5653*c0909341SAndroid Build Coastguard Worker.dy1_w2:
5654*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
5655*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
5656*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
5657*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
5658*c0909341SAndroid Build Coastguard Worker    movd                m15, t0d
5659*c0909341SAndroid Build Coastguard Worker %else
5660*c0909341SAndroid Build Coastguard Worker  %define m8  m0
5661*c0909341SAndroid Build Coastguard Worker  %define m9  m1
5662*c0909341SAndroid Build Coastguard Worker  %define m14 m4
5663*c0909341SAndroid Build Coastguard Worker  %define m15 m3
5664*c0909341SAndroid Build Coastguard Worker  %define m11 [esp+0x00]
5665*c0909341SAndroid Build Coastguard Worker  %define m12 [esp+0x10]
5666*c0909341SAndroid Build Coastguard Worker  %define m13 [esp+0x20]
5667*c0909341SAndroid Build Coastguard Worker    movzx                r5, byte [esp+0x1f0]
5668*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
5669*c0909341SAndroid Build Coastguard Worker    movd                m15, r5
5670*c0909341SAndroid Build Coastguard Worker    mov                  r1, r1m
5671*c0909341SAndroid Build Coastguard Worker %endif
5672*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
5673*c0909341SAndroid Build Coastguard Worker    punpckldq            m9, m8
5674*c0909341SAndroid Build Coastguard Worker    paddd               m14, m9 ; mx+dx*[0-1]
5675*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
5676*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pd_0x4000]
5677*c0909341SAndroid Build Coastguard Worker %endif
5678*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0000
5679*c0909341SAndroid Build Coastguard Worker    pand                 m8, m14, m10
5680*c0909341SAndroid Build Coastguard Worker    psrld                m8, 6
5681*c0909341SAndroid Build Coastguard Worker    paddd               m15, m8
5682*c0909341SAndroid Build Coastguard Worker    movd                r4d, m15
5683*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0321
5684*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
5685*c0909341SAndroid Build Coastguard Worker    movd                r6d, m15
5686*c0909341SAndroid Build Coastguard Worker %else
5687*c0909341SAndroid Build Coastguard Worker    movd                r3d, m15
5688*c0909341SAndroid Build Coastguard Worker %endif
5689*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+bdct_lb_q]
5690*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+spel_s_shuf2]
5691*c0909341SAndroid Build Coastguard Worker    movd                m15, [base+subpel_filters+r4*8+2]
5692*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
5693*c0909341SAndroid Build Coastguard Worker    movd                 m7, [base+subpel_filters+r6*8+2]
5694*c0909341SAndroid Build Coastguard Worker %else
5695*c0909341SAndroid Build Coastguard Worker    movd                 m7, [base+subpel_filters+r3*8+2]
5696*c0909341SAndroid Build Coastguard Worker %endif
5697*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
5698*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m2
5699*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
5700*c0909341SAndroid Build Coastguard Worker    paddd               m14, m14
5701*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
5702*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
5703*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
5704*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
5705*c0909341SAndroid Build Coastguard Worker    mova              [stk], m14
5706*c0909341SAndroid Build Coastguard Worker    SWAP                 m5, m0
5707*c0909341SAndroid Build Coastguard Worker    SWAP                 m6, m3
5708*c0909341SAndroid Build Coastguard Worker  %define m15 m6
5709*c0909341SAndroid Build Coastguard Worker %endif
5710*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
5711*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
5712*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*2]
5713*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ss3q ]
5714*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
5715*c0909341SAndroid Build Coastguard Worker    punpckldq           m15, m7
5716*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
5717*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
5718*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
5719*c0909341SAndroid Build Coastguard Worker    pand                 m9, m8
5720*c0909341SAndroid Build Coastguard Worker    pandn                m8, m15
5721*c0909341SAndroid Build Coastguard Worker    SWAP                m15, m8
5722*c0909341SAndroid Build Coastguard Worker    por                 m15, m9
5723*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*0]
5724*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*1]
5725*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*2]
5726*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
5727*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
5728*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
5729*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
5730*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
5731*c0909341SAndroid Build Coastguard Worker %else
5732*c0909341SAndroid Build Coastguard Worker    pand                 m7, m5, [base+pd_0x4000]
5733*c0909341SAndroid Build Coastguard Worker    pandn                m5, m15
5734*c0909341SAndroid Build Coastguard Worker    por                  m5, m7
5735*c0909341SAndroid Build Coastguard Worker  %define m15 m5
5736*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
5737*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+0x1f4]
5738*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
5739*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
5740*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+myd]
5741*c0909341SAndroid Build Coastguard Worker    mov                  r4, 64 << 24
5742*c0909341SAndroid Build Coastguard Worker    cmovnz               r4, [base+subpel_filters+r5*8+0]
5743*c0909341SAndroid Build Coastguard Worker    cmovnz               r3, [base+subpel_filters+r5*8+4]
5744*c0909341SAndroid Build Coastguard Worker    mov          [stk+0x20], r3
5745*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
5746*c0909341SAndroid Build Coastguard Worker %endif
5747*c0909341SAndroid Build Coastguard Worker    punpcklbw           m15, m15
5748*c0909341SAndroid Build Coastguard Worker    psraw               m15, 8
5749*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m14}, m0, m1, m2, m3
5750*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m0, m1, m2, m3
5751*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
5752*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m14}, m4, m5, m6
5753*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m4, m5, m6
5754*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m1
5755*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m3
5756*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m5
5757*c0909341SAndroid Build Coastguard Worker    phaddd               m6, m6
5758*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m11}, m0, m2, m4, m6
5759*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, m12}, m0, m2, m4, m6
5760*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2 ; 0 1 2 3
5761*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m6 ; 4 5 6
5762*c0909341SAndroid Build Coastguard Worker    SWAP                 m1, m4
5763*c0909341SAndroid Build Coastguard Worker    movq                m10, r4
5764*c0909341SAndroid Build Coastguard Worker %else
5765*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x10], m15
5766*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m1
5767*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m3
5768*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0]
5769*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*1]
5770*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*2]
5771*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
5772*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m14}, m1, m7, m6
5773*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m1, m7, m6
5774*c0909341SAndroid Build Coastguard Worker  %define m14 [stk+0x00]
5775*c0909341SAndroid Build Coastguard Worker  %define m15 [stk+0x10]
5776*c0909341SAndroid Build Coastguard Worker    phaddd               m1, m7
5777*c0909341SAndroid Build Coastguard Worker    phaddd               m6, m6
5778*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m11}, m0, m2, m1, m6
5779*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, m12}, m0, m2, m1, m6
5780*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2
5781*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m6
5782*c0909341SAndroid Build Coastguard Worker  %define m8  m6
5783*c0909341SAndroid Build Coastguard Worker  %define m9  m4
5784*c0909341SAndroid Build Coastguard Worker  %define m10 m5
5785*c0909341SAndroid Build Coastguard Worker    movd                m10, r4
5786*c0909341SAndroid Build Coastguard Worker    movd                 m9, [stk+0x20]
5787*c0909341SAndroid Build Coastguard Worker    punpckldq           m10, m9
5788*c0909341SAndroid Build Coastguard Worker %endif
5789*c0909341SAndroid Build Coastguard Worker    punpcklbw           m10, m10
5790*c0909341SAndroid Build Coastguard Worker    psraw               m10, 8
5791*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m10, q0000
5792*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m10, q1111
5793*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m10, q2222
5794*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m10, q3333
5795*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
5796*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m7
5797*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x60], m8
5798*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x70], m9
5799*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x80], m10
5800*c0909341SAndroid Build Coastguard Worker  %define m7  [stk+0x50]
5801*c0909341SAndroid Build Coastguard Worker  %define m8  [stk+0x60]
5802*c0909341SAndroid Build Coastguard Worker  %define m9  [stk+0x70]
5803*c0909341SAndroid Build Coastguard Worker  %define m10 [stk+0x80]
5804*c0909341SAndroid Build Coastguard Worker %endif
5805*c0909341SAndroid Build Coastguard Worker    palignr              m2, m1, m0, 4 ; 1 2 3 4
5806*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m0, m2    ; 01 12
5807*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2        ; 23 34
5808*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m1, q2121 ; 5 6 5 6
5809*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1, m4    ; 45 56
5810*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
5811*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
5812*c0909341SAndroid Build Coastguard Worker %endif
5813*c0909341SAndroid Build Coastguard Worker.dy1_w2_loop:
5814*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0]
5815*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*1]
5816*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
5817*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m3, m7
5818*c0909341SAndroid Build Coastguard Worker    mova                 m3, m0
5819*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m8
5820*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14
5821*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m14
5822*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m15
5823*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m15
5824*c0909341SAndroid Build Coastguard Worker    phaddd               m1, m6
5825*c0909341SAndroid Build Coastguard Worker    paddd                m1, m11
5826*c0909341SAndroid Build Coastguard Worker    psrad                m1, m12
5827*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m1
5828*c0909341SAndroid Build Coastguard Worker    paddd                m5, m0
5829*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
5830*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9
5831*c0909341SAndroid Build Coastguard Worker    paddd                m5, m2
5832*c0909341SAndroid Build Coastguard Worker    palignr              m2, m1, m4, 12
5833*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1        ; 67 78
5834*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m2, m10
5835*c0909341SAndroid Build Coastguard Worker    paddd                m5, m13
5836*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4
5837*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
5838*c0909341SAndroid Build Coastguard Worker    mova                 m4, m1
5839*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m12, q1032
5840*c0909341SAndroid Build Coastguard Worker    psrad                m5, m1
5841*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m5
5842*c0909341SAndroid Build Coastguard Worker    pmaxsw               m5, m6
5843*c0909341SAndroid Build Coastguard Worker    pminsw               m5, pxmaxm
5844*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m5
5845*c0909341SAndroid Build Coastguard Worker    pshuflw              m5, m5, q1032
5846*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m5
5847*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
5848*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5849*c0909341SAndroid Build Coastguard Worker    jg .dy1_w2_loop
5850*c0909341SAndroid Build Coastguard Worker    RET
5851*c0909341SAndroid Build Coastguard Worker%endif
5852*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
5853*c0909341SAndroid Build Coastguard Worker.dy1_w4:
5854*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5855*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
5856*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], m11
5857*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m12
5858*c0909341SAndroid Build Coastguard Worker %if isput
5859*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x30], m13
5860*c0909341SAndroid Build Coastguard Worker  %define vrnd_mem [rsp+0x30]
5861*c0909341SAndroid Build Coastguard Worker  %define stk rsp+0x40
5862*c0909341SAndroid Build Coastguard Worker %else
5863*c0909341SAndroid Build Coastguard Worker  %define vrnd_mem [base+pd_m524256]
5864*c0909341SAndroid Build Coastguard Worker  %define stk rsp+0x30
5865*c0909341SAndroid Build Coastguard Worker %endif
5866*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
5867*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
5868*c0909341SAndroid Build Coastguard Worker    movd                m15, t0d
5869*c0909341SAndroid Build Coastguard Worker%else
5870*c0909341SAndroid Build Coastguard Worker %define m10 [base+pd_0x3ff]
5871*c0909341SAndroid Build Coastguard Worker %define m9  [base+pd_0x4000]
5872*c0909341SAndroid Build Coastguard Worker %define m8  m0
5873*c0909341SAndroid Build Coastguard Worker %xdefine m14 m4
5874*c0909341SAndroid Build Coastguard Worker %define m15 m3
5875*c0909341SAndroid Build Coastguard Worker %if isprep
5876*c0909341SAndroid Build Coastguard Worker  %define ssq r3
5877*c0909341SAndroid Build Coastguard Worker %endif
5878*c0909341SAndroid Build Coastguard Worker    movzx                r5, byte [esp+0x1f0]
5879*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
5880*c0909341SAndroid Build Coastguard Worker    movd                m15, r5
5881*c0909341SAndroid Build Coastguard Worker%endif
5882*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [base+rescale_mul]
5883*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5884*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pd_0x4000]
5885*c0909341SAndroid Build Coastguard Worker%endif
5886*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0000
5887*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-3]
5888*c0909341SAndroid Build Coastguard Worker    pand                 m0, m14, m10
5889*c0909341SAndroid Build Coastguard Worker    psrld                m0, 6
5890*c0909341SAndroid Build Coastguard Worker    paddd               m15, m0
5891*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m15, q1032
5892*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5893*c0909341SAndroid Build Coastguard Worker    movd                r4d, m15
5894*c0909341SAndroid Build Coastguard Worker    movd               r11d, m7
5895*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0321
5896*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q0321
5897*c0909341SAndroid Build Coastguard Worker    movd                r6d, m15
5898*c0909341SAndroid Build Coastguard Worker    movd               r13d, m7
5899*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+bdct_lb_q+ 0]
5900*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+bdct_lb_q+16]
5901*c0909341SAndroid Build Coastguard Worker    movd                m13, [base+subpel_filters+ r4*8+2]
5902*c0909341SAndroid Build Coastguard Worker    movd                 m2, [base+subpel_filters+ r6*8+2]
5903*c0909341SAndroid Build Coastguard Worker    movd                m15, [base+subpel_filters+r11*8+2]
5904*c0909341SAndroid Build Coastguard Worker    movd                 m4, [base+subpel_filters+r13*8+2]
5905*c0909341SAndroid Build Coastguard Worker%else
5906*c0909341SAndroid Build Coastguard Worker    movd                 r0, m15
5907*c0909341SAndroid Build Coastguard Worker    movd                 r4, m7
5908*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0321
5909*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q0321
5910*c0909341SAndroid Build Coastguard Worker    movd                 rX, m15
5911*c0909341SAndroid Build Coastguard Worker    movd                 r5, m7
5912*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+bdct_lb_q+ 0]
5913*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+bdct_lb_q+16]
5914*c0909341SAndroid Build Coastguard Worker    movd                 m1, [base+subpel_filters+r0*8+2]
5915*c0909341SAndroid Build Coastguard Worker    movd                 m2, [base+subpel_filters+rX*8+2]
5916*c0909341SAndroid Build Coastguard Worker    movd                 m3, [base+subpel_filters+r4*8+2]
5917*c0909341SAndroid Build Coastguard Worker    movd                 m7, [base+subpel_filters+r5*8+2]
5918*c0909341SAndroid Build Coastguard Worker    SWAP                 m4, m7
5919*c0909341SAndroid Build Coastguard Worker %if isprep
5920*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
5921*c0909341SAndroid Build Coastguard Worker %endif
5922*c0909341SAndroid Build Coastguard Worker %define m10 m5
5923*c0909341SAndroid Build Coastguard Worker %define m11 m6
5924*c0909341SAndroid Build Coastguard Worker %define m12 m1
5925*c0909341SAndroid Build Coastguard Worker %define m13 m1
5926*c0909341SAndroid Build Coastguard Worker%endif
5927*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
5928*c0909341SAndroid Build Coastguard Worker    paddd               m14, m14
5929*c0909341SAndroid Build Coastguard Worker    punpckldq           m13, m2
5930*c0909341SAndroid Build Coastguard Worker    punpckldq           m15, m4
5931*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m13, m15
5932*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
5933*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m0, m2
5934*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5935*c0909341SAndroid Build Coastguard Worker    pand                 m9, m0
5936*c0909341SAndroid Build Coastguard Worker%else
5937*c0909341SAndroid Build Coastguard Worker    pand                 m2, m9, m0
5938*c0909341SAndroid Build Coastguard Worker %define m9 m2
5939*c0909341SAndroid Build Coastguard Worker    SWAP                 m7, m4
5940*c0909341SAndroid Build Coastguard Worker%endif
5941*c0909341SAndroid Build Coastguard Worker    pandn                m0, m13
5942*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5943*c0909341SAndroid Build Coastguard Worker    SWAP                m13, m0
5944*c0909341SAndroid Build Coastguard Worker%else
5945*c0909341SAndroid Build Coastguard Worker %define m13 m0
5946*c0909341SAndroid Build Coastguard Worker%endif
5947*c0909341SAndroid Build Coastguard Worker    por                 m13, m9
5948*c0909341SAndroid Build Coastguard Worker    punpckhbw           m15, m13, m13
5949*c0909341SAndroid Build Coastguard Worker    punpcklbw           m13, m13
5950*c0909341SAndroid Build Coastguard Worker    psraw               m15, 8
5951*c0909341SAndroid Build Coastguard Worker    psraw               m13, 8
5952*c0909341SAndroid Build Coastguard Worker    pshufb              m12, m14, m10
5953*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m11
5954*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+spel_s_shuf2]
5955*c0909341SAndroid Build Coastguard Worker    movd                r4d, m14
5956*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 24
5957*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
5958*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m13
5959*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m15
5960*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
5961*c0909341SAndroid Build Coastguard Worker%endif
5962*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m14, m2
5963*c0909341SAndroid Build Coastguard Worker    psubb               m14, m7
5964*c0909341SAndroid Build Coastguard Worker    paddb               m12, m10
5965*c0909341SAndroid Build Coastguard Worker    paddb               m14, m10
5966*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5967*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r4+ssq*1]
5968*c0909341SAndroid Build Coastguard Worker    lea                 r11, [r4+ssq*2]
5969*c0909341SAndroid Build Coastguard Worker    lea                 r13, [r4+ss3q ]
5970*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*0]
5971*c0909341SAndroid Build Coastguard Worker    movu                 m9, [srcq+ssq*1]
5972*c0909341SAndroid Build Coastguard Worker    movu                 m8, [srcq+ssq*2]
5973*c0909341SAndroid Build Coastguard Worker    movu                m10, [srcq+ss3q ]
5974*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r4   ]
5975*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+r6   ]
5976*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r11  ]
5977*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+r13  ]
5978*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
5979*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m12}, m7, m9, m8, m10
5980*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m13}, m7, m9, m8, m10
5981*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m14}, m1, m3, m2, m4
5982*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m1, m3, m2, m4
5983*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+0x10]
5984*c0909341SAndroid Build Coastguard Worker    movd                xm6, [rsp+0x20]
5985*c0909341SAndroid Build Coastguard Worker    phaddd               m7, m1
5986*c0909341SAndroid Build Coastguard Worker    phaddd               m9, m3
5987*c0909341SAndroid Build Coastguard Worker    phaddd               m8, m2
5988*c0909341SAndroid Build Coastguard Worker    phaddd              m10, m4
5989*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0]
5990*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1]
5991*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*2]
5992*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m7, m9, m8, m10
5993*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, xm6}, m7, m9, m8, m10
5994*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m9  ; 0 1
5995*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m10 ; 2 3
5996*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r4   ]
5997*c0909341SAndroid Build Coastguard Worker    movu                 m9, [srcq+r6   ]
5998*c0909341SAndroid Build Coastguard Worker    movu                m10, [srcq+r11  ]
5999*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
6000*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m12}, m1, m2, m3
6001*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m13}, m1, m2, m3
6002*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m14}, m0, m9, m10
6003*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m0, m9, m10
6004*c0909341SAndroid Build Coastguard Worker    phaddd               m1, m0
6005*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m9
6006*c0909341SAndroid Build Coastguard Worker    phaddd               m3, m10
6007*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
6008*c0909341SAndroid Build Coastguard Worker    mov                r13d, 64 << 24
6009*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
6010*c0909341SAndroid Build Coastguard Worker    cmovnz             r13q, [base+subpel_filters+myq*8]
6011*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m1, m2, m3
6012*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, xm6}, m1, m2, m3
6013*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m2 ; 4 5
6014*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m3 ; 6 6
6015*c0909341SAndroid Build Coastguard Worker    SWAP                 m9, m1
6016*c0909341SAndroid Build Coastguard Worker    shufps               m4, m7, m8, q1032  ; 1 2
6017*c0909341SAndroid Build Coastguard Worker    shufps               m5, m8, m9, q1032  ; 3 4
6018*c0909341SAndroid Build Coastguard Worker    shufps               m6, m9, m3, q1032  ; 5 6
6019*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m7, m4 ; 01
6020*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m4     ; 12
6021*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m8, m5 ; 23
6022*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m5     ; 34
6023*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m9, m6 ; 45
6024*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m6     ; 56
6025*c0909341SAndroid Build Coastguard Worker    movq                m10, r13
6026*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x00], m1
6027*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x10], m8
6028*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m2
6029*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m9
6030*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m3
6031*c0909341SAndroid Build Coastguard Worker %define hrnd_mem [rsp+0x10]
6032*c0909341SAndroid Build Coastguard Worker %define hsh_mem  [rsp+0x20]
6033*c0909341SAndroid Build Coastguard Worker %define vsh_mem  [rsp+0x28]
6034*c0909341SAndroid Build Coastguard Worker %if isput
6035*c0909341SAndroid Build Coastguard Worker  %define vrnd_mem [rsp+0x30]
6036*c0909341SAndroid Build Coastguard Worker %else
6037*c0909341SAndroid Build Coastguard Worker  %define vrnd_mem [base+pd_m524256]
6038*c0909341SAndroid Build Coastguard Worker %endif
6039*c0909341SAndroid Build Coastguard Worker%else
6040*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m12
6041*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m14
6042*c0909341SAndroid Build Coastguard Worker    add                  r4, srcq
6043*c0909341SAndroid Build Coastguard Worker    MC_4TAP_SCALED_H   0x60 ; 0 1
6044*c0909341SAndroid Build Coastguard Worker    MC_4TAP_SCALED_H   0x70 ; 2 3
6045*c0909341SAndroid Build Coastguard Worker    MC_4TAP_SCALED_H   0x80 ; 4 5
6046*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq]
6047*c0909341SAndroid Build Coastguard Worker    movu                 m2, [r4]
6048*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
6049*c0909341SAndroid Build Coastguard Worker    add                  r4, ssq
6050*c0909341SAndroid Build Coastguard Worker    mov          [stk+0xb0], r4
6051*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m12
6052*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14
6053*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m13
6054*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m15
6055*c0909341SAndroid Build Coastguard Worker    phaddd               m7, m2
6056*c0909341SAndroid Build Coastguard Worker    paddd                m7, [esp+0x00]
6057*c0909341SAndroid Build Coastguard Worker    psrad                m7, [esp+0x10]
6058*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m7 ; 6 6
6059*c0909341SAndroid Build Coastguard Worker    mova                 m4, [stk+0x60]
6060*c0909341SAndroid Build Coastguard Worker    mova                 m5, [stk+0x70]
6061*c0909341SAndroid Build Coastguard Worker    mova                 m6, [stk+0x80]
6062*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
6063*c0909341SAndroid Build Coastguard Worker    mov                  rX, [esp+0x1f4]
6064*c0909341SAndroid Build Coastguard Worker    xor                  r5, r5
6065*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
6066*c0909341SAndroid Build Coastguard Worker    lea                  rX, [rX+myd]
6067*c0909341SAndroid Build Coastguard Worker    mov                  r4, 64 << 24
6068*c0909341SAndroid Build Coastguard Worker    cmovnz               r4, [base+subpel_filters+rX*8+0]
6069*c0909341SAndroid Build Coastguard Worker    cmovnz               r5, [base+subpel_filters+rX*8+4]
6070*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
6071*c0909341SAndroid Build Coastguard Worker    shufps               m1, m4, m5, q1032 ; 1 2
6072*c0909341SAndroid Build Coastguard Worker    shufps               m2, m5, m6, q1032 ; 3 4
6073*c0909341SAndroid Build Coastguard Worker    shufps               m3, m6, m7, q1032 ; 5 6
6074*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xa0], m7
6075*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m4, m1         ; 01
6076*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m1             ; 12
6077*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m5, m2         ; 23
6078*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m2             ; 34
6079*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m6, m3         ; 45
6080*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m3             ; 56
6081*c0909341SAndroid Build Coastguard Worker    movd                 m7, r4
6082*c0909341SAndroid Build Coastguard Worker    movd                 m3, r5
6083*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
6084*c0909341SAndroid Build Coastguard Worker %if isput
6085*c0909341SAndroid Build Coastguard Worker    mov                  r1, r1m
6086*c0909341SAndroid Build Coastguard Worker %endif
6087*c0909341SAndroid Build Coastguard Worker    mov                  r4, [stk+0xb0]
6088*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xc0], m4 ; 12
6089*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x60], m1 ; 23
6090*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x70], m2 ; 45
6091*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x80], m5 ; 34
6092*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m6 ; 56
6093*c0909341SAndroid Build Coastguard Worker %define m12 [stk+0x20]
6094*c0909341SAndroid Build Coastguard Worker %define m14 [stk+0x30]
6095*c0909341SAndroid Build Coastguard Worker %define m13 [stk+0x40]
6096*c0909341SAndroid Build Coastguard Worker %define m15 [stk+0x50]
6097*c0909341SAndroid Build Coastguard Worker %define hrnd_mem [esp+0x00]
6098*c0909341SAndroid Build Coastguard Worker %define hsh_mem  [esp+0x10]
6099*c0909341SAndroid Build Coastguard Worker %define vsh_mem  [esp+0x18]
6100*c0909341SAndroid Build Coastguard Worker %if isput
6101*c0909341SAndroid Build Coastguard Worker  %define vrnd_mem [esp+0x20]
6102*c0909341SAndroid Build Coastguard Worker %else
6103*c0909341SAndroid Build Coastguard Worker  %define vrnd_mem [base+pd_m524256]
6104*c0909341SAndroid Build Coastguard Worker %endif
6105*c0909341SAndroid Build Coastguard Worker %define m10 m7
6106*c0909341SAndroid Build Coastguard Worker    punpckldq           m10, m3
6107*c0909341SAndroid Build Coastguard Worker%endif
6108*c0909341SAndroid Build Coastguard Worker    punpcklbw           m10, m10
6109*c0909341SAndroid Build Coastguard Worker    psraw               m10, 8
6110*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m10, q0000
6111*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m10, q1111
6112*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m10, q2222
6113*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m10, q3333
6114*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
6115*c0909341SAndroid Build Coastguard Worker %xdefine m8  m3
6116*c0909341SAndroid Build Coastguard Worker %xdefine m9  m6
6117*c0909341SAndroid Build Coastguard Worker %xdefine m11 m5
6118*c0909341SAndroid Build Coastguard Worker %xdefine m6  m4
6119*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x100], m3
6120*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x110], m4
6121*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x120], m5
6122*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x130], m10
6123*c0909341SAndroid Build Coastguard Worker %define m3  [stk+0x100]
6124*c0909341SAndroid Build Coastguard Worker %define m4  [stk+0x110]
6125*c0909341SAndroid Build Coastguard Worker %define m5  [stk+0x120]
6126*c0909341SAndroid Build Coastguard Worker %define m10 [stk+0x130]
6127*c0909341SAndroid Build Coastguard Worker    mova                 m7, [stk+0xc0]
6128*c0909341SAndroid Build Coastguard Worker    mova                 m8, [stk+0x80]
6129*c0909341SAndroid Build Coastguard Worker%endif
6130*c0909341SAndroid Build Coastguard Worker.dy1_w4_loop:
6131*c0909341SAndroid Build Coastguard Worker    movu                m11, [srcq+ssq*0]
6132*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*1]
6133*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m3
6134*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m3
6135*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m4
6136*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m4
6137*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5
6138*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m5
6139*c0909341SAndroid Build Coastguard Worker    paddd                m1, m0
6140*c0909341SAndroid Build Coastguard Worker    paddd                m8, m7
6141*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6142*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r4]
6143*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+r6]
6144*c0909341SAndroid Build Coastguard Worker%else
6145*c0909341SAndroid Build Coastguard Worker    movu                 m0, [r4+ssq*0]
6146*c0909341SAndroid Build Coastguard Worker    movu                 m7, [r4+ssq*1]
6147*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r4+ssq*2]
6148*c0909341SAndroid Build Coastguard Worker%endif
6149*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
6150*c0909341SAndroid Build Coastguard Worker    paddd                m1, m2
6151*c0909341SAndroid Build Coastguard Worker    paddd                m8, m9
6152*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m12
6153*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m12
6154*c0909341SAndroid Build Coastguard Worker    pmaddwd             m11, m13
6155*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m13
6156*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14
6157*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m14
6158*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m15
6159*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m15
6160*c0909341SAndroid Build Coastguard Worker    phaddd              m11, m0
6161*c0909341SAndroid Build Coastguard Worker    phaddd               m6, m7
6162*c0909341SAndroid Build Coastguard Worker    paddd               m11, hrnd_mem
6163*c0909341SAndroid Build Coastguard Worker    paddd                m6, hrnd_mem
6164*c0909341SAndroid Build Coastguard Worker    psrad               m11, hsh_mem
6165*c0909341SAndroid Build Coastguard Worker    psrad                m6, hsh_mem
6166*c0909341SAndroid Build Coastguard Worker    packssdw            m11, m6                     ; 7 8
6167*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6168*c0909341SAndroid Build Coastguard Worker    shufps               m9, [stk+0x40], m11, q1032 ; 6 7
6169*c0909341SAndroid Build Coastguard Worker    mova                 m0, [stk+0x00]
6170*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m11
6171*c0909341SAndroid Build Coastguard Worker%else
6172*c0909341SAndroid Build Coastguard Worker    shufps               m9, [stk+0xa0], m11, q1032 ; 6 7
6173*c0909341SAndroid Build Coastguard Worker    mova                 m0, [stk+0x60]
6174*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xa0], m11
6175*c0909341SAndroid Build Coastguard Worker%endif
6176*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m9, m11 ; 67
6177*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m11     ; 78
6178*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m10
6179*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m9, m10
6180*c0909341SAndroid Build Coastguard Worker%if isput
6181*c0909341SAndroid Build Coastguard Worker    movd                m11, vsh_mem
6182*c0909341SAndroid Build Coastguard Worker%endif
6183*c0909341SAndroid Build Coastguard Worker    paddd                m1, vrnd_mem
6184*c0909341SAndroid Build Coastguard Worker    paddd                m8, vrnd_mem
6185*c0909341SAndroid Build Coastguard Worker    paddd                m1, m6
6186*c0909341SAndroid Build Coastguard Worker    paddd                m8, m7
6187*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6188*c0909341SAndroid Build Coastguard Worker    mova                 m7, [stk+0x10]
6189*c0909341SAndroid Build Coastguard Worker%else
6190*c0909341SAndroid Build Coastguard Worker    mova                 m7, [stk+0x80]
6191*c0909341SAndroid Build Coastguard Worker%endif
6192*c0909341SAndroid Build Coastguard Worker%if isput
6193*c0909341SAndroid Build Coastguard Worker    psrad                m1, m11
6194*c0909341SAndroid Build Coastguard Worker    psrad                m8, m11
6195*c0909341SAndroid Build Coastguard Worker%else
6196*c0909341SAndroid Build Coastguard Worker    psrad                m1, 6
6197*c0909341SAndroid Build Coastguard Worker    psrad                m8, 6
6198*c0909341SAndroid Build Coastguard Worker%endif
6199*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m8
6200*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6201*c0909341SAndroid Build Coastguard Worker    mova                 m8, [stk+0x30]
6202*c0909341SAndroid Build Coastguard Worker%else
6203*c0909341SAndroid Build Coastguard Worker    mova                 m8, [stk+0x90]
6204*c0909341SAndroid Build Coastguard Worker%endif
6205*c0909341SAndroid Build Coastguard Worker%if isput
6206*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
6207*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m6
6208*c0909341SAndroid Build Coastguard Worker    pminsw               m1, pxmaxm
6209*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m1
6210*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m1
6211*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
6212*c0909341SAndroid Build Coastguard Worker%else
6213*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m1
6214*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
6215*c0909341SAndroid Build Coastguard Worker%endif
6216*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6217*c0909341SAndroid Build Coastguard Worker    mova                 m1, [stk+0x20]
6218*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x10], m8
6219*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x00], m1
6220*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m2
6221*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m9
6222*c0909341SAndroid Build Coastguard Worker%else
6223*c0909341SAndroid Build Coastguard Worker    mova                 m1, [stk+0x70]
6224*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x80], m8
6225*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x60], m1
6226*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x70], m2
6227*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m9
6228*c0909341SAndroid Build Coastguard Worker%endif
6229*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
6230*c0909341SAndroid Build Coastguard Worker    jg .dy1_w4_loop
6231*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_RET ; why not jz .ret?
6232*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
6233*c0909341SAndroid Build Coastguard Worker.dy1_w8:
6234*c0909341SAndroid Build Coastguard Worker    mov    dword [stk+0xf0], 1
6235*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 16
6236*c0909341SAndroid Build Coastguard Worker    jmp .dy1_w_start
6237*c0909341SAndroid Build Coastguard Worker.dy1_w16:
6238*c0909341SAndroid Build Coastguard Worker    mov    dword [stk+0xf0], 2
6239*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 32
6240*c0909341SAndroid Build Coastguard Worker    jmp .dy1_w_start
6241*c0909341SAndroid Build Coastguard Worker.dy1_w32:
6242*c0909341SAndroid Build Coastguard Worker    mov    dword [stk+0xf0], 4
6243*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 64
6244*c0909341SAndroid Build Coastguard Worker    jmp .dy1_w_start
6245*c0909341SAndroid Build Coastguard Worker.dy1_w64:
6246*c0909341SAndroid Build Coastguard Worker    mov    dword [stk+0xf0], 8
6247*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 128
6248*c0909341SAndroid Build Coastguard Worker    jmp .dy1_w_start
6249*c0909341SAndroid Build Coastguard Worker.dy1_w128:
6250*c0909341SAndroid Build Coastguard Worker    mov    dword [stk+0xf0], 16
6251*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 256
6252*c0909341SAndroid Build Coastguard Worker.dy1_w_start:
6253*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
6254*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6255*c0909341SAndroid Build Coastguard Worker %ifidn %1, put
6256*c0909341SAndroid Build Coastguard Worker    movifnidn           dsm, dsq
6257*c0909341SAndroid Build Coastguard Worker %endif
6258*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], m11
6259*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m12
6260*c0909341SAndroid Build Coastguard Worker %define hround m11
6261*c0909341SAndroid Build Coastguard Worker %if isput
6262*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x30], m13
6263*c0909341SAndroid Build Coastguard Worker %else
6264*c0909341SAndroid Build Coastguard Worker    mova                m13, [base+pd_m524256]
6265*c0909341SAndroid Build Coastguard Worker %endif
6266*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 16
6267*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
6268*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
6269*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
6270*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
6271*c0909341SAndroid Build Coastguard Worker    movd                m15, t0d
6272*c0909341SAndroid Build Coastguard Worker%else
6273*c0909341SAndroid Build Coastguard Worker %define hround [esp+0x00]
6274*c0909341SAndroid Build Coastguard Worker %define m12    [esp+0x10]
6275*c0909341SAndroid Build Coastguard Worker %define m10    [base+pd_0x3ff]
6276*c0909341SAndroid Build Coastguard Worker %define m8  m0
6277*c0909341SAndroid Build Coastguard Worker %xdefine m14 m4
6278*c0909341SAndroid Build Coastguard Worker %xdefine m15 m3
6279*c0909341SAndroid Build Coastguard Worker %if isprep
6280*c0909341SAndroid Build Coastguard Worker  %define ssq ssm
6281*c0909341SAndroid Build Coastguard Worker %endif
6282*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+0x1f0]
6283*c0909341SAndroid Build Coastguard Worker    mov                  r3, [esp+0x1f4]
6284*c0909341SAndroid Build Coastguard Worker    shr                  r5, 16
6285*c0909341SAndroid Build Coastguard Worker    movd                m15, r5
6286*c0909341SAndroid Build Coastguard Worker    xor                  r5, r5
6287*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
6288*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r3+myd]
6289*c0909341SAndroid Build Coastguard Worker    mov                  r4, 64 << 24
6290*c0909341SAndroid Build Coastguard Worker    cmovnz               r4, [base+subpel_filters+r3*8+0]
6291*c0909341SAndroid Build Coastguard Worker    cmovnz               r5, [base+subpel_filters+r3*8+4]
6292*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
6293*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
6294*c0909341SAndroid Build Coastguard Worker%endif
6295*c0909341SAndroid Build Coastguard Worker    sub                srcq, 6
6296*c0909341SAndroid Build Coastguard Worker    pslld                m7, m8, 2 ; dx*4
6297*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
6298*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0000
6299*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-3]
6300*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6301*c0909341SAndroid Build Coastguard Worker    movq                 m3, r4q
6302*c0909341SAndroid Build Coastguard Worker%else
6303*c0909341SAndroid Build Coastguard Worker    movd                 m5, r4
6304*c0909341SAndroid Build Coastguard Worker    movd                 m6, r5
6305*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m6
6306*c0909341SAndroid Build Coastguard Worker    SWAP                 m3, m5
6307*c0909341SAndroid Build Coastguard Worker%endif
6308*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m3
6309*c0909341SAndroid Build Coastguard Worker    psraw                m3, 8
6310*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x100], m7
6311*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x120], m15
6312*c0909341SAndroid Build Coastguard Worker    mov         [stk+0x0f8], srcq
6313*c0909341SAndroid Build Coastguard Worker    mov         [stk+0x130], r0q ; dstq / tmpq
6314*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q0000
6315*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m3, q1111
6316*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q2222
6317*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q3333
6318*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6319*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x140], m0
6320*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x150], m1
6321*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x160], m2
6322*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x170], m3
6323*c0909341SAndroid Build Coastguard Worker %if UNIX64
6324*c0909341SAndroid Build Coastguard Worker    mov                  hm, hd
6325*c0909341SAndroid Build Coastguard Worker %endif
6326*c0909341SAndroid Build Coastguard Worker%else
6327*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x180], m0
6328*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x190], m1
6329*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x1a0], m2
6330*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x1b0], m3
6331*c0909341SAndroid Build Coastguard Worker    SWAP                 m5, m3
6332*c0909341SAndroid Build Coastguard Worker    mov                  r5, hm
6333*c0909341SAndroid Build Coastguard Worker    mov         [stk+0x134], r5
6334*c0909341SAndroid Build Coastguard Worker%endif
6335*c0909341SAndroid Build Coastguard Worker    jmp .dy1_hloop
6336*c0909341SAndroid Build Coastguard Worker.dy1_hloop_prep:
6337*c0909341SAndroid Build Coastguard Worker    dec   dword [stk+0x0f0]
6338*c0909341SAndroid Build Coastguard Worker    jz .ret
6339*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6340*c0909341SAndroid Build Coastguard Worker    add   qword [stk+0x130], 16
6341*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
6342*c0909341SAndroid Build Coastguard Worker%else
6343*c0909341SAndroid Build Coastguard Worker    add   dword [stk+0x130], 16
6344*c0909341SAndroid Build Coastguard Worker    mov                  r5, [stk+0x134]
6345*c0909341SAndroid Build Coastguard Worker    mov                  r0, [stk+0x130]
6346*c0909341SAndroid Build Coastguard Worker%endif
6347*c0909341SAndroid Build Coastguard Worker    mova                 m7, [stk+0x100]
6348*c0909341SAndroid Build Coastguard Worker    mova                m14, [stk+0x110]
6349*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6350*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+pd_0x3ff]
6351*c0909341SAndroid Build Coastguard Worker    mova                m11, [rsp+0x10]
6352*c0909341SAndroid Build Coastguard Worker%endif
6353*c0909341SAndroid Build Coastguard Worker    mova                m15, [stk+0x120]
6354*c0909341SAndroid Build Coastguard Worker    mov                srcq, [stk+0x0f8]
6355*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6356*c0909341SAndroid Build Coastguard Worker    mov                 r0q, [stk+0x130] ; dstq / tmpq
6357*c0909341SAndroid Build Coastguard Worker%else
6358*c0909341SAndroid Build Coastguard Worker    mov                  hm, r5
6359*c0909341SAndroid Build Coastguard Worker    mov                 r0m, r0
6360*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
6361*c0909341SAndroid Build Coastguard Worker%endif
6362*c0909341SAndroid Build Coastguard Worker    paddd               m14, m7
6363*c0909341SAndroid Build Coastguard Worker.dy1_hloop:
6364*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6365*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pq_0x40000000]
6366*c0909341SAndroid Build Coastguard Worker%else
6367*c0909341SAndroid Build Coastguard Worker %define m9 [base+pq_0x40000000]
6368*c0909341SAndroid Build Coastguard Worker%endif
6369*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
6370*c0909341SAndroid Build Coastguard Worker    psrld                m2, m14, 10
6371*c0909341SAndroid Build Coastguard Worker    mova              [stk], m2
6372*c0909341SAndroid Build Coastguard Worker    pand                 m6, m14, m10
6373*c0909341SAndroid Build Coastguard Worker    psrld                m6, 6
6374*c0909341SAndroid Build Coastguard Worker    paddd                m5, m15, m6
6375*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m6, m1
6376*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m5, q1032
6377*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6378*c0909341SAndroid Build Coastguard Worker    movd                r4d, m5
6379*c0909341SAndroid Build Coastguard Worker    movd                r6d, m2
6380*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q0321
6381*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q0321
6382*c0909341SAndroid Build Coastguard Worker    movd                r7d, m5
6383*c0909341SAndroid Build Coastguard Worker    movd                r9d, m2
6384*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base+subpel_filters+r4*8]
6385*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base+subpel_filters+r6*8]
6386*c0909341SAndroid Build Coastguard Worker    movhps               m0, [base+subpel_filters+r7*8]
6387*c0909341SAndroid Build Coastguard Worker    movhps               m1, [base+subpel_filters+r9*8]
6388*c0909341SAndroid Build Coastguard Worker%else
6389*c0909341SAndroid Build Coastguard Worker    movd                 r0, m5
6390*c0909341SAndroid Build Coastguard Worker    movd                 rX, m2
6391*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q0321
6392*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q0321
6393*c0909341SAndroid Build Coastguard Worker    movd                 r4, m5
6394*c0909341SAndroid Build Coastguard Worker    movd                 r5, m2
6395*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base+subpel_filters+r0*8]
6396*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base+subpel_filters+rX*8]
6397*c0909341SAndroid Build Coastguard Worker    movhps               m0, [base+subpel_filters+r4*8]
6398*c0909341SAndroid Build Coastguard Worker    movhps               m1, [base+subpel_filters+r5*8]
6399*c0909341SAndroid Build Coastguard Worker%endif
6400*c0909341SAndroid Build Coastguard Worker    paddd               m14, m7 ; mx+dx*[4-7]
6401*c0909341SAndroid Build Coastguard Worker    pand                 m5, m14, m10
6402*c0909341SAndroid Build Coastguard Worker    psrld                m5, 6
6403*c0909341SAndroid Build Coastguard Worker    paddd               m15, m5
6404*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
6405*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m5, m2
6406*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x110], m14
6407*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m15, q1032
6408*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6409*c0909341SAndroid Build Coastguard Worker    movd               r10d, m15
6410*c0909341SAndroid Build Coastguard Worker    movd               r11d, m4
6411*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0321
6412*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m4, q0321
6413*c0909341SAndroid Build Coastguard Worker    movd               r13d, m15
6414*c0909341SAndroid Build Coastguard Worker    movd                rXd, m4
6415*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+r10*8]
6416*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+r11*8]
6417*c0909341SAndroid Build Coastguard Worker    movhps               m2, [base+subpel_filters+r13*8]
6418*c0909341SAndroid Build Coastguard Worker    movhps               m3, [base+subpel_filters+ rX*8]
6419*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
6420*c0909341SAndroid Build Coastguard Worker    movq                r11, m14
6421*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m14, m14
6422*c0909341SAndroid Build Coastguard Worker    movq                 rX, m14
6423*c0909341SAndroid Build Coastguard Worker    mov                r10d, r11d
6424*c0909341SAndroid Build Coastguard Worker    shr                 r11, 32
6425*c0909341SAndroid Build Coastguard Worker    mov                r13d, rXd
6426*c0909341SAndroid Build Coastguard Worker    shr                  rX, 32
6427*c0909341SAndroid Build Coastguard Worker    mov                 r4d, [stk+ 0]
6428*c0909341SAndroid Build Coastguard Worker    mov                 r6d, [stk+ 4]
6429*c0909341SAndroid Build Coastguard Worker    mov                 r7d, [stk+ 8]
6430*c0909341SAndroid Build Coastguard Worker    mov                 r9d, [stk+12]
6431*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m6, q1100
6432*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q3322
6433*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m5, q1100
6434*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q3322
6435*c0909341SAndroid Build Coastguard Worker    pand                 m7, m9, m4
6436*c0909341SAndroid Build Coastguard Worker    pand                 m8, m9, m6
6437*c0909341SAndroid Build Coastguard Worker    pand                m15, m9, m14
6438*c0909341SAndroid Build Coastguard Worker    pand                 m9, m9, m5
6439*c0909341SAndroid Build Coastguard Worker    pandn                m4, m0
6440*c0909341SAndroid Build Coastguard Worker    pandn                m6, m1
6441*c0909341SAndroid Build Coastguard Worker    pandn               m14, m2
6442*c0909341SAndroid Build Coastguard Worker    pandn                m5, m3
6443*c0909341SAndroid Build Coastguard Worker    por                  m7, m4
6444*c0909341SAndroid Build Coastguard Worker    por                  m8, m6
6445*c0909341SAndroid Build Coastguard Worker    por                 m15, m14
6446*c0909341SAndroid Build Coastguard Worker    por                  m9, m5
6447*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m7, m7
6448*c0909341SAndroid Build Coastguard Worker    punpckhbw            m7, m7
6449*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m8, m8
6450*c0909341SAndroid Build Coastguard Worker    punpckhbw            m8, m8
6451*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8
6452*c0909341SAndroid Build Coastguard Worker    psraw                m7, 8
6453*c0909341SAndroid Build Coastguard Worker    psraw                m1, 8
6454*c0909341SAndroid Build Coastguard Worker    psraw                m8, 8
6455*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m15, m15
6456*c0909341SAndroid Build Coastguard Worker    punpckhbw           m15, m15
6457*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m9, m9
6458*c0909341SAndroid Build Coastguard Worker    punpckhbw            m9, m9
6459*c0909341SAndroid Build Coastguard Worker    psraw                m2, 8
6460*c0909341SAndroid Build Coastguard Worker    psraw               m15, 8
6461*c0909341SAndroid Build Coastguard Worker    psraw                m3, 8
6462*c0909341SAndroid Build Coastguard Worker    psraw                m9, 8
6463*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x10], m0
6464*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m7
6465*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m1
6466*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m8
6467*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m2
6468*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x60], m15
6469*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x70], m3
6470*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x80], m9
6471*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
6472*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m1
6473*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
6474*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xa0], m2
6475*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
6476*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xb0], m3
6477*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
6478*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xc0], m4
6479*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
6480*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xd0], m5
6481*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
6482*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
6483*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
6484*c0909341SAndroid Build Coastguard Worker    mova                 m5, [stk+0xd0]
6485*c0909341SAndroid Build Coastguard Worker    mova                 m1, [stk+0x90]
6486*c0909341SAndroid Build Coastguard Worker    mova                 m2, [stk+0xa0]
6487*c0909341SAndroid Build Coastguard Worker    mova                 m3, [stk+0xb0]
6488*c0909341SAndroid Build Coastguard Worker    mova                 m9, [stk+0xc0]
6489*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m6 ; 45a
6490*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6     ; 45b
6491*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7, m8 ; 67a
6492*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m8     ; 67b
6493*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2 ; 01a
6494*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2     ; 01b
6495*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m9 ; 23a
6496*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m9     ; 23b
6497*c0909341SAndroid Build Coastguard Worker    mova                m10, [stk+0x140]
6498*c0909341SAndroid Build Coastguard Worker    mova                m11, [stk+0x150]
6499*c0909341SAndroid Build Coastguard Worker    mova                m14, [stk+0x160]
6500*c0909341SAndroid Build Coastguard Worker    mova                m15, [stk+0x170]
6501*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m4
6502*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xa0], m5
6503*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xb0], m6
6504*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xc0], m7
6505*c0909341SAndroid Build Coastguard Worker %define hround [rsp+0x10]
6506*c0909341SAndroid Build Coastguard Worker %define shift  [rsp+0x20]
6507*c0909341SAndroid Build Coastguard Worker %if isput
6508*c0909341SAndroid Build Coastguard Worker  %define vround [rsp+0x30]
6509*c0909341SAndroid Build Coastguard Worker %else
6510*c0909341SAndroid Build Coastguard Worker  %define vround [base+pd_m524256]
6511*c0909341SAndroid Build Coastguard Worker %endif
6512*c0909341SAndroid Build Coastguard Worker.dy1_vloop:
6513*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m0, m10
6514*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, m10
6515*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m11
6516*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m3, m11
6517*c0909341SAndroid Build Coastguard Worker    paddd                m4, m13
6518*c0909341SAndroid Build Coastguard Worker    paddd                m5, m13
6519*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
6520*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
6521*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, [stk+0x90], m14
6522*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, [stk+0xa0], m14
6523*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [stk+0xb0], m15
6524*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, [stk+0xc0], m15
6525*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
6526*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
6527*c0909341SAndroid Build Coastguard Worker %if isput
6528*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m12, q1032
6529*c0909341SAndroid Build Coastguard Worker %endif
6530*c0909341SAndroid Build Coastguard Worker    paddd                m4, m8
6531*c0909341SAndroid Build Coastguard Worker    paddd                m5, m9
6532*c0909341SAndroid Build Coastguard Worker%else
6533*c0909341SAndroid Build Coastguard Worker    movd                 r0, m15
6534*c0909341SAndroid Build Coastguard Worker    movd                 rX, m4
6535*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0321
6536*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m4, q0321
6537*c0909341SAndroid Build Coastguard Worker    movd                 r4, m15
6538*c0909341SAndroid Build Coastguard Worker    movd                 r5, m4
6539*c0909341SAndroid Build Coastguard Worker    mova                m14, [stk+0x110]
6540*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+r0*8]
6541*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+rX*8]
6542*c0909341SAndroid Build Coastguard Worker    movhps               m2, [base+subpel_filters+r4*8]
6543*c0909341SAndroid Build Coastguard Worker    movhps               m3, [base+subpel_filters+r5*8]
6544*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
6545*c0909341SAndroid Build Coastguard Worker    mova           [stk+16], m14
6546*c0909341SAndroid Build Coastguard Worker    mov                  r0, [stk+ 0]
6547*c0909341SAndroid Build Coastguard Worker    mov                  rX, [stk+ 4]
6548*c0909341SAndroid Build Coastguard Worker    mov                  r4, [stk+ 8]
6549*c0909341SAndroid Build Coastguard Worker    mov                  r5, [stk+12]
6550*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m0
6551*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m1
6552*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m2
6553*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m3
6554*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m6, q1100
6555*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q3322
6556*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m5, q1100
6557*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q3322
6558*c0909341SAndroid Build Coastguard Worker    pand                 m0, m9, m4
6559*c0909341SAndroid Build Coastguard Worker    pand                 m1, m9, m6
6560*c0909341SAndroid Build Coastguard Worker    pand                 m2, m9, m7
6561*c0909341SAndroid Build Coastguard Worker    pand                 m3, m9, m5
6562*c0909341SAndroid Build Coastguard Worker    pandn                m4, [stk+0x20]
6563*c0909341SAndroid Build Coastguard Worker    pandn                m6, [stk+0x30]
6564*c0909341SAndroid Build Coastguard Worker    pandn                m7, [stk+0x40]
6565*c0909341SAndroid Build Coastguard Worker    pandn                m5, [stk+0x50]
6566*c0909341SAndroid Build Coastguard Worker    por                  m0, m4
6567*c0909341SAndroid Build Coastguard Worker    por                  m1, m6
6568*c0909341SAndroid Build Coastguard Worker    por                  m2, m7
6569*c0909341SAndroid Build Coastguard Worker    por                  m3, m5
6570*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m0, m0
6571*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m0
6572*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m1, m1
6573*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m1
6574*c0909341SAndroid Build Coastguard Worker    psraw                m4, 8
6575*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8
6576*c0909341SAndroid Build Coastguard Worker    psraw                m5, 8
6577*c0909341SAndroid Build Coastguard Worker    psraw                m1, 8
6578*c0909341SAndroid Build Coastguard Worker    punpcklbw            m6, m2, m2
6579*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m2
6580*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m3, m3
6581*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m3
6582*c0909341SAndroid Build Coastguard Worker    psraw                m6, 8
6583*c0909341SAndroid Build Coastguard Worker    psraw                m2, 8
6584*c0909341SAndroid Build Coastguard Worker    psraw                m7, 8
6585*c0909341SAndroid Build Coastguard Worker    psraw                m3, 8
6586*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x0a0], m4
6587*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x0b0], m0
6588*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x0c0], m5
6589*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x0d0], m1
6590*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x140], m6
6591*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x150], m2
6592*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x160], m7
6593*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x170], m3
6594*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x20, 0 ; 0
6595*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x30    ; 1
6596*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x40    ; 2
6597*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x50    ; 3
6598*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x60    ; 4
6599*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x70    ; 5
6600*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x80    ; 6
6601*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x90    ; 7
6602*c0909341SAndroid Build Coastguard Worker    mova                 m5, [stk+0x60]
6603*c0909341SAndroid Build Coastguard Worker    mova                 m6, [stk+0x70]
6604*c0909341SAndroid Build Coastguard Worker    mova                 m7, [stk+0x80]
6605*c0909341SAndroid Build Coastguard Worker    mova                 m0, [stk+0x90]
6606*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
6607*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m6      ; 45a
6608*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6          ; 45b
6609*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7, m0      ; 67a
6610*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m0          ; 67b
6611*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x60], m4
6612*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x70], m5
6613*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x80], m6
6614*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m7
6615*c0909341SAndroid Build Coastguard Worker    mova                 m1, [stk+0x20]
6616*c0909341SAndroid Build Coastguard Worker    mova                 m2, [stk+0x30]
6617*c0909341SAndroid Build Coastguard Worker    mova                 m3, [stk+0x40]
6618*c0909341SAndroid Build Coastguard Worker    mova                 m4, [stk+0x50]
6619*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2      ; 01a
6620*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2          ; 01b
6621*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m4      ; 23a
6622*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4          ; 23b
6623*c0909341SAndroid Build Coastguard Worker    mova                 m4, [stk+0x180]
6624*c0909341SAndroid Build Coastguard Worker    mova                 m5, [stk+0x190]
6625*c0909341SAndroid Build Coastguard Worker    mova                 m6, [stk+0x1a0]
6626*c0909341SAndroid Build Coastguard Worker    mova                 m7, [stk+0x1b0]
6627*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m0
6628*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m1
6629*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m2
6630*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m3
6631*c0909341SAndroid Build Coastguard Worker.dy1_vloop:
6632*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m4
6633*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m4
6634*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5
6635*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5
6636*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
6637*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
6638*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, [stk+0x60], m6
6639*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, [stk+0x70], m6
6640*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, [stk+0x80], m7
6641*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, [stk+0x90], m7
6642*c0909341SAndroid Build Coastguard Worker %if isput
6643*c0909341SAndroid Build Coastguard Worker    movd                 m6, [esp+0x18]
6644*c0909341SAndroid Build Coastguard Worker %endif
6645*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
6646*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
6647*c0909341SAndroid Build Coastguard Worker    paddd                m0, vrnd_mem
6648*c0909341SAndroid Build Coastguard Worker    paddd                m1, vrnd_mem
6649*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
6650*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1
6651*c0909341SAndroid Build Coastguard Worker%endif
6652*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
6653*c0909341SAndroid Build Coastguard Worker    psrad                m4, m6
6654*c0909341SAndroid Build Coastguard Worker    psrad                m5, m6
6655*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
6656*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
6657*c0909341SAndroid Build Coastguard Worker    pmaxsw               m4, m7
6658*c0909341SAndroid Build Coastguard Worker    pminsw               m4, pxmaxm
6659*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m4
6660*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
6661*c0909341SAndroid Build Coastguard Worker%else
6662*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
6663*c0909341SAndroid Build Coastguard Worker    psrad                m5, 6
6664*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
6665*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m4
6666*c0909341SAndroid Build Coastguard Worker    add                tmpq, tmp_stridem
6667*c0909341SAndroid Build Coastguard Worker%endif
6668*c0909341SAndroid Build Coastguard Worker    dec                  hd
6669*c0909341SAndroid Build Coastguard Worker    jz .dy1_hloop_prep
6670*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6671*c0909341SAndroid Build Coastguard Worker    movu                 m8, [srcq+r10*2]
6672*c0909341SAndroid Build Coastguard Worker    movu                 m9, [srcq+r11*2]
6673*c0909341SAndroid Build Coastguard Worker    movu                m12, [srcq+r13*2]
6674*c0909341SAndroid Build Coastguard Worker    movu                m13, [srcq+ rX*2]
6675*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ r4*2]
6676*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ r6*2]
6677*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ r7*2]
6678*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ r9*2]
6679*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
6680*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [stk+0x50]
6681*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, [stk+0x60]
6682*c0909341SAndroid Build Coastguard Worker    pmaddwd             m12, [stk+0x70]
6683*c0909341SAndroid Build Coastguard Worker    pmaddwd             m13, [stk+0x80]
6684*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, [stk+0x10]
6685*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, [stk+0x20]
6686*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, [stk+0x30]
6687*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, [stk+0x40]
6688*c0909341SAndroid Build Coastguard Worker    phaddd               m8, m9
6689*c0909341SAndroid Build Coastguard Worker    phaddd              m12, m13
6690*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+unpckw]
6691*c0909341SAndroid Build Coastguard Worker    mova                m13, hround
6692*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m5
6693*c0909341SAndroid Build Coastguard Worker    phaddd               m6, m7
6694*c0909341SAndroid Build Coastguard Worker    phaddd               m8, m12
6695*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m6
6696*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m9, q1032
6697*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m9             ; 0a 1a
6698*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m9             ; 0b 1b
6699*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5             ; 3a 2a
6700*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m5             ; 3b 2b
6701*c0909341SAndroid Build Coastguard Worker    mova                m12, shift
6702*c0909341SAndroid Build Coastguard Worker    paddd                m4, m13
6703*c0909341SAndroid Build Coastguard Worker    paddd                m8, m13
6704*c0909341SAndroid Build Coastguard Worker    psrad                m4, m12
6705*c0909341SAndroid Build Coastguard Worker    psrad                m8, m12
6706*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m8
6707*c0909341SAndroid Build Coastguard Worker    pshufb               m6, [stk+0x90], m9 ; 4a 5a
6708*c0909341SAndroid Build Coastguard Worker    pshufb               m7, [stk+0xa0], m9 ; 4b 5b
6709*c0909341SAndroid Build Coastguard Worker    pshufb               m8, [stk+0xb0], m5 ; 7a 6a
6710*c0909341SAndroid Build Coastguard Worker    pshufb              m13, [stk+0xc0], m5 ; 7b 6b
6711*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2  ; 12a
6712*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m3  ; 12b
6713*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m6  ; 34a
6714*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m7  ; 34b
6715*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m8  ; 56a
6716*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m13 ; 56b
6717*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m4  ; 78a
6718*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m4, m4
6719*c0909341SAndroid Build Coastguard Worker    punpcklwd           m13, m4  ; 78b
6720*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m6
6721*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xa0], m7
6722*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xb0], m8
6723*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xc0], m13
6724*c0909341SAndroid Build Coastguard Worker    mova                m13, vround
6725*c0909341SAndroid Build Coastguard Worker%else
6726*c0909341SAndroid Build Coastguard Worker    mov                 r0m, r0
6727*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
6728*c0909341SAndroid Build Coastguard Worker    mov                  r0, [stk+ 0]
6729*c0909341SAndroid Build Coastguard Worker    mov                  rX, [stk+ 4]
6730*c0909341SAndroid Build Coastguard Worker    mov                  r4, [stk+ 8]
6731*c0909341SAndroid Build Coastguard Worker    mov                  r5, [stk+12]
6732*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
6733*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+unpckw]
6734*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m7, q1032
6735*c0909341SAndroid Build Coastguard Worker    pshufb               m0, [stk+0x20], m7 ; 0a 1a
6736*c0909341SAndroid Build Coastguard Worker    pshufb               m1, [stk+0x30], m7 ; 0b 1b
6737*c0909341SAndroid Build Coastguard Worker    pshufb               m2, [stk+0x40], m4 ; 3a 2a
6738*c0909341SAndroid Build Coastguard Worker    pshufb               m3, [stk+0x50], m4 ; 3b 2b
6739*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [stk+0x60], m7 ; 4a 5a
6740*c0909341SAndroid Build Coastguard Worker    pshufb               m6, [stk+0x70], m7 ; 4b 5b
6741*c0909341SAndroid Build Coastguard Worker    pshufb               m7, [stk+0x80], m4 ; 7a 6a
6742*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2 ; 12a
6743*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m3 ; 12b
6744*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5 ; 34a
6745*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m6 ; 34b
6746*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m0
6747*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m1
6748*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m2
6749*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m3
6750*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m7 ; 56a
6751*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x60], m5
6752*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [stk+0x90], m4 ; 7b 6b
6753*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, [stk+0xe0] ; 78a
6754*c0909341SAndroid Build Coastguard Worker    mova                 m4, [stk+0x180]
6755*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m5 ; 56b
6756*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x70], m6
6757*c0909341SAndroid Build Coastguard Worker    movq                 m6, [stk+0xe8]
6758*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x80], m7
6759*c0909341SAndroid Build Coastguard Worker    mova                 m7, [stk+0x1b0]
6760*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6
6761*c0909341SAndroid Build Coastguard Worker    mova                 m6, [stk+0x1a0]
6762*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m5
6763*c0909341SAndroid Build Coastguard Worker    mova                 m5, [stk+0x190]
6764*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
6765*c0909341SAndroid Build Coastguard Worker%endif
6766*c0909341SAndroid Build Coastguard Worker    jmp .dy1_vloop
6767*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
6768*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6769*c0909341SAndroid Build Coastguard Worker %define stk rsp+0x20
6770*c0909341SAndroid Build Coastguard Worker%endif
6771*c0909341SAndroid Build Coastguard Worker.dy2:
6772*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
6773*c0909341SAndroid Build Coastguard Worker    add                  wq, base_reg
6774*c0909341SAndroid Build Coastguard Worker    jmp                  wq
6775*c0909341SAndroid Build Coastguard Worker%if isput
6776*c0909341SAndroid Build Coastguard Worker.dy2_w2:
6777*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
6778*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
6779*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], m13
6780*c0909341SAndroid Build Coastguard Worker  %define vrnd_mem [rsp+0x10]
6781*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
6782*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
6783*c0909341SAndroid Build Coastguard Worker    movd                m15, t0d
6784*c0909341SAndroid Build Coastguard Worker %else
6785*c0909341SAndroid Build Coastguard Worker  %define m8  m0
6786*c0909341SAndroid Build Coastguard Worker  %define m9  m1
6787*c0909341SAndroid Build Coastguard Worker  %define m14 m4
6788*c0909341SAndroid Build Coastguard Worker  %define m15 m3
6789*c0909341SAndroid Build Coastguard Worker  %define m11 [esp+0x00]
6790*c0909341SAndroid Build Coastguard Worker  %define m12 [esp+0x10]
6791*c0909341SAndroid Build Coastguard Worker  %define vrnd_mem [esp+0x20]
6792*c0909341SAndroid Build Coastguard Worker    mov                  r1, r1m
6793*c0909341SAndroid Build Coastguard Worker    movzx                r5, byte [esp+0x1f0]
6794*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
6795*c0909341SAndroid Build Coastguard Worker    movd                m15, r5
6796*c0909341SAndroid Build Coastguard Worker %endif
6797*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
6798*c0909341SAndroid Build Coastguard Worker    punpckldq            m9, m8
6799*c0909341SAndroid Build Coastguard Worker    paddd               m14, m9 ; mx+dx*[0-1]
6800*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
6801*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pd_0x4000]
6802*c0909341SAndroid Build Coastguard Worker %endif
6803*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0000
6804*c0909341SAndroid Build Coastguard Worker    pand                 m8, m14, m10
6805*c0909341SAndroid Build Coastguard Worker    psrld                m8, 6
6806*c0909341SAndroid Build Coastguard Worker    paddd               m15, m8
6807*c0909341SAndroid Build Coastguard Worker    movd                r4d, m15
6808*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0321
6809*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
6810*c0909341SAndroid Build Coastguard Worker    movd                r6d, m15
6811*c0909341SAndroid Build Coastguard Worker %else
6812*c0909341SAndroid Build Coastguard Worker    movd                r3d, m15
6813*c0909341SAndroid Build Coastguard Worker %endif
6814*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+bdct_lb_q]
6815*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+spel_s_shuf2]
6816*c0909341SAndroid Build Coastguard Worker    movd                m15, [base+subpel_filters+r4*8+2]
6817*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
6818*c0909341SAndroid Build Coastguard Worker    movd                 m7, [base+subpel_filters+r6*8+2]
6819*c0909341SAndroid Build Coastguard Worker %else
6820*c0909341SAndroid Build Coastguard Worker    movd                 m7, [base+subpel_filters+r3*8+2]
6821*c0909341SAndroid Build Coastguard Worker %endif
6822*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
6823*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m2
6824*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
6825*c0909341SAndroid Build Coastguard Worker    paddd               m14, m14
6826*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
6827*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
6828*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
6829*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
6830*c0909341SAndroid Build Coastguard Worker    mova              [stk], m14
6831*c0909341SAndroid Build Coastguard Worker    SWAP                 m5, m0
6832*c0909341SAndroid Build Coastguard Worker    SWAP                 m6, m3
6833*c0909341SAndroid Build Coastguard Worker  %define m15 m6
6834*c0909341SAndroid Build Coastguard Worker %endif
6835*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
6836*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*2]
6837*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*4]
6838*c0909341SAndroid Build Coastguard Worker    punpckldq           m15, m7
6839*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
6840*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
6841*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
6842*c0909341SAndroid Build Coastguard Worker    pand                 m9, m8
6843*c0909341SAndroid Build Coastguard Worker    pandn                m8, m15
6844*c0909341SAndroid Build Coastguard Worker    SWAP                m15, m8
6845*c0909341SAndroid Build Coastguard Worker    por                 m15, m9
6846*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*1]
6847*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ss3q ]
6848*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
6849*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*1]
6850*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
6851*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
6852*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
6853*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
6854*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
6855*c0909341SAndroid Build Coastguard Worker %else
6856*c0909341SAndroid Build Coastguard Worker    pand                 m7, m5, [base+pd_0x4000]
6857*c0909341SAndroid Build Coastguard Worker    pandn                m5, m15
6858*c0909341SAndroid Build Coastguard Worker    por                  m5, m7
6859*c0909341SAndroid Build Coastguard Worker  %define m15 m5
6860*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
6861*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+0x1f4]
6862*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
6863*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
6864*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+myd]
6865*c0909341SAndroid Build Coastguard Worker    mov                  r4, 64 << 24
6866*c0909341SAndroid Build Coastguard Worker    cmovnz               r4, [base+subpel_filters+r5*8+0]
6867*c0909341SAndroid Build Coastguard Worker    cmovnz               r3, [base+subpel_filters+r5*8+4]
6868*c0909341SAndroid Build Coastguard Worker    mov          [stk+0x20], r3
6869*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
6870*c0909341SAndroid Build Coastguard Worker %endif
6871*c0909341SAndroid Build Coastguard Worker    punpcklbw           m15, m15
6872*c0909341SAndroid Build Coastguard Worker    psraw               m15, 8
6873*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m14}, m0, m1, m2
6874*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m0, m1, m2
6875*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
6876*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m14}, m4, m5, m6
6877*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m4, m5, m6
6878*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m1
6879*c0909341SAndroid Build Coastguard Worker    phaddd               m1, m2
6880*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m5
6881*c0909341SAndroid Build Coastguard Worker    phaddd               m5, m6
6882*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m11}, m0, m1, m4, m5
6883*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, m12}, m0, m1, m4, m5
6884*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1 ; 0 2 2 4
6885*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5 ; 1 3 3 5
6886*c0909341SAndroid Build Coastguard Worker    SWAP                 m2, m4
6887*c0909341SAndroid Build Coastguard Worker    movq                m10, r4
6888*c0909341SAndroid Build Coastguard Worker %else
6889*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x10], m15
6890*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m1
6891*c0909341SAndroid Build Coastguard Worker    phaddd               m1, m2
6892*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1]
6893*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ss3q ]
6894*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
6895*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*1]
6896*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
6897*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m14}, m2, m7, m6
6898*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m2, m7, m6
6899*c0909341SAndroid Build Coastguard Worker  %define m14 [stk+0x00]
6900*c0909341SAndroid Build Coastguard Worker  %define m15 [stk+0x10]
6901*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m7
6902*c0909341SAndroid Build Coastguard Worker    phaddd               m7, m6
6903*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m11}, m0, m1, m2, m7
6904*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, m12}, m0, m1, m2, m7
6905*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
6906*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m7
6907*c0909341SAndroid Build Coastguard Worker  %define m8  m6
6908*c0909341SAndroid Build Coastguard Worker  %define m9  m4
6909*c0909341SAndroid Build Coastguard Worker  %define m10 m5
6910*c0909341SAndroid Build Coastguard Worker    movd                m10, r4
6911*c0909341SAndroid Build Coastguard Worker    movd                 m9, [stk+0x20]
6912*c0909341SAndroid Build Coastguard Worker    punpckldq           m10, m9
6913*c0909341SAndroid Build Coastguard Worker %endif
6914*c0909341SAndroid Build Coastguard Worker    punpcklbw           m10, m10
6915*c0909341SAndroid Build Coastguard Worker    psraw               m10, 8
6916*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m10, q0000
6917*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m10, q1111
6918*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m10, q2222
6919*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m10, q3333
6920*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
6921*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m7
6922*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x60], m8
6923*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x70], m9
6924*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x80], m10
6925*c0909341SAndroid Build Coastguard Worker  %xdefine m13 m7
6926*c0909341SAndroid Build Coastguard Worker  %define m7  [stk+0x50]
6927*c0909341SAndroid Build Coastguard Worker  %define m8  [stk+0x60]
6928*c0909341SAndroid Build Coastguard Worker  %define m9  [stk+0x70]
6929*c0909341SAndroid Build Coastguard Worker  %define m10 [stk+0x80]
6930*c0909341SAndroid Build Coastguard Worker %endif
6931*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0, m2    ; 01 23
6932*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m2    ; 23 45
6933*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
6934*c0909341SAndroid Build Coastguard Worker    mov                  r4, r0m
6935*c0909341SAndroid Build Coastguard Worker  %define dstq r4
6936*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m3
6937*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m0
6938*c0909341SAndroid Build Coastguard Worker %endif
6939*c0909341SAndroid Build Coastguard Worker.dy2_w2_loop:
6940*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*0]
6941*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*1]
6942*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*2]
6943*c0909341SAndroid Build Coastguard Worker    movu                m13, [srcq+ss3q ]
6944*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
6945*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m8
6946*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m14}, m4, m5, m6, m13
6947*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m4, m5, m6, m13
6948*c0909341SAndroid Build Coastguard Worker    phaddd               m4, m5
6949*c0909341SAndroid Build Coastguard Worker    phaddd               m6, m13
6950*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, m7
6951*c0909341SAndroid Build Coastguard Worker    paddd                m4, m11
6952*c0909341SAndroid Build Coastguard Worker    paddd                m6, m11
6953*c0909341SAndroid Build Coastguard Worker    psrad                m4, m12
6954*c0909341SAndroid Build Coastguard Worker    psrad                m6, m12
6955*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m6 ; 6 7 8 9
6956*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
6957*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m4, q2200
6958*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m4, q3311
6959*c0909341SAndroid Build Coastguard Worker    palignr              m3, m0, 12 ; 4 6 6 8
6960*c0909341SAndroid Build Coastguard Worker    palignr              m4, m2, 12 ; 5 7 7 9
6961*c0909341SAndroid Build Coastguard Worker    mova                 m0, m3
6962*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
6963*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m4
6964*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4
6965*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m1, m9
6966*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m3, m10
6967*c0909341SAndroid Build Coastguard Worker    paddd                m5, vrnd_mem
6968*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4
6969*c0909341SAndroid Build Coastguard Worker    paddd                m5, m6
6970*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m12, q1032
6971*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
6972*c0909341SAndroid Build Coastguard Worker    psrad                m5, m4
6973*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m5
6974*c0909341SAndroid Build Coastguard Worker    pmaxsw               m5, m6
6975*c0909341SAndroid Build Coastguard Worker    pminsw               m5, pxmaxm
6976*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m5
6977*c0909341SAndroid Build Coastguard Worker    pshuflw              m5, m5, q1032
6978*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m5
6979*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
6980*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
6981*c0909341SAndroid Build Coastguard Worker    jg .dy2_w2_loop
6982*c0909341SAndroid Build Coastguard Worker    RET
6983*c0909341SAndroid Build Coastguard Worker%endif
6984*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
6985*c0909341SAndroid Build Coastguard Worker.dy2_w4:
6986*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6987*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
6988*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], m11
6989*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m12
6990*c0909341SAndroid Build Coastguard Worker %if isput
6991*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x30], m13
6992*c0909341SAndroid Build Coastguard Worker  %define vrnd_mem [rsp+0x30]
6993*c0909341SAndroid Build Coastguard Worker  %define stk rsp+0x40
6994*c0909341SAndroid Build Coastguard Worker %else
6995*c0909341SAndroid Build Coastguard Worker  %define vrnd_mem [base+pd_m524256]
6996*c0909341SAndroid Build Coastguard Worker  %define stk rsp+0x30
6997*c0909341SAndroid Build Coastguard Worker %endif
6998*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
6999*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
7000*c0909341SAndroid Build Coastguard Worker    movd                m15, t0d
7001*c0909341SAndroid Build Coastguard Worker%else
7002*c0909341SAndroid Build Coastguard Worker %define m10 [base+pd_0x3ff]
7003*c0909341SAndroid Build Coastguard Worker %define m9  [base+pd_0x4000]
7004*c0909341SAndroid Build Coastguard Worker %define m8  m0
7005*c0909341SAndroid Build Coastguard Worker %xdefine m14 m4
7006*c0909341SAndroid Build Coastguard Worker %define m15 m3
7007*c0909341SAndroid Build Coastguard Worker %if isprep
7008*c0909341SAndroid Build Coastguard Worker  %define ssq r3
7009*c0909341SAndroid Build Coastguard Worker %endif
7010*c0909341SAndroid Build Coastguard Worker    movzx                r5, byte [esp+0x1f0]
7011*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
7012*c0909341SAndroid Build Coastguard Worker    movd                m15, r5
7013*c0909341SAndroid Build Coastguard Worker%endif
7014*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [base+rescale_mul]
7015*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7016*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pd_0x4000]
7017*c0909341SAndroid Build Coastguard Worker%endif
7018*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0000
7019*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-3]
7020*c0909341SAndroid Build Coastguard Worker    pand                 m0, m14, m10
7021*c0909341SAndroid Build Coastguard Worker    psrld                m0, 6
7022*c0909341SAndroid Build Coastguard Worker    paddd               m15, m0
7023*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m15, q1032
7024*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7025*c0909341SAndroid Build Coastguard Worker    movd                r4d, m15
7026*c0909341SAndroid Build Coastguard Worker    movd               r11d, m7
7027*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0321
7028*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q0321
7029*c0909341SAndroid Build Coastguard Worker    movd                r6d, m15
7030*c0909341SAndroid Build Coastguard Worker    movd               r13d, m7
7031*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+bdct_lb_q+ 0]
7032*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+bdct_lb_q+16]
7033*c0909341SAndroid Build Coastguard Worker    movd                m13, [base+subpel_filters+ r4*8+2]
7034*c0909341SAndroid Build Coastguard Worker    movd                 m2, [base+subpel_filters+ r6*8+2]
7035*c0909341SAndroid Build Coastguard Worker    movd                m15, [base+subpel_filters+r11*8+2]
7036*c0909341SAndroid Build Coastguard Worker    movd                 m4, [base+subpel_filters+r13*8+2]
7037*c0909341SAndroid Build Coastguard Worker%else
7038*c0909341SAndroid Build Coastguard Worker    movd                 r1, m15
7039*c0909341SAndroid Build Coastguard Worker    movd                 r4, m7
7040*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0321
7041*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q0321
7042*c0909341SAndroid Build Coastguard Worker    movd                 r3, m15
7043*c0909341SAndroid Build Coastguard Worker    movd                 r5, m7
7044*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+bdct_lb_q+ 0]
7045*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+bdct_lb_q+16]
7046*c0909341SAndroid Build Coastguard Worker    movd                 m1, [base+subpel_filters+r1*8+2]
7047*c0909341SAndroid Build Coastguard Worker    movd                 m2, [base+subpel_filters+r3*8+2]
7048*c0909341SAndroid Build Coastguard Worker    movd                 m3, [base+subpel_filters+r4*8+2]
7049*c0909341SAndroid Build Coastguard Worker    movd                 m7, [base+subpel_filters+r5*8+2]
7050*c0909341SAndroid Build Coastguard Worker    SWAP                 m4, m7
7051*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
7052*c0909341SAndroid Build Coastguard Worker %if isprep
7053*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
7054*c0909341SAndroid Build Coastguard Worker %endif
7055*c0909341SAndroid Build Coastguard Worker %define m10 m5
7056*c0909341SAndroid Build Coastguard Worker %define m11 m6
7057*c0909341SAndroid Build Coastguard Worker %define m12 m1
7058*c0909341SAndroid Build Coastguard Worker %define m13 m1
7059*c0909341SAndroid Build Coastguard Worker%endif
7060*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
7061*c0909341SAndroid Build Coastguard Worker    paddd               m14, m14
7062*c0909341SAndroid Build Coastguard Worker    punpckldq           m13, m2
7063*c0909341SAndroid Build Coastguard Worker    punpckldq           m15, m4
7064*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m13, m15
7065*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
7066*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m0, m2
7067*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7068*c0909341SAndroid Build Coastguard Worker    pand                 m9, m0
7069*c0909341SAndroid Build Coastguard Worker%else
7070*c0909341SAndroid Build Coastguard Worker    pand                 m2, m9, m0
7071*c0909341SAndroid Build Coastguard Worker %define m9 m2
7072*c0909341SAndroid Build Coastguard Worker    SWAP                 m7, m4
7073*c0909341SAndroid Build Coastguard Worker%endif
7074*c0909341SAndroid Build Coastguard Worker    pandn                m0, m13
7075*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7076*c0909341SAndroid Build Coastguard Worker    SWAP                m13, m0
7077*c0909341SAndroid Build Coastguard Worker%else
7078*c0909341SAndroid Build Coastguard Worker %define m13 m0
7079*c0909341SAndroid Build Coastguard Worker%endif
7080*c0909341SAndroid Build Coastguard Worker    por                 m13, m9
7081*c0909341SAndroid Build Coastguard Worker    punpckhbw           m15, m13, m13
7082*c0909341SAndroid Build Coastguard Worker    punpcklbw           m13, m13
7083*c0909341SAndroid Build Coastguard Worker    psraw               m15, 8
7084*c0909341SAndroid Build Coastguard Worker    psraw               m13, 8
7085*c0909341SAndroid Build Coastguard Worker    pshufb              m12, m14, m10
7086*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m11
7087*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+spel_s_shuf2]
7088*c0909341SAndroid Build Coastguard Worker    movd                r4d, m14
7089*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 24
7090*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
7091*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m13
7092*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m15
7093*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
7094*c0909341SAndroid Build Coastguard Worker%endif
7095*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m14, m2
7096*c0909341SAndroid Build Coastguard Worker    psubb               m14, m7
7097*c0909341SAndroid Build Coastguard Worker    paddb               m12, m10
7098*c0909341SAndroid Build Coastguard Worker    paddb               m14, m10
7099*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7100*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r4+ssq*1]
7101*c0909341SAndroid Build Coastguard Worker    lea                 r11, [r4+ssq*2]
7102*c0909341SAndroid Build Coastguard Worker    lea                 r13, [r4+ss3q ]
7103*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0]
7104*c0909341SAndroid Build Coastguard Worker    movu                 m8, [srcq+ssq*2]
7105*c0909341SAndroid Build Coastguard Worker    movu                 m9, [srcq+ssq*1]
7106*c0909341SAndroid Build Coastguard Worker    movu                m10, [srcq+ss3q ]
7107*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+r4   ]
7108*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r11  ]
7109*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+r6   ]
7110*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+r13  ]
7111*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
7112*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m12}, m1, m9, m8, m10
7113*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m13}, m1, m9, m8, m10
7114*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m14}, m7, m3, m2, m4
7115*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m7, m3, m2, m4
7116*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+0x10]
7117*c0909341SAndroid Build Coastguard Worker    movd                xm6, [rsp+0x20]
7118*c0909341SAndroid Build Coastguard Worker    phaddd               m1, m7
7119*c0909341SAndroid Build Coastguard Worker    phaddd               m8, m2
7120*c0909341SAndroid Build Coastguard Worker    phaddd               m9, m3
7121*c0909341SAndroid Build Coastguard Worker    phaddd              m10, m4
7122*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*0]
7123*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1]
7124*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m1, m9, m8, m10
7125*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, xm6}, m1, m9, m8, m10
7126*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m8     ; 0 2
7127*c0909341SAndroid Build Coastguard Worker    packssdw             m9, m10    ; 1 3
7128*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r4   ]
7129*c0909341SAndroid Build Coastguard Worker    movu                 m8, [srcq+r6   ]
7130*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
7131*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m12}, m2, m3
7132*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m13}, m2, m3
7133*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m14}, m0, m8
7134*c0909341SAndroid Build Coastguard Worker    REPX   {pmaddwd x, m15}, m0, m8
7135*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m0
7136*c0909341SAndroid Build Coastguard Worker    phaddd               m3, m8
7137*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
7138*c0909341SAndroid Build Coastguard Worker    mov                 r9d, 64 << 24
7139*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
7140*c0909341SAndroid Build Coastguard Worker    cmovnz              r9q, [base+subpel_filters+myq*8]
7141*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m2, m3
7142*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, xm6}, m2, m3
7143*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3        ; 4 5
7144*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m2, q1032 ; 5 _
7145*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m9    ; 01
7146*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m9        ; 23
7147*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3        ; 45
7148*c0909341SAndroid Build Coastguard Worker    movq                m10, r9
7149*c0909341SAndroid Build Coastguard Worker %define hrnd_mem [rsp+0x10]
7150*c0909341SAndroid Build Coastguard Worker %define hsh_mem  [rsp+0x20]
7151*c0909341SAndroid Build Coastguard Worker %define vsh_mem  [rsp+0x28]
7152*c0909341SAndroid Build Coastguard Worker %if isput
7153*c0909341SAndroid Build Coastguard Worker  %define vrnd_mem [rsp+0x30]
7154*c0909341SAndroid Build Coastguard Worker %else
7155*c0909341SAndroid Build Coastguard Worker  %define vrnd_mem [base+pd_m524256]
7156*c0909341SAndroid Build Coastguard Worker %endif
7157*c0909341SAndroid Build Coastguard Worker%else
7158*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m12
7159*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m14
7160*c0909341SAndroid Build Coastguard Worker    add                  r4, srcq
7161*c0909341SAndroid Build Coastguard Worker    MC_4TAP_SCALED_H   0x60 ; 0 1
7162*c0909341SAndroid Build Coastguard Worker    MC_4TAP_SCALED_H   0x70 ; 2 3
7163*c0909341SAndroid Build Coastguard Worker    MC_4TAP_SCALED_H   0x80 ; 4 5
7164*c0909341SAndroid Build Coastguard Worker    mov          [stk+0xe0], r4
7165*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+spel_s_shuf8]
7166*c0909341SAndroid Build Coastguard Worker    mova                 m0, [stk+0x60]
7167*c0909341SAndroid Build Coastguard Worker    mova                 m1, [stk+0x70]
7168*c0909341SAndroid Build Coastguard Worker    mova                 m2, [stk+0x80]
7169*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
7170*c0909341SAndroid Build Coastguard Worker    mov                  rX, [esp+0x1f4]
7171*c0909341SAndroid Build Coastguard Worker    xor                  r5, r5
7172*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
7173*c0909341SAndroid Build Coastguard Worker    lea                  rX, [rX+myd]
7174*c0909341SAndroid Build Coastguard Worker    mov                  r4, 64 << 24
7175*c0909341SAndroid Build Coastguard Worker    cmovnz               r4, [base+subpel_filters+rX*8+0]
7176*c0909341SAndroid Build Coastguard Worker    cmovnz               r5, [base+subpel_filters+rX*8+4]
7177*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
7178*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3 ; 01
7179*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3 ; 23
7180*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3 ; 45
7181*c0909341SAndroid Build Coastguard Worker    movd                 m7, r4
7182*c0909341SAndroid Build Coastguard Worker    movd                 m4, r5
7183*c0909341SAndroid Build Coastguard Worker    mov                  r5, r0m
7184*c0909341SAndroid Build Coastguard Worker %if isput
7185*c0909341SAndroid Build Coastguard Worker    mov                  r1, r1m
7186*c0909341SAndroid Build Coastguard Worker %endif
7187*c0909341SAndroid Build Coastguard Worker    mov                  r4, [stk+0xe0]
7188*c0909341SAndroid Build Coastguard Worker %define dstq r5
7189*c0909341SAndroid Build Coastguard Worker %define tmpq r5
7190*c0909341SAndroid Build Coastguard Worker %define m12 [stk+0x20]
7191*c0909341SAndroid Build Coastguard Worker %define m14 [stk+0x30]
7192*c0909341SAndroid Build Coastguard Worker %define m13 [stk+0x40]
7193*c0909341SAndroid Build Coastguard Worker %define m15 [stk+0x50]
7194*c0909341SAndroid Build Coastguard Worker %define hrnd_mem [esp+0x00]
7195*c0909341SAndroid Build Coastguard Worker %define hsh_mem  [esp+0x10]
7196*c0909341SAndroid Build Coastguard Worker %define vsh_mem  [esp+0x18]
7197*c0909341SAndroid Build Coastguard Worker %if isput
7198*c0909341SAndroid Build Coastguard Worker  %define vrnd_mem [esp+0x20]
7199*c0909341SAndroid Build Coastguard Worker %else
7200*c0909341SAndroid Build Coastguard Worker  %define vrnd_mem [base+pd_m524256]
7201*c0909341SAndroid Build Coastguard Worker %endif
7202*c0909341SAndroid Build Coastguard Worker %define m10 m7
7203*c0909341SAndroid Build Coastguard Worker    punpckldq           m10, m4
7204*c0909341SAndroid Build Coastguard Worker%endif
7205*c0909341SAndroid Build Coastguard Worker    punpcklbw           m10, m10
7206*c0909341SAndroid Build Coastguard Worker    psraw               m10, 8
7207*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m10, q0000
7208*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m10, q1111
7209*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m10, q2222
7210*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m10, q3333
7211*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
7212*c0909341SAndroid Build Coastguard Worker %xdefine m8  m3
7213*c0909341SAndroid Build Coastguard Worker %xdefine m9  m6
7214*c0909341SAndroid Build Coastguard Worker %xdefine m11 m5
7215*c0909341SAndroid Build Coastguard Worker %xdefine m6  m4
7216*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x100], m3
7217*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x110], m4
7218*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x120], m5
7219*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x130], m10
7220*c0909341SAndroid Build Coastguard Worker %define m3  [stk+0x100]
7221*c0909341SAndroid Build Coastguard Worker %define m4  [stk+0x110]
7222*c0909341SAndroid Build Coastguard Worker %define m5  [stk+0x120]
7223*c0909341SAndroid Build Coastguard Worker %define m10 [stk+0x130]
7224*c0909341SAndroid Build Coastguard Worker%endif
7225*c0909341SAndroid Build Coastguard Worker.dy2_w4_loop:
7226*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m0, m3
7227*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m1, m3
7228*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
7229*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m4
7230*c0909341SAndroid Build Coastguard Worker    pmaddwd             m11, m2, m4
7231*c0909341SAndroid Build Coastguard Worker    paddd                m8, vrnd_mem
7232*c0909341SAndroid Build Coastguard Worker    paddd                m9, vrnd_mem
7233*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5
7234*c0909341SAndroid Build Coastguard Worker    paddd                m8, m1
7235*c0909341SAndroid Build Coastguard Worker    paddd                m9, m11
7236*c0909341SAndroid Build Coastguard Worker    paddd                m8, m2
7237*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*0]
7238*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*2]
7239*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7240*c0909341SAndroid Build Coastguard Worker    movu                m11, [srcq+r4 ]
7241*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r11]
7242*c0909341SAndroid Build Coastguard Worker%else
7243*c0909341SAndroid Build Coastguard Worker    movu                m11, [r4+ssq*0]
7244*c0909341SAndroid Build Coastguard Worker    movu                 m2, [r4+ssq*2]
7245*c0909341SAndroid Build Coastguard Worker%endif
7246*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m12
7247*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m12
7248*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m13
7249*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m13
7250*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m14
7251*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14
7252*c0909341SAndroid Build Coastguard Worker    pmaddwd             m11, m15
7253*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m15
7254*c0909341SAndroid Build Coastguard Worker    phaddd               m6, m11
7255*c0909341SAndroid Build Coastguard Worker    phaddd               m1, m2
7256*c0909341SAndroid Build Coastguard Worker    paddd                m6, hrnd_mem
7257*c0909341SAndroid Build Coastguard Worker    paddd                m1, hrnd_mem
7258*c0909341SAndroid Build Coastguard Worker    psrad                m6, hsh_mem
7259*c0909341SAndroid Build Coastguard Worker    psrad                m1, hsh_mem
7260*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*1]
7261*c0909341SAndroid Build Coastguard Worker    movu                m11, [srcq+ss3q ]
7262*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m1 ; 6 8
7263*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7264*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6 ]
7265*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r13]
7266*c0909341SAndroid Build Coastguard Worker%else
7267*c0909341SAndroid Build Coastguard Worker    movu                 m2, [r4+ssq*1]
7268*c0909341SAndroid Build Coastguard Worker    movu                 m1, [r4+ss3q ]
7269*c0909341SAndroid Build Coastguard Worker%endif
7270*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m12
7271*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m12
7272*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m13
7273*c0909341SAndroid Build Coastguard Worker    pmaddwd             m11, m13
7274*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14
7275*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14
7276*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m15
7277*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m15
7278*c0909341SAndroid Build Coastguard Worker    phaddd               m7, m2
7279*c0909341SAndroid Build Coastguard Worker    phaddd              m11, m1
7280*c0909341SAndroid Build Coastguard Worker    paddd                m7, hrnd_mem
7281*c0909341SAndroid Build Coastguard Worker    paddd               m11, hrnd_mem
7282*c0909341SAndroid Build Coastguard Worker    psrad                m7, hsh_mem
7283*c0909341SAndroid Build Coastguard Worker    psrad               m11, hsh_mem
7284*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m11 ; 7 9
7285*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
7286*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r4+ssq*4]
7287*c0909341SAndroid Build Coastguard Worker%endif
7288*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
7289*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m6, m7 ; 67
7290*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m7     ; 89
7291*c0909341SAndroid Build Coastguard Worker    mova                 m2, m6
7292*c0909341SAndroid Build Coastguard Worker    pmaddwd             m11, m1, m5
7293*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m1, m10
7294*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m10
7295*c0909341SAndroid Build Coastguard Worker    paddd                m9, m11
7296*c0909341SAndroid Build Coastguard Worker%if isput
7297*c0909341SAndroid Build Coastguard Worker    movd                m11, vsh_mem
7298*c0909341SAndroid Build Coastguard Worker%endif
7299*c0909341SAndroid Build Coastguard Worker    paddd                m8, m7
7300*c0909341SAndroid Build Coastguard Worker    paddd                m9, m6
7301*c0909341SAndroid Build Coastguard Worker%if isput
7302*c0909341SAndroid Build Coastguard Worker    psrad                m8, m11
7303*c0909341SAndroid Build Coastguard Worker    psrad                m9, m11
7304*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m9
7305*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
7306*c0909341SAndroid Build Coastguard Worker    pmaxsw               m8, m7
7307*c0909341SAndroid Build Coastguard Worker    pminsw               m8, pxmaxm
7308*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m8
7309*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m8
7310*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
7311*c0909341SAndroid Build Coastguard Worker%else
7312*c0909341SAndroid Build Coastguard Worker    psrad                m8, 6
7313*c0909341SAndroid Build Coastguard Worker    psrad                m9, 6
7314*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m9
7315*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m8
7316*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
7317*c0909341SAndroid Build Coastguard Worker%endif
7318*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
7319*c0909341SAndroid Build Coastguard Worker    jg .dy2_w4_loop
7320*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_RET ; why not jz .ret?
7321*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
7322*c0909341SAndroid Build Coastguard Worker.dy2_w8:
7323*c0909341SAndroid Build Coastguard Worker    mov    dword [stk+0xf0], 1
7324*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 16
7325*c0909341SAndroid Build Coastguard Worker    jmp .dy2_w_start
7326*c0909341SAndroid Build Coastguard Worker.dy2_w16:
7327*c0909341SAndroid Build Coastguard Worker    mov    dword [stk+0xf0], 2
7328*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 32
7329*c0909341SAndroid Build Coastguard Worker    jmp .dy2_w_start
7330*c0909341SAndroid Build Coastguard Worker.dy2_w32:
7331*c0909341SAndroid Build Coastguard Worker    mov    dword [stk+0xf0], 4
7332*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 64
7333*c0909341SAndroid Build Coastguard Worker    jmp .dy2_w_start
7334*c0909341SAndroid Build Coastguard Worker.dy2_w64:
7335*c0909341SAndroid Build Coastguard Worker    mov    dword [stk+0xf0], 8
7336*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 128
7337*c0909341SAndroid Build Coastguard Worker    jmp .dy2_w_start
7338*c0909341SAndroid Build Coastguard Worker.dy2_w128:
7339*c0909341SAndroid Build Coastguard Worker    mov    dword [stk+0xf0], 16
7340*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 256
7341*c0909341SAndroid Build Coastguard Worker.dy2_w_start:
7342*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
7343*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7344*c0909341SAndroid Build Coastguard Worker %ifidn %1, put
7345*c0909341SAndroid Build Coastguard Worker    movifnidn           dsm, dsq
7346*c0909341SAndroid Build Coastguard Worker %endif
7347*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], m11
7348*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m12
7349*c0909341SAndroid Build Coastguard Worker %define hround m11
7350*c0909341SAndroid Build Coastguard Worker %if isput
7351*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x30], m13
7352*c0909341SAndroid Build Coastguard Worker %else
7353*c0909341SAndroid Build Coastguard Worker    mova                m13, [base+pd_m524256]
7354*c0909341SAndroid Build Coastguard Worker %endif
7355*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 16
7356*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
7357*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
7358*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
7359*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
7360*c0909341SAndroid Build Coastguard Worker    movd                m15, t0d
7361*c0909341SAndroid Build Coastguard Worker%else
7362*c0909341SAndroid Build Coastguard Worker %define hround [esp+0x00]
7363*c0909341SAndroid Build Coastguard Worker %define m12    [esp+0x10]
7364*c0909341SAndroid Build Coastguard Worker %define m10    [base+pd_0x3ff]
7365*c0909341SAndroid Build Coastguard Worker %define m8  m0
7366*c0909341SAndroid Build Coastguard Worker %xdefine m14 m4
7367*c0909341SAndroid Build Coastguard Worker %xdefine m15 m3
7368*c0909341SAndroid Build Coastguard Worker %if isput
7369*c0909341SAndroid Build Coastguard Worker  %define dstq r0
7370*c0909341SAndroid Build Coastguard Worker %else
7371*c0909341SAndroid Build Coastguard Worker  %define tmpq r0
7372*c0909341SAndroid Build Coastguard Worker  %define ssq ssm
7373*c0909341SAndroid Build Coastguard Worker %endif
7374*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+0x1f0]
7375*c0909341SAndroid Build Coastguard Worker    mov                  r3, [esp+0x1f4]
7376*c0909341SAndroid Build Coastguard Worker    shr                  r5, 16
7377*c0909341SAndroid Build Coastguard Worker    movd                m15, r5
7378*c0909341SAndroid Build Coastguard Worker    xor                  r5, r5
7379*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
7380*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r3+myd]
7381*c0909341SAndroid Build Coastguard Worker    mov                  r4, 64 << 24
7382*c0909341SAndroid Build Coastguard Worker    cmovnz               r4, [base+subpel_filters+r3*8+0]
7383*c0909341SAndroid Build Coastguard Worker    cmovnz               r5, [base+subpel_filters+r3*8+4]
7384*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
7385*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
7386*c0909341SAndroid Build Coastguard Worker%endif
7387*c0909341SAndroid Build Coastguard Worker    sub                srcq, 6
7388*c0909341SAndroid Build Coastguard Worker    pslld                m7, m8, 2 ; dx*4
7389*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
7390*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0000
7391*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-3]
7392*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7393*c0909341SAndroid Build Coastguard Worker    movq                 m3, r4q
7394*c0909341SAndroid Build Coastguard Worker%else
7395*c0909341SAndroid Build Coastguard Worker    movd                 m5, r4
7396*c0909341SAndroid Build Coastguard Worker    movd                 m6, r5
7397*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m6
7398*c0909341SAndroid Build Coastguard Worker    SWAP                 m3, m5
7399*c0909341SAndroid Build Coastguard Worker%endif
7400*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m3
7401*c0909341SAndroid Build Coastguard Worker    psraw                m3, 8
7402*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x100], m7
7403*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x120], m15
7404*c0909341SAndroid Build Coastguard Worker    mov         [stk+0x0f8], srcq
7405*c0909341SAndroid Build Coastguard Worker    mov         [stk+0x130], r0q ; dstq / tmpq
7406*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q0000
7407*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m3, q1111
7408*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q2222
7409*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q3333
7410*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7411*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x140], m0
7412*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x150], m1
7413*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x160], m2
7414*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x170], m3
7415*c0909341SAndroid Build Coastguard Worker %if UNIX64
7416*c0909341SAndroid Build Coastguard Worker    mov                  hm, hd
7417*c0909341SAndroid Build Coastguard Worker %endif
7418*c0909341SAndroid Build Coastguard Worker%else
7419*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x180], m0
7420*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x190], m1
7421*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x1a0], m2
7422*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x1b0], m3
7423*c0909341SAndroid Build Coastguard Worker    SWAP                 m5, m3
7424*c0909341SAndroid Build Coastguard Worker    mov                  r5, hm
7425*c0909341SAndroid Build Coastguard Worker    mov         [stk+0x134], r5
7426*c0909341SAndroid Build Coastguard Worker%endif
7427*c0909341SAndroid Build Coastguard Worker    jmp .dy2_hloop
7428*c0909341SAndroid Build Coastguard Worker.dy2_hloop_prep:
7429*c0909341SAndroid Build Coastguard Worker    dec   dword [stk+0x0f0]
7430*c0909341SAndroid Build Coastguard Worker    jz .ret
7431*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7432*c0909341SAndroid Build Coastguard Worker    add   qword [stk+0x130], 16
7433*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
7434*c0909341SAndroid Build Coastguard Worker%else
7435*c0909341SAndroid Build Coastguard Worker    add   dword [stk+0x130], 16
7436*c0909341SAndroid Build Coastguard Worker    mov                  r5, [stk+0x134]
7437*c0909341SAndroid Build Coastguard Worker    mov                  r0, [stk+0x130]
7438*c0909341SAndroid Build Coastguard Worker%endif
7439*c0909341SAndroid Build Coastguard Worker    mova                 m7, [stk+0x100]
7440*c0909341SAndroid Build Coastguard Worker    mova                m14, [stk+0x110]
7441*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7442*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+pd_0x3ff]
7443*c0909341SAndroid Build Coastguard Worker    mova                m11, [rsp+0x10]
7444*c0909341SAndroid Build Coastguard Worker%endif
7445*c0909341SAndroid Build Coastguard Worker    mova                m15, [stk+0x120]
7446*c0909341SAndroid Build Coastguard Worker    mov                srcq, [stk+0x0f8]
7447*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7448*c0909341SAndroid Build Coastguard Worker    mov                 r0q, [stk+0x130] ; dstq / tmpq
7449*c0909341SAndroid Build Coastguard Worker%else
7450*c0909341SAndroid Build Coastguard Worker    mov                  hm, r5
7451*c0909341SAndroid Build Coastguard Worker    mov                 r0m, r0
7452*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
7453*c0909341SAndroid Build Coastguard Worker%endif
7454*c0909341SAndroid Build Coastguard Worker    paddd               m14, m7
7455*c0909341SAndroid Build Coastguard Worker.dy2_hloop:
7456*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7457*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pq_0x40000000]
7458*c0909341SAndroid Build Coastguard Worker%else
7459*c0909341SAndroid Build Coastguard Worker %define m9 [base+pq_0x40000000]
7460*c0909341SAndroid Build Coastguard Worker%endif
7461*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
7462*c0909341SAndroid Build Coastguard Worker    psrld                m2, m14, 10
7463*c0909341SAndroid Build Coastguard Worker    mova              [stk], m2
7464*c0909341SAndroid Build Coastguard Worker    pand                 m6, m14, m10
7465*c0909341SAndroid Build Coastguard Worker    psrld                m6, 6
7466*c0909341SAndroid Build Coastguard Worker    paddd                m5, m15, m6
7467*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m6, m1
7468*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m5, q1032
7469*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7470*c0909341SAndroid Build Coastguard Worker    movd                r4d, m5
7471*c0909341SAndroid Build Coastguard Worker    movd                r6d, m2
7472*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q0321
7473*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q0321
7474*c0909341SAndroid Build Coastguard Worker    movd                r7d, m5
7475*c0909341SAndroid Build Coastguard Worker    movd                r9d, m2
7476*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base+subpel_filters+r4*8]
7477*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base+subpel_filters+r6*8]
7478*c0909341SAndroid Build Coastguard Worker    movhps               m0, [base+subpel_filters+r7*8]
7479*c0909341SAndroid Build Coastguard Worker    movhps               m1, [base+subpel_filters+r9*8]
7480*c0909341SAndroid Build Coastguard Worker%else
7481*c0909341SAndroid Build Coastguard Worker    movd                 r0, m5
7482*c0909341SAndroid Build Coastguard Worker    movd                 rX, m2
7483*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q0321
7484*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q0321
7485*c0909341SAndroid Build Coastguard Worker    movd                 r4, m5
7486*c0909341SAndroid Build Coastguard Worker    movd                 r5, m2
7487*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base+subpel_filters+r0*8]
7488*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base+subpel_filters+rX*8]
7489*c0909341SAndroid Build Coastguard Worker    movhps               m0, [base+subpel_filters+r4*8]
7490*c0909341SAndroid Build Coastguard Worker    movhps               m1, [base+subpel_filters+r5*8]
7491*c0909341SAndroid Build Coastguard Worker%endif
7492*c0909341SAndroid Build Coastguard Worker    paddd               m14, m7 ; mx+dx*[4-7]
7493*c0909341SAndroid Build Coastguard Worker    pand                 m5, m14, m10
7494*c0909341SAndroid Build Coastguard Worker    psrld                m5, 6
7495*c0909341SAndroid Build Coastguard Worker    paddd               m15, m5
7496*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
7497*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m5, m2
7498*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x110], m14
7499*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m15, q1032
7500*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7501*c0909341SAndroid Build Coastguard Worker    movd               r10d, m15
7502*c0909341SAndroid Build Coastguard Worker    movd               r11d, m4
7503*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0321
7504*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m4, q0321
7505*c0909341SAndroid Build Coastguard Worker    movd               r13d, m15
7506*c0909341SAndroid Build Coastguard Worker    movd                rXd, m4
7507*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+r10*8]
7508*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+r11*8]
7509*c0909341SAndroid Build Coastguard Worker    movhps               m2, [base+subpel_filters+r13*8]
7510*c0909341SAndroid Build Coastguard Worker    movhps               m3, [base+subpel_filters+ rX*8]
7511*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
7512*c0909341SAndroid Build Coastguard Worker    movq                r11, m14
7513*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m14, m14
7514*c0909341SAndroid Build Coastguard Worker    movq                 rX, m14
7515*c0909341SAndroid Build Coastguard Worker    mov                r10d, r11d
7516*c0909341SAndroid Build Coastguard Worker    shr                 r11, 32
7517*c0909341SAndroid Build Coastguard Worker    mov                r13d, rXd
7518*c0909341SAndroid Build Coastguard Worker    shr                  rX, 32
7519*c0909341SAndroid Build Coastguard Worker    mov                 r4d, [stk+ 0]
7520*c0909341SAndroid Build Coastguard Worker    mov                 r6d, [stk+ 4]
7521*c0909341SAndroid Build Coastguard Worker    mov                 r7d, [stk+ 8]
7522*c0909341SAndroid Build Coastguard Worker    mov                 r9d, [stk+12]
7523*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m6, q1100
7524*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q3322
7525*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m5, q1100
7526*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q3322
7527*c0909341SAndroid Build Coastguard Worker    pand                 m7, m9, m4
7528*c0909341SAndroid Build Coastguard Worker    pand                 m8, m9, m6
7529*c0909341SAndroid Build Coastguard Worker    pand                m15, m9, m14
7530*c0909341SAndroid Build Coastguard Worker    pand                 m9, m9, m5
7531*c0909341SAndroid Build Coastguard Worker    pandn                m4, m0
7532*c0909341SAndroid Build Coastguard Worker    pandn                m6, m1
7533*c0909341SAndroid Build Coastguard Worker    pandn               m14, m2
7534*c0909341SAndroid Build Coastguard Worker    pandn                m5, m3
7535*c0909341SAndroid Build Coastguard Worker    por                  m7, m4
7536*c0909341SAndroid Build Coastguard Worker    por                  m8, m6
7537*c0909341SAndroid Build Coastguard Worker    por                 m15, m14
7538*c0909341SAndroid Build Coastguard Worker    por                  m9, m5
7539*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m7, m7
7540*c0909341SAndroid Build Coastguard Worker    punpckhbw            m7, m7
7541*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m8, m8
7542*c0909341SAndroid Build Coastguard Worker    punpckhbw            m8, m8
7543*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8
7544*c0909341SAndroid Build Coastguard Worker    psraw                m7, 8
7545*c0909341SAndroid Build Coastguard Worker    psraw                m1, 8
7546*c0909341SAndroid Build Coastguard Worker    psraw                m8, 8
7547*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m15, m15
7548*c0909341SAndroid Build Coastguard Worker    punpckhbw           m15, m15
7549*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m9, m9
7550*c0909341SAndroid Build Coastguard Worker    punpckhbw            m9, m9
7551*c0909341SAndroid Build Coastguard Worker    psraw                m2, 8
7552*c0909341SAndroid Build Coastguard Worker    psraw               m15, 8
7553*c0909341SAndroid Build Coastguard Worker    psraw                m3, 8
7554*c0909341SAndroid Build Coastguard Worker    psraw                m9, 8
7555*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x10], m0
7556*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m7
7557*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m1
7558*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m8
7559*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m2
7560*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x60], m15
7561*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x70], m3
7562*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x80], m9
7563*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
7564*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m1
7565*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
7566*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xa0], m2
7567*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
7568*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xb0], m3
7569*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
7570*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xc0], m4
7571*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
7572*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xd0], m5
7573*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
7574*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
7575*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
7576*c0909341SAndroid Build Coastguard Worker    mova                 m5, [stk+0xd0]
7577*c0909341SAndroid Build Coastguard Worker    mova                 m1, [stk+0x90]
7578*c0909341SAndroid Build Coastguard Worker    mova                 m2, [stk+0xa0]
7579*c0909341SAndroid Build Coastguard Worker    mova                 m3, [stk+0xb0]
7580*c0909341SAndroid Build Coastguard Worker    mova                 m9, [stk+0xc0]
7581*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m6 ; 45a
7582*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6     ; 45b
7583*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7, m8 ; 67a
7584*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m8     ; 67b
7585*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2 ; 01a
7586*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2     ; 01b
7587*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m9 ; 23a
7588*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m9     ; 23b
7589*c0909341SAndroid Build Coastguard Worker    mova                m10, [stk+0x140]
7590*c0909341SAndroid Build Coastguard Worker    mova                m11, [stk+0x150]
7591*c0909341SAndroid Build Coastguard Worker    mova                m14, [stk+0x160]
7592*c0909341SAndroid Build Coastguard Worker    mova                m15, [stk+0x170]
7593*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m4
7594*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xa0], m5
7595*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xb0], m6
7596*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xc0], m7
7597*c0909341SAndroid Build Coastguard Worker %define hround [rsp+0x10]
7598*c0909341SAndroid Build Coastguard Worker %define shift  [rsp+0x20]
7599*c0909341SAndroid Build Coastguard Worker %if isput
7600*c0909341SAndroid Build Coastguard Worker  %define vround [rsp+0x30]
7601*c0909341SAndroid Build Coastguard Worker %else
7602*c0909341SAndroid Build Coastguard Worker  %define vround [base+pd_m524256]
7603*c0909341SAndroid Build Coastguard Worker %endif
7604*c0909341SAndroid Build Coastguard Worker.dy2_vloop:
7605*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m0, m10
7606*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, m10
7607*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m11
7608*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m3, m11
7609*c0909341SAndroid Build Coastguard Worker    paddd                m4, m13
7610*c0909341SAndroid Build Coastguard Worker    paddd                m5, m13
7611*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
7612*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
7613*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, [stk+0x90], m14
7614*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, [stk+0xa0], m14
7615*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [stk+0xb0], m15
7616*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, [stk+0xc0], m15
7617*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
7618*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
7619*c0909341SAndroid Build Coastguard Worker %if isput
7620*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m12, q1032
7621*c0909341SAndroid Build Coastguard Worker %endif
7622*c0909341SAndroid Build Coastguard Worker    paddd                m4, m8
7623*c0909341SAndroid Build Coastguard Worker    paddd                m5, m9
7624*c0909341SAndroid Build Coastguard Worker%else
7625*c0909341SAndroid Build Coastguard Worker    movd                 r0, m15
7626*c0909341SAndroid Build Coastguard Worker    movd                 rX, m4
7627*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0321
7628*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m4, q0321
7629*c0909341SAndroid Build Coastguard Worker    movd                 r4, m15
7630*c0909341SAndroid Build Coastguard Worker    movd                 r5, m4
7631*c0909341SAndroid Build Coastguard Worker    mova                m14, [stk+0x110]
7632*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+r0*8]
7633*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+rX*8]
7634*c0909341SAndroid Build Coastguard Worker    movhps               m2, [base+subpel_filters+r4*8]
7635*c0909341SAndroid Build Coastguard Worker    movhps               m3, [base+subpel_filters+r5*8]
7636*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
7637*c0909341SAndroid Build Coastguard Worker    mova           [stk+16], m14
7638*c0909341SAndroid Build Coastguard Worker    mov                  r0, [stk+ 0]
7639*c0909341SAndroid Build Coastguard Worker    mov                  rX, [stk+ 4]
7640*c0909341SAndroid Build Coastguard Worker    mov                  r4, [stk+ 8]
7641*c0909341SAndroid Build Coastguard Worker    mov                  r5, [stk+12]
7642*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x20], m0
7643*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x30], m1
7644*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m2
7645*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m3
7646*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m6, q1100
7647*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q3322
7648*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m5, q1100
7649*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q3322
7650*c0909341SAndroid Build Coastguard Worker    pand                 m0, m9, m4
7651*c0909341SAndroid Build Coastguard Worker    pand                 m1, m9, m6
7652*c0909341SAndroid Build Coastguard Worker    pand                 m2, m9, m7
7653*c0909341SAndroid Build Coastguard Worker    pand                 m3, m9, m5
7654*c0909341SAndroid Build Coastguard Worker    pandn                m4, [stk+0x20]
7655*c0909341SAndroid Build Coastguard Worker    pandn                m6, [stk+0x30]
7656*c0909341SAndroid Build Coastguard Worker    pandn                m7, [stk+0x40]
7657*c0909341SAndroid Build Coastguard Worker    pandn                m5, [stk+0x50]
7658*c0909341SAndroid Build Coastguard Worker    por                  m0, m4
7659*c0909341SAndroid Build Coastguard Worker    por                  m1, m6
7660*c0909341SAndroid Build Coastguard Worker    por                  m2, m7
7661*c0909341SAndroid Build Coastguard Worker    por                  m3, m5
7662*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m0, m0
7663*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m0
7664*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m1, m1
7665*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m1
7666*c0909341SAndroid Build Coastguard Worker    psraw                m4, 8
7667*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8
7668*c0909341SAndroid Build Coastguard Worker    psraw                m5, 8
7669*c0909341SAndroid Build Coastguard Worker    psraw                m1, 8
7670*c0909341SAndroid Build Coastguard Worker    punpcklbw            m6, m2, m2
7671*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m2
7672*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m3, m3
7673*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m3
7674*c0909341SAndroid Build Coastguard Worker    psraw                m6, 8
7675*c0909341SAndroid Build Coastguard Worker    psraw                m2, 8
7676*c0909341SAndroid Build Coastguard Worker    psraw                m7, 8
7677*c0909341SAndroid Build Coastguard Worker    psraw                m3, 8
7678*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x0a0], m4
7679*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x0b0], m0
7680*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x0c0], m5
7681*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x0d0], m1
7682*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x140], m6
7683*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x150], m2
7684*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x160], m7
7685*c0909341SAndroid Build Coastguard Worker    mova        [stk+0x170], m3
7686*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x20, 0 ; 0
7687*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x30    ; 1
7688*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x40    ; 2
7689*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x50    ; 3
7690*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x60    ; 4
7691*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x70    ; 5
7692*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x80    ; 6
7693*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0xa0, 0x90    ; 7
7694*c0909341SAndroid Build Coastguard Worker    mova                 m5, [stk+0x60]
7695*c0909341SAndroid Build Coastguard Worker    mova                 m6, [stk+0x70]
7696*c0909341SAndroid Build Coastguard Worker    mova                 m7, [stk+0x80]
7697*c0909341SAndroid Build Coastguard Worker    mova                 m0, [stk+0x90]
7698*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
7699*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m6      ; 45a
7700*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6          ; 45b
7701*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7, m0      ; 67a
7702*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m0          ; 67b
7703*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x60], m4
7704*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x70], m5
7705*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x80], m6
7706*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m7
7707*c0909341SAndroid Build Coastguard Worker    mova                 m1, [stk+0x20]
7708*c0909341SAndroid Build Coastguard Worker    mova                 m2, [stk+0x30]
7709*c0909341SAndroid Build Coastguard Worker    mova                 m3, [stk+0x40]
7710*c0909341SAndroid Build Coastguard Worker    mova                 m4, [stk+0x50]
7711*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2      ; 01a
7712*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2          ; 01b
7713*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m4      ; 23a
7714*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4          ; 23b
7715*c0909341SAndroid Build Coastguard Worker    mova                 m4, [stk+0x180]
7716*c0909341SAndroid Build Coastguard Worker    mova                 m5, [stk+0x190]
7717*c0909341SAndroid Build Coastguard Worker    mova                 m6, [stk+0x1a0]
7718*c0909341SAndroid Build Coastguard Worker    mova                 m7, [stk+0x1b0]
7719*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m2
7720*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m3
7721*c0909341SAndroid Build Coastguard Worker.dy2_vloop:
7722*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m4
7723*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m4
7724*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5
7725*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5
7726*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
7727*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
7728*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, [stk+0x60], m6
7729*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, [stk+0x70], m6
7730*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, [stk+0x80], m7
7731*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, [stk+0x90], m7
7732*c0909341SAndroid Build Coastguard Worker %if isput
7733*c0909341SAndroid Build Coastguard Worker    movd                 m6, [esp+0x18]
7734*c0909341SAndroid Build Coastguard Worker %endif
7735*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
7736*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
7737*c0909341SAndroid Build Coastguard Worker    paddd                m0, vrnd_mem
7738*c0909341SAndroid Build Coastguard Worker    paddd                m1, vrnd_mem
7739*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
7740*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1
7741*c0909341SAndroid Build Coastguard Worker%endif
7742*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
7743*c0909341SAndroid Build Coastguard Worker    psrad                m4, m6
7744*c0909341SAndroid Build Coastguard Worker    psrad                m5, m6
7745*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
7746*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
7747*c0909341SAndroid Build Coastguard Worker    pmaxsw               m4, m7
7748*c0909341SAndroid Build Coastguard Worker    pminsw               m4, pxmaxm
7749*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m4
7750*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
7751*c0909341SAndroid Build Coastguard Worker%else
7752*c0909341SAndroid Build Coastguard Worker    psrad                m4, 6
7753*c0909341SAndroid Build Coastguard Worker    psrad                m5, 6
7754*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
7755*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m4
7756*c0909341SAndroid Build Coastguard Worker    add                tmpq, tmp_stridem
7757*c0909341SAndroid Build Coastguard Worker%endif
7758*c0909341SAndroid Build Coastguard Worker    dec                  hd
7759*c0909341SAndroid Build Coastguard Worker    jz .dy2_hloop_prep
7760*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7761*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 0, 1
7762*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xd0], m4
7763*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 4, 0, 1
7764*c0909341SAndroid Build Coastguard Worker    mova                 m4, [stk+0xd0]
7765*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2         ; 01a
7766*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3         ; 01b
7767*c0909341SAndroid Build Coastguard Worker    mova                 m2, [stk+0x90] ; 23a
7768*c0909341SAndroid Build Coastguard Worker    mova                 m3, [stk+0xa0] ; 23b
7769*c0909341SAndroid Build Coastguard Worker    mova                 m5, [stk+0xb0] ; 45a
7770*c0909341SAndroid Build Coastguard Worker    mova                 m6, [stk+0xc0] ; 45b
7771*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m4, m8     ; 67a
7772*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m8         ; 67b
7773*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m5
7774*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xa0], m6
7775*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xb0], m7
7776*c0909341SAndroid Build Coastguard Worker    mova         [stk+0xc0], m4
7777*c0909341SAndroid Build Coastguard Worker%else
7778*c0909341SAndroid Build Coastguard Worker    mov                 r0m, r0
7779*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
7780*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 0xa0, 0xe0 ; 8
7781*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 0xa0, 0    ; 9
7782*c0909341SAndroid Build Coastguard Worker    mova                 m7, [stk+0xe0]
7783*c0909341SAndroid Build Coastguard Worker    mova                 m2, [stk+0x60] ; 23a
7784*c0909341SAndroid Build Coastguard Worker    mova                 m3, [stk+0x70] ; 23b
7785*c0909341SAndroid Build Coastguard Worker    mova                 m4, [stk+0x80] ; 45a
7786*c0909341SAndroid Build Coastguard Worker    mova                 m5, [stk+0x90] ; 45b
7787*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7, m0     ; 67a
7788*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m0         ; 67b
7789*c0909341SAndroid Build Coastguard Worker    mova                 m0, [stk+0x40] ; 01a
7790*c0909341SAndroid Build Coastguard Worker    mova                 m1, [stk+0x50] ; 01b
7791*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x40], m2
7792*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x50], m3
7793*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x60], m4
7794*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x70], m5
7795*c0909341SAndroid Build Coastguard Worker    mova                 m4, [stk+0x180]
7796*c0909341SAndroid Build Coastguard Worker    mova                 m5, [stk+0x190]
7797*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x80], m6
7798*c0909341SAndroid Build Coastguard Worker    mova         [stk+0x90], m7
7799*c0909341SAndroid Build Coastguard Worker    mova                 m6, [stk+0x1a0]
7800*c0909341SAndroid Build Coastguard Worker    mova                 m7, [stk+0x1b0]
7801*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
7802*c0909341SAndroid Build Coastguard Worker%endif
7803*c0909341SAndroid Build Coastguard Worker    jmp .dy2_vloop
7804*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
7805*c0909341SAndroid Build Coastguard Worker.ret:
7806*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_RET 0
7807*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT
7808*c0909341SAndroid Build Coastguard Worker %define r0m [rstk+stack_offset+ 4]
7809*c0909341SAndroid Build Coastguard Worker %define r1m [rstk+stack_offset+ 8]
7810*c0909341SAndroid Build Coastguard Worker %define r2m [rstk+stack_offset+12]
7811*c0909341SAndroid Build Coastguard Worker %define r3m [rstk+stack_offset+16]
7812*c0909341SAndroid Build Coastguard Worker%endif
7813*c0909341SAndroid Build Coastguard Worker%undef isput
7814*c0909341SAndroid Build Coastguard Worker%undef isprep
7815*c0909341SAndroid Build Coastguard Worker%endmacro
7816*c0909341SAndroid Build Coastguard Worker
7817*c0909341SAndroid Build Coastguard Worker%macro BILIN_SCALED_FN 1
7818*c0909341SAndroid Build Coastguard Workercglobal %1_bilin_scaled_16bpc
7819*c0909341SAndroid Build Coastguard Worker    mov                 t0d, (5*15 << 16) | 5*15
7820*c0909341SAndroid Build Coastguard Worker    mov                 t1d, (5*15 << 16) | 5*15
7821*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX)
7822*c0909341SAndroid Build Coastguard Worker%endmacro
7823*c0909341SAndroid Build Coastguard Worker
7824*c0909341SAndroid Build Coastguard Worker%if WIN64
7825*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 5
7826*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_64
7827*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 8
7828*c0909341SAndroid Build Coastguard Worker%else
7829*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 1, 2
7830*c0909341SAndroid Build Coastguard Worker%endif
7831*c0909341SAndroid Build Coastguard Worker
7832*c0909341SAndroid Build Coastguard Worker%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
7833*c0909341SAndroid Build Coastguard WorkerBILIN_SCALED_FN put
7834*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   put_8tap_scaled_16bpc
7835*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_scaled_16bpc
7836*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_scaled_16bpc
7837*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  put_8tap_scaled_16bpc
7838*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, put_8tap_scaled_16bpc
7839*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   put_8tap_scaled_16bpc
7840*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, put_8tap_scaled_16bpc
7841*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  put_8tap_scaled_16bpc
7842*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
7843*c0909341SAndroid Build Coastguard WorkerMC_8TAP_SCALED put
7844*c0909341SAndroid Build Coastguard Worker
7845*c0909341SAndroid Build Coastguard Worker%if WIN64
7846*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5, 4
7847*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_64
7848*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 7
7849*c0909341SAndroid Build Coastguard Worker%else
7850*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 1, 2
7851*c0909341SAndroid Build Coastguard Worker%endif
7852*c0909341SAndroid Build Coastguard Worker
7853*c0909341SAndroid Build Coastguard Worker%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
7854*c0909341SAndroid Build Coastguard WorkerBILIN_SCALED_FN prep
7855*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   prep_8tap_scaled_16bpc
7856*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_scaled_16bpc
7857*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_scaled_16bpc
7858*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  prep_8tap_scaled_16bpc
7859*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_scaled_16bpc
7860*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_scaled_16bpc
7861*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, prep_8tap_scaled_16bpc
7862*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  prep_8tap_scaled_16bpc
7863*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
7864*c0909341SAndroid Build Coastguard WorkerMC_8TAP_SCALED prep
7865*c0909341SAndroid Build Coastguard Worker
7866*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7867*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6
7868*c0909341SAndroid Build Coastguard Worker%else
7869*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 2
7870*c0909341SAndroid Build Coastguard Worker%endif
7871*c0909341SAndroid Build Coastguard Worker
7872*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7873*c0909341SAndroid Build Coastguard Worker; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that
7874*c0909341SAndroid Build Coastguard Worker; by allocating 16 bytes more stack space so that stack offsets match up.
7875*c0909341SAndroid Build Coastguard Worker%if WIN64 && STACK_ALIGNMENT == 16
7876*c0909341SAndroid Build Coastguard Worker%assign stksz 16*14
7877*c0909341SAndroid Build Coastguard Worker%else
7878*c0909341SAndroid Build Coastguard Worker%assign stksz 16*13
7879*c0909341SAndroid Build Coastguard Worker%endif
7880*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \
7881*c0909341SAndroid Build Coastguard Worker                                                 mx, tmp, alpha, beta, \
7882*c0909341SAndroid Build Coastguard Worker                                                 filter, my, gamma, cnt
7883*c0909341SAndroid Build Coastguard Worker%assign stack_size_padded_8x8t stack_size_padded
7884*c0909341SAndroid Build Coastguard Worker%else
7885*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
7886*c0909341SAndroid Build Coastguard Worker                                                 filter, mx, my
7887*c0909341SAndroid Build Coastguard Worker%define m8   [esp+16*13]
7888*c0909341SAndroid Build Coastguard Worker%define m9   [esp+16*14]
7889*c0909341SAndroid Build Coastguard Worker%define cntd dword [esp+4*63]
7890*c0909341SAndroid Build Coastguard Worker%define dstq tmpq
7891*c0909341SAndroid Build Coastguard Worker%define dsq  0
7892*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
7893*c0909341SAndroid Build Coastguard Worker%define dstm [esp+4*65]
7894*c0909341SAndroid Build Coastguard Worker%define dsm  [esp+4*66]
7895*c0909341SAndroid Build Coastguard Worker%else
7896*c0909341SAndroid Build Coastguard Worker%define dstm r0m
7897*c0909341SAndroid Build Coastguard Worker%define dsm  r1m
7898*c0909341SAndroid Build Coastguard Worker%endif
7899*c0909341SAndroid Build Coastguard Worker%endif
7900*c0909341SAndroid Build Coastguard Worker%define base filterq-$$
7901*c0909341SAndroid Build Coastguard Worker    mov                 t0d, r7m
7902*c0909341SAndroid Build Coastguard Worker    LEA             filterq, $$
7903*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 11
7904*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7905*c0909341SAndroid Build Coastguard Worker    movddup              m8, [base+warp8x8t_rnd]
7906*c0909341SAndroid Build Coastguard Worker%else
7907*c0909341SAndroid Build Coastguard Worker    movddup              m1, [base+warp8x8t_rnd]
7908*c0909341SAndroid Build Coastguard Worker    mov                  r1, r1m
7909*c0909341SAndroid Build Coastguard Worker    add                  r1, r1
7910*c0909341SAndroid Build Coastguard Worker    mova                 m8, m1
7911*c0909341SAndroid Build Coastguard Worker    mov                 r1m, r1 ; ds *= 2
7912*c0909341SAndroid Build Coastguard Worker%endif
7913*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main
7914*c0909341SAndroid Build Coastguard Worker    jmp .start
7915*c0909341SAndroid Build Coastguard Worker.loop:
7916*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7917*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*4]
7918*c0909341SAndroid Build Coastguard Worker%else
7919*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
7920*c0909341SAndroid Build Coastguard Worker    mov                dstm, dstq
7921*c0909341SAndroid Build Coastguard Worker%endif
7922*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2
7923*c0909341SAndroid Build Coastguard Worker.start:
7924*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
7925*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstm
7926*c0909341SAndroid Build Coastguard Worker%endif
7927*c0909341SAndroid Build Coastguard Worker    paddd                m1, m8
7928*c0909341SAndroid Build Coastguard Worker    paddd                m2, m8
7929*c0909341SAndroid Build Coastguard Worker    psrad                m1, 15
7930*c0909341SAndroid Build Coastguard Worker    psrad                m2, 15
7931*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m2
7932*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m1
7933*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3
7934*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
7935*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstm
7936*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
7937*c0909341SAndroid Build Coastguard Worker%endif
7938*c0909341SAndroid Build Coastguard Worker    paddd                m1, m8
7939*c0909341SAndroid Build Coastguard Worker    paddd                m2, m8
7940*c0909341SAndroid Build Coastguard Worker    psrad                m1, 15
7941*c0909341SAndroid Build Coastguard Worker    psrad                m2, 15
7942*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m2
7943*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*2], m1
7944*c0909341SAndroid Build Coastguard Worker    dec                cntd
7945*c0909341SAndroid Build Coastguard Worker    jg .loop
7946*c0909341SAndroid Build Coastguard Worker    RET
7947*c0909341SAndroid Build Coastguard Worker
7948*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7949*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \
7950*c0909341SAndroid Build Coastguard Worker                                                 mx, tmp, alpha, beta, \
7951*c0909341SAndroid Build Coastguard Worker                                                 filter, my, gamma, cnt
7952*c0909341SAndroid Build Coastguard WorkerASSERT stack_size_padded == stack_size_padded_8x8t
7953*c0909341SAndroid Build Coastguard Worker%else
7954*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
7955*c0909341SAndroid Build Coastguard Worker                                                filter, mx, my
7956*c0909341SAndroid Build Coastguard Worker%endif
7957*c0909341SAndroid Build Coastguard Worker    mov                 t0d, r7m
7958*c0909341SAndroid Build Coastguard Worker    LEA             filterq, $$
7959*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 11
7960*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7961*c0909341SAndroid Build Coastguard Worker    movddup              m8, [base+warp8x8_rnd2+t0*8]
7962*c0909341SAndroid Build Coastguard Worker    movd                 m9, r7m ; pixel_max
7963*c0909341SAndroid Build Coastguard Worker    pshufb               m9, [base+pw_256]
7964*c0909341SAndroid Build Coastguard Worker%else
7965*c0909341SAndroid Build Coastguard Worker    movddup              m1, [base+warp8x8_rnd2+t0*8]
7966*c0909341SAndroid Build Coastguard Worker    movd                 m2, r7m ; pixel_max
7967*c0909341SAndroid Build Coastguard Worker    pshufb               m2, [base+pw_256]
7968*c0909341SAndroid Build Coastguard Worker    mova                 m8, m1
7969*c0909341SAndroid Build Coastguard Worker    mova                 m9, m2
7970*c0909341SAndroid Build Coastguard Worker%endif
7971*c0909341SAndroid Build Coastguard Worker    call .main
7972*c0909341SAndroid Build Coastguard Worker    jmp .start
7973*c0909341SAndroid Build Coastguard Worker.loop:
7974*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7975*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
7976*c0909341SAndroid Build Coastguard Worker%else
7977*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
7978*c0909341SAndroid Build Coastguard Worker    mov                dstm, dstq
7979*c0909341SAndroid Build Coastguard Worker%endif
7980*c0909341SAndroid Build Coastguard Worker    call .main2
7981*c0909341SAndroid Build Coastguard Worker.start:
7982*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
7983*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstm
7984*c0909341SAndroid Build Coastguard Worker%endif
7985*c0909341SAndroid Build Coastguard Worker    psrad                m1, 16
7986*c0909341SAndroid Build Coastguard Worker    psrad                m2, 16
7987*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m2
7988*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m6
7989*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m8
7990*c0909341SAndroid Build Coastguard Worker    pminsw               m1, m9
7991*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m1
7992*c0909341SAndroid Build Coastguard Worker    call .main3
7993*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
7994*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstm
7995*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
7996*c0909341SAndroid Build Coastguard Worker%endif
7997*c0909341SAndroid Build Coastguard Worker    psrad                m1, 16
7998*c0909341SAndroid Build Coastguard Worker    psrad                m2, 16
7999*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m2
8000*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m6
8001*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m8
8002*c0909341SAndroid Build Coastguard Worker    pminsw               m1, m9
8003*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
8004*c0909341SAndroid Build Coastguard Worker    dec                cntd
8005*c0909341SAndroid Build Coastguard Worker    jg .loop
8006*c0909341SAndroid Build Coastguard Worker    RET
8007*c0909341SAndroid Build Coastguard WorkerALIGN function_align
8008*c0909341SAndroid Build Coastguard Worker.main:
8009*c0909341SAndroid Build Coastguard Worker    ; Stack args offset by one (r4m -> r5m etc.) due to call
8010*c0909341SAndroid Build Coastguard Worker%if WIN64
8011*c0909341SAndroid Build Coastguard Worker    mov              deltaq, r5m
8012*c0909341SAndroid Build Coastguard Worker    mov                 mxd, r6m
8013*c0909341SAndroid Build Coastguard Worker%endif
8014*c0909341SAndroid Build Coastguard Worker    movd                 m0, [base+warp8x8_shift+t0*4]
8015*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+warp8x8_rnd1+t0*8]
8016*c0909341SAndroid Build Coastguard Worker    add             filterq, mc_warp_filter-$$
8017*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8018*c0909341SAndroid Build Coastguard Worker    movsx            alphad, word [deltaq+2*0]
8019*c0909341SAndroid Build Coastguard Worker    movsx             betad, word [deltaq+2*1]
8020*c0909341SAndroid Build Coastguard Worker    movsx            gammad, word [deltaq+2*2]
8021*c0909341SAndroid Build Coastguard Worker    movsx            deltad, word [deltaq+2*3]
8022*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [ssq*3]
8023*c0909341SAndroid Build Coastguard Worker    add                 mxd, 512+(64<<10)
8024*c0909341SAndroid Build Coastguard Worker    sub                srcq, tmpq             ; src -= ss*3
8025*c0909341SAndroid Build Coastguard Worker    imul               tmpd, alphad, -7
8026*c0909341SAndroid Build Coastguard Worker    mov                 myd, r7m
8027*c0909341SAndroid Build Coastguard Worker    add               betad, tmpd             ; beta -= alpha*7
8028*c0909341SAndroid Build Coastguard Worker    imul               tmpd, gammad, -7
8029*c0909341SAndroid Build Coastguard Worker    add                 myd, 512+(64<<10)
8030*c0909341SAndroid Build Coastguard Worker    mov                cntd, 4
8031*c0909341SAndroid Build Coastguard Worker    add              deltad, tmpd             ; delta -= gamma*7
8032*c0909341SAndroid Build Coastguard Worker%else
8033*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
8034*c0909341SAndroid Build Coastguard Worker    %assign stack_offset stack_offset - gprsize
8035*c0909341SAndroid Build Coastguard Worker%endif
8036*c0909341SAndroid Build Coastguard Worker    mov                 r3d, r5m              ; abcd
8037*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
8038*c0909341SAndroid Build Coastguard Worker    mov                  r0, r1m              ; dst
8039*c0909341SAndroid Build Coastguard Worker    mov                  r1, r2m              ; ds
8040*c0909341SAndroid Build Coastguard Worker    mov  [esp+gprsize+4*65], r0
8041*c0909341SAndroid Build Coastguard Worker    mov  [esp+gprsize+4*66], r1
8042*c0909341SAndroid Build Coastguard Worker%endif
8043*c0909341SAndroid Build Coastguard Worker    movsx            alphad, word [r3+2*0]
8044*c0909341SAndroid Build Coastguard Worker    movsx               r2d, word [r3+2*1]
8045*c0909341SAndroid Build Coastguard Worker    movsx            gammad, word [r3+2*2]
8046*c0909341SAndroid Build Coastguard Worker    movsx               r3d, word [r3+2*3]
8047*c0909341SAndroid Build Coastguard Worker    imul                r5d, alphad, -7
8048*c0909341SAndroid Build Coastguard Worker    add                 r2d, r5d              ; beta -= alpha*7
8049*c0909341SAndroid Build Coastguard Worker    imul                r5d, gammad, -7
8050*c0909341SAndroid Build Coastguard Worker    mov  [esp+gprsize+4*60], r2d
8051*c0909341SAndroid Build Coastguard Worker    add                 r3d, r5d              ; delta -= gamma*7
8052*c0909341SAndroid Build Coastguard Worker    mov  [esp+gprsize+4*61], r3d
8053*c0909341SAndroid Build Coastguard Worker    mov                 r3d, r4m              ; ss
8054*c0909341SAndroid Build Coastguard Worker    mov                srcq, r3m
8055*c0909341SAndroid Build Coastguard Worker    mov                 mxd, r6m
8056*c0909341SAndroid Build Coastguard Worker    mov                 myd, r7m
8057*c0909341SAndroid Build Coastguard Worker    mov dword [esp+gprsize+4*63], 4           ; cnt
8058*c0909341SAndroid Build Coastguard Worker    mov  [esp+gprsize+4*62], r3
8059*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r3*3]
8060*c0909341SAndroid Build Coastguard Worker    add                 mxd, 512+(64<<10)
8061*c0909341SAndroid Build Coastguard Worker    add                 myd, 512+(64<<10)
8062*c0909341SAndroid Build Coastguard Worker    sub                srcq, r3               ; src -= ss*3
8063*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
8064*c0909341SAndroid Build Coastguard Worker    %assign stack_offset stack_offset + gprsize
8065*c0909341SAndroid Build Coastguard Worker%endif
8066*c0909341SAndroid Build Coastguard Worker%endif
8067*c0909341SAndroid Build Coastguard Worker    mova      [rsp+gprsize], m0
8068*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
8069*c0909341SAndroid Build Coastguard Worker    call .h
8070*c0909341SAndroid Build Coastguard Worker    mova                 m5, m0
8071*c0909341SAndroid Build Coastguard Worker    call .h
8072*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m5, m0           ; 01
8073*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m0
8074*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16* 1], m1
8075*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16* 4], m5
8076*c0909341SAndroid Build Coastguard Worker    mova                 m5, m0
8077*c0909341SAndroid Build Coastguard Worker    call .h
8078*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m5, m0           ; 12
8079*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m0
8080*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16* 7], m1
8081*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16*10], m5
8082*c0909341SAndroid Build Coastguard Worker    mova                 m5, m0
8083*c0909341SAndroid Build Coastguard Worker    call .h
8084*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m5, m0           ; 23
8085*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m0
8086*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16* 2], m1
8087*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16* 5], m5
8088*c0909341SAndroid Build Coastguard Worker    mova                 m5, m0
8089*c0909341SAndroid Build Coastguard Worker    call .h
8090*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m5, m0           ; 34
8091*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m0
8092*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16* 8], m1
8093*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16*11], m5
8094*c0909341SAndroid Build Coastguard Worker    mova                 m5, m0
8095*c0909341SAndroid Build Coastguard Worker    call .h
8096*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m5, m0           ; 45
8097*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m0
8098*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16* 3], m1
8099*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16* 6], m5
8100*c0909341SAndroid Build Coastguard Worker    mova                 m5, m0
8101*c0909341SAndroid Build Coastguard Worker    call .h
8102*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m5, m0           ; 56
8103*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m0
8104*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16* 9], m1
8105*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16*12], m5
8106*c0909341SAndroid Build Coastguard Worker    mova                 m5, m0
8107*c0909341SAndroid Build Coastguard Worker.main2:
8108*c0909341SAndroid Build Coastguard Worker    call .h
8109*c0909341SAndroid Build Coastguard Worker%macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h
8110*c0909341SAndroid Build Coastguard Worker    lea                tmpd, [myq+gammaq]
8111*c0909341SAndroid Build Coastguard Worker    shr                 myd, 10
8112*c0909341SAndroid Build Coastguard Worker    movq                 m4, [filterq+myq*8]  ; a
8113*c0909341SAndroid Build Coastguard Worker    lea                 myd, [tmpq+gammaq]
8114*c0909341SAndroid Build Coastguard Worker    shr                tmpd, 10
8115*c0909341SAndroid Build Coastguard Worker    movq                 m2, [filterq+tmpq*8] ; b
8116*c0909341SAndroid Build Coastguard Worker    lea                tmpd, [myq+gammaq]
8117*c0909341SAndroid Build Coastguard Worker    shr                 myd, 10
8118*c0909341SAndroid Build Coastguard Worker    movq                 m3, [filterq+myq*8]  ; c
8119*c0909341SAndroid Build Coastguard Worker    lea                 myd, [tmpq+gammaq]
8120*c0909341SAndroid Build Coastguard Worker    shr                tmpd, 10
8121*c0909341SAndroid Build Coastguard Worker    movq                 m1, [filterq+tmpq*8] ; d
8122*c0909341SAndroid Build Coastguard Worker    lea                tmpd, [myq+gammaq]
8123*c0909341SAndroid Build Coastguard Worker    shr                 myd, 10
8124*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m2
8125*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m1
8126*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4, m3
8127*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m3
8128*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m6, m2           ; a0 a1 b0 b1 c0 c1 d0 d1 << 8
8129*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, [rsp+gprsize+16*%1]
8130*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m6, m2           ; a2 a3 b2 b3 c2 c3 d2 d3 << 8
8131*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+16*%2]
8132*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m2
8133*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16*%1], m2
8134*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
8135*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m6, m4           ; a4 a5 b4 b5 c4 c5 d4 d5 << 8
8136*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+16*%3]
8137*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m2
8138*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16*%2], m2
8139*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
8140*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m5, m0           ; 67
8141*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m6, m4           ; a6 a7 b6 b7 c6 c7 d6 d7 << 8
8142*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m3
8143*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16*%3], m3
8144*c0909341SAndroid Build Coastguard Worker    paddd                m1, m2
8145*c0909341SAndroid Build Coastguard Worker    movq                 m4, [filterq+myq*8]  ; e
8146*c0909341SAndroid Build Coastguard Worker    lea                 myd, [tmpq+gammaq]
8147*c0909341SAndroid Build Coastguard Worker    shr                tmpd, 10
8148*c0909341SAndroid Build Coastguard Worker    movq                 m3, [filterq+tmpq*8] ; f
8149*c0909341SAndroid Build Coastguard Worker    lea                tmpd, [myq+gammaq]
8150*c0909341SAndroid Build Coastguard Worker    shr                 myd, 10
8151*c0909341SAndroid Build Coastguard Worker    movq                 m2, [filterq+myq*8]  ; g
8152*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8153*c0909341SAndroid Build Coastguard Worker    lea                 myd, [tmpq+deltaq]    ; my += delta
8154*c0909341SAndroid Build Coastguard Worker%else
8155*c0909341SAndroid Build Coastguard Worker    mov                 myd, [esp+gprsize+4*61]
8156*c0909341SAndroid Build Coastguard Worker    add                 myd, tmpd
8157*c0909341SAndroid Build Coastguard Worker%endif
8158*c0909341SAndroid Build Coastguard Worker    shr                tmpd, 10
8159*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m3
8160*c0909341SAndroid Build Coastguard Worker    movq                 m3, [filterq+tmpq*8] ; h
8161*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
8162*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m4, m2
8163*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m2
8164*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m6, m3           ; e0 e1 f0 f1 g0 g1 h0 h1 << 8
8165*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, [rsp+gprsize+16*%4]
8166*c0909341SAndroid Build Coastguard Worker    punpckhbw            m6, m3               ; e2 e3 f2 f3 g2 g3 h2 h3 << 8
8167*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+gprsize+16*%5]
8168*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m3
8169*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16*%4], m3
8170*c0909341SAndroid Build Coastguard Worker    pxor                 m3, m3
8171*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6
8172*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4               ; e4 e5 f4 f5 g4 g5 h4 h5 << 8
8173*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+gprsize+16*%6]
8174*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m6
8175*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16*%5], m6
8176*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m0
8177*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
8178*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3
8179*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m6, m4           ; e6 e7 f6 f7 g6 g7 h6 h7 << 8
8180*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5
8181*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16*%6], m5
8182*c0909341SAndroid Build Coastguard Worker    mova                 m5, m0
8183*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3
8184*c0909341SAndroid Build Coastguard Worker%endmacro
8185*c0909341SAndroid Build Coastguard Worker    WARP_V                1,  2,  3,  4,  5,  6
8186*c0909341SAndroid Build Coastguard Worker    ret
8187*c0909341SAndroid Build Coastguard Worker.main3:
8188*c0909341SAndroid Build Coastguard Worker    call .h
8189*c0909341SAndroid Build Coastguard Worker    WARP_V                7,  8,  9, 10, 11, 12
8190*c0909341SAndroid Build Coastguard Worker    ret
8191*c0909341SAndroid Build Coastguard WorkerALIGN function_align
8192*c0909341SAndroid Build Coastguard Worker.h:
8193*c0909341SAndroid Build Coastguard Worker    lea                tmpd, [mxq+alphaq]
8194*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 10
8195*c0909341SAndroid Build Coastguard Worker    movq                 m3, [filterq+mxq*8]
8196*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m6, m3
8197*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq-6]
8198*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m3               ; 0
8199*c0909341SAndroid Build Coastguard Worker    lea                 mxd, [tmpq+alphaq]
8200*c0909341SAndroid Build Coastguard Worker    shr                tmpd, 10
8201*c0909341SAndroid Build Coastguard Worker    movq                 m3, [filterq+tmpq*8]
8202*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m6, m3
8203*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq-4]
8204*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m3               ; 1
8205*c0909341SAndroid Build Coastguard Worker    lea                tmpd, [mxq+alphaq]
8206*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 10
8207*c0909341SAndroid Build Coastguard Worker    movq                 m3, [filterq+mxq*8]
8208*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m2               ; 0 1
8209*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m6, m3
8210*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq-2]
8211*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m3               ; 2
8212*c0909341SAndroid Build Coastguard Worker    lea                 mxd, [tmpq+alphaq]
8213*c0909341SAndroid Build Coastguard Worker    shr                tmpd, 10
8214*c0909341SAndroid Build Coastguard Worker    movq                 m3, [filterq+tmpq*8]
8215*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m6, m3
8216*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+0]
8217*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m3               ; 3
8218*c0909341SAndroid Build Coastguard Worker    lea                tmpd, [mxq+alphaq]
8219*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 10
8220*c0909341SAndroid Build Coastguard Worker    movq                 m3, [filterq+mxq*8]
8221*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m1               ; 2 3
8222*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m6, m3
8223*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+2]
8224*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m3               ; 4
8225*c0909341SAndroid Build Coastguard Worker    lea                 mxd, [tmpq+alphaq]
8226*c0909341SAndroid Build Coastguard Worker    shr                tmpd, 10
8227*c0909341SAndroid Build Coastguard Worker    movq                 m3, [filterq+tmpq*8]
8228*c0909341SAndroid Build Coastguard Worker    phaddd               m0, m2               ; 0 1 2 3
8229*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m6, m3
8230*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+4]
8231*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m3               ; 5
8232*c0909341SAndroid Build Coastguard Worker    lea                tmpd, [mxq+alphaq]
8233*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 10
8234*c0909341SAndroid Build Coastguard Worker    movq                 m3, [filterq+mxq*8]
8235*c0909341SAndroid Build Coastguard Worker    phaddd               m1, m2               ; 4 5
8236*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m6, m3
8237*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+6]
8238*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m3               ; 6
8239*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8240*c0909341SAndroid Build Coastguard Worker    lea                 mxd, [tmpq+betaq]     ; mx += beta
8241*c0909341SAndroid Build Coastguard Worker%else
8242*c0909341SAndroid Build Coastguard Worker    mov                 mxd, [esp+gprsize*2+4*60]
8243*c0909341SAndroid Build Coastguard Worker    add                 mxd, tmpd
8244*c0909341SAndroid Build Coastguard Worker%endif
8245*c0909341SAndroid Build Coastguard Worker    shr                tmpd, 10
8246*c0909341SAndroid Build Coastguard Worker    movq                 m3, [filterq+tmpq*8]
8247*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m6, m3
8248*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+8]
8249*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8250*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
8251*c0909341SAndroid Build Coastguard Worker%else
8252*c0909341SAndroid Build Coastguard Worker    add                srcq, [esp+gprsize*2+4*62]
8253*c0909341SAndroid Build Coastguard Worker%endif
8254*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m4               ; 7
8255*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m3               ; 6 7
8256*c0909341SAndroid Build Coastguard Worker    phaddd               m1, m2               ; 4 5 6 7
8257*c0909341SAndroid Build Coastguard Worker    paddd                m0, m7
8258*c0909341SAndroid Build Coastguard Worker    paddd                m1, m7
8259*c0909341SAndroid Build Coastguard Worker    psrad                m0, [rsp+gprsize*2]
8260*c0909341SAndroid Build Coastguard Worker    psrad                m1, [rsp+gprsize*2]
8261*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
8262*c0909341SAndroid Build Coastguard Worker    ret
8263*c0909341SAndroid Build Coastguard Worker
8264*c0909341SAndroid Build Coastguard Worker%macro BIDIR_FN 0
8265*c0909341SAndroid Build Coastguard Worker    call .main
8266*c0909341SAndroid Build Coastguard Worker    jmp                  wq
8267*c0909341SAndroid Build Coastguard Worker.w4_loop:
8268*c0909341SAndroid Build Coastguard Worker    call .main
8269*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8270*c0909341SAndroid Build Coastguard Worker.w4:
8271*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
8272*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
8273*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8274*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m1
8275*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m1
8276*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
8277*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
8278*c0909341SAndroid Build Coastguard Worker.ret:
8279*c0909341SAndroid Build Coastguard Worker    RET
8280*c0909341SAndroid Build Coastguard Worker.w8_loop:
8281*c0909341SAndroid Build Coastguard Worker    call .main
8282*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8283*c0909341SAndroid Build Coastguard Worker.w8:
8284*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
8285*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
8286*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
8287*c0909341SAndroid Build Coastguard Worker    jne .w8_loop
8288*c0909341SAndroid Build Coastguard Worker    RET
8289*c0909341SAndroid Build Coastguard Worker.w16_loop:
8290*c0909341SAndroid Build Coastguard Worker    call .main
8291*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
8292*c0909341SAndroid Build Coastguard Worker.w16:
8293*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
8294*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
8295*c0909341SAndroid Build Coastguard Worker    dec                  hd
8296*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
8297*c0909341SAndroid Build Coastguard Worker    RET
8298*c0909341SAndroid Build Coastguard Worker.w32_loop:
8299*c0909341SAndroid Build Coastguard Worker    call .main
8300*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
8301*c0909341SAndroid Build Coastguard Worker.w32:
8302*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
8303*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
8304*c0909341SAndroid Build Coastguard Worker    call .main
8305*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
8306*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m1
8307*c0909341SAndroid Build Coastguard Worker    dec                  hd
8308*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
8309*c0909341SAndroid Build Coastguard Worker    RET
8310*c0909341SAndroid Build Coastguard Worker.w64_loop:
8311*c0909341SAndroid Build Coastguard Worker    call .main
8312*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
8313*c0909341SAndroid Build Coastguard Worker.w64:
8314*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
8315*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
8316*c0909341SAndroid Build Coastguard Worker    call .main
8317*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
8318*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m1
8319*c0909341SAndroid Build Coastguard Worker    call .main
8320*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*4], m0
8321*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*5], m1
8322*c0909341SAndroid Build Coastguard Worker    call .main
8323*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*6], m0
8324*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*7], m1
8325*c0909341SAndroid Build Coastguard Worker    dec                  hd
8326*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
8327*c0909341SAndroid Build Coastguard Worker    RET
8328*c0909341SAndroid Build Coastguard Worker.w128_loop:
8329*c0909341SAndroid Build Coastguard Worker    call .main
8330*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
8331*c0909341SAndroid Build Coastguard Worker.w128:
8332*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 0], m0
8333*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 1], m1
8334*c0909341SAndroid Build Coastguard Worker    call .main
8335*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 2], m0
8336*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 3], m1
8337*c0909341SAndroid Build Coastguard Worker    call .main
8338*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 4], m0
8339*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 5], m1
8340*c0909341SAndroid Build Coastguard Worker    call .main
8341*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 6], m0
8342*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 7], m1
8343*c0909341SAndroid Build Coastguard Worker    call .main
8344*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 8], m0
8345*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 9], m1
8346*c0909341SAndroid Build Coastguard Worker    call .main
8347*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16*10], m0
8348*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16*11], m1
8349*c0909341SAndroid Build Coastguard Worker    call .main
8350*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16*12], m0
8351*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16*13], m1
8352*c0909341SAndroid Build Coastguard Worker    call .main
8353*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16*14], m0
8354*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16*15], m1
8355*c0909341SAndroid Build Coastguard Worker    dec                  hd
8356*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
8357*c0909341SAndroid Build Coastguard Worker    RET
8358*c0909341SAndroid Build Coastguard Worker%endmacro
8359*c0909341SAndroid Build Coastguard Worker
8360*c0909341SAndroid Build Coastguard Worker%if UNIX64
8361*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7
8362*c0909341SAndroid Build Coastguard Worker%else
8363*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5
8364*c0909341SAndroid Build Coastguard Worker%endif
8365*c0909341SAndroid Build Coastguard Worker
8366*c0909341SAndroid Build Coastguard Workercglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h
8367*c0909341SAndroid Build Coastguard Worker%define base r6-avg_ssse3_table
8368*c0909341SAndroid Build Coastguard Worker    LEA                  r6, avg_ssse3_table
8369*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
8370*c0909341SAndroid Build Coastguard Worker    mov                 t0d, r6m ; pixel_max
8371*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
8372*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 11
8373*c0909341SAndroid Build Coastguard Worker    movddup              m2, [base+bidir_rnd+t0*8]
8374*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+bidir_mul+t0*8]
8375*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
8376*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
8377*c0909341SAndroid Build Coastguard Worker    BIDIR_FN
8378*c0909341SAndroid Build Coastguard WorkerALIGN function_align
8379*c0909341SAndroid Build Coastguard Worker.main:
8380*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tmp1q+16*0]
8381*c0909341SAndroid Build Coastguard Worker    paddsw               m0, [tmp2q+16*0]
8382*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp1q+16*1]
8383*c0909341SAndroid Build Coastguard Worker    paddsw               m1, [tmp2q+16*1]
8384*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 16*2
8385*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 16*2
8386*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m2
8387*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m2
8388*c0909341SAndroid Build Coastguard Worker    psubsw               m0, m2
8389*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m2
8390*c0909341SAndroid Build Coastguard Worker    pmulhw               m0, m3
8391*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m3
8392*c0909341SAndroid Build Coastguard Worker    ret
8393*c0909341SAndroid Build Coastguard Worker
8394*c0909341SAndroid Build Coastguard Workercglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h
8395*c0909341SAndroid Build Coastguard Worker%define base r6-w_avg_ssse3_table
8396*c0909341SAndroid Build Coastguard Worker    LEA                  r6, w_avg_ssse3_table
8397*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
8398*c0909341SAndroid Build Coastguard Worker    mov                 t0d, r6m ; weight
8399*c0909341SAndroid Build Coastguard Worker    movd                 m6, r7m ; pixel_max
8400*c0909341SAndroid Build Coastguard Worker    movddup              m5, [base+pd_65538]
8401*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
8402*c0909341SAndroid Build Coastguard Worker    pshufb               m6, [base+pw_256]
8403*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
8404*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [t0-16]
8405*c0909341SAndroid Build Coastguard Worker    shl                 t0d, 16
8406*c0909341SAndroid Build Coastguard Worker    sub                 t0d, r6d ; 16-weight, weight
8407*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6
8408*c0909341SAndroid Build Coastguard Worker    mov                 r6d, t0d
8409*c0909341SAndroid Build Coastguard Worker    shl                 t0d, 2
8410*c0909341SAndroid Build Coastguard Worker    test          dword r7m, 0x800
8411*c0909341SAndroid Build Coastguard Worker    cmovnz              r6d, t0d
8412*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
8413*c0909341SAndroid Build Coastguard Worker    movd                 m4, r6d
8414*c0909341SAndroid Build Coastguard Worker    pslld                m5, 7
8415*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
8416*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m4, q0000
8417*c0909341SAndroid Build Coastguard Worker    BIDIR_FN
8418*c0909341SAndroid Build Coastguard WorkerALIGN function_align
8419*c0909341SAndroid Build Coastguard Worker.main:
8420*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmp1q+16*0]
8421*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tmp2q+16*0]
8422*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m2
8423*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
8424*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmp1q+16*1]
8425*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp2q+16*1]
8426*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 16*2
8427*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 16*2
8428*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m4
8429*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m4
8430*c0909341SAndroid Build Coastguard Worker    paddd                m3, m5
8431*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
8432*c0909341SAndroid Build Coastguard Worker    psrad                m3, 8
8433*c0909341SAndroid Build Coastguard Worker    psrad                m0, 8
8434*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m3
8435*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m1, m2
8436*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2
8437*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m4
8438*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m4
8439*c0909341SAndroid Build Coastguard Worker    paddd                m3, m5
8440*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5
8441*c0909341SAndroid Build Coastguard Worker    psrad                m3, 8
8442*c0909341SAndroid Build Coastguard Worker    psrad                m1, 8
8443*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3
8444*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m6
8445*c0909341SAndroid Build Coastguard Worker    pminsw               m1, m6
8446*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m7
8447*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m7
8448*c0909341SAndroid Build Coastguard Worker    ret
8449*c0909341SAndroid Build Coastguard Worker
8450*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8451*c0909341SAndroid Build Coastguard Workercglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
8452*c0909341SAndroid Build Coastguard Worker%else
8453*c0909341SAndroid Build Coastguard Workercglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
8454*c0909341SAndroid Build Coastguard Worker%define hd dword r5m
8455*c0909341SAndroid Build Coastguard Worker%define m8 [base+pw_64]
8456*c0909341SAndroid Build Coastguard Worker%endif
8457*c0909341SAndroid Build Coastguard Worker%define base r6-mask_ssse3_table
8458*c0909341SAndroid Build Coastguard Worker    LEA                  r6, mask_ssse3_table
8459*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
8460*c0909341SAndroid Build Coastguard Worker    mov                 t0d, r7m ; pixel_max
8461*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 11
8462*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
8463*c0909341SAndroid Build Coastguard Worker    movddup              m6, [base+bidir_rnd+t0*8]
8464*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+bidir_mul+t0*8]
8465*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8466*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+pw_64]
8467*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
8468*c0909341SAndroid Build Coastguard Worker%endif
8469*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
8470*c0909341SAndroid Build Coastguard Worker    mov               maskq, r6mp
8471*c0909341SAndroid Build Coastguard Worker    BIDIR_FN
8472*c0909341SAndroid Build Coastguard WorkerALIGN function_align
8473*c0909341SAndroid Build Coastguard Worker.main:
8474*c0909341SAndroid Build Coastguard Worker    movq                 m3, [maskq+8*0]
8475*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tmp1q+16*0]
8476*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tmp2q+16*0]
8477*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
8478*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m5
8479*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0, m4
8480*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m4
8481*c0909341SAndroid Build Coastguard Worker    psubw                m1, m8, m3
8482*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m3, m1 ; m, 64-m
8483*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m1
8484*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m4     ; tmp1 * m + tmp2 * (64-m)
8485*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m3
8486*c0909341SAndroid Build Coastguard Worker    movq                 m3, [maskq+8*1]
8487*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp1q+16*1]
8488*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tmp2q+16*1]
8489*c0909341SAndroid Build Coastguard Worker    add               maskq, 8*2
8490*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 16*2
8491*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 16*2
8492*c0909341SAndroid Build Coastguard Worker    psrad                m2, 5
8493*c0909341SAndroid Build Coastguard Worker    psrad                m0, 5
8494*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2
8495*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m5
8496*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m1, m4
8497*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m4
8498*c0909341SAndroid Build Coastguard Worker    psubw                m5, m8, m3
8499*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m3, m5 ; m, 64-m
8500*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m5
8501*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m4     ; tmp1 * m + tmp2 * (64-m)
8502*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m3
8503*c0909341SAndroid Build Coastguard Worker    psrad                m2, 5
8504*c0909341SAndroid Build Coastguard Worker    psrad                m1, 5
8505*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m2
8506*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m6
8507*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m6
8508*c0909341SAndroid Build Coastguard Worker    psubsw               m0, m6
8509*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m6
8510*c0909341SAndroid Build Coastguard Worker    pmulhw               m0, m7
8511*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m7
8512*c0909341SAndroid Build Coastguard Worker    ret
8513*c0909341SAndroid Build Coastguard Worker
8514*c0909341SAndroid Build Coastguard Workercglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
8515*c0909341SAndroid Build Coastguard Worker%define base t0-w_mask_420_ssse3_table
8516*c0909341SAndroid Build Coastguard Worker    LEA                  t0, w_mask_420_ssse3_table
8517*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
8518*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r8m ; pixel_max
8519*c0909341SAndroid Build Coastguard Worker    movd                 m0, r7m ; sign
8520*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
8521*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4]
8522*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8523*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
8524*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pw_64]
8525*c0909341SAndroid Build Coastguard Worker    movddup             m10, [base+bidir_rnd+r6*8]
8526*c0909341SAndroid Build Coastguard Worker    movddup             m11, [base+bidir_mul+r6*8]
8527*c0909341SAndroid Build Coastguard Worker%else
8528*c0909341SAndroid Build Coastguard Worker    mova                 m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
8529*c0909341SAndroid Build Coastguard Worker    mova                 m2, [base+pw_64]
8530*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+bidir_rnd+r6*8]
8531*c0909341SAndroid Build Coastguard Worker    movddup              m4, [base+bidir_mul+r6*8]
8532*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -16*4
8533*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m1
8534*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m2
8535*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m3
8536*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m4
8537*c0909341SAndroid Build Coastguard Worker    %define              m8  [rsp+gprsize+16*0]
8538*c0909341SAndroid Build Coastguard Worker    %define              m9  [rsp+gprsize+16*1]
8539*c0909341SAndroid Build Coastguard Worker    %define             m10  [rsp+gprsize+16*2]
8540*c0909341SAndroid Build Coastguard Worker    %define             m11  [rsp+gprsize+16*3]
8541*c0909341SAndroid Build Coastguard Worker%endif
8542*c0909341SAndroid Build Coastguard Worker    movd                 m7, [base+pw_2]
8543*c0909341SAndroid Build Coastguard Worker    psubw                m7, m0
8544*c0909341SAndroid Build Coastguard Worker    pshufb               m7, [base+pw_256]
8545*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
8546*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, r5m
8547*c0909341SAndroid Build Coastguard Worker    mov               maskq, r6mp
8548*c0909341SAndroid Build Coastguard Worker    call .main
8549*c0909341SAndroid Build Coastguard Worker    jmp                  wq
8550*c0909341SAndroid Build Coastguard Worker.w4_loop:
8551*c0909341SAndroid Build Coastguard Worker    call .main
8552*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8553*c0909341SAndroid Build Coastguard Worker    add               maskq, 4
8554*c0909341SAndroid Build Coastguard Worker.w4:
8555*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
8556*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8557*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
8558*c0909341SAndroid Build Coastguard Worker    phaddd               m2, m2
8559*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8560*c0909341SAndroid Build Coastguard Worker    paddw                m2, m7
8561*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m1
8562*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 2
8563*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m1
8564*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m2
8565*c0909341SAndroid Build Coastguard Worker    movd            [maskq], m2
8566*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
8567*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
8568*c0909341SAndroid Build Coastguard Worker    RET
8569*c0909341SAndroid Build Coastguard Worker.w8_loop:
8570*c0909341SAndroid Build Coastguard Worker    call .main
8571*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8572*c0909341SAndroid Build Coastguard Worker    add               maskq, 4
8573*c0909341SAndroid Build Coastguard Worker.w8:
8574*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
8575*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
8576*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m2
8577*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
8578*c0909341SAndroid Build Coastguard Worker    paddw                m2, m7
8579*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 2
8580*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m2
8581*c0909341SAndroid Build Coastguard Worker    movd            [maskq], m2
8582*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
8583*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
8584*c0909341SAndroid Build Coastguard Worker    RET
8585*c0909341SAndroid Build Coastguard Worker.w16_loop:
8586*c0909341SAndroid Build Coastguard Worker    call .main
8587*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8588*c0909341SAndroid Build Coastguard Worker    add               maskq, 8
8589*c0909341SAndroid Build Coastguard Worker.w16:
8590*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*0], m2
8591*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*0], m0
8592*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*1], m3
8593*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*1], m1
8594*c0909341SAndroid Build Coastguard Worker    call .main
8595*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*1+16*0]
8596*c0909341SAndroid Build Coastguard Worker    paddw                m3, [dstq+strideq*1+16*1]
8597*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*0], m0
8598*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8599*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*1], m1
8600*c0909341SAndroid Build Coastguard Worker    paddw                m2, m7
8601*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 2
8602*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m2
8603*c0909341SAndroid Build Coastguard Worker    movq            [maskq], m2
8604*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
8605*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
8606*c0909341SAndroid Build Coastguard Worker    RET
8607*c0909341SAndroid Build Coastguard Worker.w32_loop:
8608*c0909341SAndroid Build Coastguard Worker    call .main
8609*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8610*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
8611*c0909341SAndroid Build Coastguard Worker.w32:
8612*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*0], m2
8613*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*0], m0
8614*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*1], m3
8615*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*1], m1
8616*c0909341SAndroid Build Coastguard Worker    call .main
8617*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*2], m0
8618*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8619*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*3], m2
8620*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*3], m1
8621*c0909341SAndroid Build Coastguard Worker    call .main
8622*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*1+16*0]
8623*c0909341SAndroid Build Coastguard Worker    paddw                m3, [dstq+strideq*1+16*1]
8624*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*0], m0
8625*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8626*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*2], m2
8627*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*1], m1
8628*c0909341SAndroid Build Coastguard Worker    call .main
8629*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8630*c0909341SAndroid Build Coastguard Worker    paddw                m3, m7, [dstq+strideq*1+16*2]
8631*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*1+16*3]
8632*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*2], m0
8633*c0909341SAndroid Build Coastguard Worker    paddw                m2, m7
8634*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 2
8635*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 2
8636*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*3], m1
8637*c0909341SAndroid Build Coastguard Worker    packuswb             m3, m2
8638*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m3
8639*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
8640*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
8641*c0909341SAndroid Build Coastguard Worker    RET
8642*c0909341SAndroid Build Coastguard Worker.w64_loop:
8643*c0909341SAndroid Build Coastguard Worker    call .main
8644*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8645*c0909341SAndroid Build Coastguard Worker    add               maskq, 16*2
8646*c0909341SAndroid Build Coastguard Worker.w64:
8647*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*1], m2
8648*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*0], m0
8649*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*2], m3
8650*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*1], m1
8651*c0909341SAndroid Build Coastguard Worker    call .main
8652*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*3], m2
8653*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*2], m0
8654*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*4], m3
8655*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*3], m1
8656*c0909341SAndroid Build Coastguard Worker    call .main
8657*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*5], m2
8658*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*4], m0
8659*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*6], m3
8660*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*5], m1
8661*c0909341SAndroid Build Coastguard Worker    call .main
8662*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*6], m0
8663*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8664*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*7], m2
8665*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*7], m1
8666*c0909341SAndroid Build Coastguard Worker    call .main
8667*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*1+16*1]
8668*c0909341SAndroid Build Coastguard Worker    paddw                m3, [dstq+strideq*1+16*2]
8669*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*0], m0
8670*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8671*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*2], m2
8672*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*1], m1
8673*c0909341SAndroid Build Coastguard Worker    call .main
8674*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*1+16*3]
8675*c0909341SAndroid Build Coastguard Worker    paddw                m3, [dstq+strideq*1+16*4]
8676*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8677*c0909341SAndroid Build Coastguard Worker    paddw                m3, m7, [dstq+strideq*1+16*2]
8678*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*2], m0
8679*c0909341SAndroid Build Coastguard Worker    paddw                m2, m7
8680*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 2
8681*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 2
8682*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*3], m1
8683*c0909341SAndroid Build Coastguard Worker    packuswb             m3, m2
8684*c0909341SAndroid Build Coastguard Worker    mova       [maskq+16*0], m3
8685*c0909341SAndroid Build Coastguard Worker    call .main
8686*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*1+16*5]
8687*c0909341SAndroid Build Coastguard Worker    paddw                m3, [dstq+strideq*1+16*6]
8688*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*4], m0
8689*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8690*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*6], m2
8691*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*5], m1
8692*c0909341SAndroid Build Coastguard Worker    call .main
8693*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8694*c0909341SAndroid Build Coastguard Worker    paddw                m3, m7, [dstq+strideq*1+16*6]
8695*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*1+16*7]
8696*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*6], m0
8697*c0909341SAndroid Build Coastguard Worker    paddw                m2, m7
8698*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 2
8699*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 2
8700*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*7], m1
8701*c0909341SAndroid Build Coastguard Worker    packuswb             m3, m2
8702*c0909341SAndroid Build Coastguard Worker    mova       [maskq+16*1], m3
8703*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
8704*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
8705*c0909341SAndroid Build Coastguard Worker    RET
8706*c0909341SAndroid Build Coastguard Worker.w128_loop:
8707*c0909341SAndroid Build Coastguard Worker    call .main
8708*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8709*c0909341SAndroid Build Coastguard Worker    add               maskq, 16*4
8710*c0909341SAndroid Build Coastguard Worker.w128:
8711*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 1], m2
8712*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16* 0], m0
8713*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 2], m3
8714*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16* 1], m1
8715*c0909341SAndroid Build Coastguard Worker    call .main
8716*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 3], m2
8717*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16* 2], m0
8718*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 4], m3
8719*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16* 3], m1
8720*c0909341SAndroid Build Coastguard Worker    call .main
8721*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 5], m2
8722*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16* 4], m0
8723*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 6], m3
8724*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16* 5], m1
8725*c0909341SAndroid Build Coastguard Worker    call .main
8726*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 7], m2
8727*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16* 6], m0
8728*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 8], m3
8729*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16* 7], m1
8730*c0909341SAndroid Build Coastguard Worker    call .main
8731*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 9], m2
8732*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16* 8], m0
8733*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*10], m3
8734*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16* 9], m1
8735*c0909341SAndroid Build Coastguard Worker    call .main
8736*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*11], m2
8737*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*10], m0
8738*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*12], m3
8739*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*11], m1
8740*c0909341SAndroid Build Coastguard Worker    call .main
8741*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*13], m2
8742*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*12], m0
8743*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*14], m3
8744*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*13], m1
8745*c0909341SAndroid Build Coastguard Worker    call .main
8746*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*14], m0
8747*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8748*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*15], m2
8749*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*15], m1
8750*c0909341SAndroid Build Coastguard Worker    call .main
8751*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*1+16* 1]
8752*c0909341SAndroid Build Coastguard Worker    paddw                m3, [dstq+strideq*1+16* 2]
8753*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 0], m0
8754*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8755*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 2], m2
8756*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 1], m1
8757*c0909341SAndroid Build Coastguard Worker    call .main
8758*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*1+16* 3]
8759*c0909341SAndroid Build Coastguard Worker    paddw                m3, [dstq+strideq*1+16* 4]
8760*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8761*c0909341SAndroid Build Coastguard Worker    paddw                m3, m7, [dstq+strideq*1+16* 2]
8762*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 2], m0
8763*c0909341SAndroid Build Coastguard Worker    paddw                m2, m7
8764*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 2
8765*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 2
8766*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 3], m1
8767*c0909341SAndroid Build Coastguard Worker    packuswb             m3, m2
8768*c0909341SAndroid Build Coastguard Worker    mova       [maskq+16*0], m3
8769*c0909341SAndroid Build Coastguard Worker    call .main
8770*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*1+16* 5]
8771*c0909341SAndroid Build Coastguard Worker    paddw                m3, [dstq+strideq*1+16* 6]
8772*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 4], m0
8773*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8774*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 6], m2
8775*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 5], m1
8776*c0909341SAndroid Build Coastguard Worker    call .main
8777*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*1+16* 7]
8778*c0909341SAndroid Build Coastguard Worker    paddw                m3, [dstq+strideq*1+16* 8]
8779*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8780*c0909341SAndroid Build Coastguard Worker    paddw                m3, m7, [dstq+strideq*1+16* 6]
8781*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 6], m0
8782*c0909341SAndroid Build Coastguard Worker    paddw                m2, m7
8783*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 2
8784*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 2
8785*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 7], m1
8786*c0909341SAndroid Build Coastguard Worker    packuswb             m3, m2
8787*c0909341SAndroid Build Coastguard Worker    mova       [maskq+16*1], m3
8788*c0909341SAndroid Build Coastguard Worker    call .main
8789*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*1+16* 9]
8790*c0909341SAndroid Build Coastguard Worker    paddw                m3, [dstq+strideq*1+16*10]
8791*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 8], m0
8792*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8793*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*10], m2
8794*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16* 9], m1
8795*c0909341SAndroid Build Coastguard Worker    call .main
8796*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*1+16*11]
8797*c0909341SAndroid Build Coastguard Worker    paddw                m3, [dstq+strideq*1+16*12]
8798*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8799*c0909341SAndroid Build Coastguard Worker    paddw                m3, m7, [dstq+strideq*1+16*10]
8800*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*10], m0
8801*c0909341SAndroid Build Coastguard Worker    paddw                m2, m7
8802*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 2
8803*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 2
8804*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*11], m1
8805*c0909341SAndroid Build Coastguard Worker    packuswb             m3, m2
8806*c0909341SAndroid Build Coastguard Worker    mova       [maskq+16*2], m3
8807*c0909341SAndroid Build Coastguard Worker    call .main
8808*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*1+16*13]
8809*c0909341SAndroid Build Coastguard Worker    paddw                m3, [dstq+strideq*1+16*14]
8810*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*12], m0
8811*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8812*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*14], m2
8813*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*13], m1
8814*c0909341SAndroid Build Coastguard Worker    call .main
8815*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8816*c0909341SAndroid Build Coastguard Worker    paddw                m3, m7, [dstq+strideq*1+16*14]
8817*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*1+16*15]
8818*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*14], m0
8819*c0909341SAndroid Build Coastguard Worker    paddw                m2, m7
8820*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 2
8821*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 2
8822*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*15], m1
8823*c0909341SAndroid Build Coastguard Worker    packuswb             m3, m2
8824*c0909341SAndroid Build Coastguard Worker    mova       [maskq+16*3], m3
8825*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
8826*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
8827*c0909341SAndroid Build Coastguard Worker    RET
8828*c0909341SAndroid Build Coastguard WorkerALIGN function_align
8829*c0909341SAndroid Build Coastguard Worker.main:
8830*c0909341SAndroid Build Coastguard Worker%macro W_MASK 2 ; dst/tmp_offset, mask
8831*c0909341SAndroid Build Coastguard Worker    mova                m%1, [tmp1q+16*%1]
8832*c0909341SAndroid Build Coastguard Worker    mova                m%2, [tmp2q+16*%1]
8833*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m%2, m%1
8834*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m%2, m%1
8835*c0909341SAndroid Build Coastguard Worker    psubsw              m%1, m%2
8836*c0909341SAndroid Build Coastguard Worker    pabsw               m%1, m%1
8837*c0909341SAndroid Build Coastguard Worker    psubusw              m6, m8, m%1
8838*c0909341SAndroid Build Coastguard Worker    psrlw                m6, 10      ; 64-m
8839*c0909341SAndroid Build Coastguard Worker    psubw               m%2, m9, m6  ; m
8840*c0909341SAndroid Build Coastguard Worker    punpcklwd           m%1, m6, m%2
8841*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m%2
8842*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m4
8843*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m5
8844*c0909341SAndroid Build Coastguard Worker    psrad               m%1, 5
8845*c0909341SAndroid Build Coastguard Worker    psrad                m6, 5
8846*c0909341SAndroid Build Coastguard Worker    packssdw            m%1, m6
8847*c0909341SAndroid Build Coastguard Worker    pmaxsw              m%1, m10
8848*c0909341SAndroid Build Coastguard Worker    psubsw              m%1, m10
8849*c0909341SAndroid Build Coastguard Worker    pmulhw              m%1, m11
8850*c0909341SAndroid Build Coastguard Worker%endmacro
8851*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 2
8852*c0909341SAndroid Build Coastguard Worker    W_MASK                1, 3
8853*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 16*2
8854*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 16*2
8855*c0909341SAndroid Build Coastguard Worker    ret
8856*c0909341SAndroid Build Coastguard Worker
8857*c0909341SAndroid Build Coastguard Workercglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
8858*c0909341SAndroid Build Coastguard Worker%define base t0-w_mask_422_ssse3_table
8859*c0909341SAndroid Build Coastguard Worker    LEA                  t0, w_mask_422_ssse3_table
8860*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
8861*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r8m ; pixel_max
8862*c0909341SAndroid Build Coastguard Worker    movd                 m7, r7m ; sign
8863*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
8864*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4]
8865*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8866*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+pw_27615]
8867*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pw_64]
8868*c0909341SAndroid Build Coastguard Worker    movddup             m10, [base+bidir_rnd+r6*8]
8869*c0909341SAndroid Build Coastguard Worker    movddup             m11, [base+bidir_mul+r6*8]
8870*c0909341SAndroid Build Coastguard Worker%else
8871*c0909341SAndroid Build Coastguard Worker    mova                 m1, [base+pw_27615]
8872*c0909341SAndroid Build Coastguard Worker    mova                 m2, [base+pw_64]
8873*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+bidir_rnd+r6*8]
8874*c0909341SAndroid Build Coastguard Worker    movddup              m4, [base+bidir_mul+r6*8]
8875*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -16*4
8876*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m1
8877*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m2
8878*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m3
8879*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m4
8880*c0909341SAndroid Build Coastguard Worker%endif
8881*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
8882*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
8883*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m0
8884*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, r5m
8885*c0909341SAndroid Build Coastguard Worker    mov               maskq, r6mp
8886*c0909341SAndroid Build Coastguard Worker    call .main
8887*c0909341SAndroid Build Coastguard Worker    jmp                  wq
8888*c0909341SAndroid Build Coastguard Worker.w4_loop:
8889*c0909341SAndroid Build Coastguard Worker    call .main
8890*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8891*c0909341SAndroid Build Coastguard Worker.w4:
8892*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
8893*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
8894*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8895*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m1
8896*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m1
8897*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
8898*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
8899*c0909341SAndroid Build Coastguard Worker.end:
8900*c0909341SAndroid Build Coastguard Worker    RET
8901*c0909341SAndroid Build Coastguard Worker.w8_loop:
8902*c0909341SAndroid Build Coastguard Worker    call .main
8903*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8904*c0909341SAndroid Build Coastguard Worker.w8:
8905*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
8906*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
8907*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
8908*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
8909*c0909341SAndroid Build Coastguard Worker.w8_end:
8910*c0909341SAndroid Build Coastguard Worker    RET
8911*c0909341SAndroid Build Coastguard Worker.w16_loop:
8912*c0909341SAndroid Build Coastguard Worker    call .main
8913*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8914*c0909341SAndroid Build Coastguard Worker.w16:
8915*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*0], m0
8916*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*1], m1
8917*c0909341SAndroid Build Coastguard Worker    call .main
8918*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*0], m0
8919*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*1], m1
8920*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
8921*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
8922*c0909341SAndroid Build Coastguard Worker    RET
8923*c0909341SAndroid Build Coastguard Worker.w32_loop:
8924*c0909341SAndroid Build Coastguard Worker    call .main
8925*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
8926*c0909341SAndroid Build Coastguard Worker.w32:
8927*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
8928*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
8929*c0909341SAndroid Build Coastguard Worker    call .main
8930*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
8931*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m1
8932*c0909341SAndroid Build Coastguard Worker    dec                  hd
8933*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
8934*c0909341SAndroid Build Coastguard Worker    RET
8935*c0909341SAndroid Build Coastguard Worker.w64_loop:
8936*c0909341SAndroid Build Coastguard Worker    call .main
8937*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
8938*c0909341SAndroid Build Coastguard Worker.w64:
8939*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
8940*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
8941*c0909341SAndroid Build Coastguard Worker    call .main
8942*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
8943*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m1
8944*c0909341SAndroid Build Coastguard Worker    call .main
8945*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*4], m0
8946*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*5], m1
8947*c0909341SAndroid Build Coastguard Worker    call .main
8948*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*6], m0
8949*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*7], m1
8950*c0909341SAndroid Build Coastguard Worker    dec                  hd
8951*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
8952*c0909341SAndroid Build Coastguard Worker    RET
8953*c0909341SAndroid Build Coastguard Worker.w128_loop:
8954*c0909341SAndroid Build Coastguard Worker    call .main
8955*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
8956*c0909341SAndroid Build Coastguard Worker.w128:
8957*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 0], m0
8958*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 1], m1
8959*c0909341SAndroid Build Coastguard Worker    call .main
8960*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 2], m0
8961*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 3], m1
8962*c0909341SAndroid Build Coastguard Worker    call .main
8963*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 4], m0
8964*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 5], m1
8965*c0909341SAndroid Build Coastguard Worker    call .main
8966*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 6], m0
8967*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 7], m1
8968*c0909341SAndroid Build Coastguard Worker    call .main
8969*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 8], m0
8970*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 9], m1
8971*c0909341SAndroid Build Coastguard Worker    call .main
8972*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16*10], m0
8973*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16*11], m1
8974*c0909341SAndroid Build Coastguard Worker    call .main
8975*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16*12], m0
8976*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16*13], m1
8977*c0909341SAndroid Build Coastguard Worker    call .main
8978*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16*14], m0
8979*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16*15], m1
8980*c0909341SAndroid Build Coastguard Worker    dec                  hd
8981*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
8982*c0909341SAndroid Build Coastguard Worker    RET
8983*c0909341SAndroid Build Coastguard WorkerALIGN function_align
8984*c0909341SAndroid Build Coastguard Worker.main:
8985*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 2
8986*c0909341SAndroid Build Coastguard Worker    W_MASK                1, 3
8987*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8988*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 16*2
8989*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 16*2
8990*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m2
8991*c0909341SAndroid Build Coastguard Worker    pxor                 m3, m3
8992*c0909341SAndroid Build Coastguard Worker    psubb                m2, m7
8993*c0909341SAndroid Build Coastguard Worker    pavgb                m2, m3
8994*c0909341SAndroid Build Coastguard Worker    movq            [maskq], m2
8995*c0909341SAndroid Build Coastguard Worker    add               maskq, 8
8996*c0909341SAndroid Build Coastguard Worker    ret
8997*c0909341SAndroid Build Coastguard Worker
8998*c0909341SAndroid Build Coastguard Workercglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
8999*c0909341SAndroid Build Coastguard Worker%define base t0-w_mask_444_ssse3_table
9000*c0909341SAndroid Build Coastguard Worker    LEA                  t0, w_mask_444_ssse3_table
9001*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
9002*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r8m ; pixel_max
9003*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
9004*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4]
9005*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9006*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+pw_27615]
9007*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pw_64]
9008*c0909341SAndroid Build Coastguard Worker    movddup             m10, [base+bidir_rnd+r6*8]
9009*c0909341SAndroid Build Coastguard Worker    movddup             m11, [base+bidir_mul+r6*8]
9010*c0909341SAndroid Build Coastguard Worker%else
9011*c0909341SAndroid Build Coastguard Worker    mova                 m1, [base+pw_27615]
9012*c0909341SAndroid Build Coastguard Worker    mova                 m2, [base+pw_64]
9013*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+bidir_rnd+r6*8]
9014*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+bidir_mul+r6*8]
9015*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -16*3
9016*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m1
9017*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m2
9018*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m3
9019*c0909341SAndroid Build Coastguard Worker    %define             m11  m7
9020*c0909341SAndroid Build Coastguard Worker%endif
9021*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
9022*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, r5m
9023*c0909341SAndroid Build Coastguard Worker    mov               maskq, r6mp
9024*c0909341SAndroid Build Coastguard Worker    call .main
9025*c0909341SAndroid Build Coastguard Worker    jmp                  wq
9026*c0909341SAndroid Build Coastguard Worker.w4_loop:
9027*c0909341SAndroid Build Coastguard Worker    call .main
9028*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
9029*c0909341SAndroid Build Coastguard Worker.w4:
9030*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
9031*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
9032*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
9033*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m1
9034*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m1
9035*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
9036*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
9037*c0909341SAndroid Build Coastguard Worker.end:
9038*c0909341SAndroid Build Coastguard Worker    RET
9039*c0909341SAndroid Build Coastguard Worker.w8_loop:
9040*c0909341SAndroid Build Coastguard Worker    call .main
9041*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
9042*c0909341SAndroid Build Coastguard Worker.w8:
9043*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
9044*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
9045*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
9046*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
9047*c0909341SAndroid Build Coastguard Worker.w8_end:
9048*c0909341SAndroid Build Coastguard Worker    RET
9049*c0909341SAndroid Build Coastguard Worker.w16_loop:
9050*c0909341SAndroid Build Coastguard Worker    call .main
9051*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
9052*c0909341SAndroid Build Coastguard Worker.w16:
9053*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*0], m0
9054*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*1], m1
9055*c0909341SAndroid Build Coastguard Worker    call .main
9056*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*0], m0
9057*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*1], m1
9058*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
9059*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
9060*c0909341SAndroid Build Coastguard Worker    RET
9061*c0909341SAndroid Build Coastguard Worker.w32_loop:
9062*c0909341SAndroid Build Coastguard Worker    call .main
9063*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
9064*c0909341SAndroid Build Coastguard Worker.w32:
9065*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
9066*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
9067*c0909341SAndroid Build Coastguard Worker    call .main
9068*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
9069*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m1
9070*c0909341SAndroid Build Coastguard Worker    dec                  hd
9071*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
9072*c0909341SAndroid Build Coastguard Worker    RET
9073*c0909341SAndroid Build Coastguard Worker.w64_loop:
9074*c0909341SAndroid Build Coastguard Worker    call .main
9075*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
9076*c0909341SAndroid Build Coastguard Worker.w64:
9077*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
9078*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
9079*c0909341SAndroid Build Coastguard Worker    call .main
9080*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
9081*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m1
9082*c0909341SAndroid Build Coastguard Worker    call .main
9083*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*4], m0
9084*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*5], m1
9085*c0909341SAndroid Build Coastguard Worker    call .main
9086*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*6], m0
9087*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*7], m1
9088*c0909341SAndroid Build Coastguard Worker    dec                  hd
9089*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
9090*c0909341SAndroid Build Coastguard Worker    RET
9091*c0909341SAndroid Build Coastguard Worker.w128_loop:
9092*c0909341SAndroid Build Coastguard Worker    call .main
9093*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
9094*c0909341SAndroid Build Coastguard Worker.w128:
9095*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 0], m0
9096*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 1], m1
9097*c0909341SAndroid Build Coastguard Worker    call .main
9098*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 2], m0
9099*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 3], m1
9100*c0909341SAndroid Build Coastguard Worker    call .main
9101*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 4], m0
9102*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 5], m1
9103*c0909341SAndroid Build Coastguard Worker    call .main
9104*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 6], m0
9105*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 7], m1
9106*c0909341SAndroid Build Coastguard Worker    call .main
9107*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 8], m0
9108*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16* 9], m1
9109*c0909341SAndroid Build Coastguard Worker    call .main
9110*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16*10], m0
9111*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16*11], m1
9112*c0909341SAndroid Build Coastguard Worker    call .main
9113*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16*12], m0
9114*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16*13], m1
9115*c0909341SAndroid Build Coastguard Worker    call .main
9116*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16*14], m0
9117*c0909341SAndroid Build Coastguard Worker    mova       [dstq+16*15], m1
9118*c0909341SAndroid Build Coastguard Worker    dec                  hd
9119*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
9120*c0909341SAndroid Build Coastguard Worker    RET
9121*c0909341SAndroid Build Coastguard WorkerALIGN function_align
9122*c0909341SAndroid Build Coastguard Worker.main:
9123*c0909341SAndroid Build Coastguard Worker    W_MASK                0, 2
9124*c0909341SAndroid Build Coastguard Worker    W_MASK                1, 3
9125*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m3
9126*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 16*2
9127*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 16*2
9128*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m2
9129*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
9130*c0909341SAndroid Build Coastguard Worker    ret
9131*c0909341SAndroid Build Coastguard Worker
9132*c0909341SAndroid Build Coastguard Worker; (a * (64 - m) + b * m + 32) >> 6
9133*c0909341SAndroid Build Coastguard Worker; = (((b - a) * m + 32) >> 6) + a
9134*c0909341SAndroid Build Coastguard Worker; = (((b - a) * (m << 9) + 16384) >> 15) + a
9135*c0909341SAndroid Build Coastguard Worker;   except m << 9 overflows int16_t when m == 64 (which is possible),
9136*c0909341SAndroid Build Coastguard Worker;   but if we negate m it works out (-64 << 9 == -32768).
9137*c0909341SAndroid Build Coastguard Worker; = (((a - b) * (m * -512) + 16384) >> 15) + a
9138*c0909341SAndroid Build Coastguard Workercglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3
9139*c0909341SAndroid Build Coastguard Worker%define base r6-blend_ssse3_table
9140*c0909341SAndroid Build Coastguard Worker    LEA                  r6, blend_ssse3_table
9141*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
9142*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
9143*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
9144*c0909341SAndroid Build Coastguard Worker    movifnidn         maskq, maskmp
9145*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+pw_m512]
9146*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
9147*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
9148*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
9149*c0909341SAndroid Build Coastguard Worker    jmp                  wq
9150*c0909341SAndroid Build Coastguard Worker.w4:
9151*c0909341SAndroid Build Coastguard Worker    mova                 m5, [maskq]
9152*c0909341SAndroid Build Coastguard Worker    movq                 m0, [dstq+strideq*0]
9153*c0909341SAndroid Build Coastguard Worker    movhps               m0, [dstq+strideq*1]
9154*c0909341SAndroid Build Coastguard Worker    movq                 m1, [dstq+strideq*2]
9155*c0909341SAndroid Build Coastguard Worker    movhps               m1, [dstq+stride3q ]
9156*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, [tmpq+16*0]
9157*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, [tmpq+16*1]
9158*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
9159*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
9160*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m5, m6
9161*c0909341SAndroid Build Coastguard Worker    punpckhbw            m5, m6
9162*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m7
9163*c0909341SAndroid Build Coastguard Worker    pmullw               m5, m7
9164*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
9165*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
9166*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
9167*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
9168*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
9169*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
9170*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], m1
9171*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], m1
9172*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
9173*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
9174*c0909341SAndroid Build Coastguard Worker    jg .w4
9175*c0909341SAndroid Build Coastguard Worker    RET
9176*c0909341SAndroid Build Coastguard Worker.w8:
9177*c0909341SAndroid Build Coastguard Worker    mova                 m5, [maskq]
9178*c0909341SAndroid Build Coastguard Worker    mova                 m0, [dstq+strideq*0]
9179*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq+strideq*1]
9180*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, [tmpq+16*0]
9181*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, [tmpq+16*1]
9182*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
9183*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
9184*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m5, m6
9185*c0909341SAndroid Build Coastguard Worker    punpckhbw            m5, m6
9186*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m7
9187*c0909341SAndroid Build Coastguard Worker    pmullw               m5, m7
9188*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
9189*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
9190*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
9191*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
9192*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
9193*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
9194*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
9195*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
9196*c0909341SAndroid Build Coastguard Worker    jg .w8
9197*c0909341SAndroid Build Coastguard Worker    RET
9198*c0909341SAndroid Build Coastguard Worker.w16:
9199*c0909341SAndroid Build Coastguard Worker    mova                 m5, [maskq]
9200*c0909341SAndroid Build Coastguard Worker    mova                 m0, [dstq+16*0]
9201*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq+16*1]
9202*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, [tmpq+16*0]
9203*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, [tmpq+16*1]
9204*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
9205*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
9206*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m5, m6
9207*c0909341SAndroid Build Coastguard Worker    punpckhbw            m5, m6
9208*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m7
9209*c0909341SAndroid Build Coastguard Worker    pmullw               m5, m7
9210*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
9211*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
9212*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
9213*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
9214*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
9215*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
9216*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
9217*c0909341SAndroid Build Coastguard Worker    dec                  hd
9218*c0909341SAndroid Build Coastguard Worker    jg .w16
9219*c0909341SAndroid Build Coastguard Worker    RET
9220*c0909341SAndroid Build Coastguard Worker.w32:
9221*c0909341SAndroid Build Coastguard Worker    mova                 m5, [maskq+16*0]
9222*c0909341SAndroid Build Coastguard Worker    mova                 m0, [dstq+16*0]
9223*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq+16*1]
9224*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, [tmpq+16*0]
9225*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, [tmpq+16*1]
9226*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m5, m6
9227*c0909341SAndroid Build Coastguard Worker    punpckhbw            m5, m6
9228*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m7
9229*c0909341SAndroid Build Coastguard Worker    pmullw               m5, m7
9230*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
9231*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
9232*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
9233*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
9234*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
9235*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
9236*c0909341SAndroid Build Coastguard Worker    mova                 m5, [maskq+16*1]
9237*c0909341SAndroid Build Coastguard Worker    mova                 m0, [dstq+16*2]
9238*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq+16*3]
9239*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, [tmpq+16*2]
9240*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, [tmpq+16*3]
9241*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
9242*c0909341SAndroid Build Coastguard Worker    add                tmpq, 64
9243*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m5, m6
9244*c0909341SAndroid Build Coastguard Worker    punpckhbw            m5, m6
9245*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m7
9246*c0909341SAndroid Build Coastguard Worker    pmullw               m5, m7
9247*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
9248*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
9249*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
9250*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
9251*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
9252*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m1
9253*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
9254*c0909341SAndroid Build Coastguard Worker    dec                  hd
9255*c0909341SAndroid Build Coastguard Worker    jg .w32
9256*c0909341SAndroid Build Coastguard Worker    RET
9257*c0909341SAndroid Build Coastguard Worker
9258*c0909341SAndroid Build Coastguard Workercglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
9259*c0909341SAndroid Build Coastguard Worker%define base r5-blend_v_ssse3_table
9260*c0909341SAndroid Build Coastguard Worker    LEA                  r5, blend_v_ssse3_table
9261*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
9262*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
9263*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
9264*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
9265*c0909341SAndroid Build Coastguard Worker    jmp                  wq
9266*c0909341SAndroid Build Coastguard Worker.w2:
9267*c0909341SAndroid Build Coastguard Worker    movd                 m4, [base+obmc_masks+2*2]
9268*c0909341SAndroid Build Coastguard Worker.w2_loop:
9269*c0909341SAndroid Build Coastguard Worker    movd                 m0, [dstq+strideq*0]
9270*c0909341SAndroid Build Coastguard Worker    movd                 m2, [tmpq+4*0]
9271*c0909341SAndroid Build Coastguard Worker    movd                 m1, [dstq+strideq*1]
9272*c0909341SAndroid Build Coastguard Worker    movd                 m3, [tmpq+4*1]
9273*c0909341SAndroid Build Coastguard Worker    add                tmpq, 4*2
9274*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
9275*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
9276*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
9277*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4
9278*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
9279*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
9280*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m0
9281*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m1
9282*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
9283*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
9284*c0909341SAndroid Build Coastguard Worker    jg .w2_loop
9285*c0909341SAndroid Build Coastguard Worker    RET
9286*c0909341SAndroid Build Coastguard Worker.w4:
9287*c0909341SAndroid Build Coastguard Worker    movddup              m2, [base+obmc_masks+4*2]
9288*c0909341SAndroid Build Coastguard Worker.w4_loop:
9289*c0909341SAndroid Build Coastguard Worker    movq                 m0, [dstq+strideq*0]
9290*c0909341SAndroid Build Coastguard Worker    movhps               m0, [dstq+strideq*1]
9291*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmpq]
9292*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8*2
9293*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
9294*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
9295*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
9296*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
9297*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
9298*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
9299*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
9300*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
9301*c0909341SAndroid Build Coastguard Worker    RET
9302*c0909341SAndroid Build Coastguard Worker.w8:
9303*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+obmc_masks+8*2]
9304*c0909341SAndroid Build Coastguard Worker.w8_loop:
9305*c0909341SAndroid Build Coastguard Worker    mova                 m0, [dstq+strideq*0]
9306*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmpq+16*0]
9307*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq+strideq*1]
9308*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tmpq+16*1]
9309*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
9310*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
9311*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
9312*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
9313*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4
9314*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
9315*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
9316*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
9317*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
9318*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
9319*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
9320*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
9321*c0909341SAndroid Build Coastguard Worker    RET
9322*c0909341SAndroid Build Coastguard Worker.w16:
9323*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+obmc_masks+16*2]
9324*c0909341SAndroid Build Coastguard Worker    movq                 m5, [base+obmc_masks+16*3]
9325*c0909341SAndroid Build Coastguard Worker.w16_loop:
9326*c0909341SAndroid Build Coastguard Worker    mova                 m0, [dstq+16*0]
9327*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmpq+16*0]
9328*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq+16*1]
9329*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tmpq+16*1]
9330*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
9331*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
9332*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
9333*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
9334*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
9335*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
9336*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
9337*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
9338*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
9339*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
9340*c0909341SAndroid Build Coastguard Worker    dec                  hd
9341*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
9342*c0909341SAndroid Build Coastguard Worker    RET
9343*c0909341SAndroid Build Coastguard Worker.w32:
9344*c0909341SAndroid Build Coastguard Worker%if WIN64
9345*c0909341SAndroid Build Coastguard Worker    movaps          [rsp+8], m6
9346*c0909341SAndroid Build Coastguard Worker%endif
9347*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+obmc_masks+16*4]
9348*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+obmc_masks+16*5]
9349*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+obmc_masks+16*6]
9350*c0909341SAndroid Build Coastguard Worker.w32_loop:
9351*c0909341SAndroid Build Coastguard Worker    mova                 m0, [dstq+16*0]
9352*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmpq+16*0]
9353*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq+16*1]
9354*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tmpq+16*1]
9355*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
9356*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
9357*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
9358*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
9359*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
9360*c0909341SAndroid Build Coastguard Worker    mova                 m2, [dstq+16*2]
9361*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
9362*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tmpq+16*2]
9363*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*4
9364*c0909341SAndroid Build Coastguard Worker    psubw                m3, m2
9365*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6
9366*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
9367*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
9368*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
9369*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m2
9370*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
9371*c0909341SAndroid Build Coastguard Worker    dec                  hd
9372*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
9373*c0909341SAndroid Build Coastguard Worker%if WIN64
9374*c0909341SAndroid Build Coastguard Worker    movaps               m6, [rsp+8]
9375*c0909341SAndroid Build Coastguard Worker%endif
9376*c0909341SAndroid Build Coastguard Worker    RET
9377*c0909341SAndroid Build Coastguard Worker
9378*c0909341SAndroid Build Coastguard Worker%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp
9379*c0909341SAndroid Build Coastguard Worker    mova                 m0, [dstq+16*(%1+0)]
9380*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmpq+16*(%2+0)]
9381*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq+16*(%1+1)]
9382*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tmpq+16*(%2+1)]
9383*c0909341SAndroid Build Coastguard Worker%if %3
9384*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*%3
9385*c0909341SAndroid Build Coastguard Worker%endif
9386*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
9387*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
9388*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5
9389*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
9390*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
9391*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
9392*c0909341SAndroid Build Coastguard Worker    mova   [dstq+16*(%1+0)], m0
9393*c0909341SAndroid Build Coastguard Worker    mova   [dstq+16*(%1+1)], m1
9394*c0909341SAndroid Build Coastguard Worker%endmacro
9395*c0909341SAndroid Build Coastguard Worker
9396*c0909341SAndroid Build Coastguard Workercglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
9397*c0909341SAndroid Build Coastguard Worker%define base r6-blend_h_ssse3_table
9398*c0909341SAndroid Build Coastguard Worker    LEA                  r6, blend_h_ssse3_table
9399*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
9400*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
9401*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
9402*c0909341SAndroid Build Coastguard Worker    movddup              m4, [base+blend_shuf]
9403*c0909341SAndroid Build Coastguard Worker    lea               maskq, [base+obmc_masks+hq*2]
9404*c0909341SAndroid Build Coastguard Worker    lea                  hd, [hq*3]
9405*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
9406*c0909341SAndroid Build Coastguard Worker    shr                  hd, 2 ; h * 3/4
9407*c0909341SAndroid Build Coastguard Worker    lea               maskq, [maskq+hq*2]
9408*c0909341SAndroid Build Coastguard Worker    neg                  hq
9409*c0909341SAndroid Build Coastguard Worker    jmp                  wq
9410*c0909341SAndroid Build Coastguard Worker.w2:
9411*c0909341SAndroid Build Coastguard Worker    movd                 m0, [dstq+dsq*0]
9412*c0909341SAndroid Build Coastguard Worker    movd                 m2, [dstq+dsq*1]
9413*c0909341SAndroid Build Coastguard Worker    movd                 m3, [maskq+hq*2]
9414*c0909341SAndroid Build Coastguard Worker    movq                 m1, [tmpq]
9415*c0909341SAndroid Build Coastguard Worker    add                tmpq, 4*2
9416*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
9417*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m3
9418*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
9419*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
9420*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
9421*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m0
9422*c0909341SAndroid Build Coastguard Worker    psrlq                m0, 32
9423*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m0
9424*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
9425*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
9426*c0909341SAndroid Build Coastguard Worker    jl .w2
9427*c0909341SAndroid Build Coastguard Worker    RET
9428*c0909341SAndroid Build Coastguard Worker.w4:
9429*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+blend_shuf]
9430*c0909341SAndroid Build Coastguard Worker.w4_loop:
9431*c0909341SAndroid Build Coastguard Worker    movq                 m0, [dstq+dsq*0]
9432*c0909341SAndroid Build Coastguard Worker    movhps               m0, [dstq+dsq*1]
9433*c0909341SAndroid Build Coastguard Worker    movd                 m2, [maskq+hq*2]
9434*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmpq]
9435*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8*2
9436*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
9437*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3
9438*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
9439*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
9440*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m0
9441*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m0
9442*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
9443*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
9444*c0909341SAndroid Build Coastguard Worker    jl .w4_loop
9445*c0909341SAndroid Build Coastguard Worker    RET
9446*c0909341SAndroid Build Coastguard Worker.w8:
9447*c0909341SAndroid Build Coastguard Worker    movddup              m5, [base+blend_shuf+8]
9448*c0909341SAndroid Build Coastguard Worker%if WIN64
9449*c0909341SAndroid Build Coastguard Worker    movaps         [rsp+ 8], m6
9450*c0909341SAndroid Build Coastguard Worker    movaps         [rsp+24], m7
9451*c0909341SAndroid Build Coastguard Worker%endif
9452*c0909341SAndroid Build Coastguard Worker.w8_loop:
9453*c0909341SAndroid Build Coastguard Worker    movd                 m7, [maskq+hq*2]
9454*c0909341SAndroid Build Coastguard Worker    mova                 m0, [dstq+dsq*0]
9455*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmpq+16*0]
9456*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq+dsq*1]
9457*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tmpq+16*1]
9458*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
9459*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m7, m4
9460*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
9461*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m5
9462*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
9463*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
9464*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m7
9465*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
9466*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
9467*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m0
9468*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
9469*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
9470*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
9471*c0909341SAndroid Build Coastguard Worker    jl .w8_loop
9472*c0909341SAndroid Build Coastguard Worker%if WIN64
9473*c0909341SAndroid Build Coastguard Worker    movaps               m6, [rsp+ 8]
9474*c0909341SAndroid Build Coastguard Worker    movaps               m7, [rsp+24]
9475*c0909341SAndroid Build Coastguard Worker%endif
9476*c0909341SAndroid Build Coastguard Worker    RET
9477*c0909341SAndroid Build Coastguard Worker.w16:
9478*c0909341SAndroid Build Coastguard Worker    movd                 m5, [maskq+hq*2]
9479*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m4
9480*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           0, 0, 2
9481*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
9482*c0909341SAndroid Build Coastguard Worker    inc                  hq
9483*c0909341SAndroid Build Coastguard Worker    jl .w16
9484*c0909341SAndroid Build Coastguard Worker    RET
9485*c0909341SAndroid Build Coastguard Worker.w32:
9486*c0909341SAndroid Build Coastguard Worker    movd                 m5, [maskq+hq*2]
9487*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m4
9488*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           0, 0
9489*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           2, 2, 4
9490*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
9491*c0909341SAndroid Build Coastguard Worker    inc                  hq
9492*c0909341SAndroid Build Coastguard Worker    jl .w32
9493*c0909341SAndroid Build Coastguard Worker    RET
9494*c0909341SAndroid Build Coastguard Worker.w64:
9495*c0909341SAndroid Build Coastguard Worker    movd                 m5, [maskq+hq*2]
9496*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m4
9497*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           0, 0
9498*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           2, 2
9499*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           4, 4
9500*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           6, 6, 8
9501*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
9502*c0909341SAndroid Build Coastguard Worker    inc                  hq
9503*c0909341SAndroid Build Coastguard Worker    jl .w64
9504*c0909341SAndroid Build Coastguard Worker    RET
9505*c0909341SAndroid Build Coastguard Worker.w128:
9506*c0909341SAndroid Build Coastguard Worker    movd                 m5, [maskq+hq*2]
9507*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m4
9508*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           0,  0
9509*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           2,  2
9510*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           4,  4
9511*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           6,  6, 16
9512*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW           8, -8
9513*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW          10, -6
9514*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW          12, -4
9515*c0909341SAndroid Build Coastguard Worker    BLEND_H_ROW          14, -2
9516*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
9517*c0909341SAndroid Build Coastguard Worker    inc                  hq
9518*c0909341SAndroid Build Coastguard Worker    jl .w128
9519*c0909341SAndroid Build Coastguard Worker    RET
9520*c0909341SAndroid Build Coastguard Worker
9521*c0909341SAndroid Build Coastguard Worker; emu_edge args:
9522*c0909341SAndroid Build Coastguard Worker; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
9523*c0909341SAndroid Build Coastguard Worker; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
9524*c0909341SAndroid Build Coastguard Worker; const pixel *ref, const ptrdiff_t ref_stride
9525*c0909341SAndroid Build Coastguard Worker;
9526*c0909341SAndroid Build Coastguard Worker; bw, bh total filled size
9527*c0909341SAndroid Build Coastguard Worker; iw, ih, copied block -> fill bottom, right
9528*c0909341SAndroid Build Coastguard Worker; x, y, offset in bw/bh -> fill top, left
9529*c0909341SAndroid Build Coastguard Workercglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \
9530*c0909341SAndroid Build Coastguard Worker                             y, dst, dstride, src, sstride, \
9531*c0909341SAndroid Build Coastguard Worker                             bottomext, rightext, blk
9532*c0909341SAndroid Build Coastguard Worker    ; we assume that the buffer (stride) is larger than width, so we can
9533*c0909341SAndroid Build Coastguard Worker    ; safely overwrite by a few bytes
9534*c0909341SAndroid Build Coastguard Worker
9535*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9536*c0909341SAndroid Build Coastguard Worker %define reg_zero       r12q
9537*c0909341SAndroid Build Coastguard Worker %define reg_tmp        r10
9538*c0909341SAndroid Build Coastguard Worker %define reg_src        srcq
9539*c0909341SAndroid Build Coastguard Worker %define reg_bottomext  bottomextq
9540*c0909341SAndroid Build Coastguard Worker %define reg_rightext   rightextq
9541*c0909341SAndroid Build Coastguard Worker %define reg_blkm       r9m
9542*c0909341SAndroid Build Coastguard Worker%else
9543*c0909341SAndroid Build Coastguard Worker %define reg_zero       r6
9544*c0909341SAndroid Build Coastguard Worker %define reg_tmp        r0
9545*c0909341SAndroid Build Coastguard Worker %define reg_src        r1
9546*c0909341SAndroid Build Coastguard Worker %define reg_bottomext  r0
9547*c0909341SAndroid Build Coastguard Worker %define reg_rightext   r1
9548*c0909341SAndroid Build Coastguard Worker %define reg_blkm       r2m
9549*c0909341SAndroid Build Coastguard Worker%endif
9550*c0909341SAndroid Build Coastguard Worker    ;
9551*c0909341SAndroid Build Coastguard Worker    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
9552*c0909341SAndroid Build Coastguard Worker    xor            reg_zero, reg_zero
9553*c0909341SAndroid Build Coastguard Worker    lea             reg_tmp, [ihq-1]
9554*c0909341SAndroid Build Coastguard Worker    cmp                  yq, ihq
9555*c0909341SAndroid Build Coastguard Worker    cmovs           reg_tmp, yq
9556*c0909341SAndroid Build Coastguard Worker    test                 yq, yq
9557*c0909341SAndroid Build Coastguard Worker    cmovs           reg_tmp, reg_zero
9558*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9559*c0909341SAndroid Build Coastguard Worker    imul            reg_tmp, sstrideq
9560*c0909341SAndroid Build Coastguard Worker    add                srcq, reg_tmp
9561*c0909341SAndroid Build Coastguard Worker%else
9562*c0909341SAndroid Build Coastguard Worker    imul            reg_tmp, sstridem
9563*c0909341SAndroid Build Coastguard Worker    mov             reg_src, srcm
9564*c0909341SAndroid Build Coastguard Worker    add             reg_src, reg_tmp
9565*c0909341SAndroid Build Coastguard Worker%endif
9566*c0909341SAndroid Build Coastguard Worker    ;
9567*c0909341SAndroid Build Coastguard Worker    ; ref += iclip(x, 0, iw - 1)
9568*c0909341SAndroid Build Coastguard Worker    lea             reg_tmp, [iwq-1]
9569*c0909341SAndroid Build Coastguard Worker    cmp                  xq, iwq
9570*c0909341SAndroid Build Coastguard Worker    cmovs           reg_tmp, xq
9571*c0909341SAndroid Build Coastguard Worker    test                 xq, xq
9572*c0909341SAndroid Build Coastguard Worker    cmovs           reg_tmp, reg_zero
9573*c0909341SAndroid Build Coastguard Worker    lea             reg_src, [reg_src+reg_tmp*2]
9574*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
9575*c0909341SAndroid Build Coastguard Worker    mov                srcm, reg_src
9576*c0909341SAndroid Build Coastguard Worker%endif
9577*c0909341SAndroid Build Coastguard Worker    ;
9578*c0909341SAndroid Build Coastguard Worker    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
9579*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
9580*c0909341SAndroid Build Coastguard Worker    mov                  r1, r1m ; restore bh
9581*c0909341SAndroid Build Coastguard Worker%endif
9582*c0909341SAndroid Build Coastguard Worker    lea       reg_bottomext, [yq+bhq]
9583*c0909341SAndroid Build Coastguard Worker    sub       reg_bottomext, ihq
9584*c0909341SAndroid Build Coastguard Worker    lea                  r3, [bhq-1]
9585*c0909341SAndroid Build Coastguard Worker    cmovs     reg_bottomext, reg_zero
9586*c0909341SAndroid Build Coastguard Worker    ;
9587*c0909341SAndroid Build Coastguard Worker
9588*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS bw, bh, iw, ih, x, \
9589*c0909341SAndroid Build Coastguard Worker                topext, dst, dstride, src, sstride, \
9590*c0909341SAndroid Build Coastguard Worker                bottomext, rightext, blk
9591*c0909341SAndroid Build Coastguard Worker
9592*c0909341SAndroid Build Coastguard Worker    ; top_ext = iclip(-y, 0, bh - 1)
9593*c0909341SAndroid Build Coastguard Worker    neg             topextq
9594*c0909341SAndroid Build Coastguard Worker    cmovs           topextq, reg_zero
9595*c0909341SAndroid Build Coastguard Worker    cmp       reg_bottomext, bhq
9596*c0909341SAndroid Build Coastguard Worker    cmovns    reg_bottomext, r3
9597*c0909341SAndroid Build Coastguard Worker    cmp             topextq, bhq
9598*c0909341SAndroid Build Coastguard Worker    cmovg           topextq, r3
9599*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
9600*c0909341SAndroid Build Coastguard Worker    mov                 r4m, reg_bottomext
9601*c0909341SAndroid Build Coastguard Worker    ;
9602*c0909341SAndroid Build Coastguard Worker    ; right_ext = iclip(x + bw - iw, 0, bw - 1)
9603*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m ; restore bw
9604*c0909341SAndroid Build Coastguard Worker %endif
9605*c0909341SAndroid Build Coastguard Worker    lea        reg_rightext, [xq+bwq]
9606*c0909341SAndroid Build Coastguard Worker    sub        reg_rightext, iwq
9607*c0909341SAndroid Build Coastguard Worker    lea                  r2, [bwq-1]
9608*c0909341SAndroid Build Coastguard Worker    cmovs      reg_rightext, reg_zero
9609*c0909341SAndroid Build Coastguard Worker
9610*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS bw, bh, iw, ih, leftext, \
9611*c0909341SAndroid Build Coastguard Worker                topext, dst, dstride, src, sstride, \
9612*c0909341SAndroid Build Coastguard Worker                bottomext, rightext, blk
9613*c0909341SAndroid Build Coastguard Worker
9614*c0909341SAndroid Build Coastguard Worker    ; left_ext = iclip(-x, 0, bw - 1)
9615*c0909341SAndroid Build Coastguard Worker    neg            leftextq
9616*c0909341SAndroid Build Coastguard Worker    cmovs          leftextq, reg_zero
9617*c0909341SAndroid Build Coastguard Worker    cmp        reg_rightext, bwq
9618*c0909341SAndroid Build Coastguard Worker    cmovns     reg_rightext, r2
9619*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
9620*c0909341SAndroid Build Coastguard Worker    mov                 r3m, r1
9621*c0909341SAndroid Build Coastguard Worker %endif
9622*c0909341SAndroid Build Coastguard Worker    cmp            leftextq, bwq
9623*c0909341SAndroid Build Coastguard Worker    cmovns         leftextq, r2
9624*c0909341SAndroid Build Coastguard Worker
9625*c0909341SAndroid Build Coastguard Worker%undef reg_zero
9626*c0909341SAndroid Build Coastguard Worker%undef reg_tmp
9627*c0909341SAndroid Build Coastguard Worker%undef reg_src
9628*c0909341SAndroid Build Coastguard Worker%undef reg_bottomext
9629*c0909341SAndroid Build Coastguard Worker%undef reg_rightext
9630*c0909341SAndroid Build Coastguard Worker
9631*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
9632*c0909341SAndroid Build Coastguard Worker                topext, dst, dstride, src, sstride, \
9633*c0909341SAndroid Build Coastguard Worker                bottomext, rightext, blk
9634*c0909341SAndroid Build Coastguard Worker
9635*c0909341SAndroid Build Coastguard Worker    ; center_h = bh - top_ext - bottom_ext
9636*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9637*c0909341SAndroid Build Coastguard Worker    lea                  r3, [bottomextq+topextq]
9638*c0909341SAndroid Build Coastguard Worker    sub            centerhq, r3
9639*c0909341SAndroid Build Coastguard Worker%else
9640*c0909341SAndroid Build Coastguard Worker    mov                   r1, centerhm ; restore r1
9641*c0909341SAndroid Build Coastguard Worker    sub             centerhq, topextq
9642*c0909341SAndroid Build Coastguard Worker    sub             centerhq, r4m
9643*c0909341SAndroid Build Coastguard Worker    mov                  r1m, centerhq
9644*c0909341SAndroid Build Coastguard Worker%endif
9645*c0909341SAndroid Build Coastguard Worker    ;
9646*c0909341SAndroid Build Coastguard Worker    ; blk += top_ext * PXSTRIDE(dst_stride)
9647*c0909341SAndroid Build Coastguard Worker    mov                  r2, topextq
9648*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9649*c0909341SAndroid Build Coastguard Worker    imul                 r2, dstrideq
9650*c0909341SAndroid Build Coastguard Worker%else
9651*c0909341SAndroid Build Coastguard Worker    mov                  r6, r6m ; restore dstq
9652*c0909341SAndroid Build Coastguard Worker    imul                 r2, dstridem
9653*c0909341SAndroid Build Coastguard Worker%endif
9654*c0909341SAndroid Build Coastguard Worker    add                dstq, r2
9655*c0909341SAndroid Build Coastguard Worker    mov            reg_blkm, dstq ; save pointer for ext
9656*c0909341SAndroid Build Coastguard Worker    ;
9657*c0909341SAndroid Build Coastguard Worker    ; center_w = bw - left_ext - right_ext
9658*c0909341SAndroid Build Coastguard Worker    mov            centerwq, bwq
9659*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9660*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rightextq+leftextq]
9661*c0909341SAndroid Build Coastguard Worker    sub            centerwq, r3
9662*c0909341SAndroid Build Coastguard Worker%else
9663*c0909341SAndroid Build Coastguard Worker    sub            centerwq, r3m
9664*c0909341SAndroid Build Coastguard Worker    sub            centerwq, leftextq
9665*c0909341SAndroid Build Coastguard Worker%endif
9666*c0909341SAndroid Build Coastguard Worker
9667*c0909341SAndroid Build Coastguard Worker; vloop Macro
9668*c0909341SAndroid Build Coastguard Worker%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
9669*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_64
9670*c0909341SAndroid Build Coastguard Worker    %define reg_tmp        r12
9671*c0909341SAndroid Build Coastguard Worker  %else
9672*c0909341SAndroid Build Coastguard Worker    %define reg_tmp        r0
9673*c0909341SAndroid Build Coastguard Worker  %endif
9674*c0909341SAndroid Build Coastguard Worker.v_loop_%3:
9675*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_32
9676*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
9677*c0909341SAndroid Build Coastguard Worker    mov                  r1, r1m
9678*c0909341SAndroid Build Coastguard Worker  %endif
9679*c0909341SAndroid Build Coastguard Worker%if %1
9680*c0909341SAndroid Build Coastguard Worker    ; left extension
9681*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_64
9682*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq]
9683*c0909341SAndroid Build Coastguard Worker  %else
9684*c0909341SAndroid Build Coastguard Worker    mov                  r3, srcm
9685*c0909341SAndroid Build Coastguard Worker    movd                 m0, [r3]
9686*c0909341SAndroid Build Coastguard Worker  %endif
9687*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q0000
9688*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
9689*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
9690*c0909341SAndroid Build Coastguard Worker.left_loop_%3:
9691*c0909341SAndroid Build Coastguard Worker    mova        [dstq+r3*2], m0
9692*c0909341SAndroid Build Coastguard Worker    add                  r3, mmsize/2
9693*c0909341SAndroid Build Coastguard Worker    cmp                  r3, leftextq
9694*c0909341SAndroid Build Coastguard Worker    jl .left_loop_%3
9695*c0909341SAndroid Build Coastguard Worker    ; body
9696*c0909341SAndroid Build Coastguard Worker    lea             reg_tmp, [dstq+leftextq*2]
9697*c0909341SAndroid Build Coastguard Worker%endif
9698*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
9699*c0909341SAndroid Build Coastguard Worker.body_loop_%3:
9700*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_64
9701*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r3*2]
9702*c0909341SAndroid Build Coastguard Worker  %else
9703*c0909341SAndroid Build Coastguard Worker    mov                  r1, srcm
9704*c0909341SAndroid Build Coastguard Worker    movu                 m0, [r1+r3*2]
9705*c0909341SAndroid Build Coastguard Worker  %endif
9706*c0909341SAndroid Build Coastguard Worker%if %1
9707*c0909341SAndroid Build Coastguard Worker    movu     [reg_tmp+r3*2], m0
9708*c0909341SAndroid Build Coastguard Worker%else
9709*c0909341SAndroid Build Coastguard Worker    movu        [dstq+r3*2], m0
9710*c0909341SAndroid Build Coastguard Worker%endif
9711*c0909341SAndroid Build Coastguard Worker    add                  r3, mmsize/2
9712*c0909341SAndroid Build Coastguard Worker    cmp                  r3, centerwq
9713*c0909341SAndroid Build Coastguard Worker    jl .body_loop_%3
9714*c0909341SAndroid Build Coastguard Worker%if %2
9715*c0909341SAndroid Build Coastguard Worker    ; right extension
9716*c0909341SAndroid Build Coastguard Worker%if %1
9717*c0909341SAndroid Build Coastguard Worker    lea             reg_tmp, [reg_tmp+centerwq*2]
9718*c0909341SAndroid Build Coastguard Worker%else
9719*c0909341SAndroid Build Coastguard Worker    lea             reg_tmp, [dstq+centerwq*2]
9720*c0909341SAndroid Build Coastguard Worker%endif
9721*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_64
9722*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+centerwq*2-2]
9723*c0909341SAndroid Build Coastguard Worker  %else
9724*c0909341SAndroid Build Coastguard Worker    mov                  r3, srcm
9725*c0909341SAndroid Build Coastguard Worker    movd                 m0, [r3+centerwq*2-2]
9726*c0909341SAndroid Build Coastguard Worker  %endif
9727*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q0000
9728*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
9729*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
9730*c0909341SAndroid Build Coastguard Worker.right_loop_%3:
9731*c0909341SAndroid Build Coastguard Worker    movu     [reg_tmp+r3*2], m0
9732*c0909341SAndroid Build Coastguard Worker    add                  r3, mmsize/2
9733*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_64
9734*c0909341SAndroid Build Coastguard Worker    cmp                  r3, rightextq
9735*c0909341SAndroid Build Coastguard Worker  %else
9736*c0909341SAndroid Build Coastguard Worker    cmp                  r3, r3m
9737*c0909341SAndroid Build Coastguard Worker  %endif
9738*c0909341SAndroid Build Coastguard Worker    jl .right_loop_%3
9739*c0909341SAndroid Build Coastguard Worker%endif
9740*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_64
9741*c0909341SAndroid Build Coastguard Worker    add                dstq, dstrideq
9742*c0909341SAndroid Build Coastguard Worker    add                srcq, sstrideq
9743*c0909341SAndroid Build Coastguard Worker    dec            centerhq
9744*c0909341SAndroid Build Coastguard Worker    jg .v_loop_%3
9745*c0909341SAndroid Build Coastguard Worker  %else
9746*c0909341SAndroid Build Coastguard Worker    add                dstq, dstridem
9747*c0909341SAndroid Build Coastguard Worker    mov                  r0, sstridem
9748*c0909341SAndroid Build Coastguard Worker    add                srcm, r0
9749*c0909341SAndroid Build Coastguard Worker    sub       dword centerhm, 1
9750*c0909341SAndroid Build Coastguard Worker    jg .v_loop_%3
9751*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m ; restore r0
9752*c0909341SAndroid Build Coastguard Worker  %endif
9753*c0909341SAndroid Build Coastguard Worker%endmacro ; vloop MACRO
9754*c0909341SAndroid Build Coastguard Worker
9755*c0909341SAndroid Build Coastguard Worker    test           leftextq, leftextq
9756*c0909341SAndroid Build Coastguard Worker    jnz .need_left_ext
9757*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
9758*c0909341SAndroid Build Coastguard Worker    test          rightextq, rightextq
9759*c0909341SAndroid Build Coastguard Worker    jnz .need_right_ext
9760*c0909341SAndroid Build Coastguard Worker %else
9761*c0909341SAndroid Build Coastguard Worker    cmp            leftextq, r3m ; leftextq == 0
9762*c0909341SAndroid Build Coastguard Worker    jne .need_right_ext
9763*c0909341SAndroid Build Coastguard Worker %endif
9764*c0909341SAndroid Build Coastguard Worker    v_loop                0, 0, 0
9765*c0909341SAndroid Build Coastguard Worker    jmp .body_done
9766*c0909341SAndroid Build Coastguard Worker
9767*c0909341SAndroid Build Coastguard Worker    ;left right extensions
9768*c0909341SAndroid Build Coastguard Worker.need_left_ext:
9769*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
9770*c0909341SAndroid Build Coastguard Worker    test          rightextq, rightextq
9771*c0909341SAndroid Build Coastguard Worker %else
9772*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
9773*c0909341SAndroid Build Coastguard Worker    test                 r3, r3
9774*c0909341SAndroid Build Coastguard Worker %endif
9775*c0909341SAndroid Build Coastguard Worker    jnz .need_left_right_ext
9776*c0909341SAndroid Build Coastguard Worker    v_loop                1, 0, 1
9777*c0909341SAndroid Build Coastguard Worker    jmp .body_done
9778*c0909341SAndroid Build Coastguard Worker
9779*c0909341SAndroid Build Coastguard Worker.need_left_right_ext:
9780*c0909341SAndroid Build Coastguard Worker    v_loop                1, 1, 2
9781*c0909341SAndroid Build Coastguard Worker    jmp .body_done
9782*c0909341SAndroid Build Coastguard Worker
9783*c0909341SAndroid Build Coastguard Worker.need_right_ext:
9784*c0909341SAndroid Build Coastguard Worker    v_loop                0, 1, 3
9785*c0909341SAndroid Build Coastguard Worker
9786*c0909341SAndroid Build Coastguard Worker.body_done:
9787*c0909341SAndroid Build Coastguard Worker; r0 ; bw
9788*c0909341SAndroid Build Coastguard Worker; r1 ;; x loop
9789*c0909341SAndroid Build Coastguard Worker; r4 ;; y loop
9790*c0909341SAndroid Build Coastguard Worker; r5 ; topextq
9791*c0909341SAndroid Build Coastguard Worker; r6 ;dstq
9792*c0909341SAndroid Build Coastguard Worker; r7 ;dstrideq
9793*c0909341SAndroid Build Coastguard Worker; r8 ; srcq
9794*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9795*c0909341SAndroid Build Coastguard Worker %define reg_dstride    dstrideq
9796*c0909341SAndroid Build Coastguard Worker%else
9797*c0909341SAndroid Build Coastguard Worker %define reg_dstride    r2
9798*c0909341SAndroid Build Coastguard Worker%endif
9799*c0909341SAndroid Build Coastguard Worker    ;
9800*c0909341SAndroid Build Coastguard Worker    ; bottom edge extension
9801*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
9802*c0909341SAndroid Build Coastguard Worker    test         bottomextq, bottomextq
9803*c0909341SAndroid Build Coastguard Worker    jz .top
9804*c0909341SAndroid Build Coastguard Worker %else
9805*c0909341SAndroid Build Coastguard Worker    xor                  r1, r1
9806*c0909341SAndroid Build Coastguard Worker    cmp                  r1, r4m
9807*c0909341SAndroid Build Coastguard Worker    je .top
9808*c0909341SAndroid Build Coastguard Worker %endif
9809*c0909341SAndroid Build Coastguard Worker    ;
9810*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
9811*c0909341SAndroid Build Coastguard Worker    mov                srcq, dstq
9812*c0909341SAndroid Build Coastguard Worker    sub                srcq, dstrideq
9813*c0909341SAndroid Build Coastguard Worker    xor                  r1, r1
9814*c0909341SAndroid Build Coastguard Worker %else
9815*c0909341SAndroid Build Coastguard Worker    mov                  r3, dstq
9816*c0909341SAndroid Build Coastguard Worker    mov         reg_dstride, dstridem
9817*c0909341SAndroid Build Coastguard Worker    sub                  r3, reg_dstride
9818*c0909341SAndroid Build Coastguard Worker    mov                srcm, r3
9819*c0909341SAndroid Build Coastguard Worker %endif
9820*c0909341SAndroid Build Coastguard Worker    ;
9821*c0909341SAndroid Build Coastguard Worker.bottom_x_loop:
9822*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
9823*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq+r1*2]
9824*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+r1*2]
9825*c0909341SAndroid Build Coastguard Worker    mov                  r4, bottomextq
9826*c0909341SAndroid Build Coastguard Worker %else
9827*c0909341SAndroid Build Coastguard Worker    mov                  r3, srcm
9828*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+r1*2]
9829*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+r1*2]
9830*c0909341SAndroid Build Coastguard Worker    mov                  r4, r4m
9831*c0909341SAndroid Build Coastguard Worker %endif
9832*c0909341SAndroid Build Coastguard Worker    ;
9833*c0909341SAndroid Build Coastguard Worker.bottom_y_loop:
9834*c0909341SAndroid Build Coastguard Worker    mova               [r3], m0
9835*c0909341SAndroid Build Coastguard Worker    add                  r3, reg_dstride
9836*c0909341SAndroid Build Coastguard Worker    dec                  r4
9837*c0909341SAndroid Build Coastguard Worker    jg .bottom_y_loop
9838*c0909341SAndroid Build Coastguard Worker    add                  r1, mmsize/2
9839*c0909341SAndroid Build Coastguard Worker    cmp                  r1, bwq
9840*c0909341SAndroid Build Coastguard Worker    jl .bottom_x_loop
9841*c0909341SAndroid Build Coastguard Worker
9842*c0909341SAndroid Build Coastguard Worker.top:
9843*c0909341SAndroid Build Coastguard Worker    ; top edge extension
9844*c0909341SAndroid Build Coastguard Worker    test            topextq, topextq
9845*c0909341SAndroid Build Coastguard Worker    jz .end
9846*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9847*c0909341SAndroid Build Coastguard Worker    mov                srcq, reg_blkm
9848*c0909341SAndroid Build Coastguard Worker%else
9849*c0909341SAndroid Build Coastguard Worker    mov                  r3, reg_blkm
9850*c0909341SAndroid Build Coastguard Worker    mov         reg_dstride, dstridem
9851*c0909341SAndroid Build Coastguard Worker%endif
9852*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstm
9853*c0909341SAndroid Build Coastguard Worker    xor                  r1, r1
9854*c0909341SAndroid Build Coastguard Worker    ;
9855*c0909341SAndroid Build Coastguard Worker.top_x_loop:
9856*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9857*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq+r1*2]
9858*c0909341SAndroid Build Coastguard Worker%else
9859*c0909341SAndroid Build Coastguard Worker    mov                  r3, reg_blkm
9860*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+r1*2]
9861*c0909341SAndroid Build Coastguard Worker%endif
9862*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+r1*2]
9863*c0909341SAndroid Build Coastguard Worker    mov                  r4, topextq
9864*c0909341SAndroid Build Coastguard Worker    ;
9865*c0909341SAndroid Build Coastguard Worker.top_y_loop:
9866*c0909341SAndroid Build Coastguard Worker    mova               [r3], m0
9867*c0909341SAndroid Build Coastguard Worker    add                  r3, reg_dstride
9868*c0909341SAndroid Build Coastguard Worker    dec                  r4
9869*c0909341SAndroid Build Coastguard Worker    jg .top_y_loop
9870*c0909341SAndroid Build Coastguard Worker    add                  r1, mmsize/2
9871*c0909341SAndroid Build Coastguard Worker    cmp                  r1, bwq
9872*c0909341SAndroid Build Coastguard Worker    jl .top_x_loop
9873*c0909341SAndroid Build Coastguard Worker
9874*c0909341SAndroid Build Coastguard Worker.end:
9875*c0909341SAndroid Build Coastguard Worker    RET
9876*c0909341SAndroid Build Coastguard Worker
9877*c0909341SAndroid Build Coastguard Worker%undef reg_dstride
9878*c0909341SAndroid Build Coastguard Worker%undef reg_blkm
9879*c0909341SAndroid Build Coastguard Worker%undef reg_tmp
9880*c0909341SAndroid Build Coastguard Worker
9881*c0909341SAndroid Build Coastguard Worker%macro SCRATCH 3
9882*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
9883*c0909341SAndroid Build Coastguard Worker    mova [rsp+%3*mmsize], m%1
9884*c0909341SAndroid Build Coastguard Worker%define m%2 [rsp+%3*mmsize]
9885*c0909341SAndroid Build Coastguard Worker%else
9886*c0909341SAndroid Build Coastguard Worker    SWAP             %1, %2
9887*c0909341SAndroid Build Coastguard Worker%endif
9888*c0909341SAndroid Build Coastguard Worker%endmacro
9889*c0909341SAndroid Build Coastguard Worker
9890*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9891*c0909341SAndroid Build Coastguard Workercglobal resize_16bpc, 0, 12, 16, 1*16, dst, dst_stride, src, src_stride, \
9892*c0909341SAndroid Build Coastguard Worker                                       dst_w, h, src_w, dx, mx0, pxmax
9893*c0909341SAndroid Build Coastguard Worker%elif STACK_ALIGNMENT >= 16
9894*c0909341SAndroid Build Coastguard Workercglobal resize_16bpc, 0, 7, 8, 6*16, dst, dst_stride, src, src_stride, \
9895*c0909341SAndroid Build Coastguard Worker                                     dst_w, h, src_w, dx, mx0, pxmax
9896*c0909341SAndroid Build Coastguard Worker%else
9897*c0909341SAndroid Build Coastguard Workercglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \
9898*c0909341SAndroid Build Coastguard Worker                                     dst_w, h, src_w, dx, mx0, pxmax
9899*c0909341SAndroid Build Coastguard Worker%endif
9900*c0909341SAndroid Build Coastguard Worker    movifnidn         dstq, dstmp
9901*c0909341SAndroid Build Coastguard Worker    movifnidn         srcq, srcmp
9902*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT >= 16
9903*c0909341SAndroid Build Coastguard Worker    movifnidn       dst_wd, dst_wm
9904*c0909341SAndroid Build Coastguard Worker%endif
9905*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9906*c0909341SAndroid Build Coastguard Worker    movifnidn           hd, hm
9907*c0909341SAndroid Build Coastguard Worker%endif
9908*c0909341SAndroid Build Coastguard Worker    sub         dword mx0m, 4<<14
9909*c0909341SAndroid Build Coastguard Worker    sub       dword src_wm, 8
9910*c0909341SAndroid Build Coastguard Worker    movd                m4, pxmaxm
9911*c0909341SAndroid Build Coastguard Worker    movd                m7, dxm
9912*c0909341SAndroid Build Coastguard Worker    movd                m6, mx0m
9913*c0909341SAndroid Build Coastguard Worker    movd                m5, src_wm
9914*c0909341SAndroid Build Coastguard Worker    punpcklwd           m4, m4
9915*c0909341SAndroid Build Coastguard Worker    pshufd              m4, m4, q0000
9916*c0909341SAndroid Build Coastguard Worker    pshufd              m7, m7, q0000
9917*c0909341SAndroid Build Coastguard Worker    pshufd              m6, m6, q0000
9918*c0909341SAndroid Build Coastguard Worker    pshufd              m5, m5, q0000
9919*c0909341SAndroid Build Coastguard Worker    mova [rsp+16*3*ARCH_X86_32], m4
9920*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9921*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
9922*c0909341SAndroid Build Coastguard Worker    LEA                 r7, $$
9923*c0909341SAndroid Build Coastguard Worker %define base r7-$$
9924*c0909341SAndroid Build Coastguard Worker%else
9925*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
9926*c0909341SAndroid Build Coastguard Worker %define hd dword r5m
9927*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT >= 16
9928*c0909341SAndroid Build Coastguard Worker    LEA                 r6, $$
9929*c0909341SAndroid Build Coastguard Worker  %define base r6-$$
9930*c0909341SAndroid Build Coastguard Worker %else
9931*c0909341SAndroid Build Coastguard Worker    LEA                 r4, $$
9932*c0909341SAndroid Build Coastguard Worker  %define base r4-$$
9933*c0909341SAndroid Build Coastguard Worker %endif
9934*c0909341SAndroid Build Coastguard Worker%endif
9935*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9936*c0909341SAndroid Build Coastguard Worker    mova               m12, [base+pd_64]
9937*c0909341SAndroid Build Coastguard Worker    mova               m11, [base+pd_63]
9938*c0909341SAndroid Build Coastguard Worker%else
9939*c0909341SAndroid Build Coastguard Worker %define m12 [base+pd_64]
9940*c0909341SAndroid Build Coastguard Worker %define m11 [base+pd_63]
9941*c0909341SAndroid Build Coastguard Worker%endif
9942*c0909341SAndroid Build Coastguard Worker    pmaddwd             m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
9943*c0909341SAndroid Build Coastguard Worker    pslld               m7, 2                      ; dx*4
9944*c0909341SAndroid Build Coastguard Worker    pslld               m5, 14
9945*c0909341SAndroid Build Coastguard Worker    paddd               m6, m4                     ; mx+[0..3]*dx
9946*c0909341SAndroid Build Coastguard Worker    SCRATCH              7, 15, 0
9947*c0909341SAndroid Build Coastguard Worker    SCRATCH              6, 14, 1
9948*c0909341SAndroid Build Coastguard Worker    SCRATCH              5, 13, 2
9949*c0909341SAndroid Build Coastguard Worker    pxor                m1, m1
9950*c0909341SAndroid Build Coastguard Worker.loop_y:
9951*c0909341SAndroid Build Coastguard Worker    xor                 xd, xd
9952*c0909341SAndroid Build Coastguard Worker    mova                m0, m14            ; per-line working version of mx
9953*c0909341SAndroid Build Coastguard Worker.loop_x:
9954*c0909341SAndroid Build Coastguard Worker    pcmpgtd             m1, m0
9955*c0909341SAndroid Build Coastguard Worker    pandn               m1, m0
9956*c0909341SAndroid Build Coastguard Worker    psrad               m2, m0, 8          ; filter offset (unmasked)
9957*c0909341SAndroid Build Coastguard Worker    pcmpgtd             m3, m13, m1
9958*c0909341SAndroid Build Coastguard Worker    pand                m1, m3
9959*c0909341SAndroid Build Coastguard Worker    pandn               m3, m13
9960*c0909341SAndroid Build Coastguard Worker    por                 m1, m3
9961*c0909341SAndroid Build Coastguard Worker    psubd               m3, m0, m1         ; pshufb offset
9962*c0909341SAndroid Build Coastguard Worker    psrad               m1, 14             ; clipped src_x offset
9963*c0909341SAndroid Build Coastguard Worker    psrad               m3, 14             ; pshufb edge_emu offset
9964*c0909341SAndroid Build Coastguard Worker    pand                m2, m11            ; filter offset (masked)
9965*c0909341SAndroid Build Coastguard Worker    ; load source pixels
9966*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9967*c0909341SAndroid Build Coastguard Worker    movd               r8d, m1
9968*c0909341SAndroid Build Coastguard Worker    pshuflw             m1, m1, q3232
9969*c0909341SAndroid Build Coastguard Worker    movd               r9d, m1
9970*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m1, m1
9971*c0909341SAndroid Build Coastguard Worker    movd              r10d, m1
9972*c0909341SAndroid Build Coastguard Worker    psrlq               m1, 32
9973*c0909341SAndroid Build Coastguard Worker    movd              r11d, m1
9974*c0909341SAndroid Build Coastguard Worker    movu                m4, [srcq+r8*2]
9975*c0909341SAndroid Build Coastguard Worker    movu                m5, [srcq+r9*2]
9976*c0909341SAndroid Build Coastguard Worker    movu                m6, [srcq+r10*2]
9977*c0909341SAndroid Build Coastguard Worker    movu                m7, [srcq+r11*2]
9978*c0909341SAndroid Build Coastguard Worker    ; if no emulation is required, we don't need to shuffle or emulate edges
9979*c0909341SAndroid Build Coastguard Worker    packssdw            m3, m3
9980*c0909341SAndroid Build Coastguard Worker    movq               r11, m3
9981*c0909341SAndroid Build Coastguard Worker    test               r11, r11
9982*c0909341SAndroid Build Coastguard Worker    jz .filter
9983*c0909341SAndroid Build Coastguard Worker    movsx               r8, r11w
9984*c0909341SAndroid Build Coastguard Worker    sar                r11, 16
9985*c0909341SAndroid Build Coastguard Worker    movsx               r9, r11w
9986*c0909341SAndroid Build Coastguard Worker    sar                r11, 16
9987*c0909341SAndroid Build Coastguard Worker    movsx              r10, r11w
9988*c0909341SAndroid Build Coastguard Worker    sar                r11, 16
9989*c0909341SAndroid Build Coastguard Worker    movu                m1, [base+resize_shuf+8+r8*2]
9990*c0909341SAndroid Build Coastguard Worker    movu                m3, [base+resize_shuf+8+r9*2]
9991*c0909341SAndroid Build Coastguard Worker    movu                m8, [base+resize_shuf+8+r10*2]
9992*c0909341SAndroid Build Coastguard Worker    movu                m9, [base+resize_shuf+8+r11*2]
9993*c0909341SAndroid Build Coastguard Worker    pshufb              m4, m1
9994*c0909341SAndroid Build Coastguard Worker    pshufb              m5, m3
9995*c0909341SAndroid Build Coastguard Worker    pshufb              m6, m8
9996*c0909341SAndroid Build Coastguard Worker    pshufb              m7, m9
9997*c0909341SAndroid Build Coastguard Worker.filter:
9998*c0909341SAndroid Build Coastguard Worker    movd               r8d, m2
9999*c0909341SAndroid Build Coastguard Worker    pshuflw             m2, m2, q3232
10000*c0909341SAndroid Build Coastguard Worker    movd               r9d, m2
10001*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m2, m2
10002*c0909341SAndroid Build Coastguard Worker    movd              r10d, m2
10003*c0909341SAndroid Build Coastguard Worker    psrlq               m2, 32
10004*c0909341SAndroid Build Coastguard Worker    movd              r11d, m2
10005*c0909341SAndroid Build Coastguard Worker    movq                m8, [base+resize_filter+r8*8]
10006*c0909341SAndroid Build Coastguard Worker    movq                m2, [base+resize_filter+r9*8]
10007*c0909341SAndroid Build Coastguard Worker    pxor                m9, m9
10008*c0909341SAndroid Build Coastguard Worker    punpcklbw           m1, m9, m8
10009*c0909341SAndroid Build Coastguard Worker    punpcklbw           m3, m9, m2
10010*c0909341SAndroid Build Coastguard Worker    psraw               m1, 8
10011*c0909341SAndroid Build Coastguard Worker    psraw               m3, 8
10012*c0909341SAndroid Build Coastguard Worker    movq               m10, [base+resize_filter+r10*8]
10013*c0909341SAndroid Build Coastguard Worker    movq                m2, [base+resize_filter+r11*8]
10014*c0909341SAndroid Build Coastguard Worker    punpcklbw           m8, m9, m10
10015*c0909341SAndroid Build Coastguard Worker    punpcklbw           m9, m2
10016*c0909341SAndroid Build Coastguard Worker    psraw               m8, 8
10017*c0909341SAndroid Build Coastguard Worker    psraw               m9, 8
10018*c0909341SAndroid Build Coastguard Worker    pmaddwd             m4, m1
10019*c0909341SAndroid Build Coastguard Worker    pmaddwd             m5, m3
10020*c0909341SAndroid Build Coastguard Worker    pmaddwd             m6, m8
10021*c0909341SAndroid Build Coastguard Worker    pmaddwd             m7, m9
10022*c0909341SAndroid Build Coastguard Worker    phaddd              m4, m5
10023*c0909341SAndroid Build Coastguard Worker%else
10024*c0909341SAndroid Build Coastguard Worker    movd                r3, m1
10025*c0909341SAndroid Build Coastguard Worker    pshuflw             m1, m1, q3232
10026*c0909341SAndroid Build Coastguard Worker    movd                r1, m1
10027*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m1, m1
10028*c0909341SAndroid Build Coastguard Worker    movu                m4, [srcq+r3*2]
10029*c0909341SAndroid Build Coastguard Worker    movu                m5, [srcq+r1*2]
10030*c0909341SAndroid Build Coastguard Worker    movd                r3, m1
10031*c0909341SAndroid Build Coastguard Worker    psrlq               m1, 32
10032*c0909341SAndroid Build Coastguard Worker    movd                r1, m1
10033*c0909341SAndroid Build Coastguard Worker    movu                m6, [srcq+r3*2]
10034*c0909341SAndroid Build Coastguard Worker    movu                m7, [srcq+r1*2]
10035*c0909341SAndroid Build Coastguard Worker    ; if no emulation is required, we don't need to shuffle or emulate edges
10036*c0909341SAndroid Build Coastguard Worker    pxor                m1, m1
10037*c0909341SAndroid Build Coastguard Worker    pcmpeqb             m1, m3
10038*c0909341SAndroid Build Coastguard Worker    pmovmskb           r3d, m1
10039*c0909341SAndroid Build Coastguard Worker    cmp                r3d, 0xffff
10040*c0909341SAndroid Build Coastguard Worker    je .filter
10041*c0909341SAndroid Build Coastguard Worker    movd                r3, m3
10042*c0909341SAndroid Build Coastguard Worker    movu                m1, [base+resize_shuf+8+r3*2]
10043*c0909341SAndroid Build Coastguard Worker    pshuflw             m3, m3, q3232
10044*c0909341SAndroid Build Coastguard Worker    movd                r1, m3
10045*c0909341SAndroid Build Coastguard Worker    pshufb              m4, m1
10046*c0909341SAndroid Build Coastguard Worker    movu                m1, [base+resize_shuf+8+r1*2]
10047*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m3, m3
10048*c0909341SAndroid Build Coastguard Worker    movd                r3, m3
10049*c0909341SAndroid Build Coastguard Worker    pshufb              m5, m1
10050*c0909341SAndroid Build Coastguard Worker    movu                m1, [base+resize_shuf+8+r3*2]
10051*c0909341SAndroid Build Coastguard Worker    psrlq               m3, 32
10052*c0909341SAndroid Build Coastguard Worker    movd                r1, m3
10053*c0909341SAndroid Build Coastguard Worker    pshufb              m6, m1
10054*c0909341SAndroid Build Coastguard Worker    movu                m1, [base+resize_shuf+8+r1*2]
10055*c0909341SAndroid Build Coastguard Worker    pshufb              m7, m1
10056*c0909341SAndroid Build Coastguard Worker.filter:
10057*c0909341SAndroid Build Coastguard Worker    mova        [esp+4*16], m6
10058*c0909341SAndroid Build Coastguard Worker    mova        [esp+5*16], m7
10059*c0909341SAndroid Build Coastguard Worker    movd                r3, m2
10060*c0909341SAndroid Build Coastguard Worker    pshuflw             m2, m2, q3232
10061*c0909341SAndroid Build Coastguard Worker    movd                r1, m2
10062*c0909341SAndroid Build Coastguard Worker    movq                m6, [base+resize_filter+r3*8]
10063*c0909341SAndroid Build Coastguard Worker    movq                m7, [base+resize_filter+r1*8]
10064*c0909341SAndroid Build Coastguard Worker    pxor                m3, m3
10065*c0909341SAndroid Build Coastguard Worker    punpcklbw           m1, m3, m6
10066*c0909341SAndroid Build Coastguard Worker    punpcklbw           m3, m7
10067*c0909341SAndroid Build Coastguard Worker    psraw               m1, 8
10068*c0909341SAndroid Build Coastguard Worker    psraw               m3, 8
10069*c0909341SAndroid Build Coastguard Worker    pmaddwd             m4, m1
10070*c0909341SAndroid Build Coastguard Worker    pmaddwd             m5, m3
10071*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m2, m2
10072*c0909341SAndroid Build Coastguard Worker    movd                r3, m2
10073*c0909341SAndroid Build Coastguard Worker    psrlq               m2, 32
10074*c0909341SAndroid Build Coastguard Worker    movd                r1, m2
10075*c0909341SAndroid Build Coastguard Worker    phaddd              m4, m5
10076*c0909341SAndroid Build Coastguard Worker    movq                m2, [base+resize_filter+r3*8]
10077*c0909341SAndroid Build Coastguard Worker    movq                m5, [base+resize_filter+r1*8]
10078*c0909341SAndroid Build Coastguard Worker    mova                m6, [esp+4*16]
10079*c0909341SAndroid Build Coastguard Worker    mova                m7, [esp+5*16]
10080*c0909341SAndroid Build Coastguard Worker    pxor                m3, m3
10081*c0909341SAndroid Build Coastguard Worker    punpcklbw           m1, m3, m2
10082*c0909341SAndroid Build Coastguard Worker    punpcklbw           m3, m5
10083*c0909341SAndroid Build Coastguard Worker    psraw               m1, 8
10084*c0909341SAndroid Build Coastguard Worker    psraw               m3, 8
10085*c0909341SAndroid Build Coastguard Worker    pmaddwd             m6, m1
10086*c0909341SAndroid Build Coastguard Worker    pmaddwd             m7, m3
10087*c0909341SAndroid Build Coastguard Worker%endif
10088*c0909341SAndroid Build Coastguard Worker    phaddd              m6, m7
10089*c0909341SAndroid Build Coastguard Worker    phaddd              m4, m6
10090*c0909341SAndroid Build Coastguard Worker    pxor                m1, m1
10091*c0909341SAndroid Build Coastguard Worker    psubd               m2, m12, m4
10092*c0909341SAndroid Build Coastguard Worker    psrad               m2, 7
10093*c0909341SAndroid Build Coastguard Worker    packssdw            m2, m2
10094*c0909341SAndroid Build Coastguard Worker    pmaxsw              m2, m1
10095*c0909341SAndroid Build Coastguard Worker    pminsw              m2, [rsp+16*3*ARCH_X86_32]
10096*c0909341SAndroid Build Coastguard Worker    movq       [dstq+xq*2], m2
10097*c0909341SAndroid Build Coastguard Worker    paddd               m0, m15
10098*c0909341SAndroid Build Coastguard Worker    add                 xd, 4
10099*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT >= 16
10100*c0909341SAndroid Build Coastguard Worker    cmp                 xd, dst_wd
10101*c0909341SAndroid Build Coastguard Worker%else
10102*c0909341SAndroid Build Coastguard Worker    cmp                 xd, dst_wm
10103*c0909341SAndroid Build Coastguard Worker%endif
10104*c0909341SAndroid Build Coastguard Worker    jl .loop_x
10105*c0909341SAndroid Build Coastguard Worker    add               dstq, dst_stridemp
10106*c0909341SAndroid Build Coastguard Worker    add               srcq, src_stridemp
10107*c0909341SAndroid Build Coastguard Worker    dec                 hd
10108*c0909341SAndroid Build Coastguard Worker    jg .loop_y
10109*c0909341SAndroid Build Coastguard Worker    RET
10110