xref: /aosp_15_r20/external/libdav1d/src/x86/looprestoration_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, VideoLabs
4*c0909341SAndroid Build Coastguard Worker; All rights reserved.
5*c0909341SAndroid Build Coastguard Worker;
6*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
7*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
8*c0909341SAndroid Build Coastguard Worker;
9*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
10*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
11*c0909341SAndroid Build Coastguard Worker;
12*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
13*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
14*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
15*c0909341SAndroid Build Coastguard Worker;
16*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*c0909341SAndroid Build Coastguard Worker
27*c0909341SAndroid Build Coastguard Worker%include "config.asm"
28*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
29*c0909341SAndroid Build Coastguard Worker
30*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 16
31*c0909341SAndroid Build Coastguard Worker
32*c0909341SAndroid Build Coastguard Workerwiener_init:   db  6,  7,  6,  7,  6,  7,  6,  7,  0,  0,  0,  0,  2,  4,  2,  4
33*c0909341SAndroid Build Coastguard Workerwiener_shufA:  db  1,  7,  2,  8,  3,  9,  4, 10,  5, 11,  6, 12,  7, 13,  8, 14
34*c0909341SAndroid Build Coastguard Workerwiener_shufB:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10
35*c0909341SAndroid Build Coastguard Workerwiener_shufC:  db  6,  5,  7,  6,  8,  7,  9,  8, 10,  9, 11, 10, 12, 11, 13, 12
36*c0909341SAndroid Build Coastguard Workerwiener_shufD:  db  4, -1,  5, -1,  6, -1,  7, -1,  8, -1,  9, -1, 10, -1, 11, -1
37*c0909341SAndroid Build Coastguard Workerwiener_l_shuf: db  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
38*c0909341SAndroid Build Coastguard Workersgr_lshuf3:    db  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
39*c0909341SAndroid Build Coastguard Workersgr_lshuf5:    db  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12
40*c0909341SAndroid Build Coastguard Workerpb_0to15:      db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
41*c0909341SAndroid Build Coastguard Worker
42*c0909341SAndroid Build Coastguard Workerpb_right_ext_mask: times 24 db 0xff
43*c0909341SAndroid Build Coastguard Worker                   times 8 db 0
44*c0909341SAndroid Build Coastguard Workerpb_1:          times 16 db 1
45*c0909341SAndroid Build Coastguard Workerpw_256:        times 8 dw 256
46*c0909341SAndroid Build Coastguard Workerpw_2056:       times 8 dw 2056
47*c0909341SAndroid Build Coastguard Workerpw_m16380:     times 8 dw -16380
48*c0909341SAndroid Build Coastguard Workerpd_4096:       times 4 dd 4096
49*c0909341SAndroid Build Coastguard Workerpd_34816:      times 4 dd 34816
50*c0909341SAndroid Build Coastguard Workerpd_0xffff:     times 4 dd 0xffff
51*c0909341SAndroid Build Coastguard Workerpd_0xf00800a4: times 4 dd 0xf00800a4
52*c0909341SAndroid Build Coastguard Workerpd_0xf00801c7: times 4 dd 0xf00801c7
53*c0909341SAndroid Build Coastguard Worker
54*c0909341SAndroid Build Coastguard Workercextern sgr_x_by_x
55*c0909341SAndroid Build Coastguard Worker
56*c0909341SAndroid Build Coastguard WorkerSECTION .text
57*c0909341SAndroid Build Coastguard Worker
58*c0909341SAndroid Build Coastguard Worker%macro movif64 2 ; dst, src
59*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
60*c0909341SAndroid Build Coastguard Worker    mov             %1, %2
61*c0909341SAndroid Build Coastguard Worker %endif
62*c0909341SAndroid Build Coastguard Worker%endmacro
63*c0909341SAndroid Build Coastguard Worker
64*c0909341SAndroid Build Coastguard Worker%macro movif32 2 ; dst, src
65*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
66*c0909341SAndroid Build Coastguard Worker    mov             %1, %2
67*c0909341SAndroid Build Coastguard Worker %endif
68*c0909341SAndroid Build Coastguard Worker%endmacro
69*c0909341SAndroid Build Coastguard Worker
70*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
71*c0909341SAndroid Build Coastguard Worker %define PIC_base_offset $$
72*c0909341SAndroid Build Coastguard Worker
73*c0909341SAndroid Build Coastguard Worker %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg
74*c0909341SAndroid Build Coastguard Worker  %assign pic_reg_stk_off 4
75*c0909341SAndroid Build Coastguard Worker  %xdefine PIC_reg %1
76*c0909341SAndroid Build Coastguard Worker  %if %2 == 1
77*c0909341SAndroid Build Coastguard Worker    mov        [esp], %1
78*c0909341SAndroid Build Coastguard Worker  %endif
79*c0909341SAndroid Build Coastguard Worker    LEA      PIC_reg, PIC_base_offset
80*c0909341SAndroid Build Coastguard Worker  %if %3 == 1
81*c0909341SAndroid Build Coastguard Worker    XCHG_PIC_REG
82*c0909341SAndroid Build Coastguard Worker  %endif
83*c0909341SAndroid Build Coastguard Worker %endmacro
84*c0909341SAndroid Build Coastguard Worker
85*c0909341SAndroid Build Coastguard Worker %macro XCHG_PIC_REG 0
86*c0909341SAndroid Build Coastguard Worker    mov [esp+pic_reg_stk_off], PIC_reg
87*c0909341SAndroid Build Coastguard Worker    %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8
88*c0909341SAndroid Build Coastguard Worker    mov PIC_reg, [esp+pic_reg_stk_off]
89*c0909341SAndroid Build Coastguard Worker %endmacro
90*c0909341SAndroid Build Coastguard Worker
91*c0909341SAndroid Build Coastguard Worker %define PIC_sym(sym)   (PIC_reg+(sym)-PIC_base_offset)
92*c0909341SAndroid Build Coastguard Worker
93*c0909341SAndroid Build Coastguard Worker%else
94*c0909341SAndroid Build Coastguard Worker %macro XCHG_PIC_REG 0
95*c0909341SAndroid Build Coastguard Worker %endmacro
96*c0909341SAndroid Build Coastguard Worker
97*c0909341SAndroid Build Coastguard Worker %define PIC_sym(sym)   (sym)
98*c0909341SAndroid Build Coastguard Worker%endif
99*c0909341SAndroid Build Coastguard Worker
100*c0909341SAndroid Build Coastguard Worker%macro WIENER 0
101*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
102*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 9, 7, 10, 11, 12, 13, 14 ; ring buffer pointers
103*c0909341SAndroid Build Coastguard Workercglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
104*c0909341SAndroid Build Coastguard Worker                                                    w, h, edge, flt, x
105*c0909341SAndroid Build Coastguard Worker    %define tmpstrideq strideq
106*c0909341SAndroid Build Coastguard Worker    %define base 0
107*c0909341SAndroid Build Coastguard Worker    mov           fltq, r6mp
108*c0909341SAndroid Build Coastguard Worker    mov             wd, wm
109*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
110*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
111*c0909341SAndroid Build Coastguard Worker    movq           m14, [fltq]
112*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
113*c0909341SAndroid Build Coastguard Worker    movq            m7, [fltq+16]
114*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
115*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq*2+16]
116*c0909341SAndroid Build Coastguard Worker    mova           m15, [pw_2056]
117*c0909341SAndroid Build Coastguard Worker    neg             wq
118*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3)
119*c0909341SAndroid Build Coastguard Worker    pshufb         m14, [wiener_init]
120*c0909341SAndroid Build Coastguard Worker    mova            m8, [wiener_shufA]
121*c0909341SAndroid Build Coastguard Worker    pshufd         m12, m14, q2222  ; x0 x0
122*c0909341SAndroid Build Coastguard Worker    mova            m9, [wiener_shufB]
123*c0909341SAndroid Build Coastguard Worker    pshufd         m13, m14, q3333  ; x1 x2
124*c0909341SAndroid Build Coastguard Worker    mova           m10, [wiener_shufC]
125*c0909341SAndroid Build Coastguard Worker    punpcklqdq     m14, m14         ; x3
126*c0909341SAndroid Build Coastguard Worker    mova           m11, [wiener_shufD]
127*c0909341SAndroid Build Coastguard Worker%else
128*c0909341SAndroid Build Coastguard Worker    mova           m10, [pw_m16380]
129*c0909341SAndroid Build Coastguard Worker    punpcklwd      m14, m14
130*c0909341SAndroid Build Coastguard Worker    pshufd         m11, m14, q0000 ; x0
131*c0909341SAndroid Build Coastguard Worker    pshufd         m12, m14, q1111 ; x1
132*c0909341SAndroid Build Coastguard Worker    pshufd         m13, m14, q2222 ; x2
133*c0909341SAndroid Build Coastguard Worker    pshufd         m14, m14, q3333 ; x3
134*c0909341SAndroid Build Coastguard Worker%endif
135*c0909341SAndroid Build Coastguard Worker%else
136*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4, 0, _, 5
137*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3)
138*c0909341SAndroid Build Coastguard Worker    %define m10         [base+wiener_shufC]
139*c0909341SAndroid Build Coastguard Worker    %define m11         [base+wiener_shufD]
140*c0909341SAndroid Build Coastguard Worker    %define stk_off     96
141*c0909341SAndroid Build Coastguard Worker%else
142*c0909341SAndroid Build Coastguard Worker    %define m10         [base+pw_m16380]
143*c0909341SAndroid Build Coastguard Worker    %define m11         [stk+96]
144*c0909341SAndroid Build Coastguard Worker    %define stk_off     112
145*c0909341SAndroid Build Coastguard Worker%endif
146*c0909341SAndroid Build Coastguard Workercglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstride
147*c0909341SAndroid Build Coastguard Worker    %define base        r6-pb_right_ext_mask-21
148*c0909341SAndroid Build Coastguard Worker    %define stk         esp
149*c0909341SAndroid Build Coastguard Worker    %define dstq        leftq
150*c0909341SAndroid Build Coastguard Worker    %define edgeb       byte edged
151*c0909341SAndroid Build Coastguard Worker    %define edged       [stk+ 8]
152*c0909341SAndroid Build Coastguard Worker    %define dstmp       [stk+12]
153*c0909341SAndroid Build Coastguard Worker    %define hd    dword [stk+16]
154*c0909341SAndroid Build Coastguard Worker    %define wq          [stk+20]
155*c0909341SAndroid Build Coastguard Worker    %define strideq     [stk+24]
156*c0909341SAndroid Build Coastguard Worker    %define leftmp      [stk+28]
157*c0909341SAndroid Build Coastguard Worker    %define t2          [stk+32]
158*c0909341SAndroid Build Coastguard Worker    %define t4          [stk+36]
159*c0909341SAndroid Build Coastguard Worker    %define t5          [stk+40]
160*c0909341SAndroid Build Coastguard Worker    %define t6          [stk+44]
161*c0909341SAndroid Build Coastguard Worker    %define m8          [base+wiener_shufA]
162*c0909341SAndroid Build Coastguard Worker    %define m9          [base+wiener_shufB]
163*c0909341SAndroid Build Coastguard Worker    %define m12         [stk+48]
164*c0909341SAndroid Build Coastguard Worker    %define m13         [stk+64]
165*c0909341SAndroid Build Coastguard Worker    %define m14         [stk+80]
166*c0909341SAndroid Build Coastguard Worker    %define m15         [base+pw_2056]
167*c0909341SAndroid Build Coastguard Worker    mov             r1, r6m ; flt
168*c0909341SAndroid Build Coastguard Worker    mov             r0, r0m ; dst
169*c0909341SAndroid Build Coastguard Worker    mov             r4, r4m ; w
170*c0909341SAndroid Build Coastguard Worker    mov           lpfq, lpfm
171*c0909341SAndroid Build Coastguard Worker    mov             r2, r7m ; edge
172*c0909341SAndroid Build Coastguard Worker    mov             r5, r5m ; h
173*c0909341SAndroid Build Coastguard Worker    movq            m3, [r1+ 0]
174*c0909341SAndroid Build Coastguard Worker    movq            m7, [r1+16]
175*c0909341SAndroid Build Coastguard Worker    add             r0, r4
176*c0909341SAndroid Build Coastguard Worker    mov             r1, r1m ; stride
177*c0909341SAndroid Build Coastguard Worker    add           lpfq, r4
178*c0909341SAndroid Build Coastguard Worker    mov          edged, r2
179*c0909341SAndroid Build Coastguard Worker    mov             r2, r2m ; left
180*c0909341SAndroid Build Coastguard Worker    mov          dstmp, r0
181*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+r4*2+stk_off]
182*c0909341SAndroid Build Coastguard Worker    mov             hd, r5
183*c0909341SAndroid Build Coastguard Worker    neg             r4
184*c0909341SAndroid Build Coastguard Worker    LEA             r6, pb_right_ext_mask+21
185*c0909341SAndroid Build Coastguard Worker    mov             wq, r4
186*c0909341SAndroid Build Coastguard Worker    mov        strideq, r1
187*c0909341SAndroid Build Coastguard Worker    mov         leftmp, r2
188*c0909341SAndroid Build Coastguard Worker    mov             r4, r1
189*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3)
190*c0909341SAndroid Build Coastguard Worker    pshufb          m3, [base+wiener_init]
191*c0909341SAndroid Build Coastguard Worker    pshufd          m1, m3, q2222
192*c0909341SAndroid Build Coastguard Worker    pshufd          m2, m3, q3333
193*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m3, m3
194*c0909341SAndroid Build Coastguard Worker%else
195*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m3
196*c0909341SAndroid Build Coastguard Worker    pshufd          m0, m3, q0000
197*c0909341SAndroid Build Coastguard Worker    pshufd          m1, m3, q1111
198*c0909341SAndroid Build Coastguard Worker    pshufd          m2, m3, q2222
199*c0909341SAndroid Build Coastguard Worker    pshufd          m3, m3, q3333
200*c0909341SAndroid Build Coastguard Worker    mova           m11, m0
201*c0909341SAndroid Build Coastguard Worker%endif
202*c0909341SAndroid Build Coastguard Worker    mova           m12, m1
203*c0909341SAndroid Build Coastguard Worker    mova           m13, m2
204*c0909341SAndroid Build Coastguard Worker    mova           m14, m3
205*c0909341SAndroid Build Coastguard Worker%endif
206*c0909341SAndroid Build Coastguard Worker    psllw           m7, 5
207*c0909341SAndroid Build Coastguard Worker    pshufd          m6, m7, q0000 ; y0 y1
208*c0909341SAndroid Build Coastguard Worker    pshufd          m7, m7, q1111 ; y2 y3
209*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
210*c0909341SAndroid Build Coastguard Worker    jz .no_top
211*c0909341SAndroid Build Coastguard Worker    call .h_top
212*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
213*c0909341SAndroid Build Coastguard Worker    mov             t6, t1
214*c0909341SAndroid Build Coastguard Worker    mov             t5, t1
215*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
216*c0909341SAndroid Build Coastguard Worker    call .h_top
217*c0909341SAndroid Build Coastguard Worker    lea             t3, [lpfq+tmpstrideq*4]
218*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstmp
219*c0909341SAndroid Build Coastguard Worker    add             t3, tmpstrideq
220*c0909341SAndroid Build Coastguard Worker    mov          [rsp], t3 ; below
221*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
222*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
223*c0909341SAndroid Build Coastguard Worker    call .h
224*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
225*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
226*c0909341SAndroid Build Coastguard Worker    dec             hd
227*c0909341SAndroid Build Coastguard Worker    jz .v1
228*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
229*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
230*c0909341SAndroid Build Coastguard Worker    call .h
231*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
232*c0909341SAndroid Build Coastguard Worker    dec             hd
233*c0909341SAndroid Build Coastguard Worker    jz .v2
234*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
235*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
236*c0909341SAndroid Build Coastguard Worker    call .h
237*c0909341SAndroid Build Coastguard Worker    dec             hd
238*c0909341SAndroid Build Coastguard Worker    jz .v3
239*c0909341SAndroid Build Coastguard Worker.main:
240*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+384*2]
241*c0909341SAndroid Build Coastguard Worker.main_loop:
242*c0909341SAndroid Build Coastguard Worker    call .hv
243*c0909341SAndroid Build Coastguard Worker    dec             hd
244*c0909341SAndroid Build Coastguard Worker    jnz .main_loop
245*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
246*c0909341SAndroid Build Coastguard Worker    jz .v3
247*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
248*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
249*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
250*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
251*c0909341SAndroid Build Coastguard Worker.v1:
252*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
253*c0909341SAndroid Build Coastguard Worker    RET
254*c0909341SAndroid Build Coastguard Worker.no_top:
255*c0909341SAndroid Build Coastguard Worker    lea             t3, [lpfq+tmpstrideq*4]
256*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstmp
257*c0909341SAndroid Build Coastguard Worker    lea             t3, [t3+tmpstrideq*2]
258*c0909341SAndroid Build Coastguard Worker    mov          [rsp], t3
259*c0909341SAndroid Build Coastguard Worker    call .h
260*c0909341SAndroid Build Coastguard Worker    mov             t6, t1
261*c0909341SAndroid Build Coastguard Worker    mov             t5, t1
262*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
263*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
264*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
265*c0909341SAndroid Build Coastguard Worker    dec             hd
266*c0909341SAndroid Build Coastguard Worker    jz .v1
267*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
268*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
269*c0909341SAndroid Build Coastguard Worker    call .h
270*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
271*c0909341SAndroid Build Coastguard Worker    dec             hd
272*c0909341SAndroid Build Coastguard Worker    jz .v2
273*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
274*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
275*c0909341SAndroid Build Coastguard Worker    call .h
276*c0909341SAndroid Build Coastguard Worker    dec             hd
277*c0909341SAndroid Build Coastguard Worker    jz .v3
278*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+384*2]
279*c0909341SAndroid Build Coastguard Worker    call .hv
280*c0909341SAndroid Build Coastguard Worker    dec             hd
281*c0909341SAndroid Build Coastguard Worker    jz .v3
282*c0909341SAndroid Build Coastguard Worker    add             t0, 384*8
283*c0909341SAndroid Build Coastguard Worker    call .hv
284*c0909341SAndroid Build Coastguard Worker    dec             hd
285*c0909341SAndroid Build Coastguard Worker    jnz .main
286*c0909341SAndroid Build Coastguard Worker.v3:
287*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
288*c0909341SAndroid Build Coastguard Worker.v2:
289*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
290*c0909341SAndroid Build Coastguard Worker    jmp .v1
291*c0909341SAndroid Build Coastguard Worker.extend_right:
292*c0909341SAndroid Build Coastguard Worker    movd            m2, [lpfq-1]
293*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
294*c0909341SAndroid Build Coastguard Worker    push            r0
295*c0909341SAndroid Build Coastguard Worker    lea             r0, [pb_right_ext_mask+21]
296*c0909341SAndroid Build Coastguard Worker    movu            m0, [r0+xq+0]
297*c0909341SAndroid Build Coastguard Worker    movu            m1, [r0+xq+8]
298*c0909341SAndroid Build Coastguard Worker    pop             r0
299*c0909341SAndroid Build Coastguard Worker%else
300*c0909341SAndroid Build Coastguard Worker    movu            m0, [r6+xq+0]
301*c0909341SAndroid Build Coastguard Worker    movu            m1, [r6+xq+8]
302*c0909341SAndroid Build Coastguard Worker%endif
303*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3)
304*c0909341SAndroid Build Coastguard Worker    pxor            m3, m3
305*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m3
306*c0909341SAndroid Build Coastguard Worker%else
307*c0909341SAndroid Build Coastguard Worker    punpcklbw       m2, m2
308*c0909341SAndroid Build Coastguard Worker    pshuflw         m2, m2, q0000
309*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m2, m2
310*c0909341SAndroid Build Coastguard Worker%endif
311*c0909341SAndroid Build Coastguard Worker    pand            m4, m0
312*c0909341SAndroid Build Coastguard Worker    pand            m5, m1
313*c0909341SAndroid Build Coastguard Worker    pandn           m0, m2
314*c0909341SAndroid Build Coastguard Worker    pandn           m1, m2
315*c0909341SAndroid Build Coastguard Worker    por             m4, m0
316*c0909341SAndroid Build Coastguard Worker    por             m5, m1
317*c0909341SAndroid Build Coastguard Worker    ret
318*c0909341SAndroid Build Coastguard Worker.h:
319*c0909341SAndroid Build Coastguard Worker    %define stk esp+4 ; offset due to call
320*c0909341SAndroid Build Coastguard Worker    mov             xq, wq
321*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
322*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
323*c0909341SAndroid Build Coastguard Worker    movifnidn    leftq, leftmp
324*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+xq]
325*c0909341SAndroid Build Coastguard Worker    movd            m5, [leftq]
326*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
327*c0909341SAndroid Build Coastguard Worker    pslldq          m4, 4
328*c0909341SAndroid Build Coastguard Worker    por             m4, m5
329*c0909341SAndroid Build Coastguard Worker    movifnidn   leftmp, leftq
330*c0909341SAndroid Build Coastguard Worker    jmp .h_main
331*c0909341SAndroid Build Coastguard Worker.h_extend_left:
332*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3)
333*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+xq]
334*c0909341SAndroid Build Coastguard Worker    pshufb          m4, [base+wiener_l_shuf]
335*c0909341SAndroid Build Coastguard Worker%else
336*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+xq]
337*c0909341SAndroid Build Coastguard Worker    pshufd          m4, m5, q2103
338*c0909341SAndroid Build Coastguard Worker    punpcklbw       m5, m5
339*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m5
340*c0909341SAndroid Build Coastguard Worker    movss           m4, m5
341*c0909341SAndroid Build Coastguard Worker%endif
342*c0909341SAndroid Build Coastguard Worker    jmp .h_main
343*c0909341SAndroid Build Coastguard Worker.h_top:
344*c0909341SAndroid Build Coastguard Worker    mov             xq, wq
345*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
346*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
347*c0909341SAndroid Build Coastguard Worker.h_loop:
348*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+xq-4]
349*c0909341SAndroid Build Coastguard Worker.h_main:
350*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+xq+4]
351*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
352*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
353*c0909341SAndroid Build Coastguard Worker    cmp             xd, -18
354*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
355*c0909341SAndroid Build Coastguard Worker    call .extend_right
356*c0909341SAndroid Build Coastguard Worker.h_have_right:
357*c0909341SAndroid Build Coastguard Worker%macro %%h7 0
358*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3)
359*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m4, m8
360*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m0, m12
361*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m5, m8
362*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m1, m12
363*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m4, m9
364*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m2, m13
365*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m5, m9
366*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m3, m13
367*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
368*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m4, m10
369*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m2, m13
370*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
371*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m5, m10
372*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m3, m13
373*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m11
374*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
375*c0909341SAndroid Build Coastguard Worker    pmullw          m2, m14, m4
376*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m11
377*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
378*c0909341SAndroid Build Coastguard Worker    pmullw          m3, m14, m5
379*c0909341SAndroid Build Coastguard Worker    psllw           m4, 7
380*c0909341SAndroid Build Coastguard Worker    psllw           m5, 7
381*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
382*c0909341SAndroid Build Coastguard Worker    mova            m2, [base+pw_m16380]
383*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
384*c0909341SAndroid Build Coastguard Worker    paddw           m4, m2
385*c0909341SAndroid Build Coastguard Worker    paddw           m5, m2
386*c0909341SAndroid Build Coastguard Worker    paddsw          m0, m4
387*c0909341SAndroid Build Coastguard Worker    paddsw          m1, m5
388*c0909341SAndroid Build Coastguard Worker%else
389*c0909341SAndroid Build Coastguard Worker    psrldq          m0, m4, 1
390*c0909341SAndroid Build Coastguard Worker    pslldq          m1, m4, 1
391*c0909341SAndroid Build Coastguard Worker    pxor            m3, m3
392*c0909341SAndroid Build Coastguard Worker    punpcklbw       m0, m3
393*c0909341SAndroid Build Coastguard Worker    punpckhbw       m1, m3
394*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
395*c0909341SAndroid Build Coastguard Worker    pmullw          m0, m11
396*c0909341SAndroid Build Coastguard Worker    psrldq          m1, m4, 2
397*c0909341SAndroid Build Coastguard Worker    pslldq          m2, m4, 2
398*c0909341SAndroid Build Coastguard Worker    punpcklbw       m1, m3
399*c0909341SAndroid Build Coastguard Worker    punpckhbw       m2, m3
400*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2
401*c0909341SAndroid Build Coastguard Worker    pmullw          m1, m12
402*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
403*c0909341SAndroid Build Coastguard Worker    pshufd          m2, m4, q0321
404*c0909341SAndroid Build Coastguard Worker    punpcklbw       m2, m3
405*c0909341SAndroid Build Coastguard Worker    pmullw          m1, m14, m2
406*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
407*c0909341SAndroid Build Coastguard Worker    psrldq          m1, m4, 3
408*c0909341SAndroid Build Coastguard Worker    pslldq          m4, 3
409*c0909341SAndroid Build Coastguard Worker    punpcklbw       m1, m3
410*c0909341SAndroid Build Coastguard Worker    punpckhbw       m4, m3
411*c0909341SAndroid Build Coastguard Worker    paddw           m1, m4
412*c0909341SAndroid Build Coastguard Worker    pmullw          m1, m13
413*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
414*c0909341SAndroid Build Coastguard Worker    psllw           m2, 7
415*c0909341SAndroid Build Coastguard Worker    paddw           m2, m10
416*c0909341SAndroid Build Coastguard Worker    paddsw          m0, m2
417*c0909341SAndroid Build Coastguard Worker    psrldq          m1, m5, 1
418*c0909341SAndroid Build Coastguard Worker    pslldq          m2, m5, 1
419*c0909341SAndroid Build Coastguard Worker    punpcklbw       m1, m3
420*c0909341SAndroid Build Coastguard Worker    punpckhbw       m2, m3
421*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2
422*c0909341SAndroid Build Coastguard Worker    pmullw          m1, m11
423*c0909341SAndroid Build Coastguard Worker    psrldq          m2, m5, 2
424*c0909341SAndroid Build Coastguard Worker    pslldq          m4, m5, 2
425*c0909341SAndroid Build Coastguard Worker    punpcklbw       m2, m3
426*c0909341SAndroid Build Coastguard Worker    punpckhbw       m4, m3
427*c0909341SAndroid Build Coastguard Worker    paddw           m2, m4
428*c0909341SAndroid Build Coastguard Worker    pmullw          m2, m12
429*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2
430*c0909341SAndroid Build Coastguard Worker    pshufd          m4, m5, q0321
431*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, m3
432*c0909341SAndroid Build Coastguard Worker    pmullw          m2, m14, m4
433*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2
434*c0909341SAndroid Build Coastguard Worker    psrldq          m2, m5, 3
435*c0909341SAndroid Build Coastguard Worker    pslldq          m5, 3
436*c0909341SAndroid Build Coastguard Worker    punpcklbw       m2, m3
437*c0909341SAndroid Build Coastguard Worker    punpckhbw       m5, m3
438*c0909341SAndroid Build Coastguard Worker    paddw           m2, m5
439*c0909341SAndroid Build Coastguard Worker    pmullw          m2, m13
440*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2
441*c0909341SAndroid Build Coastguard Worker    psllw           m4, 7
442*c0909341SAndroid Build Coastguard Worker    paddw           m4, m10
443*c0909341SAndroid Build Coastguard Worker    paddsw          m1, m4
444*c0909341SAndroid Build Coastguard Worker%endif
445*c0909341SAndroid Build Coastguard Worker%endmacro
446*c0909341SAndroid Build Coastguard Worker    %%h7
447*c0909341SAndroid Build Coastguard Worker    psraw           m0, 3
448*c0909341SAndroid Build Coastguard Worker    psraw           m1, 3
449*c0909341SAndroid Build Coastguard Worker    paddw           m0, m15
450*c0909341SAndroid Build Coastguard Worker    paddw           m1, m15
451*c0909341SAndroid Build Coastguard Worker    mova  [t1+xq*2+ 0], m0
452*c0909341SAndroid Build Coastguard Worker    mova  [t1+xq*2+16], m1
453*c0909341SAndroid Build Coastguard Worker    add             xq, 16
454*c0909341SAndroid Build Coastguard Worker    jl .h_loop
455*c0909341SAndroid Build Coastguard Worker    ret
456*c0909341SAndroid Build Coastguard WorkerALIGN function_align
457*c0909341SAndroid Build Coastguard Worker.hv:
458*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
459*c0909341SAndroid Build Coastguard Worker    mov             xq, wq
460*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
461*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
462*c0909341SAndroid Build Coastguard Worker    movifnidn    leftq, leftmp
463*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+xq]
464*c0909341SAndroid Build Coastguard Worker    movd            m5, [leftq]
465*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
466*c0909341SAndroid Build Coastguard Worker    pslldq          m4, 4
467*c0909341SAndroid Build Coastguard Worker    por             m4, m5
468*c0909341SAndroid Build Coastguard Worker    movifnidn   leftmp, leftq
469*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
470*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
471*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3)
472*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+xq]
473*c0909341SAndroid Build Coastguard Worker    pshufb          m4, [base+wiener_l_shuf]
474*c0909341SAndroid Build Coastguard Worker%else
475*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+xq]
476*c0909341SAndroid Build Coastguard Worker    pshufd          m4, m5, q2103
477*c0909341SAndroid Build Coastguard Worker    punpcklbw       m5, m5
478*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m5
479*c0909341SAndroid Build Coastguard Worker    movss           m4, m5
480*c0909341SAndroid Build Coastguard Worker%endif
481*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
482*c0909341SAndroid Build Coastguard Worker.hv_bottom:
483*c0909341SAndroid Build Coastguard Worker    mov             xq, wq
484*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
485*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
486*c0909341SAndroid Build Coastguard Worker.hv_loop:
487*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+xq-4]
488*c0909341SAndroid Build Coastguard Worker.hv_main:
489*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+xq+4]
490*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
491*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
492*c0909341SAndroid Build Coastguard Worker    cmp             xd, -18
493*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
494*c0909341SAndroid Build Coastguard Worker    call .extend_right
495*c0909341SAndroid Build Coastguard Worker.hv_have_right:
496*c0909341SAndroid Build Coastguard Worker    %%h7
497*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
498*c0909341SAndroid Build Coastguard Worker    mova            m2, [t4+xq*2]
499*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t2+xq*2]
500*c0909341SAndroid Build Coastguard Worker%else
501*c0909341SAndroid Build Coastguard Worker    mov             r2, t4
502*c0909341SAndroid Build Coastguard Worker    mova            m2, [r2+xq*2]
503*c0909341SAndroid Build Coastguard Worker    mov             r2, t2
504*c0909341SAndroid Build Coastguard Worker    paddw           m2, [r2+xq*2]
505*c0909341SAndroid Build Coastguard Worker    mov             r2, t5
506*c0909341SAndroid Build Coastguard Worker%endif
507*c0909341SAndroid Build Coastguard Worker    mova            m3, [t3+xq*2]
508*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
509*c0909341SAndroid Build Coastguard Worker    mova            m5, [t5+xq*2]
510*c0909341SAndroid Build Coastguard Worker%else
511*c0909341SAndroid Build Coastguard Worker    mova            m5, [r2+xq*2]
512*c0909341SAndroid Build Coastguard Worker    mov             r2, t6
513*c0909341SAndroid Build Coastguard Worker%endif
514*c0909341SAndroid Build Coastguard Worker    paddw           m5, [t1+xq*2]
515*c0909341SAndroid Build Coastguard Worker    psraw           m0, 3
516*c0909341SAndroid Build Coastguard Worker    psraw           m1, 3
517*c0909341SAndroid Build Coastguard Worker    paddw           m0, m15
518*c0909341SAndroid Build Coastguard Worker    paddw           m1, m15
519*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
520*c0909341SAndroid Build Coastguard Worker    paddw           m4, m0, [t6+xq*2]
521*c0909341SAndroid Build Coastguard Worker%else
522*c0909341SAndroid Build Coastguard Worker    paddw           m4, m0, [r2+xq*2]
523*c0909341SAndroid Build Coastguard Worker    mov             r2, t4
524*c0909341SAndroid Build Coastguard Worker%endif
525*c0909341SAndroid Build Coastguard Worker    mova     [t0+xq*2], m0
526*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m2, m3
527*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m7
528*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m3
529*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m7
530*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m4, m5
531*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m6
532*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m5
533*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m6
534*c0909341SAndroid Build Coastguard Worker    paddd           m0, m3
535*c0909341SAndroid Build Coastguard Worker    mova            m3, [t3+xq*2+16]
536*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2
537*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
538*c0909341SAndroid Build Coastguard Worker    mova            m2, [t4+xq*2+16]
539*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t2+xq*2+16]
540*c0909341SAndroid Build Coastguard Worker    mova            m5, [t5+xq*2+16]
541*c0909341SAndroid Build Coastguard Worker%else
542*c0909341SAndroid Build Coastguard Worker    mova            m2, [r2+xq*2+16]
543*c0909341SAndroid Build Coastguard Worker    mov             r2, t2
544*c0909341SAndroid Build Coastguard Worker    paddw           m2, [r2+xq*2+16]
545*c0909341SAndroid Build Coastguard Worker    mov             r2, t5
546*c0909341SAndroid Build Coastguard Worker    mova            m5, [r2+xq*2+16]
547*c0909341SAndroid Build Coastguard Worker    mov             r2, t6
548*c0909341SAndroid Build Coastguard Worker%endif
549*c0909341SAndroid Build Coastguard Worker    paddw           m5, [t1+xq*2+16]
550*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m4
551*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
552*c0909341SAndroid Build Coastguard Worker    paddw           m4, m1, [t6+xq*2+16]
553*c0909341SAndroid Build Coastguard Worker%else
554*c0909341SAndroid Build Coastguard Worker    paddw           m4, m1, [r2+xq*2+16]
555*c0909341SAndroid Build Coastguard Worker    mov           dstq, dstmp
556*c0909341SAndroid Build Coastguard Worker%endif
557*c0909341SAndroid Build Coastguard Worker    mova  [t0+xq*2+16], m1
558*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m2, m3
559*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m7
560*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m3
561*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m7
562*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m4, m5
563*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m6
564*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m5
565*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m6
566*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
567*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4
568*c0909341SAndroid Build Coastguard Worker    packuswb        m1, m2
569*c0909341SAndroid Build Coastguard Worker    psrlw           m0, 8
570*c0909341SAndroid Build Coastguard Worker    psrlw           m1, 8
571*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m1
572*c0909341SAndroid Build Coastguard Worker    mova     [dstq+xq], m0
573*c0909341SAndroid Build Coastguard Worker    add             xq, 16
574*c0909341SAndroid Build Coastguard Worker    jl .hv_loop
575*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
576*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
577*c0909341SAndroid Build Coastguard Worker    mov             t6, t5
578*c0909341SAndroid Build Coastguard Worker    mov             t5, t4
579*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
580*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
581*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
582*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
583*c0909341SAndroid Build Coastguard Worker    mov             t0, t6
584*c0909341SAndroid Build Coastguard Worker%else
585*c0909341SAndroid Build Coastguard Worker    mov          dstmp, dstq
586*c0909341SAndroid Build Coastguard Worker    mov             r1, t5
587*c0909341SAndroid Build Coastguard Worker    mov             r2, t4
588*c0909341SAndroid Build Coastguard Worker    mov             t6, r1
589*c0909341SAndroid Build Coastguard Worker    mov             t5, r2
590*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
591*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
592*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
593*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
594*c0909341SAndroid Build Coastguard Worker    mov             t0, r1
595*c0909341SAndroid Build Coastguard Worker%endif
596*c0909341SAndroid Build Coastguard Worker    ret
597*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code
598*c0909341SAndroid Build Coastguard Worker.v:
599*c0909341SAndroid Build Coastguard Worker    mov             xq, wq
600*c0909341SAndroid Build Coastguard Worker.v_loop:
601*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
602*c0909341SAndroid Build Coastguard Worker    mova            m1, [t4+xq*2]
603*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t2+xq*2]
604*c0909341SAndroid Build Coastguard Worker%else
605*c0909341SAndroid Build Coastguard Worker    mov             r2, t4
606*c0909341SAndroid Build Coastguard Worker    mova            m1, [r2+xq*2]
607*c0909341SAndroid Build Coastguard Worker    mov             r2, t2
608*c0909341SAndroid Build Coastguard Worker    paddw           m1, [r2+xq*2]
609*c0909341SAndroid Build Coastguard Worker    mov             r2, t6
610*c0909341SAndroid Build Coastguard Worker%endif
611*c0909341SAndroid Build Coastguard Worker    mova            m2, [t3+xq*2]
612*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+xq*2]
613*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
614*c0909341SAndroid Build Coastguard Worker    paddw           m3, m4, [t6+xq*2]
615*c0909341SAndroid Build Coastguard Worker    paddw           m4, [t5+xq*2]
616*c0909341SAndroid Build Coastguard Worker%else
617*c0909341SAndroid Build Coastguard Worker    paddw           m3, m4, [r2+xq*2]
618*c0909341SAndroid Build Coastguard Worker    mov             r2, t5
619*c0909341SAndroid Build Coastguard Worker    paddw           m4, [r2+xq*2]
620*c0909341SAndroid Build Coastguard Worker    mov             r2, t4
621*c0909341SAndroid Build Coastguard Worker%endif
622*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m2
623*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m7
624*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m2
625*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m7
626*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m4
627*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m6
628*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4
629*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m6
630*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
631*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
632*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
633*c0909341SAndroid Build Coastguard Worker    mova            m2, [t4+xq*2+16]
634*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t2+xq*2+16]
635*c0909341SAndroid Build Coastguard Worker%else
636*c0909341SAndroid Build Coastguard Worker    mova            m2, [r2+xq*2+16]
637*c0909341SAndroid Build Coastguard Worker    mov             r2, t2
638*c0909341SAndroid Build Coastguard Worker    paddw           m2, [r2+xq*2+16]
639*c0909341SAndroid Build Coastguard Worker    mov             r2, t6
640*c0909341SAndroid Build Coastguard Worker%endif
641*c0909341SAndroid Build Coastguard Worker    mova            m3, [t3+xq*2+16]
642*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+xq*2+16]
643*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
644*c0909341SAndroid Build Coastguard Worker    paddw           m4, m5, [t6+xq*2+16]
645*c0909341SAndroid Build Coastguard Worker    paddw           m5, [t5+xq*2+16]
646*c0909341SAndroid Build Coastguard Worker%else
647*c0909341SAndroid Build Coastguard Worker    paddw           m4, m5, [r2+xq*2+16]
648*c0909341SAndroid Build Coastguard Worker    mov             r2, t5
649*c0909341SAndroid Build Coastguard Worker    paddw           m5, [r2+xq*2+16]
650*c0909341SAndroid Build Coastguard Worker    movifnidn     dstq, dstmp
651*c0909341SAndroid Build Coastguard Worker%endif
652*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m1
653*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m2, m3
654*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m7
655*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m3
656*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m7
657*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m4, m5
658*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m6
659*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m5
660*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m6
661*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
662*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4
663*c0909341SAndroid Build Coastguard Worker    packuswb        m1, m2
664*c0909341SAndroid Build Coastguard Worker    psrlw           m0, 8
665*c0909341SAndroid Build Coastguard Worker    psrlw           m1, 8
666*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m1
667*c0909341SAndroid Build Coastguard Worker    mova     [dstq+xq], m0
668*c0909341SAndroid Build Coastguard Worker    add             xq, 16
669*c0909341SAndroid Build Coastguard Worker    jl .v_loop
670*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
671*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
672*c0909341SAndroid Build Coastguard Worker    mov             t6, t5
673*c0909341SAndroid Build Coastguard Worker    mov             t5, t4
674*c0909341SAndroid Build Coastguard Worker%else
675*c0909341SAndroid Build Coastguard Worker    mov          dstmp, dstq
676*c0909341SAndroid Build Coastguard Worker    mov             r1, t5
677*c0909341SAndroid Build Coastguard Worker    mov             r2, t4
678*c0909341SAndroid Build Coastguard Worker    mov             t6, r1
679*c0909341SAndroid Build Coastguard Worker    mov             t5, r2
680*c0909341SAndroid Build Coastguard Worker%endif
681*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
682*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
683*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
684*c0909341SAndroid Build Coastguard Worker    ret
685*c0909341SAndroid Build Coastguard Worker%endif
686*c0909341SAndroid Build Coastguard Worker
687*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
688*c0909341SAndroid Build Coastguard Workercglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
689*c0909341SAndroid Build Coastguard Worker                                                  w, h, edge, flt, x
690*c0909341SAndroid Build Coastguard Worker    mov           fltq, r6mp
691*c0909341SAndroid Build Coastguard Worker    mov             wd, wm
692*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
693*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
694*c0909341SAndroid Build Coastguard Worker    movq           m14, [fltq]
695*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
696*c0909341SAndroid Build Coastguard Worker    movq            m7, [fltq+16]
697*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
698*c0909341SAndroid Build Coastguard Worker    mova            m8, [pw_m16380]
699*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq*2+16]
700*c0909341SAndroid Build Coastguard Worker    mova           m15, [pw_2056]
701*c0909341SAndroid Build Coastguard Worker    neg             wq
702*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3)
703*c0909341SAndroid Build Coastguard Worker    pshufb         m14, [wiener_init]
704*c0909341SAndroid Build Coastguard Worker    mova            m9, [wiener_shufB]
705*c0909341SAndroid Build Coastguard Worker    pshufd         m13, m14, q3333  ; x1 x2
706*c0909341SAndroid Build Coastguard Worker    mova           m10, [wiener_shufC]
707*c0909341SAndroid Build Coastguard Worker    punpcklqdq     m14, m14         ; x3
708*c0909341SAndroid Build Coastguard Worker    mova           m11, [wiener_shufD]
709*c0909341SAndroid Build Coastguard Worker    mova           m12, [wiener_l_shuf]
710*c0909341SAndroid Build Coastguard Worker%else
711*c0909341SAndroid Build Coastguard Worker    punpcklwd      m14, m14
712*c0909341SAndroid Build Coastguard Worker    pshufd         m11, m14, q1111 ; x1
713*c0909341SAndroid Build Coastguard Worker    pshufd         m13, m14, q2222 ; x2
714*c0909341SAndroid Build Coastguard Worker    pshufd         m14, m14, q3333 ; x3
715*c0909341SAndroid Build Coastguard Worker%endif
716*c0909341SAndroid Build Coastguard Worker%else
717*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3)
718*c0909341SAndroid Build Coastguard Worker    %define stk_off     80
719*c0909341SAndroid Build Coastguard Worker%else
720*c0909341SAndroid Build Coastguard Worker    %define m11         [stk+80]
721*c0909341SAndroid Build Coastguard Worker    %define stk_off     96
722*c0909341SAndroid Build Coastguard Worker%endif
723*c0909341SAndroid Build Coastguard Workercglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, tmpstride
724*c0909341SAndroid Build Coastguard Worker    %define stk         esp
725*c0909341SAndroid Build Coastguard Worker    %define leftmp      [stk+28]
726*c0909341SAndroid Build Coastguard Worker    %define m8          [base+pw_m16380]
727*c0909341SAndroid Build Coastguard Worker    %define m12         [base+wiener_l_shuf]
728*c0909341SAndroid Build Coastguard Worker    %define m14         [stk+48]
729*c0909341SAndroid Build Coastguard Worker    mov             r1, r6m ; flt
730*c0909341SAndroid Build Coastguard Worker    mov             r0, r0m ; dst
731*c0909341SAndroid Build Coastguard Worker    mov             r4, r4m ; w
732*c0909341SAndroid Build Coastguard Worker    mov           lpfq, lpfm
733*c0909341SAndroid Build Coastguard Worker    mov             r2, r7m ; edge
734*c0909341SAndroid Build Coastguard Worker    mov             r5, r5m ; h
735*c0909341SAndroid Build Coastguard Worker    movq            m2, [r1+ 0]
736*c0909341SAndroid Build Coastguard Worker    movq            m7, [r1+16]
737*c0909341SAndroid Build Coastguard Worker    add             r0, r4
738*c0909341SAndroid Build Coastguard Worker    mov             r1, r1m ; stride
739*c0909341SAndroid Build Coastguard Worker    add           lpfq, r4
740*c0909341SAndroid Build Coastguard Worker    mov          edged, r2
741*c0909341SAndroid Build Coastguard Worker    mov             r2, r2m ; left
742*c0909341SAndroid Build Coastguard Worker    mov          dstmp, r0
743*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+r4*2+stk_off]
744*c0909341SAndroid Build Coastguard Worker    mov             hd, r5
745*c0909341SAndroid Build Coastguard Worker    neg             r4
746*c0909341SAndroid Build Coastguard Worker    LEA             r6, pb_right_ext_mask+21
747*c0909341SAndroid Build Coastguard Worker    mov             wq, r4
748*c0909341SAndroid Build Coastguard Worker    mov        strideq, r1
749*c0909341SAndroid Build Coastguard Worker    mov         leftmp, r2
750*c0909341SAndroid Build Coastguard Worker    mov             r4, r1
751*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3)
752*c0909341SAndroid Build Coastguard Worker    pshufb          m2, [base+wiener_init]
753*c0909341SAndroid Build Coastguard Worker    pshufd          m1, m2, q3333
754*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m2, m2
755*c0909341SAndroid Build Coastguard Worker%else
756*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m2
757*c0909341SAndroid Build Coastguard Worker    pshufd          m0, m2, q1111
758*c0909341SAndroid Build Coastguard Worker    pshufd          m1, m2, q2222
759*c0909341SAndroid Build Coastguard Worker    pshufd          m2, m2, q3333
760*c0909341SAndroid Build Coastguard Worker    mova           m11, m0
761*c0909341SAndroid Build Coastguard Worker%endif
762*c0909341SAndroid Build Coastguard Worker    mova           m13, m1
763*c0909341SAndroid Build Coastguard Worker    mova           m14, m2
764*c0909341SAndroid Build Coastguard Worker%endif
765*c0909341SAndroid Build Coastguard Worker    psllw           m7, 5
766*c0909341SAndroid Build Coastguard Worker    pshufd          m6, m7, q0000 ; __ y1
767*c0909341SAndroid Build Coastguard Worker    pshufd          m7, m7, q1111 ; y2 y3
768*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
769*c0909341SAndroid Build Coastguard Worker    jz .no_top
770*c0909341SAndroid Build Coastguard Worker    call .h_top
771*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
772*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
773*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
774*c0909341SAndroid Build Coastguard Worker    call .h_top
775*c0909341SAndroid Build Coastguard Worker    lea             xq, [lpfq+tmpstrideq*4]
776*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstmp
777*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
778*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
779*c0909341SAndroid Build Coastguard Worker    add             xq, tmpstrideq
780*c0909341SAndroid Build Coastguard Worker    mov          [rsp], xq ; below
781*c0909341SAndroid Build Coastguard Worker    call .h
782*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
783*c0909341SAndroid Build Coastguard Worker    dec             hd
784*c0909341SAndroid Build Coastguard Worker    jz .v1
785*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
786*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
787*c0909341SAndroid Build Coastguard Worker    call .h
788*c0909341SAndroid Build Coastguard Worker    dec             hd
789*c0909341SAndroid Build Coastguard Worker    jz .v2
790*c0909341SAndroid Build Coastguard Worker.main:
791*c0909341SAndroid Build Coastguard Worker    mov             t0, t4
792*c0909341SAndroid Build Coastguard Worker.main_loop:
793*c0909341SAndroid Build Coastguard Worker    call .hv
794*c0909341SAndroid Build Coastguard Worker    dec             hd
795*c0909341SAndroid Build Coastguard Worker    jnz .main_loop
796*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
797*c0909341SAndroid Build Coastguard Worker    jz .v2
798*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
799*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
800*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
801*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
802*c0909341SAndroid Build Coastguard Worker.end:
803*c0909341SAndroid Build Coastguard Worker    RET
804*c0909341SAndroid Build Coastguard Worker.no_top:
805*c0909341SAndroid Build Coastguard Worker    lea             t3, [lpfq+tmpstrideq*4]
806*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstmp
807*c0909341SAndroid Build Coastguard Worker    lea             t3, [t3+tmpstrideq*2]
808*c0909341SAndroid Build Coastguard Worker    mov          [rsp], t3
809*c0909341SAndroid Build Coastguard Worker    call .h
810*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
811*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
812*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
813*c0909341SAndroid Build Coastguard Worker    dec             hd
814*c0909341SAndroid Build Coastguard Worker    jz .v1
815*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
816*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
817*c0909341SAndroid Build Coastguard Worker    call .h
818*c0909341SAndroid Build Coastguard Worker    dec             hd
819*c0909341SAndroid Build Coastguard Worker    jz .v2
820*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+384*2]
821*c0909341SAndroid Build Coastguard Worker    call .hv
822*c0909341SAndroid Build Coastguard Worker    dec             hd
823*c0909341SAndroid Build Coastguard Worker    jz .v2
824*c0909341SAndroid Build Coastguard Worker    add             t0, 384*6
825*c0909341SAndroid Build Coastguard Worker    call .hv
826*c0909341SAndroid Build Coastguard Worker    dec             hd
827*c0909341SAndroid Build Coastguard Worker    jnz .main
828*c0909341SAndroid Build Coastguard Worker.v2:
829*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
830*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
831*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
832*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
833*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
834*c0909341SAndroid Build Coastguard Worker    movifnidn    dstmp, dstq
835*c0909341SAndroid Build Coastguard Worker.v1:
836*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
837*c0909341SAndroid Build Coastguard Worker    jmp .end
838*c0909341SAndroid Build Coastguard Worker.h:
839*c0909341SAndroid Build Coastguard Worker    %define stk esp+4
840*c0909341SAndroid Build Coastguard Worker    mov             xq, wq
841*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
842*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
843*c0909341SAndroid Build Coastguard Worker    movifnidn    leftq, leftmp
844*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+xq]
845*c0909341SAndroid Build Coastguard Worker    movd            m5, [leftq]
846*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
847*c0909341SAndroid Build Coastguard Worker    pslldq          m4, 4
848*c0909341SAndroid Build Coastguard Worker    por             m4, m5
849*c0909341SAndroid Build Coastguard Worker    movifnidn   leftmp, leftq
850*c0909341SAndroid Build Coastguard Worker    jmp .h_main
851*c0909341SAndroid Build Coastguard Worker.h_extend_left:
852*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3)
853*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+xq]
854*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m12
855*c0909341SAndroid Build Coastguard Worker%else
856*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+xq]
857*c0909341SAndroid Build Coastguard Worker    pshufd          m4, m5, q2103
858*c0909341SAndroid Build Coastguard Worker    punpcklbw       m5, m5
859*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m5
860*c0909341SAndroid Build Coastguard Worker    movss           m4, m5
861*c0909341SAndroid Build Coastguard Worker%endif
862*c0909341SAndroid Build Coastguard Worker    jmp .h_main
863*c0909341SAndroid Build Coastguard Worker.h_top:
864*c0909341SAndroid Build Coastguard Worker    mov             xq, wq
865*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
866*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
867*c0909341SAndroid Build Coastguard Worker.h_loop:
868*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+xq-4]
869*c0909341SAndroid Build Coastguard Worker.h_main:
870*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+xq+4]
871*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
872*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
873*c0909341SAndroid Build Coastguard Worker    cmp             xd, -17
874*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
875*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
876*c0909341SAndroid Build Coastguard Worker.h_have_right:
877*c0909341SAndroid Build Coastguard Worker%macro %%h5 0
878*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3)
879*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m4, m9
880*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m0, m13
881*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m5, m9
882*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m1, m13
883*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m4, m10
884*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m2, m13
885*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m5, m10
886*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m3, m13
887*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m11
888*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
889*c0909341SAndroid Build Coastguard Worker    pmullw          m2, m14, m4
890*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m11
891*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
892*c0909341SAndroid Build Coastguard Worker    pmullw          m3, m14, m5
893*c0909341SAndroid Build Coastguard Worker    psllw           m4, 7
894*c0909341SAndroid Build Coastguard Worker    psllw           m5, 7
895*c0909341SAndroid Build Coastguard Worker    paddw           m4, m8
896*c0909341SAndroid Build Coastguard Worker    paddw           m5, m8
897*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
898*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
899*c0909341SAndroid Build Coastguard Worker    paddsw          m0, m4
900*c0909341SAndroid Build Coastguard Worker    paddsw          m1, m5
901*c0909341SAndroid Build Coastguard Worker%else
902*c0909341SAndroid Build Coastguard Worker    psrldq          m0, m4, 2
903*c0909341SAndroid Build Coastguard Worker    pslldq          m1, m4, 2
904*c0909341SAndroid Build Coastguard Worker    pxor            m3, m3
905*c0909341SAndroid Build Coastguard Worker    punpcklbw       m0, m3
906*c0909341SAndroid Build Coastguard Worker    punpckhbw       m1, m3
907*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
908*c0909341SAndroid Build Coastguard Worker    pmullw          m0, m11
909*c0909341SAndroid Build Coastguard Worker    pshufd          m2, m4, q0321
910*c0909341SAndroid Build Coastguard Worker    punpcklbw       m2, m3
911*c0909341SAndroid Build Coastguard Worker    pmullw          m1, m14, m2
912*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
913*c0909341SAndroid Build Coastguard Worker    psrldq          m1, m4, 3
914*c0909341SAndroid Build Coastguard Worker    pslldq          m4, 3
915*c0909341SAndroid Build Coastguard Worker    punpcklbw       m1, m3
916*c0909341SAndroid Build Coastguard Worker    punpckhbw       m4, m3
917*c0909341SAndroid Build Coastguard Worker    paddw           m1, m4
918*c0909341SAndroid Build Coastguard Worker    pmullw          m1, m13
919*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
920*c0909341SAndroid Build Coastguard Worker    psllw           m2, 7
921*c0909341SAndroid Build Coastguard Worker    paddw           m2, m8
922*c0909341SAndroid Build Coastguard Worker    paddsw          m0, m2
923*c0909341SAndroid Build Coastguard Worker    psrldq          m1, m5, 2
924*c0909341SAndroid Build Coastguard Worker    pslldq          m4, m5, 2
925*c0909341SAndroid Build Coastguard Worker    punpcklbw       m1, m3
926*c0909341SAndroid Build Coastguard Worker    punpckhbw       m4, m3
927*c0909341SAndroid Build Coastguard Worker    paddw           m1, m4
928*c0909341SAndroid Build Coastguard Worker    pmullw          m1, m11
929*c0909341SAndroid Build Coastguard Worker    pshufd          m4, m5, q0321
930*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, m3
931*c0909341SAndroid Build Coastguard Worker    pmullw          m2, m14, m4
932*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2
933*c0909341SAndroid Build Coastguard Worker    psrldq          m2, m5, 3
934*c0909341SAndroid Build Coastguard Worker    pslldq          m5, 3
935*c0909341SAndroid Build Coastguard Worker    punpcklbw       m2, m3
936*c0909341SAndroid Build Coastguard Worker    punpckhbw       m5, m3
937*c0909341SAndroid Build Coastguard Worker    paddw           m2, m5
938*c0909341SAndroid Build Coastguard Worker    pmullw          m2, m13
939*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2
940*c0909341SAndroid Build Coastguard Worker    psllw           m4, 7
941*c0909341SAndroid Build Coastguard Worker    paddw           m4, m8
942*c0909341SAndroid Build Coastguard Worker    paddsw          m1, m4
943*c0909341SAndroid Build Coastguard Worker%endif
944*c0909341SAndroid Build Coastguard Worker%endmacro
945*c0909341SAndroid Build Coastguard Worker    %%h5
946*c0909341SAndroid Build Coastguard Worker    psraw           m0, 3
947*c0909341SAndroid Build Coastguard Worker    psraw           m1, 3
948*c0909341SAndroid Build Coastguard Worker    paddw           m0, m15
949*c0909341SAndroid Build Coastguard Worker    paddw           m1, m15
950*c0909341SAndroid Build Coastguard Worker    mova  [t1+xq*2+ 0], m0
951*c0909341SAndroid Build Coastguard Worker    mova  [t1+xq*2+16], m1
952*c0909341SAndroid Build Coastguard Worker    add             xq, 16
953*c0909341SAndroid Build Coastguard Worker    jl .h_loop
954*c0909341SAndroid Build Coastguard Worker    ret
955*c0909341SAndroid Build Coastguard WorkerALIGN function_align
956*c0909341SAndroid Build Coastguard Worker.hv:
957*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
958*c0909341SAndroid Build Coastguard Worker    mov             xq, wq
959*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
960*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
961*c0909341SAndroid Build Coastguard Worker    movifnidn    leftq, leftmp
962*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+xq]
963*c0909341SAndroid Build Coastguard Worker    movd            m5, [leftq]
964*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
965*c0909341SAndroid Build Coastguard Worker    pslldq          m4, 4
966*c0909341SAndroid Build Coastguard Worker    por             m4, m5
967*c0909341SAndroid Build Coastguard Worker    movifnidn   leftmp, leftq
968*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
969*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
970*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3)
971*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+xq]
972*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m12
973*c0909341SAndroid Build Coastguard Worker%else
974*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+xq]
975*c0909341SAndroid Build Coastguard Worker    pshufd          m4, m5, q2103
976*c0909341SAndroid Build Coastguard Worker    punpcklbw       m5, m5
977*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m5
978*c0909341SAndroid Build Coastguard Worker    movss           m4, m5
979*c0909341SAndroid Build Coastguard Worker%endif
980*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
981*c0909341SAndroid Build Coastguard Worker.hv_bottom:
982*c0909341SAndroid Build Coastguard Worker    mov             xq, wq
983*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
984*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
985*c0909341SAndroid Build Coastguard Worker.hv_loop:
986*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+xq-4]
987*c0909341SAndroid Build Coastguard Worker.hv_main:
988*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+xq+4]
989*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
990*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
991*c0909341SAndroid Build Coastguard Worker    cmp             xd, -17
992*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
993*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
994*c0909341SAndroid Build Coastguard Worker.hv_have_right:
995*c0909341SAndroid Build Coastguard Worker    %%h5
996*c0909341SAndroid Build Coastguard Worker    mova            m2, [t3+xq*2]
997*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t1+xq*2]
998*c0909341SAndroid Build Coastguard Worker    psraw           m0, 3
999*c0909341SAndroid Build Coastguard Worker    psraw           m1, 3
1000*c0909341SAndroid Build Coastguard Worker    paddw           m0, m15
1001*c0909341SAndroid Build Coastguard Worker    paddw           m1, m15
1002*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1003*c0909341SAndroid Build Coastguard Worker    mova            m3, [t2+xq*2]
1004*c0909341SAndroid Build Coastguard Worker    paddw           m4, m0, [t4+xq*2]
1005*c0909341SAndroid Build Coastguard Worker%else
1006*c0909341SAndroid Build Coastguard Worker    mov             r2, t2
1007*c0909341SAndroid Build Coastguard Worker    mova            m3, [r2+xq*2]
1008*c0909341SAndroid Build Coastguard Worker    mov             r2, t4
1009*c0909341SAndroid Build Coastguard Worker    paddw           m4, m0, [r2+xq*2]
1010*c0909341SAndroid Build Coastguard Worker%endif
1011*c0909341SAndroid Build Coastguard Worker    mova     [t0+xq*2], m0
1012*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m2, m3
1013*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m7
1014*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m3
1015*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m7
1016*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m4, m4
1017*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m6
1018*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m4
1019*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m6
1020*c0909341SAndroid Build Coastguard Worker    paddd           m0, m3
1021*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2
1022*c0909341SAndroid Build Coastguard Worker    mova            m2, [t3+xq*2+16]
1023*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t1+xq*2+16]
1024*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m4
1025*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1026*c0909341SAndroid Build Coastguard Worker    mova            m3, [t2+xq*2+16]
1027*c0909341SAndroid Build Coastguard Worker    paddw           m4, m1, [t4+xq*2+16]
1028*c0909341SAndroid Build Coastguard Worker%else
1029*c0909341SAndroid Build Coastguard Worker    paddw           m4, m1, [r2+xq*2+16]
1030*c0909341SAndroid Build Coastguard Worker    mov             r2, t2
1031*c0909341SAndroid Build Coastguard Worker    mova            m3, [r2+xq*2+16]
1032*c0909341SAndroid Build Coastguard Worker    mov           dstq, dstmp
1033*c0909341SAndroid Build Coastguard Worker%endif
1034*c0909341SAndroid Build Coastguard Worker    mova  [t0+xq*2+16], m1
1035*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m2, m3
1036*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m7
1037*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m3
1038*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m7
1039*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m4, m4
1040*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m6
1041*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m4
1042*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m6
1043*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
1044*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4
1045*c0909341SAndroid Build Coastguard Worker    packuswb        m1, m2
1046*c0909341SAndroid Build Coastguard Worker    psrlw           m0, 8
1047*c0909341SAndroid Build Coastguard Worker    psrlw           m1, 8
1048*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m1
1049*c0909341SAndroid Build Coastguard Worker    mova     [dstq+xq], m0
1050*c0909341SAndroid Build Coastguard Worker    add             xq, 16
1051*c0909341SAndroid Build Coastguard Worker    jl .hv_loop
1052*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
1053*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
1054*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
1055*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1056*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
1057*c0909341SAndroid Build Coastguard Worker    mov             t0, t4
1058*c0909341SAndroid Build Coastguard Worker    movifnidn    dstmp, dstq
1059*c0909341SAndroid Build Coastguard Worker    ret
1060*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3)
1061*c0909341SAndroid Build Coastguard Worker.v:
1062*c0909341SAndroid Build Coastguard Worker    mov             xq, wq
1063*c0909341SAndroid Build Coastguard Worker.v_loop:
1064*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+xq*2]
1065*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3, [t3+xq*2]
1066*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1067*c0909341SAndroid Build Coastguard Worker    mova            m2, [t2+xq*2]
1068*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+xq*2]
1069*c0909341SAndroid Build Coastguard Worker%else
1070*c0909341SAndroid Build Coastguard Worker    mov             r2, t2
1071*c0909341SAndroid Build Coastguard Worker    mova            m2, [r2+xq*2]
1072*c0909341SAndroid Build Coastguard Worker    mov             r2, t4
1073*c0909341SAndroid Build Coastguard Worker    paddw           m3, [r2+xq*2]
1074*c0909341SAndroid Build Coastguard Worker%endif
1075*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m2
1076*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m7
1077*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m2
1078*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m7
1079*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3
1080*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m6
1081*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m3
1082*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m6
1083*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
1084*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
1085*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+xq*2+16]
1086*c0909341SAndroid Build Coastguard Worker    paddw           m2, m4, [t3+xq*2+16]
1087*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1088*c0909341SAndroid Build Coastguard Worker    mova            m3, [t2+xq*2+16]
1089*c0909341SAndroid Build Coastguard Worker    paddw           m4, [t4+xq*2+16]
1090*c0909341SAndroid Build Coastguard Worker%else
1091*c0909341SAndroid Build Coastguard Worker    paddw           m4, [r2+xq*2+16]
1092*c0909341SAndroid Build Coastguard Worker    mov             r2, t2
1093*c0909341SAndroid Build Coastguard Worker    mova            m3, [r2+xq*2+16]
1094*c0909341SAndroid Build Coastguard Worker    mov           dstq, dstmp
1095*c0909341SAndroid Build Coastguard Worker%endif
1096*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m1
1097*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m2, m3
1098*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m7
1099*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m3
1100*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m7
1101*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m4
1102*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m6
1103*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m4
1104*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m6
1105*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
1106*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4
1107*c0909341SAndroid Build Coastguard Worker    packuswb        m1, m2
1108*c0909341SAndroid Build Coastguard Worker    psrlw           m0, 8
1109*c0909341SAndroid Build Coastguard Worker    psrlw           m1, 8
1110*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m1
1111*c0909341SAndroid Build Coastguard Worker    mova     [dstq+xq], m0
1112*c0909341SAndroid Build Coastguard Worker    add             xq, 16
1113*c0909341SAndroid Build Coastguard Worker    jl .v_loop
1114*c0909341SAndroid Build Coastguard Worker    ret
1115*c0909341SAndroid Build Coastguard Worker%endif
1116*c0909341SAndroid Build Coastguard Worker%endmacro
1117*c0909341SAndroid Build Coastguard Worker
1118*c0909341SAndroid Build Coastguard WorkerINIT_XMM sse2
1119*c0909341SAndroid Build Coastguard WorkerWIENER
1120*c0909341SAndroid Build Coastguard Worker
1121*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
1122*c0909341SAndroid Build Coastguard WorkerWIENER
1123*c0909341SAndroid Build Coastguard Worker
1124*c0909341SAndroid Build Coastguard Worker;;;;;;;;;;;;;;;;;;;;;;;;;;
1125*c0909341SAndroid Build Coastguard Worker;;      self-guided     ;;
1126*c0909341SAndroid Build Coastguard Worker;;;;;;;;;;;;;;;;;;;;;;;;;;
1127*c0909341SAndroid Build Coastguard Worker
1128*c0909341SAndroid Build Coastguard Worker%macro GATHERDD 3 ; dst, src, tmp
1129*c0909341SAndroid Build Coastguard Worker    movd           %3d, %2
1130*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
1131*c0909341SAndroid Build Coastguard Worker    movd            %1, [r13+%3]
1132*c0909341SAndroid Build Coastguard Worker    pextrw         %3d, %2, 2
1133*c0909341SAndroid Build Coastguard Worker    pinsrw          %1, [r13+%3+2], 3
1134*c0909341SAndroid Build Coastguard Worker    pextrw         %3d, %2, 4
1135*c0909341SAndroid Build Coastguard Worker    pinsrw          %1, [r13+%3+2], 5
1136*c0909341SAndroid Build Coastguard Worker    pextrw         %3d, %2, 6
1137*c0909341SAndroid Build Coastguard Worker    pinsrw          %1, [r13+%3+2], 7
1138*c0909341SAndroid Build Coastguard Worker %else
1139*c0909341SAndroid Build Coastguard Worker    movd            %1, [base+sgr_x_by_x-0xf03+%3]
1140*c0909341SAndroid Build Coastguard Worker    pextrw          %3, %2, 2
1141*c0909341SAndroid Build Coastguard Worker    pinsrw          %1, [base+sgr_x_by_x-0xf03+%3+2], 3
1142*c0909341SAndroid Build Coastguard Worker    pextrw          %3, %2, 4
1143*c0909341SAndroid Build Coastguard Worker    pinsrw          %1, [base+sgr_x_by_x-0xf03+%3+2], 5
1144*c0909341SAndroid Build Coastguard Worker    pextrw          %3, %2, 6
1145*c0909341SAndroid Build Coastguard Worker    pinsrw          %1, [base+sgr_x_by_x-0xf03+%3+2], 7
1146*c0909341SAndroid Build Coastguard Worker %endif
1147*c0909341SAndroid Build Coastguard Worker%endmacro
1148*c0909341SAndroid Build Coastguard Worker
1149*c0909341SAndroid Build Coastguard Worker%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore
1150*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
1151*c0909341SAndroid Build Coastguard Worker  %define tmp r14
1152*c0909341SAndroid Build Coastguard Worker %else
1153*c0909341SAndroid Build Coastguard Worker  %define tmp %4
1154*c0909341SAndroid Build Coastguard Worker %endif
1155*c0909341SAndroid Build Coastguard Worker    GATHERDD        %1, %2, tmp
1156*c0909341SAndroid Build Coastguard Worker    GATHERDD        %2, %3, tmp
1157*c0909341SAndroid Build Coastguard Worker    movif32         %4, %5
1158*c0909341SAndroid Build Coastguard Worker    psrld           %1, 24
1159*c0909341SAndroid Build Coastguard Worker    psrld           %2, 24
1160*c0909341SAndroid Build Coastguard Worker    packssdw        %1, %2
1161*c0909341SAndroid Build Coastguard Worker%endmacro
1162*c0909341SAndroid Build Coastguard Worker
1163*c0909341SAndroid Build Coastguard Worker%macro MULLD 3 ; dst, src, tmp
1164*c0909341SAndroid Build Coastguard Worker    pmulhuw         %3, %1, %2
1165*c0909341SAndroid Build Coastguard Worker    pmullw          %1, %2
1166*c0909341SAndroid Build Coastguard Worker    pslld           %3, 16
1167*c0909341SAndroid Build Coastguard Worker    paddd           %1, %3
1168*c0909341SAndroid Build Coastguard Worker%endmacro
1169*c0909341SAndroid Build Coastguard Worker
1170*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1171*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 0, 1, 2, 3, 5
1172*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
1173*c0909341SAndroid Build Coastguard Worker  %assign extra_stack 5*16
1174*c0909341SAndroid Build Coastguard Worker %else
1175*c0909341SAndroid Build Coastguard Worker  %assign extra_stack 3*16
1176*c0909341SAndroid Build Coastguard Worker %endif
1177*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_5x5_8bpc, 1, 7, 8, -400*24-16-extra_stack, \
1178*c0909341SAndroid Build Coastguard Worker                             dst, stride, left, lpf, w
1179*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
1180*c0909341SAndroid Build Coastguard Worker  %define dstm         dword [esp+calloff+16*0+4*6]
1181*c0909341SAndroid Build Coastguard Worker  %define stridemp     dword [esp+calloff+16*0+4*7]
1182*c0909341SAndroid Build Coastguard Worker  %define leftm        dword [esp+calloff+16*3+4*0]
1183*c0909341SAndroid Build Coastguard Worker  %define lpfm         dword [esp+calloff+16*3+4*1]
1184*c0909341SAndroid Build Coastguard Worker  %define w0m          dword [esp+calloff+16*3+4*2]
1185*c0909341SAndroid Build Coastguard Worker  %define hd           dword [esp+calloff+16*3+4*3]
1186*c0909341SAndroid Build Coastguard Worker  %define edgeb         byte [esp+calloff+16*3+4*4]
1187*c0909341SAndroid Build Coastguard Worker  %define edged        dword [esp+calloff+16*3+4*4]
1188*c0909341SAndroid Build Coastguard Worker  %define leftmp leftm
1189*c0909341SAndroid Build Coastguard Worker %else
1190*c0909341SAndroid Build Coastguard Worker  %define w0m wm
1191*c0909341SAndroid Build Coastguard Worker  %define hd dword r5m
1192*c0909341SAndroid Build Coastguard Worker  %define edgeb  byte r7m
1193*c0909341SAndroid Build Coastguard Worker  %define edged dword r7m
1194*c0909341SAndroid Build Coastguard Worker %endif
1195*c0909341SAndroid Build Coastguard Worker %define hvsrcm dword [esp+calloff+4*0]
1196*c0909341SAndroid Build Coastguard Worker %define w1m    dword [esp+calloff+4*1]
1197*c0909341SAndroid Build Coastguard Worker %define t0m    dword [esp+calloff+4*2]
1198*c0909341SAndroid Build Coastguard Worker %define t2m    dword [esp+calloff+4*3]
1199*c0909341SAndroid Build Coastguard Worker %define t3m    dword [esp+calloff+4*4]
1200*c0909341SAndroid Build Coastguard Worker %define t4m    dword [esp+calloff+4*5]
1201*c0909341SAndroid Build Coastguard Worker %define  m8 [base+pb_1]
1202*c0909341SAndroid Build Coastguard Worker %define  m9 [esp+calloff+16*2]
1203*c0909341SAndroid Build Coastguard Worker %define m10 [base+pd_0xf00800a4]
1204*c0909341SAndroid Build Coastguard Worker %define m11 [base+sgr_lshuf5]
1205*c0909341SAndroid Build Coastguard Worker %define m12 [base+pd_34816]
1206*c0909341SAndroid Build Coastguard Worker %define m13 [base+pb_0to15]
1207*c0909341SAndroid Build Coastguard Worker %define r10 r4
1208*c0909341SAndroid Build Coastguard Worker %define base r6-$$
1209*c0909341SAndroid Build Coastguard Worker %assign calloff 0
1210*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
1211*c0909341SAndroid Build Coastguard Worker    mov        strideq, [rstk+stack_offset+ 8]
1212*c0909341SAndroid Build Coastguard Worker    mov          leftq, [rstk+stack_offset+12]
1213*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rstk+stack_offset+16]
1214*c0909341SAndroid Build Coastguard Worker    mov             wd, [rstk+stack_offset+20]
1215*c0909341SAndroid Build Coastguard Worker    mov           dstm, dstq
1216*c0909341SAndroid Build Coastguard Worker    mov       stridemp, strideq
1217*c0909341SAndroid Build Coastguard Worker    mov          leftm, leftq
1218*c0909341SAndroid Build Coastguard Worker    mov             r1, [rstk+stack_offset+24]
1219*c0909341SAndroid Build Coastguard Worker    mov             r2, [rstk+stack_offset+32]
1220*c0909341SAndroid Build Coastguard Worker    mov           lpfm, lpfq
1221*c0909341SAndroid Build Coastguard Worker    mov             hd, r1
1222*c0909341SAndroid Build Coastguard Worker    mov          edged, r2
1223*c0909341SAndroid Build Coastguard Worker %endif
1224*c0909341SAndroid Build Coastguard Worker%else
1225*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 8, 7, 9, 11, 12
1226*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_5x5_8bpc, 4, 15, 14, -400*24-16, dst, stride, left, lpf, \
1227*c0909341SAndroid Build Coastguard Worker                                                    w, h, edge, params
1228*c0909341SAndroid Build Coastguard Worker%endif
1229*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
1230*c0909341SAndroid Build Coastguard Worker    mov             wd, wm
1231*c0909341SAndroid Build Coastguard Worker%endif
1232*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1233*c0909341SAndroid Build Coastguard Worker    mov        paramsq, r6mp
1234*c0909341SAndroid Build Coastguard Worker    lea            r13, [sgr_x_by_x-0xf03]
1235*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
1236*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
1237*c0909341SAndroid Build Coastguard Worker    movu            m9, [paramsq]
1238*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
1239*c0909341SAndroid Build Coastguard Worker    mova            m8, [pb_1]
1240*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq*2+20]
1241*c0909341SAndroid Build Coastguard Worker    mova           m10, [pd_0xf00800a4]
1242*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
1243*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+wq*4+400*12+16]
1244*c0909341SAndroid Build Coastguard Worker    mova           m12, [pd_34816]  ; (1 << 11) + (1 << 15)
1245*c0909341SAndroid Build Coastguard Worker    lea             t4, [rsp+wq*2+400*20+16]
1246*c0909341SAndroid Build Coastguard Worker    pshufhw         m7, m9, q0000
1247*c0909341SAndroid Build Coastguard Worker    pshufb          m9, [pw_256]  ; s0
1248*c0909341SAndroid Build Coastguard Worker    punpckhqdq      m7, m7        ; w0
1249*c0909341SAndroid Build Coastguard Worker    neg             wq
1250*c0909341SAndroid Build Coastguard Worker    mova           m13, [pb_0to15]
1251*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
1252*c0909341SAndroid Build Coastguard Worker    mova           m11, [sgr_lshuf5]
1253*c0909341SAndroid Build Coastguard Worker    psllw           m7, 4
1254*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
1255*c0909341SAndroid Build Coastguard Worker %define lpfm [rsp]
1256*c0909341SAndroid Build Coastguard Worker%else
1257*c0909341SAndroid Build Coastguard Worker    mov             r1, [rstk+stack_offset+28] ; params
1258*c0909341SAndroid Build Coastguard Worker    LEA             r6, $$
1259*c0909341SAndroid Build Coastguard Worker    movu            m1, [r1]
1260*c0909341SAndroid Build Coastguard Worker    add           lpfm, wq
1261*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+extra_stack+wq*2+20]
1262*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
1263*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+extra_stack+wq*4+400*12+16]
1264*c0909341SAndroid Build Coastguard Worker    mov           dstm, dstq
1265*c0909341SAndroid Build Coastguard Worker    lea             t4, [rsp+extra_stack+wq*2+400*20+16]
1266*c0909341SAndroid Build Coastguard Worker    mov            t3m, t3
1267*c0909341SAndroid Build Coastguard Worker    pshufhw         m7, m1, q0000
1268*c0909341SAndroid Build Coastguard Worker    mov            t4m, t4
1269*c0909341SAndroid Build Coastguard Worker    pshufb          m1, [base+pw_256] ; s0
1270*c0909341SAndroid Build Coastguard Worker    punpckhqdq      m7, m7            ; w0
1271*c0909341SAndroid Build Coastguard Worker    psllw           m7, 4
1272*c0909341SAndroid Build Coastguard Worker    neg             wq
1273*c0909341SAndroid Build Coastguard Worker    mova            m9, m1
1274*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
1275*c0909341SAndroid Build Coastguard Worker    mov            w1m, wd
1276*c0909341SAndroid Build Coastguard Worker    sub             wd, 2
1277*c0909341SAndroid Build Coastguard Worker    mov           lpfq, lpfm
1278*c0909341SAndroid Build Coastguard Worker    mov            w0m, wd
1279*c0909341SAndroid Build Coastguard Worker %define strideq r5
1280*c0909341SAndroid Build Coastguard Worker%endif
1281*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
1282*c0909341SAndroid Build Coastguard Worker    jz .no_top
1283*c0909341SAndroid Build Coastguard Worker    call .h_top
1284*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1285*c0909341SAndroid Build Coastguard Worker    movif32        t2m, t1
1286*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1287*c0909341SAndroid Build Coastguard Worker    call .top_fixup
1288*c0909341SAndroid Build Coastguard Worker    add             t1, 400*6
1289*c0909341SAndroid Build Coastguard Worker    call .h_top
1290*c0909341SAndroid Build Coastguard Worker    movif32    strideq, stridemp
1291*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1292*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1293*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
1294*c0909341SAndroid Build Coastguard Worker    mov           lpfm, r10 ; below
1295*c0909341SAndroid Build Coastguard Worker    movif32        t0m, t2
1296*c0909341SAndroid Build Coastguard Worker    mov             t0, t2
1297*c0909341SAndroid Build Coastguard Worker    dec             hd
1298*c0909341SAndroid Build Coastguard Worker    jz .height1
1299*c0909341SAndroid Build Coastguard Worker    or           edged, 16
1300*c0909341SAndroid Build Coastguard Worker    call .h
1301*c0909341SAndroid Build Coastguard Worker.main:
1302*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1303*c0909341SAndroid Build Coastguard Worker    movif32         t4, t4m
1304*c0909341SAndroid Build Coastguard Worker    call .hv
1305*c0909341SAndroid Build Coastguard Worker    call .prep_n
1306*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1307*c0909341SAndroid Build Coastguard Worker    jl .extend_bottom
1308*c0909341SAndroid Build Coastguard Worker.main_loop:
1309*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
1310*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1311*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1312*c0909341SAndroid Build Coastguard Worker    test            hb, hb
1313*c0909341SAndroid Build Coastguard Worker%else
1314*c0909341SAndroid Build Coastguard Worker    mov             r4, hd
1315*c0909341SAndroid Build Coastguard Worker    test            r4, r4
1316*c0909341SAndroid Build Coastguard Worker%endif
1317*c0909341SAndroid Build Coastguard Worker    jz .odd_height
1318*c0909341SAndroid Build Coastguard Worker    call .h
1319*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1320*c0909341SAndroid Build Coastguard Worker    call .hv
1321*c0909341SAndroid Build Coastguard Worker    movif32       dstq, dstm
1322*c0909341SAndroid Build Coastguard Worker    call .n0
1323*c0909341SAndroid Build Coastguard Worker    call .n1
1324*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1325*c0909341SAndroid Build Coastguard Worker    movif32         t0, t0m
1326*c0909341SAndroid Build Coastguard Worker    jge .main_loop
1327*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
1328*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
1329*c0909341SAndroid Build Coastguard Worker    mov           lpfq, lpfm
1330*c0909341SAndroid Build Coastguard Worker    call .h_top
1331*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1332*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
1333*c0909341SAndroid Build Coastguard Worker.end:
1334*c0909341SAndroid Build Coastguard Worker    movif32       dstq, dstm
1335*c0909341SAndroid Build Coastguard Worker    call .n0
1336*c0909341SAndroid Build Coastguard Worker    call .n1
1337*c0909341SAndroid Build Coastguard Worker.end2:
1338*c0909341SAndroid Build Coastguard Worker    RET
1339*c0909341SAndroid Build Coastguard Worker.height1:
1340*c0909341SAndroid Build Coastguard Worker    movif32         t4, t4m
1341*c0909341SAndroid Build Coastguard Worker    call .hv
1342*c0909341SAndroid Build Coastguard Worker    call .prep_n
1343*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
1344*c0909341SAndroid Build Coastguard Worker.odd_height:
1345*c0909341SAndroid Build Coastguard Worker    call .hv
1346*c0909341SAndroid Build Coastguard Worker    movif32       dstq, dstm
1347*c0909341SAndroid Build Coastguard Worker    call .n0
1348*c0909341SAndroid Build Coastguard Worker    call .n1
1349*c0909341SAndroid Build Coastguard Worker.odd_height_end:
1350*c0909341SAndroid Build Coastguard Worker    call .v
1351*c0909341SAndroid Build Coastguard Worker    movif32       dstq, dstm
1352*c0909341SAndroid Build Coastguard Worker    call .n0
1353*c0909341SAndroid Build Coastguard Worker    jmp .end2
1354*c0909341SAndroid Build Coastguard Worker.extend_bottom:
1355*c0909341SAndroid Build Coastguard Worker    call .v
1356*c0909341SAndroid Build Coastguard Worker    jmp .end
1357*c0909341SAndroid Build Coastguard Worker.no_top:
1358*c0909341SAndroid Build Coastguard Worker    movif32    strideq, stridemp
1359*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1360*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1361*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
1362*c0909341SAndroid Build Coastguard Worker    mov           lpfm, r10
1363*c0909341SAndroid Build Coastguard Worker    call .h
1364*c0909341SAndroid Build Coastguard Worker    lea             t2, [t1+400*6]
1365*c0909341SAndroid Build Coastguard Worker    movif32        t2m, t2
1366*c0909341SAndroid Build Coastguard Worker    call .top_fixup
1367*c0909341SAndroid Build Coastguard Worker    dec             hd
1368*c0909341SAndroid Build Coastguard Worker    jz .no_top_height1
1369*c0909341SAndroid Build Coastguard Worker    or           edged, 16
1370*c0909341SAndroid Build Coastguard Worker    mov             t0, t1
1371*c0909341SAndroid Build Coastguard Worker    mov             t1, t2
1372*c0909341SAndroid Build Coastguard Worker    movif32        t0m, t0
1373*c0909341SAndroid Build Coastguard Worker    jmp .main
1374*c0909341SAndroid Build Coastguard Worker.no_top_height1:
1375*c0909341SAndroid Build Coastguard Worker    movif32         t3, t3m
1376*c0909341SAndroid Build Coastguard Worker    movif32         t4, t4m
1377*c0909341SAndroid Build Coastguard Worker    call .v
1378*c0909341SAndroid Build Coastguard Worker    call .prep_n
1379*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
1380*c0909341SAndroid Build Coastguard Worker.extend_right:
1381*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset+8
1382*c0909341SAndroid Build Coastguard Worker%assign calloff 8
1383*c0909341SAndroid Build Coastguard Worker    movd            m1, wd
1384*c0909341SAndroid Build Coastguard Worker    movd            m3, [lpfq-1]
1385*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m6
1386*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m6
1387*c0909341SAndroid Build Coastguard Worker    psubb           m2, m8, m1
1388*c0909341SAndroid Build Coastguard Worker    pcmpgtb         m2, m13
1389*c0909341SAndroid Build Coastguard Worker    pand            m5, m2
1390*c0909341SAndroid Build Coastguard Worker    pandn           m2, m3
1391*c0909341SAndroid Build Coastguard Worker    por             m5, m2
1392*c0909341SAndroid Build Coastguard Worker    ret
1393*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset-4
1394*c0909341SAndroid Build Coastguard Worker%assign calloff 4
1395*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum
1396*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1397*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
1398*c0909341SAndroid Build Coastguard Worker%else
1399*c0909341SAndroid Build Coastguard Worker %define leftq r4
1400*c0909341SAndroid Build Coastguard Worker%endif
1401*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1402*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1403*c0909341SAndroid Build Coastguard Worker    movif32      leftq, leftm
1404*c0909341SAndroid Build Coastguard Worker    movddup         m4, [leftq-4]
1405*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
1406*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+wq+2]
1407*c0909341SAndroid Build Coastguard Worker    add         leftmp, 4
1408*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 13
1409*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1410*c0909341SAndroid Build Coastguard Worker.h_extend_left:
1411*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
1412*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+wq+2]
1413*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m11
1414*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1415*c0909341SAndroid Build Coastguard Worker.h_top:
1416*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1417*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
1418*c0909341SAndroid Build Coastguard Worker%endif
1419*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1420*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1421*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
1422*c0909341SAndroid Build Coastguard Worker.h_loop:
1423*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+wq-1]
1424*c0909341SAndroid Build Coastguard Worker.h_main:
1425*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1426*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
1427*c0909341SAndroid Build Coastguard Worker    cmp             wd, -10
1428*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
1429*c0909341SAndroid Build Coastguard Worker    call .extend_right
1430*c0909341SAndroid Build Coastguard Worker.h_have_right:
1431*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, m5, m6
1432*c0909341SAndroid Build Coastguard Worker    punpckhbw       m5, m6
1433*c0909341SAndroid Build Coastguard Worker    palignr         m2, m5, m4, 2
1434*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4, m2
1435*c0909341SAndroid Build Coastguard Worker    palignr         m3, m5, m4, 6
1436*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3
1437*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m2, m3
1438*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
1439*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m3
1440*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1441*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 8
1442*c0909341SAndroid Build Coastguard Worker    paddw           m0, m5
1443*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m4, m5
1444*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1445*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
1446*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4, m5
1447*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1448*c0909341SAndroid Build Coastguard Worker    shufps          m4, m5, q2121
1449*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4             ; sum
1450*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m4, m6
1451*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
1452*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m6
1453*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
1454*c0909341SAndroid Build Coastguard Worker    paddd           m2, m3
1455*c0909341SAndroid Build Coastguard Worker    test         edgeb, 16             ; y > 0
1456*c0909341SAndroid Build Coastguard Worker    jz .h_loop_end
1457*c0909341SAndroid Build Coastguard Worker    paddw           m0, [t1+wq*2+400*0]
1458*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t1+wq*2+400*2]
1459*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+wq*2+400*4]
1460*c0909341SAndroid Build Coastguard Worker.h_loop_end:
1461*c0909341SAndroid Build Coastguard Worker    paddd           m1, m5             ; sumsq
1462*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4
1463*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400*0], m0
1464*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400*2], m1
1465*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400*4], m2
1466*c0909341SAndroid Build Coastguard Worker    add             wq, 8
1467*c0909341SAndroid Build Coastguard Worker    jl .h_loop
1468*c0909341SAndroid Build Coastguard Worker    ret
1469*c0909341SAndroid Build Coastguard Worker.top_fixup:
1470*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1471*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
1472*c0909341SAndroid Build Coastguard Worker%else
1473*c0909341SAndroid Build Coastguard Worker    mov             wd, w0m
1474*c0909341SAndroid Build Coastguard Worker%endif
1475*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: ; the sums of the first row needs to be doubled
1476*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+wq*2+400*0]
1477*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+wq*2+400*2]
1478*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+wq*2+400*4]
1479*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
1480*c0909341SAndroid Build Coastguard Worker    paddd           m1, m1
1481*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
1482*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*0], m0
1483*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*2], m1
1484*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*4], m2
1485*c0909341SAndroid Build Coastguard Worker    add             wq, 8
1486*c0909341SAndroid Build Coastguard Worker    jl .top_fixup_loop
1487*c0909341SAndroid Build Coastguard Worker    ret
1488*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1489*c0909341SAndroid Build Coastguard Worker.hv: ; horizontal boxsum + vertical boxsum + ab
1490*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1491*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
1492*c0909341SAndroid Build Coastguard Worker%else
1493*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
1494*c0909341SAndroid Build Coastguard Worker%endif
1495*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1496*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
1497*c0909341SAndroid Build Coastguard Worker    movif32      leftq, leftm
1498*c0909341SAndroid Build Coastguard Worker    movddup         m4, [leftq-4]
1499*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
1500*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+wq+2]
1501*c0909341SAndroid Build Coastguard Worker    add         leftmp, 4
1502*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 13
1503*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
1504*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
1505*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
1506*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+wq+2]
1507*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m11
1508*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
1509*c0909341SAndroid Build Coastguard Worker.hv_bottom:
1510*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1511*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
1512*c0909341SAndroid Build Coastguard Worker%else
1513*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
1514*c0909341SAndroid Build Coastguard Worker%endif
1515*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1516*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
1517*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
1518*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1519*c0909341SAndroid Build Coastguard Worker    jmp .hv_loop_start
1520*c0909341SAndroid Build Coastguard Worker%endif
1521*c0909341SAndroid Build Coastguard Worker.hv_loop:
1522*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
1523*c0909341SAndroid Build Coastguard Worker.hv_loop_start:
1524*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+wq-1]
1525*c0909341SAndroid Build Coastguard Worker.hv_main:
1526*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1527*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
1528*c0909341SAndroid Build Coastguard Worker    cmp             wd, -10
1529*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
1530*c0909341SAndroid Build Coastguard Worker    call .extend_right
1531*c0909341SAndroid Build Coastguard Worker.hv_have_right:
1532*c0909341SAndroid Build Coastguard Worker    movif32         t3, hd
1533*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, m5, m6
1534*c0909341SAndroid Build Coastguard Worker    punpckhbw       m5, m6
1535*c0909341SAndroid Build Coastguard Worker    palignr         m3, m5, m4, 2
1536*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4, m3
1537*c0909341SAndroid Build Coastguard Worker    palignr         m1, m5, m4, 6
1538*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
1539*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m1
1540*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1541*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m1
1542*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1543*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 8
1544*c0909341SAndroid Build Coastguard Worker    paddw           m0, m5
1545*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m4, m5
1546*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
1547*c0909341SAndroid Build Coastguard Worker    paddd           m2, m1
1548*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m4, m5
1549*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
1550*c0909341SAndroid Build Coastguard Worker    shufps          m4, m5, q2121
1551*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4            ; h sum
1552*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m4, m6
1553*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
1554*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m6
1555*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
1556*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1
1557*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5            ; h sumsq
1558*c0909341SAndroid Build Coastguard Worker    paddd           m3, m4
1559*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t1+wq*2+400*0]
1560*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t1+wq*2+400*2]
1561*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t1+wq*2+400*4]
1562*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1563*c0909341SAndroid Build Coastguard Worker    test            hd, hd
1564*c0909341SAndroid Build Coastguard Worker%else
1565*c0909341SAndroid Build Coastguard Worker    test            t3, t3
1566*c0909341SAndroid Build Coastguard Worker%endif
1567*c0909341SAndroid Build Coastguard Worker    jz .hv_last_row
1568*c0909341SAndroid Build Coastguard Worker.hv_main2:
1569*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t2+wq*2+400*0] ; hv sum
1570*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t2+wq*2+400*2] ; hv sumsq
1571*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t2+wq*2+400*4]
1572*c0909341SAndroid Build Coastguard Worker    mova [t0+wq*2+400*0], m0
1573*c0909341SAndroid Build Coastguard Worker    pslld           m0, m4, 4
1574*c0909341SAndroid Build Coastguard Worker    mova [t0+wq*2+400*2], m2
1575*c0909341SAndroid Build Coastguard Worker    mova [t0+wq*2+400*4], m3
1576*c0909341SAndroid Build Coastguard Worker    pslld           m2, m4, 3
1577*c0909341SAndroid Build Coastguard Worker    paddd           m4, m0
1578*c0909341SAndroid Build Coastguard Worker    pslld           m0, m5, 4
1579*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; a * 25
1580*c0909341SAndroid Build Coastguard Worker    pslld           m2, m5, 3
1581*c0909341SAndroid Build Coastguard Worker    paddd           m5, m0
1582*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2
1583*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
1584*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
1585*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0, m0         ; b * b
1586*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1, m1
1587*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p
1588*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
1589*c0909341SAndroid Build Coastguard Worker    MULLD           m4, m9, m2         ; p * s
1590*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m9, m2
1591*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m10            ; b * 164
1592*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m10
1593*c0909341SAndroid Build Coastguard Worker    paddusw         m4, m10
1594*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m10
1595*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; min(z, 255)
1596*c0909341SAndroid Build Coastguard Worker    movif32         t3, t3m
1597*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
1598*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m3, m4, m5, t2, t2m
1599*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m3
1600*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3, m3
1601*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m4, m2
1602*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m5, m2
1603*c0909341SAndroid Build Coastguard Worker    paddd           m0, m12            ; x * b * 164 + (1 << 11) + (1 << 15)
1604*c0909341SAndroid Build Coastguard Worker    paddd           m1, m12
1605*c0909341SAndroid Build Coastguard Worker    mova   [t4+wq*2+4], m3
1606*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12             ; b
1607*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
1608*c0909341SAndroid Build Coastguard Worker    mova  [t3+wq*4+ 8], m0
1609*c0909341SAndroid Build Coastguard Worker    mova  [t3+wq*4+24], m1
1610*c0909341SAndroid Build Coastguard Worker    add             wq, 8
1611*c0909341SAndroid Build Coastguard Worker    jl .hv_loop
1612*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1613*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
1614*c0909341SAndroid Build Coastguard Worker    mov             t0, t2
1615*c0909341SAndroid Build Coastguard Worker    movif32        t2m, t2
1616*c0909341SAndroid Build Coastguard Worker    movif32        t0m, t0
1617*c0909341SAndroid Build Coastguard Worker    ret
1618*c0909341SAndroid Build Coastguard Worker.hv_last_row: ; esoteric edge case for odd heights
1619*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400*0], m1
1620*c0909341SAndroid Build Coastguard Worker    paddw             m1, m0
1621*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400*2], m4
1622*c0909341SAndroid Build Coastguard Worker    paddd             m4, m2
1623*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400*4], m5
1624*c0909341SAndroid Build Coastguard Worker    paddd             m5, m3
1625*c0909341SAndroid Build Coastguard Worker    jmp .hv_main2
1626*c0909341SAndroid Build Coastguard Worker.v: ; vertical boxsum + ab
1627*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1628*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
1629*c0909341SAndroid Build Coastguard Worker%else
1630*c0909341SAndroid Build Coastguard Worker    mov             wd, w0m
1631*c0909341SAndroid Build Coastguard Worker%endif
1632*c0909341SAndroid Build Coastguard Worker.v_loop:
1633*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+wq*2+400*0]
1634*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+wq*2+400*2]
1635*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+wq*2+400*4]
1636*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+wq*2+400*0]
1637*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t2+wq*2+400*2]
1638*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t2+wq*2+400*4]
1639*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
1640*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
1641*c0909341SAndroid Build Coastguard Worker    paddd           m3, m3
1642*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0             ; hv sum
1643*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; hv sumsq
1644*c0909341SAndroid Build Coastguard Worker    pslld           m0, m4, 4
1645*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
1646*c0909341SAndroid Build Coastguard Worker    pslld           m2, m4, 3
1647*c0909341SAndroid Build Coastguard Worker    paddd           m4, m0
1648*c0909341SAndroid Build Coastguard Worker    pslld           m0, m5, 4
1649*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; a * 25
1650*c0909341SAndroid Build Coastguard Worker    pslld           m2, m5, 3
1651*c0909341SAndroid Build Coastguard Worker    paddd           m5, m0
1652*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2
1653*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6
1654*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
1655*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0, m0         ; b * b
1656*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1, m1
1657*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p
1658*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
1659*c0909341SAndroid Build Coastguard Worker    MULLD           m4, m9, m2         ; p * s
1660*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m9, m2
1661*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m10            ; b * 164
1662*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m10
1663*c0909341SAndroid Build Coastguard Worker    paddusw         m4, m10
1664*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m10
1665*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; min(z, 255)
1666*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
1667*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m3, m4, m5, t2, t2m
1668*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m3
1669*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3, m3
1670*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m4, m2
1671*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m5, m2
1672*c0909341SAndroid Build Coastguard Worker    paddd           m0, m12            ; x * b * 164 + (1 << 11) + (1 << 15)
1673*c0909341SAndroid Build Coastguard Worker    paddd           m1, m12
1674*c0909341SAndroid Build Coastguard Worker    mova   [t4+wq*2+4], m3
1675*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12             ; b
1676*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
1677*c0909341SAndroid Build Coastguard Worker    mova  [t3+wq*4+ 8], m0
1678*c0909341SAndroid Build Coastguard Worker    mova  [t3+wq*4+24], m1
1679*c0909341SAndroid Build Coastguard Worker    add             wq, 8
1680*c0909341SAndroid Build Coastguard Worker    jl .v_loop
1681*c0909341SAndroid Build Coastguard Worker    ret
1682*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup
1683*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
1684*c0909341SAndroid Build Coastguard Worker    movif32         wd, w1m
1685*c0909341SAndroid Build Coastguard Worker.prep_n_loop:
1686*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+wq*2+ 2]
1687*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*2+ 4]
1688*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*4+ 4]
1689*c0909341SAndroid Build Coastguard Worker    movu            m4, [t3+wq*4+ 8]
1690*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+wq*4+20]
1691*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*4+24]
1692*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0
1693*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1
1694*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2
1695*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*2+ 0]
1696*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+wq*4+ 0]
1697*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+wq*4+16]
1698*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3
1699*c0909341SAndroid Build Coastguard Worker    psllw           m3, 2
1700*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
1701*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2
1702*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5
1703*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2
1704*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3             ; a 565
1705*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4             ; b 565
1706*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5
1707*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400*2+ 0], m0
1708*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*4+ 0], m1
1709*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*4+16], m2
1710*c0909341SAndroid Build Coastguard Worker    add             wq, 8
1711*c0909341SAndroid Build Coastguard Worker    jl .prep_n_loop
1712*c0909341SAndroid Build Coastguard Worker    ret
1713*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1714*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows)
1715*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
1716*c0909341SAndroid Build Coastguard Worker    movif32         wd, w1m
1717*c0909341SAndroid Build Coastguard Worker.n0_loop:
1718*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+wq*2+ 2]
1719*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*2+ 4]
1720*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*4+ 4]
1721*c0909341SAndroid Build Coastguard Worker    movu            m4, [t3+wq*4+ 8]
1722*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+wq*4+20]
1723*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*4+24]
1724*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0
1725*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1
1726*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2
1727*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*2+ 0]
1728*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+wq*4+ 0]
1729*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+wq*4+16]
1730*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3
1731*c0909341SAndroid Build Coastguard Worker    psllw           m3, 2
1732*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
1733*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2
1734*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5
1735*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2
1736*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3             ; a 565
1737*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4             ; b 565
1738*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5
1739*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0, [t4+wq*2+400*2+ 0]
1740*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1, [t3+wq*4+400*4+ 0]
1741*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2, [t3+wq*4+400*4+16]
1742*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400*2+ 0], m0
1743*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*4+ 0], m1
1744*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*4+16], m2
1745*c0909341SAndroid Build Coastguard Worker    movq            m0, [dstq+wq]
1746*c0909341SAndroid Build Coastguard Worker    punpcklbw       m0, m6
1747*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m0, m6          ; src
1748*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6          ; a
1749*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m1              ; a * src
1750*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m6
1751*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1752*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1
1753*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2              ; b - a * src + (1 << 8)
1754*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
1755*c0909341SAndroid Build Coastguard Worker    psrad           m4, 9
1756*c0909341SAndroid Build Coastguard Worker    psrad           m5, 9
1757*c0909341SAndroid Build Coastguard Worker    packssdw        m4, m5
1758*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m4, m7
1759*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4
1760*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m0
1761*c0909341SAndroid Build Coastguard Worker    movq     [dstq+wq], m0
1762*c0909341SAndroid Build Coastguard Worker    add             wq, 8
1763*c0909341SAndroid Build Coastguard Worker    jl .n0_loop
1764*c0909341SAndroid Build Coastguard Worker    add           dstq, stridemp
1765*c0909341SAndroid Build Coastguard Worker    ret
1766*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1767*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows)
1768*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
1769*c0909341SAndroid Build Coastguard Worker    movif32         wd, w1m
1770*c0909341SAndroid Build Coastguard Worker.n1_loop:
1771*c0909341SAndroid Build Coastguard Worker    movq            m0, [dstq+wq]
1772*c0909341SAndroid Build Coastguard Worker    mova            m3, [t4+wq*2+400*2+ 0]
1773*c0909341SAndroid Build Coastguard Worker    mova            m4, [t3+wq*4+400*4+ 0]
1774*c0909341SAndroid Build Coastguard Worker    mova            m5, [t3+wq*4+400*4+16]
1775*c0909341SAndroid Build Coastguard Worker    punpcklbw       m0, m6
1776*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m0, m6          ; src
1777*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6          ; a
1778*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m1              ; a * src
1779*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m6
1780*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1781*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1
1782*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2              ; b - a * src + (1 << 7)
1783*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
1784*c0909341SAndroid Build Coastguard Worker    psrad           m4, 8
1785*c0909341SAndroid Build Coastguard Worker    psrad           m5, 8
1786*c0909341SAndroid Build Coastguard Worker    packssdw        m4, m5
1787*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m4, m7
1788*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4
1789*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m0
1790*c0909341SAndroid Build Coastguard Worker    movq     [dstq+wq], m0
1791*c0909341SAndroid Build Coastguard Worker    add             wq, 8
1792*c0909341SAndroid Build Coastguard Worker    jl .n1_loop
1793*c0909341SAndroid Build Coastguard Worker    add           dstq, stridemp
1794*c0909341SAndroid Build Coastguard Worker    movif32       dstm, dstq
1795*c0909341SAndroid Build Coastguard Worker    ret
1796*c0909341SAndroid Build Coastguard Worker
1797*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1798*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
1799*c0909341SAndroid Build Coastguard Worker  %assign extra_stack 4*16
1800*c0909341SAndroid Build Coastguard Worker %else
1801*c0909341SAndroid Build Coastguard Worker  %assign extra_stack 2*16
1802*c0909341SAndroid Build Coastguard Worker %endif
1803*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_3x3_8bpc, 1, 7, 8, -400*42-16-extra_stack, \
1804*c0909341SAndroid Build Coastguard Worker                             dst, stride, left, lpf, w
1805*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
1806*c0909341SAndroid Build Coastguard Worker  %define dstm         dword [esp+calloff+16*2+4*0]
1807*c0909341SAndroid Build Coastguard Worker  %define stridemp     dword [esp+calloff+16*2+4*1]
1808*c0909341SAndroid Build Coastguard Worker  %define leftm        dword [esp+calloff+16*2+4*2]
1809*c0909341SAndroid Build Coastguard Worker  %define lpfm         dword [esp+calloff+16*2+4*3]
1810*c0909341SAndroid Build Coastguard Worker  %define w0m          dword [esp+calloff+16*2+4*4]
1811*c0909341SAndroid Build Coastguard Worker  %define hd           dword [esp+calloff+16*2+4*5]
1812*c0909341SAndroid Build Coastguard Worker  %define edgeb         byte [esp+calloff+16*2+4*6]
1813*c0909341SAndroid Build Coastguard Worker  %define edged        dword [esp+calloff+16*2+4*6]
1814*c0909341SAndroid Build Coastguard Worker  %define leftmp leftm
1815*c0909341SAndroid Build Coastguard Worker %else
1816*c0909341SAndroid Build Coastguard Worker  %define w0m wm
1817*c0909341SAndroid Build Coastguard Worker  %define hd dword r5m
1818*c0909341SAndroid Build Coastguard Worker  %define edgeb  byte r7m
1819*c0909341SAndroid Build Coastguard Worker  %define edged dword r7m
1820*c0909341SAndroid Build Coastguard Worker %endif
1821*c0909341SAndroid Build Coastguard Worker %define hvsrcm dword [esp+calloff+4*0]
1822*c0909341SAndroid Build Coastguard Worker %define w1m    dword [esp+calloff+4*1]
1823*c0909341SAndroid Build Coastguard Worker %define t3m    dword [esp+calloff+4*2]
1824*c0909341SAndroid Build Coastguard Worker %define t4m    dword [esp+calloff+4*3]
1825*c0909341SAndroid Build Coastguard Worker %define  m8 [base+pb_0to15]
1826*c0909341SAndroid Build Coastguard Worker %define  m9 [esp+calloff+16*1]
1827*c0909341SAndroid Build Coastguard Worker %define m10 [base+pd_0xf00801c7]
1828*c0909341SAndroid Build Coastguard Worker %define m11 [base+pd_34816]
1829*c0909341SAndroid Build Coastguard Worker %define m12 m6
1830*c0909341SAndroid Build Coastguard Worker %define m13 [base+sgr_lshuf3]
1831*c0909341SAndroid Build Coastguard Worker %define base r6-$$
1832*c0909341SAndroid Build Coastguard Worker %assign calloff 0
1833*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
1834*c0909341SAndroid Build Coastguard Worker    mov        strideq, [rstk+stack_offset+ 8]
1835*c0909341SAndroid Build Coastguard Worker    mov          leftq, [rstk+stack_offset+12]
1836*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rstk+stack_offset+16]
1837*c0909341SAndroid Build Coastguard Worker    mov             wd, [rstk+stack_offset+20]
1838*c0909341SAndroid Build Coastguard Worker    mov           dstm, dstq
1839*c0909341SAndroid Build Coastguard Worker    mov       stridemp, strideq
1840*c0909341SAndroid Build Coastguard Worker    mov          leftm, leftq
1841*c0909341SAndroid Build Coastguard Worker    mov             r1, [rstk+stack_offset+24]
1842*c0909341SAndroid Build Coastguard Worker    mov             r2, [rstk+stack_offset+32]
1843*c0909341SAndroid Build Coastguard Worker    mov           lpfm, lpfq
1844*c0909341SAndroid Build Coastguard Worker    mov             hd, r1
1845*c0909341SAndroid Build Coastguard Worker    mov          edged, r2
1846*c0909341SAndroid Build Coastguard Worker %endif
1847*c0909341SAndroid Build Coastguard Worker%else
1848*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_3x3_8bpc, 4, 15, 14, -400*42-8, dst, stride, left, lpf, \
1849*c0909341SAndroid Build Coastguard Worker                                                   w, h, edge, params
1850*c0909341SAndroid Build Coastguard Worker%endif
1851*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
1852*c0909341SAndroid Build Coastguard Worker    mov             wd, wm
1853*c0909341SAndroid Build Coastguard Worker%endif
1854*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1855*c0909341SAndroid Build Coastguard Worker    mov        paramsq, r6mp
1856*c0909341SAndroid Build Coastguard Worker    lea            r13, [sgr_x_by_x-0xf03]
1857*c0909341SAndroid Build Coastguard Worker    mov             hd, hm
1858*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
1859*c0909341SAndroid Build Coastguard Worker    movq            m9, [paramsq+4]
1860*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
1861*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq*2+12]
1862*c0909341SAndroid Build Coastguard Worker    mova            m8, [pb_0to15]
1863*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
1864*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+wq*4+400*12+8]
1865*c0909341SAndroid Build Coastguard Worker    mova           m10, [pd_0xf00801c7]
1866*c0909341SAndroid Build Coastguard Worker    lea             t4, [rsp+wq*2+400*32+8]
1867*c0909341SAndroid Build Coastguard Worker    mova           m11, [pd_34816]
1868*c0909341SAndroid Build Coastguard Worker    pshuflw         m7, m9, q3333
1869*c0909341SAndroid Build Coastguard Worker    pshufb          m9, [pw_256]  ; s1
1870*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m7, m7        ; w1
1871*c0909341SAndroid Build Coastguard Worker    neg             wq
1872*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
1873*c0909341SAndroid Build Coastguard Worker    mova           m13, [sgr_lshuf3]
1874*c0909341SAndroid Build Coastguard Worker    psllw           m7, 4
1875*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
1876*c0909341SAndroid Build Coastguard Worker %define lpfm [rsp]
1877*c0909341SAndroid Build Coastguard Worker%else
1878*c0909341SAndroid Build Coastguard Worker    mov             r1, [rstk+stack_offset+28] ; params
1879*c0909341SAndroid Build Coastguard Worker    LEA             r6, $$
1880*c0909341SAndroid Build Coastguard Worker    movq            m1, [r1+4]
1881*c0909341SAndroid Build Coastguard Worker    add           lpfm, wq
1882*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+extra_stack+wq*2+20]
1883*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
1884*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+extra_stack+wq*4+400*12+16]
1885*c0909341SAndroid Build Coastguard Worker    mov           dstm, dstq
1886*c0909341SAndroid Build Coastguard Worker    lea             t4, [rsp+extra_stack+wq*2+400*32+16]
1887*c0909341SAndroid Build Coastguard Worker    mov            t3m, t3
1888*c0909341SAndroid Build Coastguard Worker    pshuflw         m7, m1, q3333
1889*c0909341SAndroid Build Coastguard Worker    mov            t4m, t4
1890*c0909341SAndroid Build Coastguard Worker    pshufb          m1, [base+pw_256] ; s1
1891*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m7, m7            ; w1
1892*c0909341SAndroid Build Coastguard Worker    psllw           m7, 4
1893*c0909341SAndroid Build Coastguard Worker    neg             wq
1894*c0909341SAndroid Build Coastguard Worker    mova            m9, m1
1895*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
1896*c0909341SAndroid Build Coastguard Worker    mov            w1m, wd
1897*c0909341SAndroid Build Coastguard Worker    sub             wd, 2
1898*c0909341SAndroid Build Coastguard Worker    mov           lpfq, lpfm
1899*c0909341SAndroid Build Coastguard Worker    mov            w0m, wd
1900*c0909341SAndroid Build Coastguard Worker %define strideq r5
1901*c0909341SAndroid Build Coastguard Worker%endif
1902*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
1903*c0909341SAndroid Build Coastguard Worker    jz .no_top
1904*c0909341SAndroid Build Coastguard Worker    call .h_top
1905*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1906*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1907*c0909341SAndroid Build Coastguard Worker    add             t1, 400*6
1908*c0909341SAndroid Build Coastguard Worker    call .h_top
1909*c0909341SAndroid Build Coastguard Worker    movif32    strideq, stridemp
1910*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1911*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1912*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
1913*c0909341SAndroid Build Coastguard Worker    mov           lpfm, r10 ; below
1914*c0909341SAndroid Build Coastguard Worker    movif32         t4, t4m
1915*c0909341SAndroid Build Coastguard Worker    call .hv0
1916*c0909341SAndroid Build Coastguard Worker.main:
1917*c0909341SAndroid Build Coastguard Worker    dec             hd
1918*c0909341SAndroid Build Coastguard Worker    jz .height1
1919*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
1920*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1921*c0909341SAndroid Build Coastguard Worker    call .hv1
1922*c0909341SAndroid Build Coastguard Worker    call .prep_n
1923*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1924*c0909341SAndroid Build Coastguard Worker    jl .extend_bottom
1925*c0909341SAndroid Build Coastguard Worker.main_loop:
1926*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
1927*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1928*c0909341SAndroid Build Coastguard Worker    call .hv0
1929*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1930*c0909341SAndroid Build Coastguard Worker    test            hb, hb
1931*c0909341SAndroid Build Coastguard Worker%else
1932*c0909341SAndroid Build Coastguard Worker    mov             r4, hd
1933*c0909341SAndroid Build Coastguard Worker    test            r4, r4
1934*c0909341SAndroid Build Coastguard Worker%endif
1935*c0909341SAndroid Build Coastguard Worker    jz .odd_height
1936*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
1937*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1938*c0909341SAndroid Build Coastguard Worker    call .hv1
1939*c0909341SAndroid Build Coastguard Worker    call .n0
1940*c0909341SAndroid Build Coastguard Worker    call .n1
1941*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1942*c0909341SAndroid Build Coastguard Worker    jge .main_loop
1943*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
1944*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
1945*c0909341SAndroid Build Coastguard Worker    mov           lpfq, lpfm
1946*c0909341SAndroid Build Coastguard Worker    call .hv0_bottom
1947*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
1948*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1949*c0909341SAndroid Build Coastguard Worker    call .hv1_bottom
1950*c0909341SAndroid Build Coastguard Worker.end:
1951*c0909341SAndroid Build Coastguard Worker    call .n0
1952*c0909341SAndroid Build Coastguard Worker    call .n1
1953*c0909341SAndroid Build Coastguard Worker.end2:
1954*c0909341SAndroid Build Coastguard Worker    RET
1955*c0909341SAndroid Build Coastguard Worker.height1:
1956*c0909341SAndroid Build Coastguard Worker    call .v1
1957*c0909341SAndroid Build Coastguard Worker    call .prep_n
1958*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
1959*c0909341SAndroid Build Coastguard Worker.odd_height:
1960*c0909341SAndroid Build Coastguard Worker    call .v1
1961*c0909341SAndroid Build Coastguard Worker    call .n0
1962*c0909341SAndroid Build Coastguard Worker    call .n1
1963*c0909341SAndroid Build Coastguard Worker.odd_height_end:
1964*c0909341SAndroid Build Coastguard Worker    call .v0
1965*c0909341SAndroid Build Coastguard Worker    call .v1
1966*c0909341SAndroid Build Coastguard Worker    call .n0
1967*c0909341SAndroid Build Coastguard Worker    jmp .end2
1968*c0909341SAndroid Build Coastguard Worker.extend_bottom:
1969*c0909341SAndroid Build Coastguard Worker    call .v0
1970*c0909341SAndroid Build Coastguard Worker    call .v1
1971*c0909341SAndroid Build Coastguard Worker    jmp .end
1972*c0909341SAndroid Build Coastguard Worker.no_top:
1973*c0909341SAndroid Build Coastguard Worker    movif32    strideq, stridemp
1974*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1975*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1976*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
1977*c0909341SAndroid Build Coastguard Worker    mov           lpfm, r10
1978*c0909341SAndroid Build Coastguard Worker    call .h
1979*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1980*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
1981*c0909341SAndroid Build Coastguard Worker%else
1982*c0909341SAndroid Build Coastguard Worker    mov             wq, w0m
1983*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
1984*c0909341SAndroid Build Coastguard Worker%endif
1985*c0909341SAndroid Build Coastguard Worker    lea             t2, [t1+400*6]
1986*c0909341SAndroid Build Coastguard Worker.top_fixup_loop:
1987*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+wq*2+400*0]
1988*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+wq*2+400*2]
1989*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+wq*2+400*4]
1990*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*0], m0
1991*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*2], m1
1992*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*4], m2
1993*c0909341SAndroid Build Coastguard Worker    add             wq, 8
1994*c0909341SAndroid Build Coastguard Worker    jl .top_fixup_loop
1995*c0909341SAndroid Build Coastguard Worker    movif32         t3, t3m
1996*c0909341SAndroid Build Coastguard Worker    movif32         t4, t4m
1997*c0909341SAndroid Build Coastguard Worker    call .v0
1998*c0909341SAndroid Build Coastguard Worker    jmp .main
1999*c0909341SAndroid Build Coastguard Worker.extend_right:
2000*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset+8
2001*c0909341SAndroid Build Coastguard Worker%assign calloff 8
2002*c0909341SAndroid Build Coastguard Worker    movd            m0, [lpfq-1]
2003*c0909341SAndroid Build Coastguard Worker    movd            m1, wd
2004*c0909341SAndroid Build Coastguard Worker    mova            m3, m8
2005*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m6
2006*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m6
2007*c0909341SAndroid Build Coastguard Worker    mova            m2, m6
2008*c0909341SAndroid Build Coastguard Worker    psubb           m2, m1
2009*c0909341SAndroid Build Coastguard Worker    pcmpgtb         m2, m3
2010*c0909341SAndroid Build Coastguard Worker    pand            m5, m2
2011*c0909341SAndroid Build Coastguard Worker    pandn           m2, m0
2012*c0909341SAndroid Build Coastguard Worker    por             m5, m2
2013*c0909341SAndroid Build Coastguard Worker    ret
2014*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset-4
2015*c0909341SAndroid Build Coastguard Worker%assign calloff 4
2016*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum
2017*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2018*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
2019*c0909341SAndroid Build Coastguard Worker%else
2020*c0909341SAndroid Build Coastguard Worker %define leftq r4
2021*c0909341SAndroid Build Coastguard Worker%endif
2022*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2023*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
2024*c0909341SAndroid Build Coastguard Worker    movif32      leftq, leftm
2025*c0909341SAndroid Build Coastguard Worker    movddup         m4, [leftq-4]
2026*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2027*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+wq+2]
2028*c0909341SAndroid Build Coastguard Worker    add         leftmp, 4
2029*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 14
2030*c0909341SAndroid Build Coastguard Worker    jmp .h_main
2031*c0909341SAndroid Build Coastguard Worker.h_extend_left:
2032*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2033*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+wq+2]
2034*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m13
2035*c0909341SAndroid Build Coastguard Worker    jmp .h_main
2036*c0909341SAndroid Build Coastguard Worker.h_top:
2037*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2038*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
2039*c0909341SAndroid Build Coastguard Worker%endif
2040*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2041*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
2042*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2043*c0909341SAndroid Build Coastguard Worker.h_loop:
2044*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+wq]
2045*c0909341SAndroid Build Coastguard Worker.h_main:
2046*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
2047*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
2048*c0909341SAndroid Build Coastguard Worker    cmp             wd, -9
2049*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
2050*c0909341SAndroid Build Coastguard Worker    call .extend_right
2051*c0909341SAndroid Build Coastguard Worker.h_have_right:
2052*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, m5, m6
2053*c0909341SAndroid Build Coastguard Worker    punpckhbw       m5, m6
2054*c0909341SAndroid Build Coastguard Worker    palignr         m0, m5, m4, 2
2055*c0909341SAndroid Build Coastguard Worker    paddw           m1, m4, m0
2056*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m4, m0
2057*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2058*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4, m0
2059*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2060*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 4
2061*c0909341SAndroid Build Coastguard Worker    paddw           m1, m5             ; sum
2062*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5, m6
2063*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
2064*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m6
2065*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
2066*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4             ; sumsq
2067*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5
2068*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400*0], m1
2069*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400*2], m2
2070*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400*4], m3
2071*c0909341SAndroid Build Coastguard Worker    add             wq, 8
2072*c0909341SAndroid Build Coastguard Worker    jl .h_loop
2073*c0909341SAndroid Build Coastguard Worker    ret
2074*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2075*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
2076*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2077*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
2078*c0909341SAndroid Build Coastguard Worker%else
2079*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
2080*c0909341SAndroid Build Coastguard Worker%endif
2081*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2082*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
2083*c0909341SAndroid Build Coastguard Worker    movif32      leftq, leftm
2084*c0909341SAndroid Build Coastguard Worker    movddup         m4, [leftq-4]
2085*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2086*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+wq+2]
2087*c0909341SAndroid Build Coastguard Worker    add         leftmp, 4
2088*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 14
2089*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
2090*c0909341SAndroid Build Coastguard Worker.hv0_extend_left:
2091*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2092*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+wq+2]
2093*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m13
2094*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
2095*c0909341SAndroid Build Coastguard Worker.hv0_bottom:
2096*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2097*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
2098*c0909341SAndroid Build Coastguard Worker%else
2099*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
2100*c0909341SAndroid Build Coastguard Worker%endif
2101*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2102*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
2103*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2104*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2105*c0909341SAndroid Build Coastguard Worker    jmp .hv0_loop_start
2106*c0909341SAndroid Build Coastguard Worker%endif
2107*c0909341SAndroid Build Coastguard Worker.hv0_loop:
2108*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
2109*c0909341SAndroid Build Coastguard Worker.hv0_loop_start:
2110*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+wq]
2111*c0909341SAndroid Build Coastguard Worker.hv0_main:
2112*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
2113*c0909341SAndroid Build Coastguard Worker    jnz .hv0_have_right
2114*c0909341SAndroid Build Coastguard Worker    cmp             wd, -9
2115*c0909341SAndroid Build Coastguard Worker    jl .hv0_have_right
2116*c0909341SAndroid Build Coastguard Worker    call .extend_right
2117*c0909341SAndroid Build Coastguard Worker.hv0_have_right:
2118*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, m5, m6
2119*c0909341SAndroid Build Coastguard Worker    punpckhbw       m5, m6
2120*c0909341SAndroid Build Coastguard Worker    palignr         m0, m5, m4, 2
2121*c0909341SAndroid Build Coastguard Worker    paddw           m1, m4, m0
2122*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m4, m0
2123*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2124*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4, m0
2125*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2126*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 4
2127*c0909341SAndroid Build Coastguard Worker    paddw           m1, m5             ; sum
2128*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5, m6
2129*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
2130*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m6
2131*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
2132*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4             ; sumsq
2133*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5
2134*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1, [t1+wq*2+400*0]
2135*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t1+wq*2+400*2]
2136*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t1+wq*2+400*4]
2137*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400*0], m1
2138*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400*2], m2
2139*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400*4], m3
2140*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+wq*2+400*0]
2141*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+wq*2+400*2]
2142*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+wq*2+400*4]
2143*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*0], m0
2144*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*2], m4
2145*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*4], m5
2146*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
2147*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
2148*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; a * 9
2149*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
2150*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
2151*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0, m0         ; b * b
2152*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
2153*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1, m1
2154*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p
2155*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
2156*c0909341SAndroid Build Coastguard Worker    MULLD           m4, m9, m12        ; p * s
2157*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m9, m12
2158*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m10            ; b * 455
2159*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m10
2160*c0909341SAndroid Build Coastguard Worker    paddusw         m4, m10
2161*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m10
2162*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; min(z, 255)
2163*c0909341SAndroid Build Coastguard Worker    movif32         t3, t3m
2164*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
2165*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m3, m4, m5, r0, dstm
2166*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m3
2167*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3, m3
2168*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m4, m12
2169*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m5, m12
2170*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2171*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
2172*c0909341SAndroid Build Coastguard Worker%endif
2173*c0909341SAndroid Build Coastguard Worker    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2174*c0909341SAndroid Build Coastguard Worker    paddd           m1, m11
2175*c0909341SAndroid Build Coastguard Worker    mova   [t4+wq*2+4], m3
2176*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
2177*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
2178*c0909341SAndroid Build Coastguard Worker    mova  [t3+wq*4+ 8], m0
2179*c0909341SAndroid Build Coastguard Worker    mova  [t3+wq*4+24], m1
2180*c0909341SAndroid Build Coastguard Worker    add             wq, 8
2181*c0909341SAndroid Build Coastguard Worker    jl .hv0_loop
2182*c0909341SAndroid Build Coastguard Worker    ret
2183*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2184*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
2185*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2186*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
2187*c0909341SAndroid Build Coastguard Worker%else
2188*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
2189*c0909341SAndroid Build Coastguard Worker%endif
2190*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2191*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
2192*c0909341SAndroid Build Coastguard Worker    movif32      leftq, leftm
2193*c0909341SAndroid Build Coastguard Worker    movddup         m4, [leftq-4]
2194*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2195*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+wq+2]
2196*c0909341SAndroid Build Coastguard Worker    add         leftmp, 4
2197*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 14
2198*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
2199*c0909341SAndroid Build Coastguard Worker.hv1_extend_left:
2200*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2201*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+wq+2]
2202*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m13
2203*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
2204*c0909341SAndroid Build Coastguard Worker.hv1_bottom:
2205*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2206*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
2207*c0909341SAndroid Build Coastguard Worker%else
2208*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
2209*c0909341SAndroid Build Coastguard Worker%endif
2210*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2211*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
2212*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2213*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2214*c0909341SAndroid Build Coastguard Worker    jmp .hv1_loop_start
2215*c0909341SAndroid Build Coastguard Worker%endif
2216*c0909341SAndroid Build Coastguard Worker.hv1_loop:
2217*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
2218*c0909341SAndroid Build Coastguard Worker.hv1_loop_start:
2219*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+wq]
2220*c0909341SAndroid Build Coastguard Worker.hv1_main:
2221*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
2222*c0909341SAndroid Build Coastguard Worker    jnz .hv1_have_right
2223*c0909341SAndroid Build Coastguard Worker    cmp             wd, -9
2224*c0909341SAndroid Build Coastguard Worker    jl .hv1_have_right
2225*c0909341SAndroid Build Coastguard Worker    call .extend_right
2226*c0909341SAndroid Build Coastguard Worker.hv1_have_right:
2227*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, m5, m6
2228*c0909341SAndroid Build Coastguard Worker    punpckhbw       m5, m6
2229*c0909341SAndroid Build Coastguard Worker    palignr         m1, m5, m4, 2
2230*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4, m1
2231*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m4, m1
2232*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2233*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4, m1
2234*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2235*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 4
2236*c0909341SAndroid Build Coastguard Worker    paddw           m0, m5             ; h sum
2237*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m5, m6
2238*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
2239*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m6
2240*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
2241*c0909341SAndroid Build Coastguard Worker    paddd           m2, m1             ; h sumsq
2242*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5
2243*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+wq*2+400*0]
2244*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t2+wq*2+400*2]
2245*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t2+wq*2+400*4]
2246*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*0], m0
2247*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*2], m2
2248*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*4], m3
2249*c0909341SAndroid Build Coastguard Worker    pslld           m2, m4, 3
2250*c0909341SAndroid Build Coastguard Worker    pslld           m3, m5, 3
2251*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; a * 9
2252*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
2253*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
2254*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0, m0         ; b * b
2255*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
2256*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1, m1
2257*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p
2258*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
2259*c0909341SAndroid Build Coastguard Worker    MULLD           m4, m9, m12        ; p * s
2260*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m9, m12
2261*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m10            ; b * 455
2262*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m10
2263*c0909341SAndroid Build Coastguard Worker    paddusw         m4, m10
2264*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m10
2265*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; min(z, 255)
2266*c0909341SAndroid Build Coastguard Worker    movif32         t3, t3m
2267*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
2268*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m3, m4, m5, r0, dstm
2269*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m3
2270*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3, m3
2271*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m4, m12
2272*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m5, m12
2273*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2274*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
2275*c0909341SAndroid Build Coastguard Worker%endif
2276*c0909341SAndroid Build Coastguard Worker    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2277*c0909341SAndroid Build Coastguard Worker    paddd           m1, m11
2278*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400*2 +4], m3
2279*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
2280*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
2281*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*4+ 8], m0
2282*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*4+24], m1
2283*c0909341SAndroid Build Coastguard Worker    add             wq, 8
2284*c0909341SAndroid Build Coastguard Worker    jl .hv1_loop
2285*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
2286*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
2287*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
2288*c0909341SAndroid Build Coastguard Worker    ret
2289*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab (even rows)
2290*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2291*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
2292*c0909341SAndroid Build Coastguard Worker%else
2293*c0909341SAndroid Build Coastguard Worker    mov             wd, w0m
2294*c0909341SAndroid Build Coastguard Worker%endif
2295*c0909341SAndroid Build Coastguard Worker.v0_loop:
2296*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+wq*2+400*0]
2297*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+wq*2+400*2]
2298*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+wq*2+400*4]
2299*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
2300*c0909341SAndroid Build Coastguard Worker    paddd           m4, m4
2301*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5
2302*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+wq*2+400*0]
2303*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+wq*2+400*2]
2304*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+wq*2+400*4]
2305*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*0], m0
2306*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*2], m4
2307*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*4], m5
2308*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
2309*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
2310*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; a * 9
2311*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
2312*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
2313*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0, m0         ; b * b
2314*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
2315*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1, m1
2316*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p
2317*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
2318*c0909341SAndroid Build Coastguard Worker    MULLD           m4, m9, m12        ; p * s
2319*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m9, m12
2320*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m10            ; b * 455
2321*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m10
2322*c0909341SAndroid Build Coastguard Worker    paddusw         m4, m10
2323*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m10
2324*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; min(z, 255)
2325*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
2326*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m3, m4, m5, r0, dstm
2327*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m3
2328*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3, m3
2329*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m4, m12
2330*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m5, m12
2331*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2332*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
2333*c0909341SAndroid Build Coastguard Worker%endif
2334*c0909341SAndroid Build Coastguard Worker    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2335*c0909341SAndroid Build Coastguard Worker    paddd           m1, m11
2336*c0909341SAndroid Build Coastguard Worker    mova   [t4+wq*2+4], m3
2337*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
2338*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
2339*c0909341SAndroid Build Coastguard Worker    mova  [t3+wq*4+ 8], m0
2340*c0909341SAndroid Build Coastguard Worker    mova  [t3+wq*4+24], m1
2341*c0909341SAndroid Build Coastguard Worker    add             wq, 8
2342*c0909341SAndroid Build Coastguard Worker    jl .v0_loop
2343*c0909341SAndroid Build Coastguard Worker    ret
2344*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows)
2345*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2346*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
2347*c0909341SAndroid Build Coastguard Worker%else
2348*c0909341SAndroid Build Coastguard Worker    mov             wd, w0m
2349*c0909341SAndroid Build Coastguard Worker%endif
2350*c0909341SAndroid Build Coastguard Worker.v1_loop:
2351*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+wq*2+400*0]
2352*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+wq*2+400*2]
2353*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+wq*2+400*4]
2354*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+wq*2+400*0]
2355*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+wq*2+400*2]
2356*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+wq*2+400*4]
2357*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*0], m0
2358*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*2], m4
2359*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*4], m5
2360*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
2361*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
2362*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; a * 9
2363*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
2364*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
2365*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0, m0         ; b * b
2366*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
2367*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1, m1
2368*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p
2369*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
2370*c0909341SAndroid Build Coastguard Worker    MULLD           m4, m9, m12        ; p * s
2371*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m9, m12
2372*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m10            ; b * 455
2373*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m10
2374*c0909341SAndroid Build Coastguard Worker    paddusw         m4, m10
2375*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m10
2376*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; min(z, 255)
2377*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
2378*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m3, m4, m5, r0, dstm
2379*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m3
2380*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3, m3
2381*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m4, m12
2382*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m5, m12
2383*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2384*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
2385*c0909341SAndroid Build Coastguard Worker%endif
2386*c0909341SAndroid Build Coastguard Worker    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2387*c0909341SAndroid Build Coastguard Worker    paddd           m1, m11
2388*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400*2+ 4], m3
2389*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
2390*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
2391*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*4+ 8], m0
2392*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*4+24], m1
2393*c0909341SAndroid Build Coastguard Worker    add             wq, 8
2394*c0909341SAndroid Build Coastguard Worker    jl .v1_loop
2395*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
2396*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
2397*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
2398*c0909341SAndroid Build Coastguard Worker    ret
2399*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup
2400*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
2401*c0909341SAndroid Build Coastguard Worker    movif32         wd, w1m
2402*c0909341SAndroid Build Coastguard Worker.prep_n_loop:
2403*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+wq*2+400*0+ 4]
2404*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*4+400*0+ 8]
2405*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+wq*4+400*0+24]
2406*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*2+400*0+ 2]
2407*c0909341SAndroid Build Coastguard Worker    movu            m4, [t3+wq*4+400*0+ 4]
2408*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*4+400*0+20]
2409*c0909341SAndroid Build Coastguard Worker    paddw           m0, [t4+wq*2+400*0+ 0]
2410*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+wq*4+400*0+ 0]
2411*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t3+wq*4+400*0+16]
2412*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0
2413*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1
2414*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2
2415*c0909341SAndroid Build Coastguard Worker    psllw           m3, 2                ; a[-1] 444
2416*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2                ; b[-1] 444
2417*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2
2418*c0909341SAndroid Build Coastguard Worker    psubw           m3, m0               ; a[-1] 343
2419*c0909341SAndroid Build Coastguard Worker    psubd           m4, m1               ; b[-1] 343
2420*c0909341SAndroid Build Coastguard Worker    psubd           m5, m2
2421*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400*4], m3
2422*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*8+ 0], m4
2423*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*8+16], m5
2424*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+wq*2+400*2+ 4]
2425*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*4+400*4+ 8]
2426*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+wq*4+400*4+24]
2427*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*2+400*2+ 2]
2428*c0909341SAndroid Build Coastguard Worker    movu            m4, [t3+wq*4+400*4+ 4]
2429*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*4+400*4+20]
2430*c0909341SAndroid Build Coastguard Worker    paddw           m0, [t4+wq*2+400*2+ 0]
2431*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+wq*4+400*4+ 0]
2432*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t3+wq*4+400*4+16]
2433*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0
2434*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1
2435*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2
2436*c0909341SAndroid Build Coastguard Worker    psllw           m3, 2                 ; a[ 0] 444
2437*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2                 ; b[ 0] 444
2438*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2
2439*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400* 6], m3
2440*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*12+ 0], m4
2441*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*12+16], m5
2442*c0909341SAndroid Build Coastguard Worker    psubw           m3, m0                ; a[ 0] 343
2443*c0909341SAndroid Build Coastguard Worker    psubd           m4, m1                ; b[ 0] 343
2444*c0909341SAndroid Build Coastguard Worker    psubd           m5, m2
2445*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400* 8], m3
2446*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*16+ 0], m4
2447*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*16+16], m5
2448*c0909341SAndroid Build Coastguard Worker    add             wq, 8
2449*c0909341SAndroid Build Coastguard Worker    jl .prep_n_loop
2450*c0909341SAndroid Build Coastguard Worker    ret
2451*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2452*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows)
2453*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
2454*c0909341SAndroid Build Coastguard Worker    movif32         wd, w1m
2455*c0909341SAndroid Build Coastguard Worker.n0_loop:
2456*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*2+400*0+4]
2457*c0909341SAndroid Build Coastguard Worker    movu            m1, [t4+wq*2+400*0+2]
2458*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*2+400*0+0]
2459*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
2460*c0909341SAndroid Build Coastguard Worker    psllw           m1, 2                ; a[ 1] 444
2461*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1, m3           ; a[ 1] 343
2462*c0909341SAndroid Build Coastguard Worker    paddw           m3, m2, [t4+wq*2+400*4]
2463*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*2+400*6]
2464*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400*4], m2
2465*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400*6], m1
2466*c0909341SAndroid Build Coastguard Worker    movu            m4, [t3+wq*4+400*0+8]
2467*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*4+400*0+4]
2468*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+wq*4+400*0+0]
2469*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
2470*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2                ; b[ 1] 444
2471*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1, m4           ; b[ 1] 343
2472*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t3+wq*4+400* 8+ 0]
2473*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+wq*4+400*12+ 0]
2474*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400* 8+ 0], m2
2475*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*12+ 0], m1
2476*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*4+400*0+24]
2477*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*4+400*0+20]
2478*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+wq*4+400*0+16]
2479*c0909341SAndroid Build Coastguard Worker    paddd           m1, m5
2480*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2
2481*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1, m5
2482*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2, [t3+wq*4+400* 8+16]
2483*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+wq*4+400*12+16]
2484*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400* 8+16], m2
2485*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*12+16], m1
2486*c0909341SAndroid Build Coastguard Worker    movq            m0, [dstq+wq]
2487*c0909341SAndroid Build Coastguard Worker    punpcklbw       m0, m6
2488*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m0, m6
2489*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
2490*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m1               ; a * src
2491*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m6
2492*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
2493*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1
2494*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2               ; b - a * src + (1 << 8)
2495*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
2496*c0909341SAndroid Build Coastguard Worker    psrad           m4, 9
2497*c0909341SAndroid Build Coastguard Worker    psrad           m5, 9
2498*c0909341SAndroid Build Coastguard Worker    packssdw        m4, m5
2499*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m4, m7
2500*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4
2501*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m0
2502*c0909341SAndroid Build Coastguard Worker    movq     [dstq+wq], m0
2503*c0909341SAndroid Build Coastguard Worker    add             wq, 8
2504*c0909341SAndroid Build Coastguard Worker    jl .n0_loop
2505*c0909341SAndroid Build Coastguard Worker    add           dstq, stridemp
2506*c0909341SAndroid Build Coastguard Worker    ret
2507*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2508*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows)
2509*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
2510*c0909341SAndroid Build Coastguard Worker    movif32         wd, w1m
2511*c0909341SAndroid Build Coastguard Worker.n1_loop:
2512*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*2+400*2+4]
2513*c0909341SAndroid Build Coastguard Worker    movu            m1, [t4+wq*2+400*2+2]
2514*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*2+400*2+0]
2515*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
2516*c0909341SAndroid Build Coastguard Worker    psllw           m1, 2                ; a[ 1] 444
2517*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1, m3           ; a[ 1] 343
2518*c0909341SAndroid Build Coastguard Worker    paddw           m3, m2, [t4+wq*2+400*6]
2519*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*2+400*8]
2520*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400*6], m1
2521*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400*8], m2
2522*c0909341SAndroid Build Coastguard Worker    movu            m4, [t3+wq*4+400*4+8]
2523*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*4+400*4+4]
2524*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+wq*4+400*4+0]
2525*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
2526*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2                ; b[ 1] 444
2527*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1, m4           ; b[ 1] 343
2528*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t3+wq*4+400*12+ 0]
2529*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+wq*4+400*16+ 0]
2530*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*12+ 0], m1
2531*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*16+ 0], m2
2532*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*4+400*4+24]
2533*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*4+400*4+20]
2534*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+wq*4+400*4+16]
2535*c0909341SAndroid Build Coastguard Worker    paddd           m1, m5
2536*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2
2537*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1, m5
2538*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2, [t3+wq*4+400*12+16]
2539*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+wq*4+400*16+16]
2540*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*12+16], m1
2541*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*16+16], m2
2542*c0909341SAndroid Build Coastguard Worker    movq            m0, [dstq+wq]
2543*c0909341SAndroid Build Coastguard Worker    punpcklbw       m0, m6
2544*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m0, m6
2545*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
2546*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m1               ; a * src
2547*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m6
2548*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
2549*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1
2550*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2               ; b - a * src + (1 << 8)
2551*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
2552*c0909341SAndroid Build Coastguard Worker    psrad           m4, 9
2553*c0909341SAndroid Build Coastguard Worker    psrad           m5, 9
2554*c0909341SAndroid Build Coastguard Worker    packssdw        m4, m5
2555*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m4, m7
2556*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4
2557*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m0
2558*c0909341SAndroid Build Coastguard Worker    movq     [dstq+wq], m0
2559*c0909341SAndroid Build Coastguard Worker    add             wq, 8
2560*c0909341SAndroid Build Coastguard Worker    jl .n1_loop
2561*c0909341SAndroid Build Coastguard Worker    add           dstq, stridemp
2562*c0909341SAndroid Build Coastguard Worker    movif32       dstm, dstq
2563*c0909341SAndroid Build Coastguard Worker    ret
2564*c0909341SAndroid Build Coastguard Worker
2565*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2566*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
2567*c0909341SAndroid Build Coastguard Worker  %assign extra_stack 10*16
2568*c0909341SAndroid Build Coastguard Worker %else
2569*c0909341SAndroid Build Coastguard Worker  %assign extra_stack 8*16
2570*c0909341SAndroid Build Coastguard Worker %endif
2571*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_mix_8bpc, 1, 7, 8, -400*66-48-extra_stack, \
2572*c0909341SAndroid Build Coastguard Worker                             dst, stride, left, lpf, w
2573*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
2574*c0909341SAndroid Build Coastguard Worker  %define dstm         dword [esp+calloff+16*8+4*0]
2575*c0909341SAndroid Build Coastguard Worker  %define stridemp     dword [esp+calloff+16*8+4*1]
2576*c0909341SAndroid Build Coastguard Worker  %define leftm        dword [esp+calloff+16*8+4*2]
2577*c0909341SAndroid Build Coastguard Worker  %define lpfm         dword [esp+calloff+16*8+4*3]
2578*c0909341SAndroid Build Coastguard Worker  %define w0m          dword [esp+calloff+16*8+4*4]
2579*c0909341SAndroid Build Coastguard Worker  %define hd           dword [esp+calloff+16*8+4*5]
2580*c0909341SAndroid Build Coastguard Worker  %define edgeb         byte [esp+calloff+16*8+4*6]
2581*c0909341SAndroid Build Coastguard Worker  %define edged        dword [esp+calloff+16*8+4*6]
2582*c0909341SAndroid Build Coastguard Worker  %define leftmp leftm
2583*c0909341SAndroid Build Coastguard Worker %else
2584*c0909341SAndroid Build Coastguard Worker  %define w0m wm
2585*c0909341SAndroid Build Coastguard Worker  %define hd dword r5m
2586*c0909341SAndroid Build Coastguard Worker  %define edgeb  byte r7m
2587*c0909341SAndroid Build Coastguard Worker  %define edged dword r7m
2588*c0909341SAndroid Build Coastguard Worker %endif
2589*c0909341SAndroid Build Coastguard Worker %define hvsrcm dword [esp+calloff+4*0]
2590*c0909341SAndroid Build Coastguard Worker %define w1m    dword [esp+calloff+4*1]
2591*c0909341SAndroid Build Coastguard Worker %define t3m    dword [esp+calloff+4*2]
2592*c0909341SAndroid Build Coastguard Worker %define t4m    dword [esp+calloff+4*3]
2593*c0909341SAndroid Build Coastguard Worker %xdefine m8 m6
2594*c0909341SAndroid Build Coastguard Worker %define  m9 [base+pd_0xffff]
2595*c0909341SAndroid Build Coastguard Worker %define m10 [base+pd_34816]
2596*c0909341SAndroid Build Coastguard Worker %define m11 [base+pd_0xf00801c7]
2597*c0909341SAndroid Build Coastguard Worker %define m12 [base+pd_0xf00800a4]
2598*c0909341SAndroid Build Coastguard Worker %define m13 [esp+calloff+16*4]
2599*c0909341SAndroid Build Coastguard Worker %define m14 [esp+calloff+16*5]
2600*c0909341SAndroid Build Coastguard Worker %define m15 [esp+calloff+16*6]
2601*c0909341SAndroid Build Coastguard Worker %define  m6 [esp+calloff+16*7]
2602*c0909341SAndroid Build Coastguard Worker %define base r6-$$
2603*c0909341SAndroid Build Coastguard Worker %assign calloff 0
2604*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
2605*c0909341SAndroid Build Coastguard Worker    mov        strideq, [rstk+stack_offset+ 8]
2606*c0909341SAndroid Build Coastguard Worker    mov          leftq, [rstk+stack_offset+12]
2607*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rstk+stack_offset+16]
2608*c0909341SAndroid Build Coastguard Worker    mov             wd, [rstk+stack_offset+20]
2609*c0909341SAndroid Build Coastguard Worker    mov           dstm, dstq
2610*c0909341SAndroid Build Coastguard Worker    mov       stridemp, strideq
2611*c0909341SAndroid Build Coastguard Worker    mov          leftm, leftq
2612*c0909341SAndroid Build Coastguard Worker    mov             r1, [rstk+stack_offset+24]
2613*c0909341SAndroid Build Coastguard Worker    mov             r2, [rstk+stack_offset+32]
2614*c0909341SAndroid Build Coastguard Worker    mov           lpfm, lpfq
2615*c0909341SAndroid Build Coastguard Worker    mov             hd, r1
2616*c0909341SAndroid Build Coastguard Worker    mov          edged, r2
2617*c0909341SAndroid Build Coastguard Worker %endif
2618*c0909341SAndroid Build Coastguard Worker%else
2619*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_mix_8bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
2620*c0909341SAndroid Build Coastguard Worker                                                    w, h, edge, params
2621*c0909341SAndroid Build Coastguard Worker%endif
2622*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
2623*c0909341SAndroid Build Coastguard Worker    mov             wd, wm
2624*c0909341SAndroid Build Coastguard Worker%endif
2625*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2626*c0909341SAndroid Build Coastguard Worker    mov        paramsq, r6mp
2627*c0909341SAndroid Build Coastguard Worker    lea            r13, [sgr_x_by_x-0xf03]
2628*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
2629*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
2630*c0909341SAndroid Build Coastguard Worker    mova           m15, [paramsq]
2631*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
2632*c0909341SAndroid Build Coastguard Worker    mova            m9, [pd_0xffff]
2633*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq*2+44]
2634*c0909341SAndroid Build Coastguard Worker    mova           m10, [pd_34816]
2635*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
2636*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+wq*4+400*24+40]
2637*c0909341SAndroid Build Coastguard Worker    mova           m11, [pd_0xf00801c7]
2638*c0909341SAndroid Build Coastguard Worker    lea             t4, [rsp+wq*2+400*52+40]
2639*c0909341SAndroid Build Coastguard Worker    mova           m12, [base+pd_0xf00800a4]
2640*c0909341SAndroid Build Coastguard Worker    neg             wq
2641*c0909341SAndroid Build Coastguard Worker    pshuflw        m13, m15, q0000
2642*c0909341SAndroid Build Coastguard Worker    pshuflw        m14, m15, q2222
2643*c0909341SAndroid Build Coastguard Worker    pshufhw        m15, m15, q1010
2644*c0909341SAndroid Build Coastguard Worker    punpcklqdq     m13, m13 ; s0
2645*c0909341SAndroid Build Coastguard Worker    punpcklqdq     m14, m14 ; s1
2646*c0909341SAndroid Build Coastguard Worker    punpckhqdq     m15, m15 ; w0 w1
2647*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
2648*c0909341SAndroid Build Coastguard Worker    psllw          m15, 2
2649*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
2650*c0909341SAndroid Build Coastguard Worker %define lpfm [rsp]
2651*c0909341SAndroid Build Coastguard Worker%else
2652*c0909341SAndroid Build Coastguard Worker    mov             r1, [rstk+stack_offset+28] ; params
2653*c0909341SAndroid Build Coastguard Worker    LEA             r6, $$
2654*c0909341SAndroid Build Coastguard Worker    mova            m2, [r1]
2655*c0909341SAndroid Build Coastguard Worker    add           lpfm, wq
2656*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+extra_stack+wq*2+52]
2657*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
2658*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+extra_stack+wq*4+400*24+48]
2659*c0909341SAndroid Build Coastguard Worker    mov           dstm, dstq
2660*c0909341SAndroid Build Coastguard Worker    lea             t4, [rsp+extra_stack+wq*2+400*52+48]
2661*c0909341SAndroid Build Coastguard Worker    mov            t3m, t3
2662*c0909341SAndroid Build Coastguard Worker    mov            t4m, t4
2663*c0909341SAndroid Build Coastguard Worker    neg             wq
2664*c0909341SAndroid Build Coastguard Worker    pshuflw         m0, m2, q0000
2665*c0909341SAndroid Build Coastguard Worker    pshuflw         m1, m2, q2222
2666*c0909341SAndroid Build Coastguard Worker    pshufhw         m2, m2, q1010
2667*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m0, m0 ; s0
2668*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m1, m1 ; s1
2669*c0909341SAndroid Build Coastguard Worker    punpckhqdq      m2, m2 ; w0 w1
2670*c0909341SAndroid Build Coastguard Worker    mov            w1m, wd
2671*c0909341SAndroid Build Coastguard Worker    pxor            m3, m3
2672*c0909341SAndroid Build Coastguard Worker    psllw           m2, 2
2673*c0909341SAndroid Build Coastguard Worker    mova           m13, m0
2674*c0909341SAndroid Build Coastguard Worker    mova           m14, m1
2675*c0909341SAndroid Build Coastguard Worker    sub             wd, 2
2676*c0909341SAndroid Build Coastguard Worker    mova           m15, m2
2677*c0909341SAndroid Build Coastguard Worker    mova            m6, m3
2678*c0909341SAndroid Build Coastguard Worker    mov           lpfq, lpfm
2679*c0909341SAndroid Build Coastguard Worker    mov            w0m, wd
2680*c0909341SAndroid Build Coastguard Worker %define strideq r5
2681*c0909341SAndroid Build Coastguard Worker%endif
2682*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
2683*c0909341SAndroid Build Coastguard Worker    jz .no_top
2684*c0909341SAndroid Build Coastguard Worker    call .h_top
2685*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
2686*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
2687*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2688*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup
2689*c0909341SAndroid Build Coastguard Worker%else
2690*c0909341SAndroid Build Coastguard Worker    mov             wq, w0m
2691*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup_loop
2692*c0909341SAndroid Build Coastguard Worker%endif
2693*c0909341SAndroid Build Coastguard Worker    add             t1, 400*12
2694*c0909341SAndroid Build Coastguard Worker    call .h_top
2695*c0909341SAndroid Build Coastguard Worker    movif32    strideq, stridemp
2696*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
2697*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
2698*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
2699*c0909341SAndroid Build Coastguard Worker    mov           lpfm, r10 ; below
2700*c0909341SAndroid Build Coastguard Worker    movif32         t4, t4m
2701*c0909341SAndroid Build Coastguard Worker    call .hv0
2702*c0909341SAndroid Build Coastguard Worker.main:
2703*c0909341SAndroid Build Coastguard Worker    dec             hd
2704*c0909341SAndroid Build Coastguard Worker    jz .height1
2705*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
2706*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
2707*c0909341SAndroid Build Coastguard Worker    call .hv1
2708*c0909341SAndroid Build Coastguard Worker    call .prep_n
2709*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
2710*c0909341SAndroid Build Coastguard Worker    jl .extend_bottom
2711*c0909341SAndroid Build Coastguard Worker.main_loop:
2712*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
2713*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
2714*c0909341SAndroid Build Coastguard Worker    call .hv0
2715*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2716*c0909341SAndroid Build Coastguard Worker    test            hd, hd
2717*c0909341SAndroid Build Coastguard Worker%else
2718*c0909341SAndroid Build Coastguard Worker    mov             r4, hd
2719*c0909341SAndroid Build Coastguard Worker    test            r4, r4
2720*c0909341SAndroid Build Coastguard Worker%endif
2721*c0909341SAndroid Build Coastguard Worker    jz .odd_height
2722*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
2723*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
2724*c0909341SAndroid Build Coastguard Worker    call .hv1
2725*c0909341SAndroid Build Coastguard Worker    call .n0
2726*c0909341SAndroid Build Coastguard Worker    call .n1
2727*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
2728*c0909341SAndroid Build Coastguard Worker    jge .main_loop
2729*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
2730*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
2731*c0909341SAndroid Build Coastguard Worker    mov           lpfq, lpfm
2732*c0909341SAndroid Build Coastguard Worker    call .hv0_bottom
2733*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
2734*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
2735*c0909341SAndroid Build Coastguard Worker    call .hv1_bottom
2736*c0909341SAndroid Build Coastguard Worker.end:
2737*c0909341SAndroid Build Coastguard Worker    call .n0
2738*c0909341SAndroid Build Coastguard Worker    call .n1
2739*c0909341SAndroid Build Coastguard Worker.end2:
2740*c0909341SAndroid Build Coastguard Worker    RET
2741*c0909341SAndroid Build Coastguard Worker.height1:
2742*c0909341SAndroid Build Coastguard Worker    call .v1
2743*c0909341SAndroid Build Coastguard Worker    call .prep_n
2744*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
2745*c0909341SAndroid Build Coastguard Worker.odd_height:
2746*c0909341SAndroid Build Coastguard Worker    call .v1
2747*c0909341SAndroid Build Coastguard Worker    call .n0
2748*c0909341SAndroid Build Coastguard Worker    call .n1
2749*c0909341SAndroid Build Coastguard Worker.odd_height_end:
2750*c0909341SAndroid Build Coastguard Worker    call .v0
2751*c0909341SAndroid Build Coastguard Worker    call .v1
2752*c0909341SAndroid Build Coastguard Worker    call .n0
2753*c0909341SAndroid Build Coastguard Worker    jmp .end2
2754*c0909341SAndroid Build Coastguard Worker.extend_bottom:
2755*c0909341SAndroid Build Coastguard Worker    call .v0
2756*c0909341SAndroid Build Coastguard Worker    call .v1
2757*c0909341SAndroid Build Coastguard Worker    jmp .end
2758*c0909341SAndroid Build Coastguard Worker.no_top:
2759*c0909341SAndroid Build Coastguard Worker    movif32    strideq, stridemp
2760*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
2761*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
2762*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
2763*c0909341SAndroid Build Coastguard Worker    mov           lpfm, r10
2764*c0909341SAndroid Build Coastguard Worker    call .h
2765*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2766*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
2767*c0909341SAndroid Build Coastguard Worker%else
2768*c0909341SAndroid Build Coastguard Worker    mov             wq, w0m
2769*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
2770*c0909341SAndroid Build Coastguard Worker%endif
2771*c0909341SAndroid Build Coastguard Worker    lea             t2, [t1+400*12]
2772*c0909341SAndroid Build Coastguard Worker.top_fixup_loop:
2773*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+wq*2+400* 0]
2774*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+wq*2+400* 2]
2775*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+wq*2+400* 4]
2776*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
2777*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+wq*2+400* 6]
2778*c0909341SAndroid Build Coastguard Worker    paddd           m1, m1
2779*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+wq*2+400* 8]
2780*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
2781*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+wq*2+400*10]
2782*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400* 0], m0
2783*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400* 2], m1
2784*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400* 4], m2
2785*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400* 6], m3
2786*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400* 8], m4
2787*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*10], m5
2788*c0909341SAndroid Build Coastguard Worker    add             wq, 8
2789*c0909341SAndroid Build Coastguard Worker    jl .top_fixup_loop
2790*c0909341SAndroid Build Coastguard Worker    movif32         t3, t3m
2791*c0909341SAndroid Build Coastguard Worker    movif32         t4, t4m
2792*c0909341SAndroid Build Coastguard Worker    call .v0
2793*c0909341SAndroid Build Coastguard Worker    jmp .main
2794*c0909341SAndroid Build Coastguard Worker.extend_right:
2795*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset+8
2796*c0909341SAndroid Build Coastguard Worker%assign calloff 8
2797*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2798*c0909341SAndroid Build Coastguard Worker    SWAP            m8, m6
2799*c0909341SAndroid Build Coastguard Worker%endif
2800*c0909341SAndroid Build Coastguard Worker    movd            m1, wd
2801*c0909341SAndroid Build Coastguard Worker    movd            m3, [lpfq-1]
2802*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m8
2803*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m8
2804*c0909341SAndroid Build Coastguard Worker    psubb           m2, [base+pb_1], m1
2805*c0909341SAndroid Build Coastguard Worker    pcmpgtb         m2, [base+pb_0to15]
2806*c0909341SAndroid Build Coastguard Worker    pand            m5, m2
2807*c0909341SAndroid Build Coastguard Worker    pandn           m2, m3
2808*c0909341SAndroid Build Coastguard Worker    por             m5, m2
2809*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2810*c0909341SAndroid Build Coastguard Worker    SWAP            m6, m8
2811*c0909341SAndroid Build Coastguard Worker%endif
2812*c0909341SAndroid Build Coastguard Worker    ret
2813*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset-4
2814*c0909341SAndroid Build Coastguard Worker%assign calloff 4
2815*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum
2816*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2817*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
2818*c0909341SAndroid Build Coastguard Worker%else
2819*c0909341SAndroid Build Coastguard Worker %define leftq r4
2820*c0909341SAndroid Build Coastguard Worker%endif
2821*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2822*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
2823*c0909341SAndroid Build Coastguard Worker    movif32      leftq, leftm
2824*c0909341SAndroid Build Coastguard Worker    movddup         m4, [leftq-4]
2825*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2826*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+wq+2]
2827*c0909341SAndroid Build Coastguard Worker    add         leftmp, 4
2828*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 13
2829*c0909341SAndroid Build Coastguard Worker    jmp .h_main
2830*c0909341SAndroid Build Coastguard Worker.h_extend_left:
2831*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2832*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+wq+2]
2833*c0909341SAndroid Build Coastguard Worker    pshufb          m5, [base+sgr_lshuf5]
2834*c0909341SAndroid Build Coastguard Worker    jmp .h_main
2835*c0909341SAndroid Build Coastguard Worker.h_top:
2836*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2837*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
2838*c0909341SAndroid Build Coastguard Worker%endif
2839*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2840*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
2841*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2842*c0909341SAndroid Build Coastguard Worker.h_loop:
2843*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+wq-1]
2844*c0909341SAndroid Build Coastguard Worker.h_main:
2845*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
2846*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2847*c0909341SAndroid Build Coastguard Worker    pxor            m8, m8
2848*c0909341SAndroid Build Coastguard Worker%else
2849*c0909341SAndroid Build Coastguard Worker    SWAP            m8, m6
2850*c0909341SAndroid Build Coastguard Worker%endif
2851*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
2852*c0909341SAndroid Build Coastguard Worker    cmp             wd, -10
2853*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
2854*c0909341SAndroid Build Coastguard Worker    call .extend_right
2855*c0909341SAndroid Build Coastguard Worker.h_have_right:
2856*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, m5, m8
2857*c0909341SAndroid Build Coastguard Worker    punpckhbw       m5, m8
2858*c0909341SAndroid Build Coastguard Worker    palignr         m3, m5, m4, 2
2859*c0909341SAndroid Build Coastguard Worker    palignr         m0, m5, m4, 4
2860*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3, m0
2861*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m0
2862*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2863*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m0
2864*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2865*c0909341SAndroid Build Coastguard Worker    palignr         m0, m5, m4, 6
2866*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0             ; sum3
2867*c0909341SAndroid Build Coastguard Worker    punpcklwd       m7, m0, m8
2868*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m7
2869*c0909341SAndroid Build Coastguard Worker    punpckhwd       m0, m8
2870*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
2871*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2872*c0909341SAndroid Build Coastguard Worker    SWAP            m6, m8
2873*c0909341SAndroid Build Coastguard Worker%endif
2874*c0909341SAndroid Build Coastguard Worker    paddd           m2, m7             ; sumsq3
2875*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 8
2876*c0909341SAndroid Build Coastguard Worker    punpcklwd       m7, m5, m4
2877*c0909341SAndroid Build Coastguard Worker    paddw           m8, m4, m5
2878*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m7
2879*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m4
2880*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
2881*c0909341SAndroid Build Coastguard Worker    paddd           m3, m0
2882*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400* 6], m1
2883*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400* 8], m2
2884*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400*10], m3
2885*c0909341SAndroid Build Coastguard Worker    paddw           m8, m1             ; sum5
2886*c0909341SAndroid Build Coastguard Worker    paddd           m7, m2             ; sumsq5
2887*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
2888*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400* 0], m8
2889*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400* 2], m7
2890*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400* 4], m5
2891*c0909341SAndroid Build Coastguard Worker    add             wq, 8
2892*c0909341SAndroid Build Coastguard Worker    jl .h_loop
2893*c0909341SAndroid Build Coastguard Worker    ret
2894*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2895*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
2896*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2897*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
2898*c0909341SAndroid Build Coastguard Worker%else
2899*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
2900*c0909341SAndroid Build Coastguard Worker%endif
2901*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2902*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
2903*c0909341SAndroid Build Coastguard Worker    movif32      leftq, leftm
2904*c0909341SAndroid Build Coastguard Worker    movddup         m4, [leftq-4]
2905*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2906*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+wq+2]
2907*c0909341SAndroid Build Coastguard Worker    add         leftmp, 4
2908*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 13
2909*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
2910*c0909341SAndroid Build Coastguard Worker.hv0_extend_left:
2911*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2912*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+wq+2]
2913*c0909341SAndroid Build Coastguard Worker    pshufb          m5, [base+sgr_lshuf5]
2914*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
2915*c0909341SAndroid Build Coastguard Worker.hv0_bottom:
2916*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2917*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
2918*c0909341SAndroid Build Coastguard Worker%else
2919*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
2920*c0909341SAndroid Build Coastguard Worker%endif
2921*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2922*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
2923*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2924*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2925*c0909341SAndroid Build Coastguard Worker    jmp .hv0_loop_start
2926*c0909341SAndroid Build Coastguard Worker%endif
2927*c0909341SAndroid Build Coastguard Worker.hv0_loop:
2928*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
2929*c0909341SAndroid Build Coastguard Worker.hv0_loop_start:
2930*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+wq-1]
2931*c0909341SAndroid Build Coastguard Worker.hv0_main:
2932*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
2933*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2934*c0909341SAndroid Build Coastguard Worker    pxor            m8, m8
2935*c0909341SAndroid Build Coastguard Worker%else
2936*c0909341SAndroid Build Coastguard Worker    SWAP            m8, m6
2937*c0909341SAndroid Build Coastguard Worker%endif
2938*c0909341SAndroid Build Coastguard Worker    jnz .hv0_have_right
2939*c0909341SAndroid Build Coastguard Worker    cmp             wd, -10
2940*c0909341SAndroid Build Coastguard Worker    jl .hv0_have_right
2941*c0909341SAndroid Build Coastguard Worker    call .extend_right
2942*c0909341SAndroid Build Coastguard Worker.hv0_have_right:
2943*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, m5, m8
2944*c0909341SAndroid Build Coastguard Worker    punpckhbw       m5, m8
2945*c0909341SAndroid Build Coastguard Worker    palignr         m3, m5, m4, 2
2946*c0909341SAndroid Build Coastguard Worker    palignr         m0, m5, m4, 4
2947*c0909341SAndroid Build Coastguard Worker    movif32         t3, t3m
2948*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3, m0
2949*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m0
2950*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2951*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m0
2952*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2953*c0909341SAndroid Build Coastguard Worker    palignr         m0, m5, m4, 6
2954*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0             ; h sum3
2955*c0909341SAndroid Build Coastguard Worker    punpcklwd       m7, m0, m8
2956*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m7
2957*c0909341SAndroid Build Coastguard Worker    punpckhwd       m0, m8
2958*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2959*c0909341SAndroid Build Coastguard Worker    SWAP            m6, m8
2960*c0909341SAndroid Build Coastguard Worker%endif
2961*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
2962*c0909341SAndroid Build Coastguard Worker    paddd           m2, m7             ; h sumsq3
2963*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 8
2964*c0909341SAndroid Build Coastguard Worker    punpcklwd       m7, m5, m4
2965*c0909341SAndroid Build Coastguard Worker    paddw           m8, m4, m5
2966*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m7
2967*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m4
2968*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
2969*c0909341SAndroid Build Coastguard Worker    paddd           m3, m0
2970*c0909341SAndroid Build Coastguard Worker    paddw           m8, m1             ; h sum5
2971*c0909341SAndroid Build Coastguard Worker    paddd           m7, m2             ; h sumsq5
2972*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
2973*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*8+ 8], m8
2974*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*0+ 8], m7
2975*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*0+24], m5
2976*c0909341SAndroid Build Coastguard Worker    paddw           m8, [t1+wq*2+400* 0]
2977*c0909341SAndroid Build Coastguard Worker    paddd           m7, [t1+wq*2+400* 2]
2978*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t1+wq*2+400* 4]
2979*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400* 0], m8
2980*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400* 2], m7
2981*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400* 4], m5
2982*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1, [t1+wq*2+400* 6]
2983*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t1+wq*2+400* 8]
2984*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t1+wq*2+400*10]
2985*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400* 6], m1
2986*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400* 8], m2
2987*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400*10], m3
2988*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+wq*2+400* 6]
2989*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+wq*2+400* 8]
2990*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+wq*2+400*10]
2991*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400* 6], m0
2992*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400* 8], m4
2993*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*10], m5
2994*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2995*c0909341SAndroid Build Coastguard Worker    pxor            m7, m7
2996*c0909341SAndroid Build Coastguard Worker%else
2997*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
2998*c0909341SAndroid Build Coastguard Worker%endif
2999*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
3000*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
3001*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; a3 * 9
3002*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
3003*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7         ; b3
3004*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0, m0
3005*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
3006*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1, m1
3007*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3008*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3009*c0909341SAndroid Build Coastguard Worker%endif
3010*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p3
3011*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
3012*c0909341SAndroid Build Coastguard Worker    MULLD           m4, m14, m7        ; p3 * s1
3013*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m14, m7
3014*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11            ; b3 * 455
3015*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
3016*c0909341SAndroid Build Coastguard Worker    paddusw         m4, m11
3017*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m11
3018*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; min(z3, 255)
3019*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
3020*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m3, m4, m5, r0, dstm
3021*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m3
3022*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3, m3
3023*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m4, m7
3024*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m5, m7
3025*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3026*c0909341SAndroid Build Coastguard Worker    paddd           m1, m10
3027*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400*2+ 4], m3
3028*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
3029*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
3030*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*4+ 8], m0
3031*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*4+24], m1
3032*c0909341SAndroid Build Coastguard Worker    add             wq, 8
3033*c0909341SAndroid Build Coastguard Worker    jl .hv0_loop
3034*c0909341SAndroid Build Coastguard Worker    ret
3035*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3036*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
3037*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3038*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
3039*c0909341SAndroid Build Coastguard Worker%else
3040*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
3041*c0909341SAndroid Build Coastguard Worker%endif
3042*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
3043*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
3044*c0909341SAndroid Build Coastguard Worker    movif32      leftq, leftm
3045*c0909341SAndroid Build Coastguard Worker    movddup         m4, [leftq-4]
3046*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
3047*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+wq+2]
3048*c0909341SAndroid Build Coastguard Worker    add         leftmp, 4
3049*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 13
3050*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
3051*c0909341SAndroid Build Coastguard Worker.hv1_extend_left:
3052*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
3053*c0909341SAndroid Build Coastguard Worker    mova            m5, [lpfq+wq+2]
3054*c0909341SAndroid Build Coastguard Worker    pshufb          m5, [base+sgr_lshuf5]
3055*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
3056*c0909341SAndroid Build Coastguard Worker.hv1_bottom:
3057*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3058*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
3059*c0909341SAndroid Build Coastguard Worker%else
3060*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
3061*c0909341SAndroid Build Coastguard Worker%endif
3062*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
3063*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
3064*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
3065*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3066*c0909341SAndroid Build Coastguard Worker    jmp .hv1_loop_start
3067*c0909341SAndroid Build Coastguard Worker%endif
3068*c0909341SAndroid Build Coastguard Worker.hv1_loop:
3069*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
3070*c0909341SAndroid Build Coastguard Worker.hv1_loop_start:
3071*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+wq-1]
3072*c0909341SAndroid Build Coastguard Worker.hv1_main:
3073*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
3074*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3075*c0909341SAndroid Build Coastguard Worker    pxor            m8, m8
3076*c0909341SAndroid Build Coastguard Worker%else
3077*c0909341SAndroid Build Coastguard Worker    SWAP            m8, m6
3078*c0909341SAndroid Build Coastguard Worker%endif
3079*c0909341SAndroid Build Coastguard Worker    jnz .hv1_have_right
3080*c0909341SAndroid Build Coastguard Worker    cmp             wd, -10
3081*c0909341SAndroid Build Coastguard Worker    jl .hv1_have_right
3082*c0909341SAndroid Build Coastguard Worker    call .extend_right
3083*c0909341SAndroid Build Coastguard Worker.hv1_have_right:
3084*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, m5, m8
3085*c0909341SAndroid Build Coastguard Worker    punpckhbw       m5, m8
3086*c0909341SAndroid Build Coastguard Worker    palignr         m7, m5, m4, 2
3087*c0909341SAndroid Build Coastguard Worker    palignr         m3, m5, m4, 4
3088*c0909341SAndroid Build Coastguard Worker    paddw           m2, m7, m3
3089*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m7, m3
3090*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
3091*c0909341SAndroid Build Coastguard Worker    punpckhwd       m7, m3
3092*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m7
3093*c0909341SAndroid Build Coastguard Worker    palignr         m3, m5, m4, 6
3094*c0909341SAndroid Build Coastguard Worker    paddw           m2, m3             ; h sum3
3095*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m3, m8
3096*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
3097*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m8
3098*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3099*c0909341SAndroid Build Coastguard Worker    SWAP            m6, m8
3100*c0909341SAndroid Build Coastguard Worker%endif
3101*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
3102*c0909341SAndroid Build Coastguard Worker    paddd           m0, m1             ; h sumsq3
3103*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 8
3104*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m4, m5
3105*c0909341SAndroid Build Coastguard Worker    paddw           m8, m4, m5
3106*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
3107*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5
3108*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
3109*c0909341SAndroid Build Coastguard Worker    paddd           m7, m3
3110*c0909341SAndroid Build Coastguard Worker    paddw           m5, m2, [t2+wq*2+400* 6]
3111*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400* 6], m2
3112*c0909341SAndroid Build Coastguard Worker    paddw           m8, m2             ; h sum5
3113*c0909341SAndroid Build Coastguard Worker    paddd           m2, m0, [t2+wq*2+400* 8]
3114*c0909341SAndroid Build Coastguard Worker    paddd           m3, m7, [t2+wq*2+400*10]
3115*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400* 8], m0
3116*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*10], m7
3117*c0909341SAndroid Build Coastguard Worker    paddd           m4, m0             ; h sumsq5
3118*c0909341SAndroid Build Coastguard Worker    paddd           m1, m7
3119*c0909341SAndroid Build Coastguard Worker    pslld           m0, m2, 3
3120*c0909341SAndroid Build Coastguard Worker    pslld           m7, m3, 3
3121*c0909341SAndroid Build Coastguard Worker    paddd           m2, m0             ; a3 * 9
3122*c0909341SAndroid Build Coastguard Worker    paddd           m3, m7
3123*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3124*c0909341SAndroid Build Coastguard Worker    mova      [esp+20], m8
3125*c0909341SAndroid Build Coastguard Worker    pxor            m8, m8
3126*c0909341SAndroid Build Coastguard Worker%else
3127*c0909341SAndroid Build Coastguard Worker    SWAP            m8, m6
3128*c0909341SAndroid Build Coastguard Worker%endif
3129*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m5, m8         ; b3
3130*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m0, m0
3131*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m8
3132*c0909341SAndroid Build Coastguard Worker    pmaddwd         m8, m5, m5
3133*c0909341SAndroid Build Coastguard Worker    psubd           m2, m7             ; p3
3134*c0909341SAndroid Build Coastguard Worker    psubd           m3, m8
3135*c0909341SAndroid Build Coastguard Worker    MULLD           m2, m14, m8        ; p3 * s1
3136*c0909341SAndroid Build Coastguard Worker    MULLD           m3, m14, m8
3137*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11            ; b3 * 455
3138*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m11
3139*c0909341SAndroid Build Coastguard Worker    paddusw         m2, m11
3140*c0909341SAndroid Build Coastguard Worker    paddusw         m3, m11
3141*c0909341SAndroid Build Coastguard Worker    psrld           m2, 20             ; min(z3, 255)
3142*c0909341SAndroid Build Coastguard Worker    movif32         t3, t3m
3143*c0909341SAndroid Build Coastguard Worker    psrld           m3, 20
3144*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m8, m2, m3, r0, dstm
3145*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m8, m8
3146*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m8, m8
3147*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m2, m7
3148*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m3, m7
3149*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3150*c0909341SAndroid Build Coastguard Worker    paddd           m5, m10
3151*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
3152*c0909341SAndroid Build Coastguard Worker    psrld           m5, 12
3153*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400*4+ 4], m8
3154*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*8+ 8], m0
3155*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*8+24], m5
3156*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3157*c0909341SAndroid Build Coastguard Worker    mova            m8, [esp+20]
3158*c0909341SAndroid Build Coastguard Worker%else
3159*c0909341SAndroid Build Coastguard Worker    SWAP            m6, m8
3160*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
3161*c0909341SAndroid Build Coastguard Worker%endif
3162*c0909341SAndroid Build Coastguard Worker    paddw           m5, m8, [t2+wq*2+400*0]
3163*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+wq*2+400*2]
3164*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1, [t2+wq*2+400*4]
3165*c0909341SAndroid Build Coastguard Worker    paddw           m5, [t1+wq*2+400*0]
3166*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+wq*2+400*2]
3167*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t1+wq*2+400*4]
3168*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*0], m8
3169*c0909341SAndroid Build Coastguard Worker    pslld           m0, m2, 4
3170*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*2], m4
3171*c0909341SAndroid Build Coastguard Worker    pslld           m8, m3, 4
3172*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*4], m1
3173*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
3174*c0909341SAndroid Build Coastguard Worker    paddd           m2, m0
3175*c0909341SAndroid Build Coastguard Worker    pslld           m7, m3, 3
3176*c0909341SAndroid Build Coastguard Worker    paddd           m3, m8
3177*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4             ; a5 * 25
3178*c0909341SAndroid Build Coastguard Worker    paddd           m3, m7
3179*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3180*c0909341SAndroid Build Coastguard Worker    pxor            m7, m7
3181*c0909341SAndroid Build Coastguard Worker%else
3182*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3183*c0909341SAndroid Build Coastguard Worker%endif
3184*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m5, m7         ; b5
3185*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m0, m0
3186*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m7
3187*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m5, m5
3188*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3189*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3190*c0909341SAndroid Build Coastguard Worker%endif
3191*c0909341SAndroid Build Coastguard Worker    psubd           m2, m4             ; p5
3192*c0909341SAndroid Build Coastguard Worker    psubd           m3, m1
3193*c0909341SAndroid Build Coastguard Worker    MULLD           m2, m13, m7        ; p5 * s0
3194*c0909341SAndroid Build Coastguard Worker    MULLD           m3, m13, m7
3195*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m12            ; b5 * 164
3196*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m12
3197*c0909341SAndroid Build Coastguard Worker    paddusw         m2, m12
3198*c0909341SAndroid Build Coastguard Worker    paddusw         m3, m12
3199*c0909341SAndroid Build Coastguard Worker    psrld           m2, 20             ; min(z5, 255)
3200*c0909341SAndroid Build Coastguard Worker    psrld           m3, 20
3201*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m1, m2, m3, r0, dstm
3202*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m1, m1
3203*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m1, m1
3204*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m2, m7
3205*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m3, m7
3206*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
3207*c0909341SAndroid Build Coastguard Worker    paddd           m5, m10
3208*c0909341SAndroid Build Coastguard Worker    mova   [t4+wq*2+4], m1
3209*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
3210*c0909341SAndroid Build Coastguard Worker    psrld           m5, 12
3211*c0909341SAndroid Build Coastguard Worker    mova  [t3+wq*4+ 8], m0
3212*c0909341SAndroid Build Coastguard Worker    mova  [t3+wq*4+24], m5
3213*c0909341SAndroid Build Coastguard Worker    add             wq, 8
3214*c0909341SAndroid Build Coastguard Worker    jl .hv1_loop
3215*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
3216*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
3217*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
3218*c0909341SAndroid Build Coastguard Worker    ret
3219*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab3 (even rows)
3220*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3221*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
3222*c0909341SAndroid Build Coastguard Worker%else
3223*c0909341SAndroid Build Coastguard Worker    mov             wd, w0m
3224*c0909341SAndroid Build Coastguard Worker%endif
3225*c0909341SAndroid Build Coastguard Worker.v0_loop:
3226*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+wq*2+400* 6]
3227*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+wq*2+400* 8]
3228*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+wq*2+400*10]
3229*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
3230*c0909341SAndroid Build Coastguard Worker    paddd           m4, m4
3231*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5
3232*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+wq*2+400* 6]
3233*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+wq*2+400* 8]
3234*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+wq*2+400*10]
3235*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400* 6], m0
3236*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400* 8], m4
3237*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*10], m5
3238*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3239*c0909341SAndroid Build Coastguard Worker    pxor            m7, m7
3240*c0909341SAndroid Build Coastguard Worker%else
3241*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3242*c0909341SAndroid Build Coastguard Worker%endif
3243*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
3244*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
3245*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; a3 * 9
3246*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
3247*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7         ; b3
3248*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0, m0
3249*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
3250*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1, m1
3251*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p3
3252*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
3253*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3254*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3255*c0909341SAndroid Build Coastguard Worker%endif
3256*c0909341SAndroid Build Coastguard Worker    MULLD           m4, m14, m7        ; p3 * s1
3257*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m14, m7
3258*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11            ; b3 * 455
3259*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
3260*c0909341SAndroid Build Coastguard Worker    paddusw         m4, m11
3261*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m11
3262*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; min(z3, 255)
3263*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
3264*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m3, m4, m5, r0, dstm
3265*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m3
3266*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3, m3
3267*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m4, m7
3268*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m5, m7
3269*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3270*c0909341SAndroid Build Coastguard Worker    paddd           m1, m10
3271*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400*2+4], m3
3272*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
3273*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
3274*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+wq*2+400*0]
3275*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+wq*2+400*2]
3276*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+wq*2+400*4]
3277*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*8+ 8], m3
3278*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*0+ 8], m4
3279*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*0+24], m5
3280*c0909341SAndroid Build Coastguard Worker    paddw           m3, m3 ; cc5
3281*c0909341SAndroid Build Coastguard Worker    paddd           m4, m4
3282*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5
3283*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400*0], m3
3284*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400*2], m4
3285*c0909341SAndroid Build Coastguard Worker    mova [t1+wq*2+400*4], m5
3286*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*4+ 8], m0
3287*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*4+24], m1
3288*c0909341SAndroid Build Coastguard Worker    add             wq, 8
3289*c0909341SAndroid Build Coastguard Worker    jl .v0_loop
3290*c0909341SAndroid Build Coastguard Worker    ret
3291*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows)
3292*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3293*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-2]
3294*c0909341SAndroid Build Coastguard Worker%else
3295*c0909341SAndroid Build Coastguard Worker    mov             wd, w0m
3296*c0909341SAndroid Build Coastguard Worker%endif
3297*c0909341SAndroid Build Coastguard Worker.v1_loop:
3298*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+wq*2+400* 6]
3299*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+wq*2+400* 8]
3300*c0909341SAndroid Build Coastguard Worker    mova            m7, [t1+wq*2+400*10]
3301*c0909341SAndroid Build Coastguard Worker    paddw           m1, m4, [t2+wq*2+400* 6]
3302*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5, [t2+wq*2+400* 8]
3303*c0909341SAndroid Build Coastguard Worker    paddd           m3, m7, [t2+wq*2+400*10]
3304*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400* 6], m4
3305*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400* 8], m5
3306*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*10], m7
3307*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3308*c0909341SAndroid Build Coastguard Worker    pxor            m7, m7
3309*c0909341SAndroid Build Coastguard Worker%else
3310*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3311*c0909341SAndroid Build Coastguard Worker%endif
3312*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
3313*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
3314*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
3315*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
3316*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7         ; b3
3317*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0, m0
3318*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
3319*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1, m1
3320*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p3
3321*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
3322*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3323*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3324*c0909341SAndroid Build Coastguard Worker%endif
3325*c0909341SAndroid Build Coastguard Worker    MULLD           m4, m14, m7        ; p3 * s1
3326*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m14, m7
3327*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11            ; b3 * 455
3328*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
3329*c0909341SAndroid Build Coastguard Worker    paddusw         m4, m11
3330*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m11
3331*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; min(z3, 255)
3332*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
3333*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m3, m4, m5, r0, dstm
3334*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m3
3335*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3, m3
3336*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m4, m7
3337*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m5, m7
3338*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3339*c0909341SAndroid Build Coastguard Worker    paddd           m1, m10
3340*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400*4+4], m3
3341*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
3342*c0909341SAndroid Build Coastguard Worker    psrld           m8, m1, 12
3343*c0909341SAndroid Build Coastguard Worker    mova            m4, [t3+wq*4+400*8+ 8]
3344*c0909341SAndroid Build Coastguard Worker    mova            m5, [t3+wq*4+400*0+ 8]
3345*c0909341SAndroid Build Coastguard Worker    mova            m7, [t3+wq*4+400*0+24]
3346*c0909341SAndroid Build Coastguard Worker    paddw           m1, m4, [t2+wq*2+400*0]
3347*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5, [t2+wq*2+400*2]
3348*c0909341SAndroid Build Coastguard Worker    paddd           m3, m7, [t2+wq*2+400*4]
3349*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t1+wq*2+400*0]
3350*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+wq*2+400*2]
3351*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t1+wq*2+400*4]
3352*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*0], m4
3353*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*2], m5
3354*c0909341SAndroid Build Coastguard Worker    mova [t2+wq*2+400*4], m7
3355*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 4
3356*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*8+ 8], m0
3357*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 4
3358*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*8+24], m8
3359*c0909341SAndroid Build Coastguard Worker    pslld           m7, m2, 3
3360*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4
3361*c0909341SAndroid Build Coastguard Worker    pslld           m8, m3, 3
3362*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5
3363*c0909341SAndroid Build Coastguard Worker    paddd           m2, m7             ; a5 * 25
3364*c0909341SAndroid Build Coastguard Worker    paddd           m3, m8
3365*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3366*c0909341SAndroid Build Coastguard Worker    pxor            m7, m7
3367*c0909341SAndroid Build Coastguard Worker%else
3368*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3369*c0909341SAndroid Build Coastguard Worker%endif
3370*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7         ; b5
3371*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m0, m0
3372*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
3373*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m1, m1
3374*c0909341SAndroid Build Coastguard Worker    psubd           m2, m4             ; p5
3375*c0909341SAndroid Build Coastguard Worker    psubd           m3, m5
3376*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3377*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3378*c0909341SAndroid Build Coastguard Worker%endif
3379*c0909341SAndroid Build Coastguard Worker    MULLD           m2, m13, m7        ; p5 * s0
3380*c0909341SAndroid Build Coastguard Worker    MULLD           m3, m13, m7
3381*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m12            ; b5 * 164
3382*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m12
3383*c0909341SAndroid Build Coastguard Worker    paddusw         m2, m12
3384*c0909341SAndroid Build Coastguard Worker    paddusw         m3, m12
3385*c0909341SAndroid Build Coastguard Worker    psrld           m2, 20             ; min(z5, 255)
3386*c0909341SAndroid Build Coastguard Worker    psrld           m3, 20
3387*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m4, m2, m3, r0, dstm
3388*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m4, m4
3389*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4, m4
3390*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m2, m7
3391*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m3, m7
3392*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
3393*c0909341SAndroid Build Coastguard Worker    paddd           m1, m10
3394*c0909341SAndroid Build Coastguard Worker    mova   [t4+wq*2+4], m4
3395*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
3396*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
3397*c0909341SAndroid Build Coastguard Worker    mova  [t3+wq*4+ 8], m0
3398*c0909341SAndroid Build Coastguard Worker    mova  [t3+wq*4+24], m1
3399*c0909341SAndroid Build Coastguard Worker    add             wq, 8
3400*c0909341SAndroid Build Coastguard Worker    jl .v1_loop
3401*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
3402*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
3403*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
3404*c0909341SAndroid Build Coastguard Worker    ret
3405*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup
3406*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
3407*c0909341SAndroid Build Coastguard Worker    movif32         wd, w1m
3408*c0909341SAndroid Build Coastguard Worker.prep_n_loop:
3409*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+wq*2+400*0+ 2]
3410*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*4+400*0+ 4]
3411*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+wq*4+400*0+20]
3412*c0909341SAndroid Build Coastguard Worker    movu            m7, [t4+wq*2+400*0+ 4]
3413*c0909341SAndroid Build Coastguard Worker    movu            m8, [t3+wq*4+400*0+ 8]
3414*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0, [t4+wq*2+400*0+ 0]
3415*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1, [t3+wq*4+400*0+ 0]
3416*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2, [t3+wq*4+400*0+16]
3417*c0909341SAndroid Build Coastguard Worker    paddw           m3, m7
3418*c0909341SAndroid Build Coastguard Worker    paddd           m4, m8
3419*c0909341SAndroid Build Coastguard Worker    movu            m7, [t3+wq*4+400*0+24]
3420*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3
3421*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
3422*c0909341SAndroid Build Coastguard Worker    psllw           m3, 2
3423*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2
3424*c0909341SAndroid Build Coastguard Worker    paddd           m5, m7
3425*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5
3426*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2
3427*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3               ; a5 565
3428*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4               ; b5 565
3429*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5
3430*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400* 6+ 0], m0
3431*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*12+ 0], m1
3432*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*12+16], m2
3433*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+wq*2+400*2+ 4]
3434*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*4+400*4+ 8]
3435*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+wq*4+400*4+24]
3436*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*2+400*2+ 2]
3437*c0909341SAndroid Build Coastguard Worker    movu            m4, [t3+wq*4+400*4+ 4]
3438*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*4+400*4+20]
3439*c0909341SAndroid Build Coastguard Worker    paddw           m0, [t4+wq*2+400*2+ 0]
3440*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+wq*4+400*4+ 0]
3441*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t3+wq*4+400*4+16]
3442*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0
3443*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1
3444*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2
3445*c0909341SAndroid Build Coastguard Worker    psllw           m3, 2                ; a3[-1] 444
3446*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2                ; b3[-1] 444
3447*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2
3448*c0909341SAndroid Build Coastguard Worker    psubw           m3, m0               ; a3[-1] 343
3449*c0909341SAndroid Build Coastguard Worker    psubd           m4, m1               ; b3[-1] 343
3450*c0909341SAndroid Build Coastguard Worker    psubd           m5, m2
3451*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400* 8+ 0], m3
3452*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*16+ 0], m4
3453*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*16+16], m5
3454*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+wq*2+400*4+ 4]
3455*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*4+400*8+ 8]
3456*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+wq*4+400*8+24]
3457*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*2+400*4+ 2]
3458*c0909341SAndroid Build Coastguard Worker    movu            m4, [t3+wq*4+400*8+ 4]
3459*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*4+400*8+20]
3460*c0909341SAndroid Build Coastguard Worker    paddw           m0, [t4+wq*2+400*4+ 0]
3461*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+wq*4+400*8+ 0]
3462*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t3+wq*4+400*8+16]
3463*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0
3464*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1
3465*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2
3466*c0909341SAndroid Build Coastguard Worker    psllw           m3, 2                 ; a3[ 0] 444
3467*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2                 ; b3[ 0] 444
3468*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2
3469*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400*10+ 0], m3
3470*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*20+ 0], m4
3471*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*20+16], m5
3472*c0909341SAndroid Build Coastguard Worker    psubw           m3, m0                ; a3[ 0] 343
3473*c0909341SAndroid Build Coastguard Worker    psubd           m4, m1                ; b3[ 0] 343
3474*c0909341SAndroid Build Coastguard Worker    psubd           m5, m2
3475*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400*12+ 0], m3
3476*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*24+ 0], m4
3477*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*24+16], m5
3478*c0909341SAndroid Build Coastguard Worker    add             wq, 8
3479*c0909341SAndroid Build Coastguard Worker    jl .prep_n_loop
3480*c0909341SAndroid Build Coastguard Worker    ret
3481*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3482*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows)
3483*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
3484*c0909341SAndroid Build Coastguard Worker    movif32         wd, w1m
3485*c0909341SAndroid Build Coastguard Worker.n0_loop:
3486*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+wq*2+ 4]
3487*c0909341SAndroid Build Coastguard Worker    movu            m2, [t4+wq*2+ 2]
3488*c0909341SAndroid Build Coastguard Worker    paddw           m0, [t4+wq*2+ 0]
3489*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
3490*c0909341SAndroid Build Coastguard Worker    paddw           m2, m0
3491*c0909341SAndroid Build Coastguard Worker    psllw           m0, 2
3492*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2               ; a5
3493*c0909341SAndroid Build Coastguard Worker    movu            m4, [t3+wq*4+ 8]
3494*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*4+24]
3495*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*4+ 4]
3496*c0909341SAndroid Build Coastguard Worker    movu            m3, [t3+wq*4+20]
3497*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+wq*4+ 0]
3498*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+wq*4+16]
3499*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1
3500*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
3501*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
3502*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5
3503*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2
3504*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2
3505*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1               ; b5
3506*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
3507*c0909341SAndroid Build Coastguard Worker    movu            m2, [t4+wq*2+400* 6]
3508*c0909341SAndroid Build Coastguard Worker    paddw           m2, m0
3509*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400* 6], m0
3510*c0909341SAndroid Build Coastguard Worker    paddd           m0, m4, [t3+wq*4+400*12+ 0]
3511*c0909341SAndroid Build Coastguard Worker    paddd           m1, m5, [t3+wq*4+400*12+16]
3512*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*12+ 0], m4
3513*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*12+16], m5
3514*c0909341SAndroid Build Coastguard Worker    mova [rsp+16+ARCH_X86_32*4], m1
3515*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*2+400*2+4]
3516*c0909341SAndroid Build Coastguard Worker    movu            m5, [t4+wq*2+400*2+2]
3517*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*2+400*2+0]
3518*c0909341SAndroid Build Coastguard Worker    paddw           m5, m3
3519*c0909341SAndroid Build Coastguard Worker    psllw           m5, 2                ; a3[ 1] 444
3520*c0909341SAndroid Build Coastguard Worker    psubw           m4, m5, m3           ; a3[ 1] 343
3521*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*2+400* 8]
3522*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*2+400*10]
3523*c0909341SAndroid Build Coastguard Worker    paddw           m3, m4
3524*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400* 8], m4
3525*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400*10], m5
3526*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*4+400*4+ 8]
3527*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*4+400*4+ 4]
3528*c0909341SAndroid Build Coastguard Worker    movu            m7, [t3+wq*4+400*4+24]
3529*c0909341SAndroid Build Coastguard Worker    movu            m8, [t3+wq*4+400*4+20]
3530*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+wq*4+400*4+ 0]
3531*c0909341SAndroid Build Coastguard Worker    paddd           m7, [t3+wq*4+400*4+16]
3532*c0909341SAndroid Build Coastguard Worker    paddd           m5, m1
3533*c0909341SAndroid Build Coastguard Worker    paddd           m8, m7
3534*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2                ; b3[ 1] 444
3535*c0909341SAndroid Build Coastguard Worker    pslld           m8, 2
3536*c0909341SAndroid Build Coastguard Worker    psubd           m4, m5, m1           ; b3[ 1] 343
3537*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3538*c0909341SAndroid Build Coastguard Worker    mova      [esp+52], m8
3539*c0909341SAndroid Build Coastguard Worker    psubd           m8, m7
3540*c0909341SAndroid Build Coastguard Worker%else
3541*c0909341SAndroid Build Coastguard Worker    psubd           m6, m8, m7
3542*c0909341SAndroid Build Coastguard Worker    SWAP            m8, m6
3543*c0909341SAndroid Build Coastguard Worker%endif
3544*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4, [t3+wq*4+400*16+ 0]
3545*c0909341SAndroid Build Coastguard Worker    paddd           m7, m8, [t3+wq*4+400*16+16]
3546*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+wq*4+400*20+ 0]
3547*c0909341SAndroid Build Coastguard Worker    paddd           m7, [t3+wq*4+400*20+16]
3548*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*16+ 0], m4
3549*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*16+16], m8
3550*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*20+ 0], m5
3551*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3552*c0909341SAndroid Build Coastguard Worker    mova            m8, [esp+52]
3553*c0909341SAndroid Build Coastguard Worker%else
3554*c0909341SAndroid Build Coastguard Worker    SWAP            m8, m6
3555*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
3556*c0909341SAndroid Build Coastguard Worker%endif
3557*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*20+16], m8
3558*c0909341SAndroid Build Coastguard Worker    mova [rsp+32+ARCH_X86_32*4], m7
3559*c0909341SAndroid Build Coastguard Worker    movq            m4, [dstq+wq]
3560*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, m6
3561*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m4, m6
3562*c0909341SAndroid Build Coastguard Worker    punpcklwd       m7, m2, m6
3563*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m5               ; a5 * src
3564*c0909341SAndroid Build Coastguard Worker    punpcklwd       m8, m3, m6
3565*c0909341SAndroid Build Coastguard Worker    pmaddwd         m8, m5               ; a3 * src
3566*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m4, m6
3567*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m6
3568*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m5
3569*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
3570*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m5
3571*c0909341SAndroid Build Coastguard Worker    psubd           m0, m7               ; b5 - a5 * src + (1 << 8) - (src << 13)
3572*c0909341SAndroid Build Coastguard Worker    psubd           m1, m8               ; b3 - a3 * src + (1 << 8) - (src << 13)
3573*c0909341SAndroid Build Coastguard Worker    psrld           m0, 9
3574*c0909341SAndroid Build Coastguard Worker    pslld           m1, 7
3575*c0909341SAndroid Build Coastguard Worker    pand            m0, m9
3576*c0909341SAndroid Build Coastguard Worker    pandn           m8, m9, m1
3577*c0909341SAndroid Build Coastguard Worker    por             m0, m8
3578*c0909341SAndroid Build Coastguard Worker    mova            m1, [rsp+16+ARCH_X86_32*4]
3579*c0909341SAndroid Build Coastguard Worker    psubd           m1, m2
3580*c0909341SAndroid Build Coastguard Worker    mova            m2, [rsp+32+ARCH_X86_32*4]
3581*c0909341SAndroid Build Coastguard Worker    psubd           m2, m3
3582*c0909341SAndroid Build Coastguard Worker    mova            m3, [base+pd_4096]
3583*c0909341SAndroid Build Coastguard Worker    psrld           m1, 9
3584*c0909341SAndroid Build Coastguard Worker    pslld           m2, 7
3585*c0909341SAndroid Build Coastguard Worker    pand            m1, m9
3586*c0909341SAndroid Build Coastguard Worker    pandn           m5, m9, m2
3587*c0909341SAndroid Build Coastguard Worker    por             m1, m5
3588*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m15
3589*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m15
3590*c0909341SAndroid Build Coastguard Worker    paddd           m0, m3
3591*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
3592*c0909341SAndroid Build Coastguard Worker    psrad           m0, 13
3593*c0909341SAndroid Build Coastguard Worker    psrad           m1, 13
3594*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
3595*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4
3596*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m0
3597*c0909341SAndroid Build Coastguard Worker    movq     [dstq+wq], m0
3598*c0909341SAndroid Build Coastguard Worker    add             wq, 8
3599*c0909341SAndroid Build Coastguard Worker    jl .n0_loop
3600*c0909341SAndroid Build Coastguard Worker    add           dstq, stridemp
3601*c0909341SAndroid Build Coastguard Worker    ret
3602*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3603*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows)
3604*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
3605*c0909341SAndroid Build Coastguard Worker    movif32         wd, w1m
3606*c0909341SAndroid Build Coastguard Worker.n1_loop:
3607*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*2+400*4+4]
3608*c0909341SAndroid Build Coastguard Worker    movu            m5, [t4+wq*2+400*4+2]
3609*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*2+400*4+0]
3610*c0909341SAndroid Build Coastguard Worker    paddw           m5, m3
3611*c0909341SAndroid Build Coastguard Worker    psllw           m5, 2                ; a3[ 1] 444
3612*c0909341SAndroid Build Coastguard Worker    psubw           m4, m5, m3           ; a3[ 1] 343
3613*c0909341SAndroid Build Coastguard Worker    paddw           m3, m4, [t4+wq*2+400*12]
3614*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*2+400*10]
3615*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400*10], m5
3616*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*2+400*12], m4
3617*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*4+400*8+ 8]
3618*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*4+400*8+ 4]
3619*c0909341SAndroid Build Coastguard Worker    movu            m7, [t3+wq*4+400*8+24]
3620*c0909341SAndroid Build Coastguard Worker    movu            m8, [t3+wq*4+400*8+20]
3621*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+wq*4+400*8+ 0]
3622*c0909341SAndroid Build Coastguard Worker    paddd           m7, [t3+wq*4+400*8+16]
3623*c0909341SAndroid Build Coastguard Worker    paddd           m5, m1
3624*c0909341SAndroid Build Coastguard Worker    paddd           m8, m7
3625*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2                ; b3[ 1] 444
3626*c0909341SAndroid Build Coastguard Worker    pslld           m8, 2
3627*c0909341SAndroid Build Coastguard Worker    psubd           m4, m5, m1           ; b3[ 1] 343
3628*c0909341SAndroid Build Coastguard Worker    psubd           m0, m8, m7
3629*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4, [t3+wq*4+400*24+ 0]
3630*c0909341SAndroid Build Coastguard Worker    paddd           m7, m0, [t3+wq*4+400*24+16]
3631*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+wq*4+400*20+ 0]
3632*c0909341SAndroid Build Coastguard Worker    paddd           m7, [t3+wq*4+400*20+16]
3633*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*20+ 0], m5
3634*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*20+16], m8
3635*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*24+ 0], m4
3636*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*4+400*24+16], m0
3637*c0909341SAndroid Build Coastguard Worker    movq            m5, [dstq+wq]
3638*c0909341SAndroid Build Coastguard Worker    mova            m2, [t4+wq*2+400* 6]
3639*c0909341SAndroid Build Coastguard Worker    punpcklbw       m5, m6
3640*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5, m6
3641*c0909341SAndroid Build Coastguard Worker    punpcklwd       m8, m2, m6
3642*c0909341SAndroid Build Coastguard Worker    pmaddwd         m8, m4               ; a5 * src
3643*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m3, m6
3644*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m4               ; a3 * src
3645*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m5, m6
3646*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m6
3647*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m4
3648*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
3649*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m4
3650*c0909341SAndroid Build Coastguard Worker    psubd           m1, m0               ; b3 - a3 * src + (1 << 8) - (src << 13)
3651*c0909341SAndroid Build Coastguard Worker    mova            m0, [t3+wq*4+400*12+ 0]
3652*c0909341SAndroid Build Coastguard Worker    psubd           m0, m8               ; b5 - a5 * src + (1 << 8) - (src << 13)
3653*c0909341SAndroid Build Coastguard Worker    mova            m4, [t3+wq*4+400*12+16]
3654*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2
3655*c0909341SAndroid Build Coastguard Worker    psubd           m7, m3
3656*c0909341SAndroid Build Coastguard Worker    pslld           m1, 7
3657*c0909341SAndroid Build Coastguard Worker    psrld           m0, 8
3658*c0909341SAndroid Build Coastguard Worker    psrld           m4, 8
3659*c0909341SAndroid Build Coastguard Worker    pslld           m7, 7
3660*c0909341SAndroid Build Coastguard Worker    pandn           m3, m9, m1
3661*c0909341SAndroid Build Coastguard Worker    pand            m0, m9
3662*c0909341SAndroid Build Coastguard Worker    por             m0, m3
3663*c0909341SAndroid Build Coastguard Worker    pand            m4, m9
3664*c0909341SAndroid Build Coastguard Worker    pandn           m2, m9, m7
3665*c0909341SAndroid Build Coastguard Worker    por             m2, m4
3666*c0909341SAndroid Build Coastguard Worker    mova            m1, [base+pd_4096]
3667*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m15
3668*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m15
3669*c0909341SAndroid Build Coastguard Worker    paddd           m0, m1
3670*c0909341SAndroid Build Coastguard Worker    paddd           m2, m1
3671*c0909341SAndroid Build Coastguard Worker    psrad           m0, 13
3672*c0909341SAndroid Build Coastguard Worker    psrad           m2, 13
3673*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m2
3674*c0909341SAndroid Build Coastguard Worker    paddw           m0, m5
3675*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m0
3676*c0909341SAndroid Build Coastguard Worker    movq     [dstq+wq], m0
3677*c0909341SAndroid Build Coastguard Worker    add             wq, 8
3678*c0909341SAndroid Build Coastguard Worker    jl .n1_loop
3679*c0909341SAndroid Build Coastguard Worker    add           dstq, stridemp
3680*c0909341SAndroid Build Coastguard Worker    movif32       dstm, dstq
3681*c0909341SAndroid Build Coastguard Worker    ret
3682