xref: /aosp_15_r20/external/libdav1d/src/x86/looprestoration16_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Workerwiener_shufA:  db  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11
32*c0909341SAndroid Build Coastguard Workerwiener_shufB:  db  6,  7,  4,  5,  8,  9,  6,  7, 10, 11,  8,  9, 12, 13, 10, 11
33*c0909341SAndroid Build Coastguard Workerwiener_shufC:  db  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
34*c0909341SAndroid Build Coastguard Workerwiener_shufD:  db  2,  3, -1, -1,  4,  5, -1, -1,  6,  7, -1, -1,  8,  9, -1, -1
35*c0909341SAndroid Build Coastguard Workerwiener_shufE:  db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
36*c0909341SAndroid Build Coastguard Workerwiener_lshuf5: db  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
37*c0909341SAndroid Build Coastguard Workerwiener_lshuf7: db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7
38*c0909341SAndroid Build Coastguard Workersgr_lshuf3:    db  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
39*c0909341SAndroid Build Coastguard Workersgr_lshuf5:    db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9
40*c0909341SAndroid Build Coastguard Workerpb_0to15:      db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
41*c0909341SAndroid Build Coastguard Worker
42*c0909341SAndroid Build Coastguard Workerpb_m14_m13:    times 8 db -14,-13
43*c0909341SAndroid Build Coastguard Workerpb_m10_m9:     times 8 db -10, -9
44*c0909341SAndroid Build Coastguard Workerpb_m6_m5:      times 8 db  -6, -5
45*c0909341SAndroid Build Coastguard Workerpb_m2_m1:      times 8 db  -2, -1
46*c0909341SAndroid Build Coastguard Workerpb_2_3:        times 8 db   2,  3
47*c0909341SAndroid Build Coastguard Workerpb_6_7:        times 8 db   6,  7
48*c0909341SAndroid Build Coastguard Workerpw_256:        times 8 dw 256
49*c0909341SAndroid Build Coastguard Workerpw_1023:       times 8 dw 1023
50*c0909341SAndroid Build Coastguard Workerpd_8:          times 4 dd 8
51*c0909341SAndroid Build Coastguard Workerpd_4096:       times 4 dd 4096
52*c0909341SAndroid Build Coastguard Workerpd_34816:      times 4 dd 34816
53*c0909341SAndroid Build Coastguard Workerpd_m262128:    times 4 dd -262128
54*c0909341SAndroid Build Coastguard Workerpd_0xffff:     times 4 dd 0xffff
55*c0909341SAndroid Build Coastguard Workerpd_0xf00800a4: times 4 dd 0xf00800a4
56*c0909341SAndroid Build Coastguard Workerpd_0xf00801c7: times 4 dd 0xf00801c7
57*c0909341SAndroid Build Coastguard Workerpd_0xfffffff0: times 4 dd 0xfffffff0
58*c0909341SAndroid Build Coastguard Worker
59*c0909341SAndroid Build Coastguard Workerwiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192
60*c0909341SAndroid Build Coastguard Workerwiener_round:  dd 1049600, 1048832
61*c0909341SAndroid Build Coastguard Worker
62*c0909341SAndroid Build Coastguard Workercextern sgr_x_by_x
63*c0909341SAndroid Build Coastguard Worker
64*c0909341SAndroid Build Coastguard WorkerSECTION .text
65*c0909341SAndroid Build Coastguard Worker
66*c0909341SAndroid Build Coastguard Worker%macro movif64 2 ; dst, src
67*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
68*c0909341SAndroid Build Coastguard Worker    mov             %1, %2
69*c0909341SAndroid Build Coastguard Worker %endif
70*c0909341SAndroid Build Coastguard Worker%endmacro
71*c0909341SAndroid Build Coastguard Worker
72*c0909341SAndroid Build Coastguard Worker%macro movif32 2 ; dst, src
73*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
74*c0909341SAndroid Build Coastguard Worker    mov             %1, %2
75*c0909341SAndroid Build Coastguard Worker %endif
76*c0909341SAndroid Build Coastguard Worker%endmacro
77*c0909341SAndroid Build Coastguard Worker
78*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
79*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
80*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5, 6
81*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
82*c0909341SAndroid Build Coastguard Worker  %assign extra_stack 13*16
83*c0909341SAndroid Build Coastguard Worker %else
84*c0909341SAndroid Build Coastguard Worker  %assign extra_stack 12*16
85*c0909341SAndroid Build Coastguard Worker %endif
86*c0909341SAndroid Build Coastguard Workercglobal wiener_filter7_16bpc, 4, 7, 8, -384*12-16-extra_stack, \
87*c0909341SAndroid Build Coastguard Worker                              dst, stride, left, lpf, w, flt
88*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
89*c0909341SAndroid Build Coastguard Worker  %define lpfm        dword [esp+calloff+16*12+ 0]
90*c0909341SAndroid Build Coastguard Worker  %define wm          dword [esp+calloff+16*12+ 4]
91*c0909341SAndroid Build Coastguard Worker  %define hd          dword [esp+calloff+16*12+ 8]
92*c0909341SAndroid Build Coastguard Worker  %define edgeb        byte [esp+calloff+16*12+12]
93*c0909341SAndroid Build Coastguard Worker  %define edged       dword [esp+calloff+16*12+12]
94*c0909341SAndroid Build Coastguard Worker %else
95*c0909341SAndroid Build Coastguard Worker  %define hd dword r5m
96*c0909341SAndroid Build Coastguard Worker  %define edgeb byte r7m
97*c0909341SAndroid Build Coastguard Worker %endif
98*c0909341SAndroid Build Coastguard Worker %define PICmem dword [esp+calloff+4*0]
99*c0909341SAndroid Build Coastguard Worker %define t0m    dword [esp+calloff+4*1] ; wiener ring buffer pointers
100*c0909341SAndroid Build Coastguard Worker %define t1m    dword [esp+calloff+4*2]
101*c0909341SAndroid Build Coastguard Worker %define t2m    dword [esp+calloff+4*3]
102*c0909341SAndroid Build Coastguard Worker %define t3m    dword [esp+calloff+4*4]
103*c0909341SAndroid Build Coastguard Worker %define t4m    dword [esp+calloff+4*5]
104*c0909341SAndroid Build Coastguard Worker %define t5m    dword [esp+calloff+4*6]
105*c0909341SAndroid Build Coastguard Worker %define t6m    dword [esp+calloff+4*7]
106*c0909341SAndroid Build Coastguard Worker %define t2 t2m
107*c0909341SAndroid Build Coastguard Worker %define t3 t3m
108*c0909341SAndroid Build Coastguard Worker %define t4 t4m
109*c0909341SAndroid Build Coastguard Worker %define t5 t5m
110*c0909341SAndroid Build Coastguard Worker %define t6 t6m
111*c0909341SAndroid Build Coastguard Worker %define  m8 [esp+calloff+16*2]
112*c0909341SAndroid Build Coastguard Worker %define  m9 [esp+calloff+16*3]
113*c0909341SAndroid Build Coastguard Worker %define m10 [esp+calloff+16*4]
114*c0909341SAndroid Build Coastguard Worker %define m11 [esp+calloff+16*5]
115*c0909341SAndroid Build Coastguard Worker %define m12 [esp+calloff+16*6]
116*c0909341SAndroid Build Coastguard Worker %define m13 [esp+calloff+16*7]
117*c0909341SAndroid Build Coastguard Worker %define m14 [esp+calloff+16*8]
118*c0909341SAndroid Build Coastguard Worker %define m15 [esp+calloff+16*9]
119*c0909341SAndroid Build Coastguard Worker %define r10 r4
120*c0909341SAndroid Build Coastguard Worker %define base t0-wiener_shifts
121*c0909341SAndroid Build Coastguard Worker %assign calloff 0
122*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
123*c0909341SAndroid Build Coastguard Worker    mov             wd, [rstk+stack_offset+20]
124*c0909341SAndroid Build Coastguard Worker    mov             wm, wd
125*c0909341SAndroid Build Coastguard Worker    mov             r5, [rstk+stack_offset+24]
126*c0909341SAndroid Build Coastguard Worker    mov             hd, r5
127*c0909341SAndroid Build Coastguard Worker    mov             r5, [rstk+stack_offset+32]
128*c0909341SAndroid Build Coastguard Worker    mov          edged, r5 ; edge
129*c0909341SAndroid Build Coastguard Worker %endif
130*c0909341SAndroid Build Coastguard Worker%else
131*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers
132*c0909341SAndroid Build Coastguard Workercglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
133*c0909341SAndroid Build Coastguard Worker                                                     w, h, edge, flt
134*c0909341SAndroid Build Coastguard Worker %define base
135*c0909341SAndroid Build Coastguard Worker%endif
136*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
137*c0909341SAndroid Build Coastguard Worker    movifnidn       wd, wm
138*c0909341SAndroid Build Coastguard Worker%endif
139*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
140*c0909341SAndroid Build Coastguard Worker    mov           fltq, r6mp
141*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
142*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
143*c0909341SAndroid Build Coastguard Worker    mov            t3d, r8m ; pixel_max
144*c0909341SAndroid Build Coastguard Worker    movq           m13, [fltq]
145*c0909341SAndroid Build Coastguard Worker    movq           m15, [fltq+16]
146*c0909341SAndroid Build Coastguard Worker%else
147*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
148*c0909341SAndroid Build Coastguard Worker    mov             t0, [rstk+stack_offset+28]
149*c0909341SAndroid Build Coastguard Worker    mov             t1, [rstk+stack_offset+36] ; pixel_max
150*c0909341SAndroid Build Coastguard Worker    movq            m1, [t0]    ; fx
151*c0909341SAndroid Build Coastguard Worker    movq            m3, [t0+16] ; fy
152*c0909341SAndroid Build Coastguard Worker    LEA             t0, wiener_shifts
153*c0909341SAndroid Build Coastguard Worker %else
154*c0909341SAndroid Build Coastguard Worker    mov           fltq, r6m
155*c0909341SAndroid Build Coastguard Worker    movq            m1, [fltq]
156*c0909341SAndroid Build Coastguard Worker    movq            m3, [fltq+16]
157*c0909341SAndroid Build Coastguard Worker    LEA             t0, wiener_shifts
158*c0909341SAndroid Build Coastguard Worker    mov             t1, r8m ; pixel_max
159*c0909341SAndroid Build Coastguard Worker %endif
160*c0909341SAndroid Build Coastguard Worker    mov         PICmem, t0
161*c0909341SAndroid Build Coastguard Worker%endif
162*c0909341SAndroid Build Coastguard Worker    mova            m6, [base+wiener_shufA]
163*c0909341SAndroid Build Coastguard Worker    mova            m7, [base+wiener_shufB]
164*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
165*c0909341SAndroid Build Coastguard Worker    lea             t4, [wiener_shifts]
166*c0909341SAndroid Build Coastguard Worker    add             wd, wd
167*c0909341SAndroid Build Coastguard Worker    pshufd         m12, m13, q0000 ; x0 x1
168*c0909341SAndroid Build Coastguard Worker    pshufd         m13, m13, q1111 ; x2 x3
169*c0909341SAndroid Build Coastguard Worker    pshufd         m14, m15, q0000 ; y0 y1
170*c0909341SAndroid Build Coastguard Worker    pshufd         m15, m15, q1111 ; y2 y3
171*c0909341SAndroid Build Coastguard Worker    mova            m8, [wiener_shufC]
172*c0909341SAndroid Build Coastguard Worker    mova            m9, [wiener_shufD]
173*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
174*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq+16]
175*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
176*c0909341SAndroid Build Coastguard Worker    neg             wq
177*c0909341SAndroid Build Coastguard Worker    shr            t3d, 11
178*c0909341SAndroid Build Coastguard Worker %define base t4-wiener_shifts
179*c0909341SAndroid Build Coastguard Worker    movd           m10, [base+wiener_round+t3*4]
180*c0909341SAndroid Build Coastguard Worker    movq           m11, [base+wiener_shifts+t3*8]
181*c0909341SAndroid Build Coastguard Worker    pshufd         m10, m10, q0000
182*c0909341SAndroid Build Coastguard Worker    pshufd          m0, m11, q0000
183*c0909341SAndroid Build Coastguard Worker    pshufd         m11, m11, q1111
184*c0909341SAndroid Build Coastguard Worker    pmullw         m12, m0 ; upshift filter coefs to make the
185*c0909341SAndroid Build Coastguard Worker    pmullw         m13, m0 ; horizontal downshift constant
186*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
187*c0909341SAndroid Build Coastguard Worker %define lpfm [rsp]
188*c0909341SAndroid Build Coastguard Worker %define base
189*c0909341SAndroid Build Coastguard Worker %define wiener_lshuf7_mem [wiener_lshuf7]
190*c0909341SAndroid Build Coastguard Worker %define pd_m262128_mem [pd_m262128]
191*c0909341SAndroid Build Coastguard Worker%else
192*c0909341SAndroid Build Coastguard Worker    add             wd, wd
193*c0909341SAndroid Build Coastguard Worker    mova            m4, [base+wiener_shufC]
194*c0909341SAndroid Build Coastguard Worker    mova            m5, [base+wiener_shufD]
195*c0909341SAndroid Build Coastguard Worker    pshufd          m0, m1, q0000
196*c0909341SAndroid Build Coastguard Worker    pshufd          m1, m1, q1111
197*c0909341SAndroid Build Coastguard Worker    pshufd          m2, m3, q0000
198*c0909341SAndroid Build Coastguard Worker    pshufd          m3, m3, q1111
199*c0909341SAndroid Build Coastguard Worker    mova            m8, m4
200*c0909341SAndroid Build Coastguard Worker    mova            m9, m5
201*c0909341SAndroid Build Coastguard Worker    mova           m14, m2
202*c0909341SAndroid Build Coastguard Worker    mova           m15, m3
203*c0909341SAndroid Build Coastguard Worker    shr             t1, 11
204*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
205*c0909341SAndroid Build Coastguard Worker    mova            m3, [base+pd_m262128]
206*c0909341SAndroid Build Coastguard Worker    movd            m4, [base+wiener_round+t1*4]
207*c0909341SAndroid Build Coastguard Worker    movq            m5, [base+wiener_shifts+t1*8]
208*c0909341SAndroid Build Coastguard Worker    lea             t1, [esp+extra_stack+wq+16]
209*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
210*c0909341SAndroid Build Coastguard Worker    neg             wq
211*c0909341SAndroid Build Coastguard Worker    pshufd          m4, m4, q0000
212*c0909341SAndroid Build Coastguard Worker    pshufd          m2, m5, q0000
213*c0909341SAndroid Build Coastguard Worker    pshufd          m5, m5, q1111
214*c0909341SAndroid Build Coastguard Worker    mov             wm, wq
215*c0909341SAndroid Build Coastguard Worker    pmullw          m0, m2
216*c0909341SAndroid Build Coastguard Worker    pmullw          m1, m2
217*c0909341SAndroid Build Coastguard Worker    mova            m2, [base+wiener_lshuf7]
218*c0909341SAndroid Build Coastguard Worker %define pd_m262128_mem [esp+calloff+16*10]
219*c0909341SAndroid Build Coastguard Worker    mova pd_m262128_mem, m3
220*c0909341SAndroid Build Coastguard Worker    mova           m10, m4
221*c0909341SAndroid Build Coastguard Worker    mova           m11, m5
222*c0909341SAndroid Build Coastguard Worker    mova           m12, m0
223*c0909341SAndroid Build Coastguard Worker    mova           m13, m1
224*c0909341SAndroid Build Coastguard Worker %define wiener_lshuf7_mem [esp+calloff+16*11]
225*c0909341SAndroid Build Coastguard Worker    mova wiener_lshuf7_mem, m2
226*c0909341SAndroid Build Coastguard Worker%endif
227*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
228*c0909341SAndroid Build Coastguard Worker    jz .no_top
229*c0909341SAndroid Build Coastguard Worker    call .h_top
230*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
231*c0909341SAndroid Build Coastguard Worker    mov             t6, t1
232*c0909341SAndroid Build Coastguard Worker    mov             t5, t1
233*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
234*c0909341SAndroid Build Coastguard Worker    call .h_top
235*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
236*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
237*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
238*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
239*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
240*c0909341SAndroid Build Coastguard Worker    mov           lpfm, r10 ; below
241*c0909341SAndroid Build Coastguard Worker    call .h
242*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
243*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
244*c0909341SAndroid Build Coastguard Worker    dec             hd
245*c0909341SAndroid Build Coastguard Worker    jz .v1
246*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
247*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
248*c0909341SAndroid Build Coastguard Worker    call .h
249*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
250*c0909341SAndroid Build Coastguard Worker    dec             hd
251*c0909341SAndroid Build Coastguard Worker    jz .v2
252*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
253*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
254*c0909341SAndroid Build Coastguard Worker    call .h
255*c0909341SAndroid Build Coastguard Worker    dec             hd
256*c0909341SAndroid Build Coastguard Worker    jz .v3
257*c0909341SAndroid Build Coastguard Worker.main:
258*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+384*2]
259*c0909341SAndroid Build Coastguard Worker.main_loop:
260*c0909341SAndroid Build Coastguard Worker    call .hv
261*c0909341SAndroid Build Coastguard Worker    dec             hd
262*c0909341SAndroid Build Coastguard Worker    jnz .main_loop
263*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
264*c0909341SAndroid Build Coastguard Worker    jz .v3
265*c0909341SAndroid Build Coastguard Worker    mov           lpfq, lpfm
266*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
267*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
268*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
269*c0909341SAndroid Build Coastguard Worker.v1:
270*c0909341SAndroid Build Coastguard Worker    call .v
271*c0909341SAndroid Build Coastguard Worker    RET
272*c0909341SAndroid Build Coastguard Worker.no_top:
273*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
274*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
275*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
276*c0909341SAndroid Build Coastguard Worker    mov           lpfm, r10
277*c0909341SAndroid Build Coastguard Worker    call .h
278*c0909341SAndroid Build Coastguard Worker    mov             t6, t1
279*c0909341SAndroid Build Coastguard Worker    mov             t5, t1
280*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
281*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
282*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
283*c0909341SAndroid Build Coastguard Worker    dec             hd
284*c0909341SAndroid Build Coastguard Worker    jz .v1
285*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
286*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
287*c0909341SAndroid Build Coastguard Worker    call .h
288*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
289*c0909341SAndroid Build Coastguard Worker    dec             hd
290*c0909341SAndroid Build Coastguard Worker    jz .v2
291*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
292*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
293*c0909341SAndroid Build Coastguard Worker    call .h
294*c0909341SAndroid Build Coastguard Worker    dec             hd
295*c0909341SAndroid Build Coastguard Worker    jz .v3
296*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+384*2]
297*c0909341SAndroid Build Coastguard Worker    call .hv
298*c0909341SAndroid Build Coastguard Worker    dec             hd
299*c0909341SAndroid Build Coastguard Worker    jz .v3
300*c0909341SAndroid Build Coastguard Worker    add             t0, 384*8
301*c0909341SAndroid Build Coastguard Worker    call .hv
302*c0909341SAndroid Build Coastguard Worker    dec             hd
303*c0909341SAndroid Build Coastguard Worker    jnz .main
304*c0909341SAndroid Build Coastguard Worker.v3:
305*c0909341SAndroid Build Coastguard Worker    call .v
306*c0909341SAndroid Build Coastguard Worker    movif32         wq, wm
307*c0909341SAndroid Build Coastguard Worker.v2:
308*c0909341SAndroid Build Coastguard Worker    call .v
309*c0909341SAndroid Build Coastguard Worker    movif32         wq, wm
310*c0909341SAndroid Build Coastguard Worker    jmp .v1
311*c0909341SAndroid Build Coastguard Worker.extend_right:
312*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset+8
313*c0909341SAndroid Build Coastguard Worker%assign calloff 8
314*c0909341SAndroid Build Coastguard Worker    movif32         t0, PICmem
315*c0909341SAndroid Build Coastguard Worker    pxor            m0, m0
316*c0909341SAndroid Build Coastguard Worker    movd            m1, wd
317*c0909341SAndroid Build Coastguard Worker    mova            m2, [base+pb_0to15]
318*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m0
319*c0909341SAndroid Build Coastguard Worker    mova            m0, [base+pb_6_7]
320*c0909341SAndroid Build Coastguard Worker    psubb           m0, m1
321*c0909341SAndroid Build Coastguard Worker    pminub          m0, m2
322*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m0
323*c0909341SAndroid Build Coastguard Worker    mova            m0, [base+pb_m2_m1]
324*c0909341SAndroid Build Coastguard Worker    psubb           m0, m1
325*c0909341SAndroid Build Coastguard Worker    pminub          m0, m2
326*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m0
327*c0909341SAndroid Build Coastguard Worker    mova            m0, [base+pb_m10_m9]
328*c0909341SAndroid Build Coastguard Worker    psubb           m0, m1
329*c0909341SAndroid Build Coastguard Worker    pminub          m0, m2
330*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m0
331*c0909341SAndroid Build Coastguard Worker    movif32         t0, t0m
332*c0909341SAndroid Build Coastguard Worker    ret
333*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset-4
334*c0909341SAndroid Build Coastguard Worker%assign calloff 4
335*c0909341SAndroid Build Coastguard Worker.h:
336*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
337*c0909341SAndroid Build Coastguard Worker    movif32         wq, wm
338*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
339*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
340*c0909341SAndroid Build Coastguard Worker    movq            m3, [leftq]
341*c0909341SAndroid Build Coastguard Worker    movhps          m3, [lpfq+wq]
342*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
343*c0909341SAndroid Build Coastguard Worker    jmp .h_main
344*c0909341SAndroid Build Coastguard Worker.h_extend_left:
345*c0909341SAndroid Build Coastguard Worker    mova            m3, [lpfq+wq]         ; avoid accessing memory located
346*c0909341SAndroid Build Coastguard Worker    pshufb          m3, wiener_lshuf7_mem ; before the start of the buffer
347*c0909341SAndroid Build Coastguard Worker    jmp .h_main
348*c0909341SAndroid Build Coastguard Worker.h_top:
349*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
350*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
351*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
352*c0909341SAndroid Build Coastguard Worker.h_loop:
353*c0909341SAndroid Build Coastguard Worker    movu            m3, [lpfq+wq-8]
354*c0909341SAndroid Build Coastguard Worker.h_main:
355*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq+0]
356*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+wq+8]
357*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
358*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
359*c0909341SAndroid Build Coastguard Worker    cmp             wd, -20
360*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
361*c0909341SAndroid Build Coastguard Worker    call .extend_right
362*c0909341SAndroid Build Coastguard Worker.h_have_right:
363*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m3, m6
364*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m7
365*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
366*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m8
367*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m12
368*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m9
369*c0909341SAndroid Build Coastguard Worker    paddw           m3, m1
370*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m6
371*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m13
372*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m5, m7
373*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2
374*c0909341SAndroid Build Coastguard Worker    mova            m2, pd_m262128_mem ; (1 << 4) - (1 << 18)
375*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m8
376*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m12
377*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m9
378*c0909341SAndroid Build Coastguard Worker    paddw           m4, m5
379*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m13
380*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
381*c0909341SAndroid Build Coastguard Worker    paddd           m1, m2
382*c0909341SAndroid Build Coastguard Worker    paddd           m0, m3
383*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
384*c0909341SAndroid Build Coastguard Worker    psrad           m0, 4
385*c0909341SAndroid Build Coastguard Worker    psrad           m1, 4
386*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
387*c0909341SAndroid Build Coastguard Worker    psraw           m0, 1
388*c0909341SAndroid Build Coastguard Worker    mova       [t1+wq], m0
389*c0909341SAndroid Build Coastguard Worker    add             wq, 16
390*c0909341SAndroid Build Coastguard Worker    jl .h_loop
391*c0909341SAndroid Build Coastguard Worker    movif32         wq, wm
392*c0909341SAndroid Build Coastguard Worker    ret
393*c0909341SAndroid Build Coastguard WorkerALIGN function_align
394*c0909341SAndroid Build Coastguard Worker.hv:
395*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
396*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
397*c0909341SAndroid Build Coastguard Worker    movif32        t0m, t0
398*c0909341SAndroid Build Coastguard Worker    movif32        t1m, t1
399*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
400*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
401*c0909341SAndroid Build Coastguard Worker    movq            m3, [leftq]
402*c0909341SAndroid Build Coastguard Worker    movhps          m3, [lpfq+wq]
403*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
404*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
405*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
406*c0909341SAndroid Build Coastguard Worker    mova            m3, [lpfq+wq]
407*c0909341SAndroid Build Coastguard Worker    pshufb          m3, wiener_lshuf7_mem
408*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
409*c0909341SAndroid Build Coastguard Worker.hv_bottom:
410*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
411*c0909341SAndroid Build Coastguard Worker    movif32        t0m, t0
412*c0909341SAndroid Build Coastguard Worker    movif32        t1m, t1
413*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
414*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
415*c0909341SAndroid Build Coastguard Worker.hv_loop:
416*c0909341SAndroid Build Coastguard Worker    movu            m3, [lpfq+wq-8]
417*c0909341SAndroid Build Coastguard Worker.hv_main:
418*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq+0]
419*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+wq+8]
420*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
421*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
422*c0909341SAndroid Build Coastguard Worker    cmp             wd, -20
423*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
424*c0909341SAndroid Build Coastguard Worker    call .extend_right
425*c0909341SAndroid Build Coastguard Worker.hv_have_right:
426*c0909341SAndroid Build Coastguard Worker    movif32         t1, t4m
427*c0909341SAndroid Build Coastguard Worker    movif32         t0, t2m
428*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m3, m6
429*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m7
430*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
431*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m8
432*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m12
433*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m9
434*c0909341SAndroid Build Coastguard Worker    paddw           m3, m1
435*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m6
436*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m13
437*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m5, m7
438*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2
439*c0909341SAndroid Build Coastguard Worker    mova            m2, pd_m262128_mem
440*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m8
441*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m12
442*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m9
443*c0909341SAndroid Build Coastguard Worker    paddw           m4, m5
444*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m13
445*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
446*c0909341SAndroid Build Coastguard Worker    paddd           m1, m2
447*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
448*c0909341SAndroid Build Coastguard Worker    mova            m2, [t4+wq]
449*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t2+wq]
450*c0909341SAndroid Build Coastguard Worker    mova            m5, [t3+wq]
451*c0909341SAndroid Build Coastguard Worker%else
452*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+wq]
453*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t0+wq]
454*c0909341SAndroid Build Coastguard Worker    mov             t1, t3m
455*c0909341SAndroid Build Coastguard Worker    mov             t0, t5m
456*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+wq]
457*c0909341SAndroid Build Coastguard Worker    mov             t1, t1m
458*c0909341SAndroid Build Coastguard Worker%endif
459*c0909341SAndroid Build Coastguard Worker    paddd           m0, m3
460*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
461*c0909341SAndroid Build Coastguard Worker    psrad           m0, 4
462*c0909341SAndroid Build Coastguard Worker    psrad           m1, 4
463*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
464*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
465*c0909341SAndroid Build Coastguard Worker    mova            m4, [t5+wq]
466*c0909341SAndroid Build Coastguard Worker    paddw           m4, [t1+wq]
467*c0909341SAndroid Build Coastguard Worker    psraw           m0, 1
468*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0, [t6+wq]
469*c0909341SAndroid Build Coastguard Worker%else
470*c0909341SAndroid Build Coastguard Worker    mova            m4, [t0+wq]
471*c0909341SAndroid Build Coastguard Worker    paddw           m4, [t1+wq]
472*c0909341SAndroid Build Coastguard Worker    mov             t0, t0m
473*c0909341SAndroid Build Coastguard Worker    mov             t1, t6m
474*c0909341SAndroid Build Coastguard Worker    psraw           m0, 1
475*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0, [t1+wq]
476*c0909341SAndroid Build Coastguard Worker%endif
477*c0909341SAndroid Build Coastguard Worker    mova       [t0+wq], m0
478*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m2, m5
479*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m15
480*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m5
481*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m15
482*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m3, m4
483*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m14
484*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4
485*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m14
486*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10
487*c0909341SAndroid Build Coastguard Worker    paddd           m2, m10
488*c0909341SAndroid Build Coastguard Worker    paddd           m0, m1
489*c0909341SAndroid Build Coastguard Worker    paddd           m2, m3
490*c0909341SAndroid Build Coastguard Worker    psrad           m0, 6
491*c0909341SAndroid Build Coastguard Worker    psrad           m2, 6
492*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m2
493*c0909341SAndroid Build Coastguard Worker    pmulhw          m0, m11
494*c0909341SAndroid Build Coastguard Worker    pxor            m1, m1
495*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m1
496*c0909341SAndroid Build Coastguard Worker    mova     [dstq+wq], m0
497*c0909341SAndroid Build Coastguard Worker    add             wq, 16
498*c0909341SAndroid Build Coastguard Worker    jl .hv_loop
499*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
500*c0909341SAndroid Build Coastguard Worker    mov             t6, t5
501*c0909341SAndroid Build Coastguard Worker    mov             t5, t4
502*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
503*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
504*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
505*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
506*c0909341SAndroid Build Coastguard Worker    mov             t0, t6
507*c0909341SAndroid Build Coastguard Worker%else
508*c0909341SAndroid Build Coastguard Worker    mov             r4, t5m
509*c0909341SAndroid Build Coastguard Worker    mov             t1, t4m
510*c0909341SAndroid Build Coastguard Worker    mov            t6m, r4
511*c0909341SAndroid Build Coastguard Worker    mov            t5m, t1
512*c0909341SAndroid Build Coastguard Worker    mov             r4, t3m
513*c0909341SAndroid Build Coastguard Worker    mov             t1, t2m
514*c0909341SAndroid Build Coastguard Worker    mov            t4m, r4
515*c0909341SAndroid Build Coastguard Worker    mov            t3m, t1
516*c0909341SAndroid Build Coastguard Worker    mov             r4, t1m
517*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
518*c0909341SAndroid Build Coastguard Worker    mov            t2m, r4
519*c0909341SAndroid Build Coastguard Worker    mov             t0, t6m
520*c0909341SAndroid Build Coastguard Worker    mov             wq, wm
521*c0909341SAndroid Build Coastguard Worker%endif
522*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
523*c0909341SAndroid Build Coastguard Worker    ret
524*c0909341SAndroid Build Coastguard Worker.v:
525*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
526*c0909341SAndroid Build Coastguard Worker    movif32        t0m, t0
527*c0909341SAndroid Build Coastguard Worker    movif32        t1m, t1
528*c0909341SAndroid Build Coastguard Worker.v_loop:
529*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
530*c0909341SAndroid Build Coastguard Worker    mova            m1, [t4+wq]
531*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t2+wq]
532*c0909341SAndroid Build Coastguard Worker    mova            m2, [t3+wq]
533*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+wq]
534*c0909341SAndroid Build Coastguard Worker    paddw           m3, m4, [t6+wq]
535*c0909341SAndroid Build Coastguard Worker    paddw           m4, [t5+wq]
536*c0909341SAndroid Build Coastguard Worker%else
537*c0909341SAndroid Build Coastguard Worker    mov             t0, t4m
538*c0909341SAndroid Build Coastguard Worker    mov             t1, t2m
539*c0909341SAndroid Build Coastguard Worker    mova            m1, [t0+wq]
540*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t1+wq]
541*c0909341SAndroid Build Coastguard Worker    mov             t0, t3m
542*c0909341SAndroid Build Coastguard Worker    mov             t1, t1m
543*c0909341SAndroid Build Coastguard Worker    mova            m2, [t0+wq]
544*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+wq]
545*c0909341SAndroid Build Coastguard Worker    mov             t0, t6m
546*c0909341SAndroid Build Coastguard Worker    mov             t1, t5m
547*c0909341SAndroid Build Coastguard Worker    paddw           m3, m4, [t0+wq]
548*c0909341SAndroid Build Coastguard Worker    paddw           m4, [t1+wq]
549*c0909341SAndroid Build Coastguard Worker%endif
550*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m2
551*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m15
552*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m2
553*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m15
554*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m4
555*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m14
556*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4
557*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m14
558*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10
559*c0909341SAndroid Build Coastguard Worker    paddd           m1, m10
560*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
561*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
562*c0909341SAndroid Build Coastguard Worker    psrad           m0, 6
563*c0909341SAndroid Build Coastguard Worker    psrad           m1, 6
564*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
565*c0909341SAndroid Build Coastguard Worker    pmulhw          m0, m11
566*c0909341SAndroid Build Coastguard Worker    pxor            m1, m1
567*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m1
568*c0909341SAndroid Build Coastguard Worker    mova     [dstq+wq], m0
569*c0909341SAndroid Build Coastguard Worker    add             wq, 16
570*c0909341SAndroid Build Coastguard Worker    jl .v_loop
571*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
572*c0909341SAndroid Build Coastguard Worker    mov             t6, t5
573*c0909341SAndroid Build Coastguard Worker    mov             t5, t4
574*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
575*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
576*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
577*c0909341SAndroid Build Coastguard Worker%else
578*c0909341SAndroid Build Coastguard Worker    mov             t0, t5m
579*c0909341SAndroid Build Coastguard Worker    mov             t1, t4m
580*c0909341SAndroid Build Coastguard Worker    mov             r4, t3m
581*c0909341SAndroid Build Coastguard Worker    mov            t6m, t0
582*c0909341SAndroid Build Coastguard Worker    mov            t5m, t1
583*c0909341SAndroid Build Coastguard Worker    mov            t4m, r4
584*c0909341SAndroid Build Coastguard Worker    mov             r4, t2m
585*c0909341SAndroid Build Coastguard Worker    mov             t1, t1m
586*c0909341SAndroid Build Coastguard Worker    mov             t0, t0m
587*c0909341SAndroid Build Coastguard Worker    mov            t3m, r4
588*c0909341SAndroid Build Coastguard Worker    mov            t2m, t1
589*c0909341SAndroid Build Coastguard Worker%endif
590*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
591*c0909341SAndroid Build Coastguard Worker    ret
592*c0909341SAndroid Build Coastguard Worker
593*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
594*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
595*c0909341SAndroid Build Coastguard Worker  %assign stack_size 12*16+384*8
596*c0909341SAndroid Build Coastguard Worker %else
597*c0909341SAndroid Build Coastguard Worker  %assign stack_size 11*16+384*8
598*c0909341SAndroid Build Coastguard Worker %endif
599*c0909341SAndroid Build Coastguard Workercglobal wiener_filter5_16bpc, 4, 7, 8, -stack_size, dst, stride, left, \
600*c0909341SAndroid Build Coastguard Worker                                                    lpf, w, flt
601*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
602*c0909341SAndroid Build Coastguard Worker  %define lpfm        dword [esp+calloff+4*6]
603*c0909341SAndroid Build Coastguard Worker  %define wm          dword [esp+calloff+4*7]
604*c0909341SAndroid Build Coastguard Worker  %define hd          dword [esp+calloff+16*10+0]
605*c0909341SAndroid Build Coastguard Worker  %define edgeb        byte [esp+calloff+16*10+4]
606*c0909341SAndroid Build Coastguard Worker  %define edged       dword [esp+calloff+16*10+4]
607*c0909341SAndroid Build Coastguard Worker %else
608*c0909341SAndroid Build Coastguard Worker  %define hd dword r5m
609*c0909341SAndroid Build Coastguard Worker  %define edgeb byte r7m
610*c0909341SAndroid Build Coastguard Worker %endif
611*c0909341SAndroid Build Coastguard Worker %define PICmem dword [esp+calloff+4*0]
612*c0909341SAndroid Build Coastguard Worker %define t0m    dword [esp+calloff+4*1] ; wiener ring buffer pointers
613*c0909341SAndroid Build Coastguard Worker %define t1m    dword [esp+calloff+4*2]
614*c0909341SAndroid Build Coastguard Worker %define t2m    dword [esp+calloff+4*3]
615*c0909341SAndroid Build Coastguard Worker %define t3m    dword [esp+calloff+4*4]
616*c0909341SAndroid Build Coastguard Worker %define t4m    dword [esp+calloff+4*5]
617*c0909341SAndroid Build Coastguard Worker %define t2 t2m
618*c0909341SAndroid Build Coastguard Worker %define t3 t3m
619*c0909341SAndroid Build Coastguard Worker %define t4 t4m
620*c0909341SAndroid Build Coastguard Worker %define  m8 [esp+calloff+16*2]
621*c0909341SAndroid Build Coastguard Worker %define  m9 [esp+calloff+16*3]
622*c0909341SAndroid Build Coastguard Worker %define m10 [esp+calloff+16*4]
623*c0909341SAndroid Build Coastguard Worker %define m11 [esp+calloff+16*5]
624*c0909341SAndroid Build Coastguard Worker %define m12 [esp+calloff+16*6]
625*c0909341SAndroid Build Coastguard Worker %define m13 [esp+calloff+16*7]
626*c0909341SAndroid Build Coastguard Worker %define m14 [esp+calloff+16*8]
627*c0909341SAndroid Build Coastguard Worker %define m15 [esp+calloff+16*9]
628*c0909341SAndroid Build Coastguard Worker %define base t0-wiener_shifts
629*c0909341SAndroid Build Coastguard Worker %assign calloff 0
630*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
631*c0909341SAndroid Build Coastguard Worker    mov             wd, [rstk+stack_offset+20]
632*c0909341SAndroid Build Coastguard Worker    mov             wm, wd
633*c0909341SAndroid Build Coastguard Worker    mov             r5, [rstk+stack_offset+24]
634*c0909341SAndroid Build Coastguard Worker    mov             hd, r5
635*c0909341SAndroid Build Coastguard Worker    mov             r5, [rstk+stack_offset+32]
636*c0909341SAndroid Build Coastguard Worker    mov          edged, r5 ; edge
637*c0909341SAndroid Build Coastguard Worker %endif
638*c0909341SAndroid Build Coastguard Worker%else
639*c0909341SAndroid Build Coastguard Workercglobal wiener_filter5_16bpc, 4, 14, 16, 384*8+16, dst, stride, left, lpf, \
640*c0909341SAndroid Build Coastguard Worker                                                   w, h, edge, flt
641*c0909341SAndroid Build Coastguard Worker %define base
642*c0909341SAndroid Build Coastguard Worker%endif
643*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
644*c0909341SAndroid Build Coastguard Worker    movifnidn       wd, wm
645*c0909341SAndroid Build Coastguard Worker%endif
646*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
647*c0909341SAndroid Build Coastguard Worker    mov           fltq, r6mp
648*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
649*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
650*c0909341SAndroid Build Coastguard Worker    mov            t3d, r8m ; pixel_max
651*c0909341SAndroid Build Coastguard Worker    movq           m12, [fltq]
652*c0909341SAndroid Build Coastguard Worker    movq           m14, [fltq+16]
653*c0909341SAndroid Build Coastguard Worker%else
654*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
655*c0909341SAndroid Build Coastguard Worker    mov             t0, [rstk+stack_offset+28]
656*c0909341SAndroid Build Coastguard Worker    mov             t1, [rstk+stack_offset+36] ; pixel_max
657*c0909341SAndroid Build Coastguard Worker    movq            m1, [t0]    ; fx
658*c0909341SAndroid Build Coastguard Worker    movq            m3, [t0+16] ; fy
659*c0909341SAndroid Build Coastguard Worker    LEA             t0, wiener_shifts
660*c0909341SAndroid Build Coastguard Worker %else
661*c0909341SAndroid Build Coastguard Worker    mov           fltq, r6m
662*c0909341SAndroid Build Coastguard Worker    movq            m1, [fltq]
663*c0909341SAndroid Build Coastguard Worker    movq            m3, [fltq+16]
664*c0909341SAndroid Build Coastguard Worker    LEA             t0, wiener_shifts
665*c0909341SAndroid Build Coastguard Worker    mov             t1, r8m ; pixel_max
666*c0909341SAndroid Build Coastguard Worker %endif
667*c0909341SAndroid Build Coastguard Worker    mov         PICmem, t0
668*c0909341SAndroid Build Coastguard Worker%endif
669*c0909341SAndroid Build Coastguard Worker    mova            m5, [base+wiener_shufE]
670*c0909341SAndroid Build Coastguard Worker    mova            m6, [base+wiener_shufB]
671*c0909341SAndroid Build Coastguard Worker    mova            m7, [base+wiener_shufD]
672*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
673*c0909341SAndroid Build Coastguard Worker    lea             t4, [wiener_shifts]
674*c0909341SAndroid Build Coastguard Worker    add             wd, wd
675*c0909341SAndroid Build Coastguard Worker    punpcklwd      m11, m12, m12
676*c0909341SAndroid Build Coastguard Worker    pshufd         m11, m11, q1111 ; x1
677*c0909341SAndroid Build Coastguard Worker    pshufd         m12, m12, q1111 ; x2 x3
678*c0909341SAndroid Build Coastguard Worker    punpcklwd      m13, m14, m14
679*c0909341SAndroid Build Coastguard Worker    pshufd         m13, m13, q1111 ; y1
680*c0909341SAndroid Build Coastguard Worker    pshufd         m14, m14, q1111 ; y2 y3
681*c0909341SAndroid Build Coastguard Worker    shr            t3d, 11
682*c0909341SAndroid Build Coastguard Worker    mova            m8, [pd_m262128] ; (1 << 4) - (1 << 18)
683*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
684*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq+16]
685*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
686*c0909341SAndroid Build Coastguard Worker    neg             wq
687*c0909341SAndroid Build Coastguard Worker %define base t4-wiener_shifts
688*c0909341SAndroid Build Coastguard Worker    movd            m9, [base+wiener_round+t3*4]
689*c0909341SAndroid Build Coastguard Worker    movq           m10, [base+wiener_shifts+t3*8]
690*c0909341SAndroid Build Coastguard Worker    pshufd          m9, m9, q0000
691*c0909341SAndroid Build Coastguard Worker    pshufd          m0, m10, q0000
692*c0909341SAndroid Build Coastguard Worker    pshufd         m10, m10, q1111
693*c0909341SAndroid Build Coastguard Worker    mova           m15, [wiener_lshuf5]
694*c0909341SAndroid Build Coastguard Worker    pmullw         m11, m0
695*c0909341SAndroid Build Coastguard Worker    pmullw         m12, m0
696*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
697*c0909341SAndroid Build Coastguard Worker %define lpfm [rsp]
698*c0909341SAndroid Build Coastguard Worker %define base
699*c0909341SAndroid Build Coastguard Worker%else
700*c0909341SAndroid Build Coastguard Worker    add             wd, wd
701*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m1
702*c0909341SAndroid Build Coastguard Worker    pshufd          m0, m0, q1111 ; x1
703*c0909341SAndroid Build Coastguard Worker    pshufd          m1, m1, q1111 ; x2 x3
704*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m3
705*c0909341SAndroid Build Coastguard Worker    pshufd          m2, m2, q1111 ; y1
706*c0909341SAndroid Build Coastguard Worker    pshufd          m3, m3, q1111 ; y2 y3
707*c0909341SAndroid Build Coastguard Worker    mova            m4, [base+pd_m262128] ; (1 << 4) - (1 << 18)
708*c0909341SAndroid Build Coastguard Worker    mova           m13, m2
709*c0909341SAndroid Build Coastguard Worker    mova           m14, m3
710*c0909341SAndroid Build Coastguard Worker    mova            m8, m4
711*c0909341SAndroid Build Coastguard Worker    shr             t1, 11
712*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
713*c0909341SAndroid Build Coastguard Worker    movd            m2, [base+wiener_round+t1*4]
714*c0909341SAndroid Build Coastguard Worker    movq            m3, [base+wiener_shifts+t1*8]
715*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
716*c0909341SAndroid Build Coastguard Worker    lea             t1, [esp+16*11+wq+16]
717*c0909341SAndroid Build Coastguard Worker %else
718*c0909341SAndroid Build Coastguard Worker    lea             t1, [esp+16*10+wq+16]
719*c0909341SAndroid Build Coastguard Worker %endif
720*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
721*c0909341SAndroid Build Coastguard Worker    neg             wq
722*c0909341SAndroid Build Coastguard Worker    pshufd          m2, m2, q0000
723*c0909341SAndroid Build Coastguard Worker    pshufd          m4, m3, q0000
724*c0909341SAndroid Build Coastguard Worker    pshufd          m3, m3, q1111
725*c0909341SAndroid Build Coastguard Worker    mov             wm, wq
726*c0909341SAndroid Build Coastguard Worker    pmullw          m0, m4
727*c0909341SAndroid Build Coastguard Worker    pmullw          m1, m4
728*c0909341SAndroid Build Coastguard Worker    mova            m4, [base+wiener_lshuf5]
729*c0909341SAndroid Build Coastguard Worker    mova            m9, m2
730*c0909341SAndroid Build Coastguard Worker    mova           m10, m3
731*c0909341SAndroid Build Coastguard Worker    mova           m11, m0
732*c0909341SAndroid Build Coastguard Worker    mova           m12, m1
733*c0909341SAndroid Build Coastguard Worker    mova           m15, m4
734*c0909341SAndroid Build Coastguard Worker%endif
735*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
736*c0909341SAndroid Build Coastguard Worker    jz .no_top
737*c0909341SAndroid Build Coastguard Worker    call .h_top
738*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
739*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
740*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
741*c0909341SAndroid Build Coastguard Worker    call .h_top
742*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
743*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
744*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
745*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
746*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
747*c0909341SAndroid Build Coastguard Worker    mov           lpfm, r10 ; below
748*c0909341SAndroid Build Coastguard Worker    call .h
749*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
750*c0909341SAndroid Build Coastguard Worker    dec             hd
751*c0909341SAndroid Build Coastguard Worker    jz .v1
752*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
753*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
754*c0909341SAndroid Build Coastguard Worker    call .h
755*c0909341SAndroid Build Coastguard Worker    dec             hd
756*c0909341SAndroid Build Coastguard Worker    jz .v2
757*c0909341SAndroid Build Coastguard Worker.main:
758*c0909341SAndroid Build Coastguard Worker    mov             t0, t4
759*c0909341SAndroid Build Coastguard Worker.main_loop:
760*c0909341SAndroid Build Coastguard Worker    call .hv
761*c0909341SAndroid Build Coastguard Worker    dec             hd
762*c0909341SAndroid Build Coastguard Worker    jnz .main_loop
763*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
764*c0909341SAndroid Build Coastguard Worker    jz .v2
765*c0909341SAndroid Build Coastguard Worker    mov           lpfq, lpfm
766*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
767*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
768*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
769*c0909341SAndroid Build Coastguard Worker.end:
770*c0909341SAndroid Build Coastguard Worker    RET
771*c0909341SAndroid Build Coastguard Worker.no_top:
772*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
773*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
774*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
775*c0909341SAndroid Build Coastguard Worker    mov           lpfm, r10
776*c0909341SAndroid Build Coastguard Worker    call .h
777*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
778*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
779*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
780*c0909341SAndroid Build Coastguard Worker    dec             hd
781*c0909341SAndroid Build Coastguard Worker    jz .v1
782*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
783*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
784*c0909341SAndroid Build Coastguard Worker    call .h
785*c0909341SAndroid Build Coastguard Worker    dec             hd
786*c0909341SAndroid Build Coastguard Worker    jz .v2
787*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+384*2]
788*c0909341SAndroid Build Coastguard Worker    call .hv
789*c0909341SAndroid Build Coastguard Worker    dec             hd
790*c0909341SAndroid Build Coastguard Worker    jz .v2
791*c0909341SAndroid Build Coastguard Worker    add             t0, 384*6
792*c0909341SAndroid Build Coastguard Worker    call .hv
793*c0909341SAndroid Build Coastguard Worker    dec             hd
794*c0909341SAndroid Build Coastguard Worker    jnz .main
795*c0909341SAndroid Build Coastguard Worker.v2:
796*c0909341SAndroid Build Coastguard Worker    call .v
797*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
798*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
799*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
800*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
801*c0909341SAndroid Build Coastguard Worker%else
802*c0909341SAndroid Build Coastguard Worker    mov             t0, t3m
803*c0909341SAndroid Build Coastguard Worker    mov             r4, t2m
804*c0909341SAndroid Build Coastguard Worker    mov             t1, t1m
805*c0909341SAndroid Build Coastguard Worker    mov            t4m, t0
806*c0909341SAndroid Build Coastguard Worker    mov            t3m, r4
807*c0909341SAndroid Build Coastguard Worker    mov            t2m, t1
808*c0909341SAndroid Build Coastguard Worker    mov             wq, wm
809*c0909341SAndroid Build Coastguard Worker%endif
810*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
811*c0909341SAndroid Build Coastguard Worker.v1:
812*c0909341SAndroid Build Coastguard Worker    call .v
813*c0909341SAndroid Build Coastguard Worker    jmp .end
814*c0909341SAndroid Build Coastguard Worker.extend_right:
815*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset+8
816*c0909341SAndroid Build Coastguard Worker%assign calloff 8
817*c0909341SAndroid Build Coastguard Worker    movif32         t0, PICmem
818*c0909341SAndroid Build Coastguard Worker    pxor            m1, m1
819*c0909341SAndroid Build Coastguard Worker    movd            m2, wd
820*c0909341SAndroid Build Coastguard Worker    mova            m0, [base+pb_2_3]
821*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m1
822*c0909341SAndroid Build Coastguard Worker    mova            m1, [base+pb_m6_m5]
823*c0909341SAndroid Build Coastguard Worker    psubb           m0, m2
824*c0909341SAndroid Build Coastguard Worker    psubb           m1, m2
825*c0909341SAndroid Build Coastguard Worker    mova            m2, [base+pb_0to15]
826*c0909341SAndroid Build Coastguard Worker    pminub          m0, m2
827*c0909341SAndroid Build Coastguard Worker    pminub          m1, m2
828*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m0
829*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m1
830*c0909341SAndroid Build Coastguard Worker    ret
831*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset-4
832*c0909341SAndroid Build Coastguard Worker%assign calloff 4
833*c0909341SAndroid Build Coastguard Worker.h:
834*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
835*c0909341SAndroid Build Coastguard Worker    movif32         wq, wm
836*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
837*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
838*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq]
839*c0909341SAndroid Build Coastguard Worker    movd            m3, [leftq+4]
840*c0909341SAndroid Build Coastguard Worker    pslldq          m4, 4
841*c0909341SAndroid Build Coastguard Worker    por             m3, m4
842*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
843*c0909341SAndroid Build Coastguard Worker    jmp .h_main
844*c0909341SAndroid Build Coastguard Worker.h_extend_left:
845*c0909341SAndroid Build Coastguard Worker    mova            m3, [lpfq+wq] ; avoid accessing memory located
846*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m15       ; before the start of the buffer
847*c0909341SAndroid Build Coastguard Worker    jmp .h_main
848*c0909341SAndroid Build Coastguard Worker.h_top:
849*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
850*c0909341SAndroid Build Coastguard Worker    movif32         wq, wm
851*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
852*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
853*c0909341SAndroid Build Coastguard Worker.h_loop:
854*c0909341SAndroid Build Coastguard Worker    movu            m3, [lpfq+wq-4]
855*c0909341SAndroid Build Coastguard Worker.h_main:
856*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+wq+4]
857*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
858*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
859*c0909341SAndroid Build Coastguard Worker    cmp             wd, -18
860*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
861*c0909341SAndroid Build Coastguard Worker    call .extend_right
862*c0909341SAndroid Build Coastguard Worker.h_have_right:
863*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m3, m5
864*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11
865*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m5
866*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
867*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m3, m6
868*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m7
869*c0909341SAndroid Build Coastguard Worker    paddw           m2, m3
870*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m4, m6
871*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m12
872*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m7
873*c0909341SAndroid Build Coastguard Worker    paddw           m3, m4
874*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m12
875*c0909341SAndroid Build Coastguard Worker    paddd           m0, m8
876*c0909341SAndroid Build Coastguard Worker    paddd           m1, m8
877*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
878*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
879*c0909341SAndroid Build Coastguard Worker    psrad           m0, 4
880*c0909341SAndroid Build Coastguard Worker    psrad           m1, 4
881*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
882*c0909341SAndroid Build Coastguard Worker    psraw           m0, 1
883*c0909341SAndroid Build Coastguard Worker    mova       [t1+wq], m0
884*c0909341SAndroid Build Coastguard Worker    add             wq, 16
885*c0909341SAndroid Build Coastguard Worker    jl .h_loop
886*c0909341SAndroid Build Coastguard Worker    movif32         wq, wm
887*c0909341SAndroid Build Coastguard Worker    ret
888*c0909341SAndroid Build Coastguard WorkerALIGN function_align
889*c0909341SAndroid Build Coastguard Worker.hv:
890*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
891*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
892*c0909341SAndroid Build Coastguard Worker    movif32        t0m, t0
893*c0909341SAndroid Build Coastguard Worker    movif32        t1m, t1
894*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
895*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
896*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq]
897*c0909341SAndroid Build Coastguard Worker    movd            m3, [leftq+4]
898*c0909341SAndroid Build Coastguard Worker    pslldq          m4, 4
899*c0909341SAndroid Build Coastguard Worker    por             m3, m4
900*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
901*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
902*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
903*c0909341SAndroid Build Coastguard Worker    mova            m3, [lpfq+wq]
904*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m15
905*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
906*c0909341SAndroid Build Coastguard Worker.hv_bottom:
907*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
908*c0909341SAndroid Build Coastguard Worker    movif32        t0m, t0
909*c0909341SAndroid Build Coastguard Worker    movif32        t1m, t1
910*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
911*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
912*c0909341SAndroid Build Coastguard Worker.hv_loop:
913*c0909341SAndroid Build Coastguard Worker    movu            m3, [lpfq+wq-4]
914*c0909341SAndroid Build Coastguard Worker.hv_main:
915*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+wq+4]
916*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
917*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
918*c0909341SAndroid Build Coastguard Worker    cmp             wd, -18
919*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
920*c0909341SAndroid Build Coastguard Worker    call .extend_right
921*c0909341SAndroid Build Coastguard Worker.hv_have_right:
922*c0909341SAndroid Build Coastguard Worker    movif32         t1, t1m
923*c0909341SAndroid Build Coastguard Worker    movif32         t0, t3m
924*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m3, m5
925*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11
926*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m5
927*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
928*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m3, m6
929*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m7
930*c0909341SAndroid Build Coastguard Worker    paddw           m2, m3
931*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m4, m6
932*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m12
933*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m7
934*c0909341SAndroid Build Coastguard Worker    paddw           m3, m4
935*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m12
936*c0909341SAndroid Build Coastguard Worker    paddd           m0, m8
937*c0909341SAndroid Build Coastguard Worker    paddd           m1, m8
938*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
939*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
940*c0909341SAndroid Build Coastguard Worker    mova            m2, [t3+wq]
941*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t1+wq]
942*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
943*c0909341SAndroid Build Coastguard Worker    mova            m4, [t2+wq]
944*c0909341SAndroid Build Coastguard Worker%else
945*c0909341SAndroid Build Coastguard Worker    mova            m2, [t0+wq]
946*c0909341SAndroid Build Coastguard Worker    mov             t0, t2m
947*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t1+wq]
948*c0909341SAndroid Build Coastguard Worker    mov             t1, t4m
949*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
950*c0909341SAndroid Build Coastguard Worker    mova            m4, [t0+wq]
951*c0909341SAndroid Build Coastguard Worker    mov             t0, t0m
952*c0909341SAndroid Build Coastguard Worker%endif
953*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m2, m4
954*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m14
955*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m4
956*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
957*c0909341SAndroid Build Coastguard Worker    mova            m4, [t4+wq]
958*c0909341SAndroid Build Coastguard Worker%else
959*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+wq]
960*c0909341SAndroid Build Coastguard Worker%endif
961*c0909341SAndroid Build Coastguard Worker    psrad           m0, 4
962*c0909341SAndroid Build Coastguard Worker    psrad           m1, 4
963*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
964*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m14
965*c0909341SAndroid Build Coastguard Worker    psraw           m0, 1
966*c0909341SAndroid Build Coastguard Worker    mova       [t0+wq], m0
967*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m4
968*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m13
969*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m4
970*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m13
971*c0909341SAndroid Build Coastguard Worker    paddd           m3, m9
972*c0909341SAndroid Build Coastguard Worker    paddd           m2, m9
973*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
974*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
975*c0909341SAndroid Build Coastguard Worker    psrad           m1, 6
976*c0909341SAndroid Build Coastguard Worker    psrad           m0, 6
977*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
978*c0909341SAndroid Build Coastguard Worker    pmulhw          m0, m10
979*c0909341SAndroid Build Coastguard Worker    pxor            m1, m1
980*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m1
981*c0909341SAndroid Build Coastguard Worker    mova     [dstq+wq], m0
982*c0909341SAndroid Build Coastguard Worker    add             wq, 16
983*c0909341SAndroid Build Coastguard Worker    jl .hv_loop
984*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
985*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
986*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
987*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
988*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
989*c0909341SAndroid Build Coastguard Worker    mov             t0, t4
990*c0909341SAndroid Build Coastguard Worker%else
991*c0909341SAndroid Build Coastguard Worker    mov             r4, t3m
992*c0909341SAndroid Build Coastguard Worker    mov             t1, t2m
993*c0909341SAndroid Build Coastguard Worker    mov            t4m, r4
994*c0909341SAndroid Build Coastguard Worker    mov            t3m, t1
995*c0909341SAndroid Build Coastguard Worker    mov             r4, t1m
996*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
997*c0909341SAndroid Build Coastguard Worker    mov            t2m, r4
998*c0909341SAndroid Build Coastguard Worker    mov             t0, t4m
999*c0909341SAndroid Build Coastguard Worker    mov             wq, wm
1000*c0909341SAndroid Build Coastguard Worker%endif
1001*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
1002*c0909341SAndroid Build Coastguard Worker    ret
1003*c0909341SAndroid Build Coastguard Worker.v:
1004*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
1005*c0909341SAndroid Build Coastguard Worker    movif32        t1m, t1
1006*c0909341SAndroid Build Coastguard Worker.v_loop:
1007*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1008*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+wq]
1009*c0909341SAndroid Build Coastguard Worker    paddw           m2, m0, [t3+wq]
1010*c0909341SAndroid Build Coastguard Worker    mova            m1, [t2+wq]
1011*c0909341SAndroid Build Coastguard Worker    mova            m4, [t4+wq]
1012*c0909341SAndroid Build Coastguard Worker%else
1013*c0909341SAndroid Build Coastguard Worker    mov             t0, t3m
1014*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+wq]
1015*c0909341SAndroid Build Coastguard Worker    mov             t1, t2m
1016*c0909341SAndroid Build Coastguard Worker    paddw           m2, m0, [t0+wq]
1017*c0909341SAndroid Build Coastguard Worker    mov             t0, t4m
1018*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+wq]
1019*c0909341SAndroid Build Coastguard Worker    mova            m4, [t0+wq]
1020*c0909341SAndroid Build Coastguard Worker%endif
1021*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m2, m1
1022*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m14
1023*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m1
1024*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m14
1025*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m4
1026*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m13
1027*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m4
1028*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m13
1029*c0909341SAndroid Build Coastguard Worker    paddd           m3, m9
1030*c0909341SAndroid Build Coastguard Worker    paddd           m2, m9
1031*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
1032*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
1033*c0909341SAndroid Build Coastguard Worker    psrad           m1, 6
1034*c0909341SAndroid Build Coastguard Worker    psrad           m0, 6
1035*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
1036*c0909341SAndroid Build Coastguard Worker    pmulhw          m0, m10
1037*c0909341SAndroid Build Coastguard Worker    pxor            m1, m1
1038*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m1
1039*c0909341SAndroid Build Coastguard Worker    mova     [dstq+wq], m0
1040*c0909341SAndroid Build Coastguard Worker    add             wq, 16
1041*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1042*c0909341SAndroid Build Coastguard Worker    jl .v_loop
1043*c0909341SAndroid Build Coastguard Worker%else
1044*c0909341SAndroid Build Coastguard Worker    jge .v_end
1045*c0909341SAndroid Build Coastguard Worker    mov             t1, t1m
1046*c0909341SAndroid Build Coastguard Worker    jmp .v_loop
1047*c0909341SAndroid Build Coastguard Worker.v_end:
1048*c0909341SAndroid Build Coastguard Worker%endif
1049*c0909341SAndroid Build Coastguard Worker    ret
1050*c0909341SAndroid Build Coastguard Worker
1051*c0909341SAndroid Build Coastguard Worker%macro GATHERDD 3 ; dst, src, tmp
1052*c0909341SAndroid Build Coastguard Worker    movd           %3d, %2
1053*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
1054*c0909341SAndroid Build Coastguard Worker    movd            %1, [r13+%3]
1055*c0909341SAndroid Build Coastguard Worker    pextrw         %3d, %2, 2
1056*c0909341SAndroid Build Coastguard Worker    pinsrw          %1, [r13+%3+2], 3
1057*c0909341SAndroid Build Coastguard Worker    pextrw         %3d, %2, 4
1058*c0909341SAndroid Build Coastguard Worker    pinsrw          %1, [r13+%3+2], 5
1059*c0909341SAndroid Build Coastguard Worker    pextrw         %3d, %2, 6
1060*c0909341SAndroid Build Coastguard Worker    pinsrw          %1, [r13+%3+2], 7
1061*c0909341SAndroid Build Coastguard Worker %else
1062*c0909341SAndroid Build Coastguard Worker    movd            %1, [base+sgr_x_by_x-0xf03+%3]
1063*c0909341SAndroid Build Coastguard Worker    pextrw          %3, %2, 2
1064*c0909341SAndroid Build Coastguard Worker    pinsrw          %1, [base+sgr_x_by_x-0xf03+%3+2], 3
1065*c0909341SAndroid Build Coastguard Worker    pextrw          %3, %2, 4
1066*c0909341SAndroid Build Coastguard Worker    pinsrw          %1, [base+sgr_x_by_x-0xf03+%3+2], 5
1067*c0909341SAndroid Build Coastguard Worker    pextrw          %3, %2, 6
1068*c0909341SAndroid Build Coastguard Worker    pinsrw          %1, [base+sgr_x_by_x-0xf03+%3+2], 7
1069*c0909341SAndroid Build Coastguard Worker %endif
1070*c0909341SAndroid Build Coastguard Worker%endmacro
1071*c0909341SAndroid Build Coastguard Worker
1072*c0909341SAndroid Build Coastguard Worker%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore
1073*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
1074*c0909341SAndroid Build Coastguard Worker  %define tmp r14
1075*c0909341SAndroid Build Coastguard Worker %else
1076*c0909341SAndroid Build Coastguard Worker  %define tmp %4
1077*c0909341SAndroid Build Coastguard Worker %endif
1078*c0909341SAndroid Build Coastguard Worker    GATHERDD        %1, %2, tmp
1079*c0909341SAndroid Build Coastguard Worker    GATHERDD        %2, %3, tmp
1080*c0909341SAndroid Build Coastguard Worker    movif32         %4, %5
1081*c0909341SAndroid Build Coastguard Worker    psrld           %1, 24
1082*c0909341SAndroid Build Coastguard Worker    psrld           %2, 24
1083*c0909341SAndroid Build Coastguard Worker    packssdw        %1, %2
1084*c0909341SAndroid Build Coastguard Worker%endmacro
1085*c0909341SAndroid Build Coastguard Worker
1086*c0909341SAndroid Build Coastguard Worker%macro MAXSD 3-4 0 ; dst, src, restore_tmp
1087*c0909341SAndroid Build Coastguard Worker    pcmpgtd         %3, %1, %2
1088*c0909341SAndroid Build Coastguard Worker    pand            %1, %3
1089*c0909341SAndroid Build Coastguard Worker    pandn           %3, %2
1090*c0909341SAndroid Build Coastguard Worker    por             %1, %3
1091*c0909341SAndroid Build Coastguard Worker %if %4 == 1
1092*c0909341SAndroid Build Coastguard Worker    pxor            %3, %3
1093*c0909341SAndroid Build Coastguard Worker %endif
1094*c0909341SAndroid Build Coastguard Worker%endmacro
1095*c0909341SAndroid Build Coastguard Worker
1096*c0909341SAndroid Build Coastguard Worker%macro MULLD 3 ; dst, src, tmp
1097*c0909341SAndroid Build Coastguard Worker    pmulhuw         %3, %1, %2
1098*c0909341SAndroid Build Coastguard Worker    pmullw          %1, %2
1099*c0909341SAndroid Build Coastguard Worker    pslld           %3, 16
1100*c0909341SAndroid Build Coastguard Worker    paddd           %1, %3
1101*c0909341SAndroid Build Coastguard Worker%endmacro
1102*c0909341SAndroid Build Coastguard Worker
1103*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1104*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 0, 1, 2, 3, 5
1105*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
1106*c0909341SAndroid Build Coastguard Worker  %assign extra_stack 5*16
1107*c0909341SAndroid Build Coastguard Worker %else
1108*c0909341SAndroid Build Coastguard Worker  %assign extra_stack 3*16
1109*c0909341SAndroid Build Coastguard Worker %endif
1110*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \
1111*c0909341SAndroid Build Coastguard Worker                              dst, stride, left, lpf, w
1112*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
1113*c0909341SAndroid Build Coastguard Worker  %define dstm         dword [esp+calloff+16*0+4*6]
1114*c0909341SAndroid Build Coastguard Worker  %define stridemp     dword [esp+calloff+16*0+4*7]
1115*c0909341SAndroid Build Coastguard Worker  %define leftm        dword [esp+calloff+16*3+4*0]
1116*c0909341SAndroid Build Coastguard Worker  %define lpfm         dword [esp+calloff+16*3+4*1]
1117*c0909341SAndroid Build Coastguard Worker  %define w0m          dword [esp+calloff+16*3+4*2]
1118*c0909341SAndroid Build Coastguard Worker  %define hd           dword [esp+calloff+16*3+4*3]
1119*c0909341SAndroid Build Coastguard Worker  %define edgeb         byte [esp+calloff+16*3+4*4]
1120*c0909341SAndroid Build Coastguard Worker  %define edged        dword [esp+calloff+16*3+4*4]
1121*c0909341SAndroid Build Coastguard Worker  %define leftmp leftm
1122*c0909341SAndroid Build Coastguard Worker %else
1123*c0909341SAndroid Build Coastguard Worker  %define w0m wm
1124*c0909341SAndroid Build Coastguard Worker  %define hd dword r5m
1125*c0909341SAndroid Build Coastguard Worker  %define edgeb  byte r7m
1126*c0909341SAndroid Build Coastguard Worker  %define edged dword r7m
1127*c0909341SAndroid Build Coastguard Worker %endif
1128*c0909341SAndroid Build Coastguard Worker %define hvsrcm dword [esp+calloff+4*0]
1129*c0909341SAndroid Build Coastguard Worker %define w1m    dword [esp+calloff+4*1]
1130*c0909341SAndroid Build Coastguard Worker %define t0m    dword [esp+calloff+4*2]
1131*c0909341SAndroid Build Coastguard Worker %define t2m    dword [esp+calloff+4*3]
1132*c0909341SAndroid Build Coastguard Worker %define t3m    dword [esp+calloff+4*4]
1133*c0909341SAndroid Build Coastguard Worker %define t4m    dword [esp+calloff+4*5]
1134*c0909341SAndroid Build Coastguard Worker %define  m8 [base+pd_8]
1135*c0909341SAndroid Build Coastguard Worker %define  m9 [base+pd_0xfffffff0]
1136*c0909341SAndroid Build Coastguard Worker %define m10 [esp+calloff+16*2]
1137*c0909341SAndroid Build Coastguard Worker %define m11 [base+pd_0xf00800a4]
1138*c0909341SAndroid Build Coastguard Worker %define m12 [base+sgr_lshuf5]
1139*c0909341SAndroid Build Coastguard Worker %define m13 [base+pd_34816]
1140*c0909341SAndroid Build Coastguard Worker %define m14 [base+pw_1023]
1141*c0909341SAndroid Build Coastguard Worker %define r10 r4
1142*c0909341SAndroid Build Coastguard Worker %define base r6-$$
1143*c0909341SAndroid Build Coastguard Worker %assign calloff 0
1144*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
1145*c0909341SAndroid Build Coastguard Worker    mov        strideq, [rstk+stack_offset+ 8]
1146*c0909341SAndroid Build Coastguard Worker    mov          leftq, [rstk+stack_offset+12]
1147*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rstk+stack_offset+16]
1148*c0909341SAndroid Build Coastguard Worker    mov             wd, [rstk+stack_offset+20]
1149*c0909341SAndroid Build Coastguard Worker    mov           dstm, dstq
1150*c0909341SAndroid Build Coastguard Worker    mov       stridemp, strideq
1151*c0909341SAndroid Build Coastguard Worker    mov          leftm, leftq
1152*c0909341SAndroid Build Coastguard Worker    mov             r1, [rstk+stack_offset+24]
1153*c0909341SAndroid Build Coastguard Worker    mov             r2, [rstk+stack_offset+32]
1154*c0909341SAndroid Build Coastguard Worker    mov           lpfm, lpfq
1155*c0909341SAndroid Build Coastguard Worker    mov             hd, r1
1156*c0909341SAndroid Build Coastguard Worker    mov          edged, r2
1157*c0909341SAndroid Build Coastguard Worker %endif
1158*c0909341SAndroid Build Coastguard Worker%else
1159*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_5x5_16bpc, 4, 15, 15, -400*24-16, dst, stride, left, lpf, \
1160*c0909341SAndroid Build Coastguard Worker                                                     w, h, edge, params
1161*c0909341SAndroid Build Coastguard Worker%endif
1162*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
1163*c0909341SAndroid Build Coastguard Worker    movifnidn       wd, wm
1164*c0909341SAndroid Build Coastguard Worker%endif
1165*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1166*c0909341SAndroid Build Coastguard Worker    mov        paramsq, r6mp
1167*c0909341SAndroid Build Coastguard Worker    lea            r13, [sgr_x_by_x-0xf03]
1168*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
1169*c0909341SAndroid Build Coastguard Worker    add             wd, wd
1170*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
1171*c0909341SAndroid Build Coastguard Worker    movu           m10, [paramsq]
1172*c0909341SAndroid Build Coastguard Worker    mova           m12, [sgr_lshuf5]
1173*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
1174*c0909341SAndroid Build Coastguard Worker    mova            m8, [pd_8]
1175*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq+20]
1176*c0909341SAndroid Build Coastguard Worker    mova            m9, [pd_0xfffffff0]
1177*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
1178*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+wq*2+400*12+16]
1179*c0909341SAndroid Build Coastguard Worker    mova           m11, [pd_0xf00800a4]
1180*c0909341SAndroid Build Coastguard Worker    lea             t4, [rsp+wq+400*20+16]
1181*c0909341SAndroid Build Coastguard Worker    pshufhw         m7, m10, q0000
1182*c0909341SAndroid Build Coastguard Worker    pshufb         m10, [pw_256]  ; s0
1183*c0909341SAndroid Build Coastguard Worker    punpckhqdq      m7, m7        ; w0
1184*c0909341SAndroid Build Coastguard Worker    neg             wq
1185*c0909341SAndroid Build Coastguard Worker    mova           m13, [pd_34816]  ; (1 << 11) + (1 << 15)
1186*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
1187*c0909341SAndroid Build Coastguard Worker    mova           m14, [pw_1023]
1188*c0909341SAndroid Build Coastguard Worker    psllw           m7, 4
1189*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
1190*c0909341SAndroid Build Coastguard Worker %define lpfm        [rsp]
1191*c0909341SAndroid Build Coastguard Worker%else
1192*c0909341SAndroid Build Coastguard Worker    mov             r1, [rstk+stack_offset+28] ; params
1193*c0909341SAndroid Build Coastguard Worker    LEA             r6, $$
1194*c0909341SAndroid Build Coastguard Worker    add             wd, wd
1195*c0909341SAndroid Build Coastguard Worker    movu            m1, [r1]
1196*c0909341SAndroid Build Coastguard Worker    add           lpfm, wq
1197*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+extra_stack+wq+20]
1198*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
1199*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+extra_stack+wq*2+400*12+16]
1200*c0909341SAndroid Build Coastguard Worker    mov           dstm, dstq
1201*c0909341SAndroid Build Coastguard Worker    lea             t4, [rsp+extra_stack+wq+400*20+16]
1202*c0909341SAndroid Build Coastguard Worker    mov            t3m, t3
1203*c0909341SAndroid Build Coastguard Worker    pshufhw         m7, m1, q0000
1204*c0909341SAndroid Build Coastguard Worker    mov            t4m, t4
1205*c0909341SAndroid Build Coastguard Worker    pshufb          m1, [base+pw_256] ; s0
1206*c0909341SAndroid Build Coastguard Worker    punpckhqdq      m7, m7            ; w0
1207*c0909341SAndroid Build Coastguard Worker    psllw           m7, 4
1208*c0909341SAndroid Build Coastguard Worker    neg             wq
1209*c0909341SAndroid Build Coastguard Worker    mova           m10, m1
1210*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
1211*c0909341SAndroid Build Coastguard Worker    mov            w1m, wd
1212*c0909341SAndroid Build Coastguard Worker    sub             wd, 4
1213*c0909341SAndroid Build Coastguard Worker    mov           lpfq, lpfm
1214*c0909341SAndroid Build Coastguard Worker    mov            w0m, wd
1215*c0909341SAndroid Build Coastguard Worker %define strideq r5
1216*c0909341SAndroid Build Coastguard Worker%endif
1217*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
1218*c0909341SAndroid Build Coastguard Worker    jz .no_top
1219*c0909341SAndroid Build Coastguard Worker    call .h_top
1220*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1221*c0909341SAndroid Build Coastguard Worker    movif32        t2m, t1
1222*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1223*c0909341SAndroid Build Coastguard Worker    call .top_fixup
1224*c0909341SAndroid Build Coastguard Worker    add             t1, 400*6
1225*c0909341SAndroid Build Coastguard Worker    call .h_top
1226*c0909341SAndroid Build Coastguard Worker    movif32    strideq, stridemp
1227*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1228*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1229*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
1230*c0909341SAndroid Build Coastguard Worker    mov           lpfm, r10 ; below
1231*c0909341SAndroid Build Coastguard Worker    movif32        t0m, t2
1232*c0909341SAndroid Build Coastguard Worker    mov             t0, t2
1233*c0909341SAndroid Build Coastguard Worker    dec             hd
1234*c0909341SAndroid Build Coastguard Worker    jz .height1
1235*c0909341SAndroid Build Coastguard Worker    or           edged, 16
1236*c0909341SAndroid Build Coastguard Worker    call .h
1237*c0909341SAndroid Build Coastguard Worker.main:
1238*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1239*c0909341SAndroid Build Coastguard Worker    movif32         t4, t4m
1240*c0909341SAndroid Build Coastguard Worker    call .hv
1241*c0909341SAndroid Build Coastguard Worker    call .prep_n
1242*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1243*c0909341SAndroid Build Coastguard Worker    jl .extend_bottom
1244*c0909341SAndroid Build Coastguard Worker.main_loop:
1245*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
1246*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1247*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1248*c0909341SAndroid Build Coastguard Worker    test            hb, hb
1249*c0909341SAndroid Build Coastguard Worker%else
1250*c0909341SAndroid Build Coastguard Worker    mov             r4, hd
1251*c0909341SAndroid Build Coastguard Worker    test            r4, r4
1252*c0909341SAndroid Build Coastguard Worker%endif
1253*c0909341SAndroid Build Coastguard Worker    jz .odd_height
1254*c0909341SAndroid Build Coastguard Worker    call .h
1255*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1256*c0909341SAndroid Build Coastguard Worker    call .hv
1257*c0909341SAndroid Build Coastguard Worker    movif32       dstq, dstm
1258*c0909341SAndroid Build Coastguard Worker    call .n0
1259*c0909341SAndroid Build Coastguard Worker    call .n1
1260*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1261*c0909341SAndroid Build Coastguard Worker    movif32         t0, t0m
1262*c0909341SAndroid Build Coastguard Worker    jge .main_loop
1263*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
1264*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
1265*c0909341SAndroid Build Coastguard Worker    mov           lpfq, lpfm
1266*c0909341SAndroid Build Coastguard Worker    call .h_top
1267*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1268*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
1269*c0909341SAndroid Build Coastguard Worker.end:
1270*c0909341SAndroid Build Coastguard Worker    movif32       dstq, dstm
1271*c0909341SAndroid Build Coastguard Worker    call .n0
1272*c0909341SAndroid Build Coastguard Worker    call .n1
1273*c0909341SAndroid Build Coastguard Worker.end2:
1274*c0909341SAndroid Build Coastguard Worker    RET
1275*c0909341SAndroid Build Coastguard Worker.height1:
1276*c0909341SAndroid Build Coastguard Worker    movif32         t4, t4m
1277*c0909341SAndroid Build Coastguard Worker    call .hv
1278*c0909341SAndroid Build Coastguard Worker    call .prep_n
1279*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
1280*c0909341SAndroid Build Coastguard Worker.odd_height:
1281*c0909341SAndroid Build Coastguard Worker    call .hv
1282*c0909341SAndroid Build Coastguard Worker    movif32       dstq, dstm
1283*c0909341SAndroid Build Coastguard Worker    call .n0
1284*c0909341SAndroid Build Coastguard Worker    call .n1
1285*c0909341SAndroid Build Coastguard Worker.odd_height_end:
1286*c0909341SAndroid Build Coastguard Worker    call .v
1287*c0909341SAndroid Build Coastguard Worker    movif32       dstq, dstm
1288*c0909341SAndroid Build Coastguard Worker    call .n0
1289*c0909341SAndroid Build Coastguard Worker    jmp .end2
1290*c0909341SAndroid Build Coastguard Worker.extend_bottom:
1291*c0909341SAndroid Build Coastguard Worker    call .v
1292*c0909341SAndroid Build Coastguard Worker    jmp .end
1293*c0909341SAndroid Build Coastguard Worker.no_top:
1294*c0909341SAndroid Build Coastguard Worker    movif32    strideq, stridemp
1295*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1296*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1297*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
1298*c0909341SAndroid Build Coastguard Worker    mov           lpfm, r10
1299*c0909341SAndroid Build Coastguard Worker    call .h
1300*c0909341SAndroid Build Coastguard Worker    lea             t2, [t1+400*6]
1301*c0909341SAndroid Build Coastguard Worker    movif32        t2m, t2
1302*c0909341SAndroid Build Coastguard Worker    call .top_fixup
1303*c0909341SAndroid Build Coastguard Worker    dec             hd
1304*c0909341SAndroid Build Coastguard Worker    jz .no_top_height1
1305*c0909341SAndroid Build Coastguard Worker    or           edged, 16
1306*c0909341SAndroid Build Coastguard Worker    mov             t0, t1
1307*c0909341SAndroid Build Coastguard Worker    mov             t1, t2
1308*c0909341SAndroid Build Coastguard Worker    movif32        t0m, t0
1309*c0909341SAndroid Build Coastguard Worker    jmp .main
1310*c0909341SAndroid Build Coastguard Worker.no_top_height1:
1311*c0909341SAndroid Build Coastguard Worker    movif32         t3, t3m
1312*c0909341SAndroid Build Coastguard Worker    movif32         t4, t4m
1313*c0909341SAndroid Build Coastguard Worker    call .v
1314*c0909341SAndroid Build Coastguard Worker    call .prep_n
1315*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
1316*c0909341SAndroid Build Coastguard Worker.extend_right:
1317*c0909341SAndroid Build Coastguard Worker    movd            m0, wd
1318*c0909341SAndroid Build Coastguard Worker    movd            m1, [lpfq-2]
1319*c0909341SAndroid Build Coastguard Worker    mova            m2, [base+pw_256]
1320*c0909341SAndroid Build Coastguard Worker    mova            m3, [base+pb_m14_m13]
1321*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m6
1322*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m2
1323*c0909341SAndroid Build Coastguard Worker    psubb           m2, m0
1324*c0909341SAndroid Build Coastguard Worker    psubb           m3, m0
1325*c0909341SAndroid Build Coastguard Worker    mova            m0, [base+pb_0to15]
1326*c0909341SAndroid Build Coastguard Worker    pcmpgtb         m2, m0
1327*c0909341SAndroid Build Coastguard Worker    pcmpgtb         m3, m0
1328*c0909341SAndroid Build Coastguard Worker    pand            m4, m2
1329*c0909341SAndroid Build Coastguard Worker    pand            m5, m3
1330*c0909341SAndroid Build Coastguard Worker    pandn           m2, m1
1331*c0909341SAndroid Build Coastguard Worker    pandn           m3, m1
1332*c0909341SAndroid Build Coastguard Worker    por             m4, m2
1333*c0909341SAndroid Build Coastguard Worker    por             m5, m3
1334*c0909341SAndroid Build Coastguard Worker    ret
1335*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset+4
1336*c0909341SAndroid Build Coastguard Worker%assign calloff 4
1337*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum
1338*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1339*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
1340*c0909341SAndroid Build Coastguard Worker%else
1341*c0909341SAndroid Build Coastguard Worker %define leftq r4
1342*c0909341SAndroid Build Coastguard Worker%endif
1343*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1344*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1345*c0909341SAndroid Build Coastguard Worker    movif32      leftq, leftm
1346*c0909341SAndroid Build Coastguard Worker    movddup         m5, [leftq]
1347*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
1348*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq+4]
1349*c0909341SAndroid Build Coastguard Worker    add         leftmp, 8
1350*c0909341SAndroid Build Coastguard Worker    palignr         m4, m5, 10
1351*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1352*c0909341SAndroid Build Coastguard Worker.h_extend_left:
1353*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
1354*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq+4]
1355*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m12
1356*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1357*c0909341SAndroid Build Coastguard Worker.h_top:
1358*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1359*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
1360*c0909341SAndroid Build Coastguard Worker%endif
1361*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1362*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1363*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
1364*c0909341SAndroid Build Coastguard Worker.h_loop:
1365*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+wq- 2]
1366*c0909341SAndroid Build Coastguard Worker.h_main:
1367*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+wq+14]
1368*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1369*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
1370*c0909341SAndroid Build Coastguard Worker    cmp             wd, -20
1371*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
1372*c0909341SAndroid Build Coastguard Worker    call .extend_right
1373*c0909341SAndroid Build Coastguard Worker.h_have_right:
1374*c0909341SAndroid Build Coastguard Worker    palignr         m2, m5, m4, 2
1375*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4, m2
1376*c0909341SAndroid Build Coastguard Worker    palignr         m3, m5, m4, 6
1377*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3
1378*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m2, m3
1379*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
1380*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m3
1381*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1382*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 8
1383*c0909341SAndroid Build Coastguard Worker    paddw           m0, m5
1384*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m4, m5
1385*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1386*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
1387*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4, m5
1388*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1389*c0909341SAndroid Build Coastguard Worker    shufps          m4, m5, q2121
1390*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4             ; sum
1391*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m4, m6
1392*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
1393*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m6
1394*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
1395*c0909341SAndroid Build Coastguard Worker    paddd           m2, m3
1396*c0909341SAndroid Build Coastguard Worker    test         edgeb, 16             ; y > 0
1397*c0909341SAndroid Build Coastguard Worker    jz .h_loop_end
1398*c0909341SAndroid Build Coastguard Worker    paddw           m0, [t1+wq+400*0]
1399*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t1+wq+400*2]
1400*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+wq+400*4]
1401*c0909341SAndroid Build Coastguard Worker.h_loop_end:
1402*c0909341SAndroid Build Coastguard Worker    paddd           m1, m5             ; sumsq
1403*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4
1404*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400*0], m0
1405*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400*2], m1
1406*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400*4], m2
1407*c0909341SAndroid Build Coastguard Worker    add             wq, 16
1408*c0909341SAndroid Build Coastguard Worker    jl .h_loop
1409*c0909341SAndroid Build Coastguard Worker    ret
1410*c0909341SAndroid Build Coastguard Worker.top_fixup:
1411*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1412*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
1413*c0909341SAndroid Build Coastguard Worker%else
1414*c0909341SAndroid Build Coastguard Worker    mov             wd, w0m
1415*c0909341SAndroid Build Coastguard Worker%endif
1416*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: ; the sums of the first row needs to be doubled
1417*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+wq+400*0]
1418*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+wq+400*2]
1419*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+wq+400*4]
1420*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
1421*c0909341SAndroid Build Coastguard Worker    paddd           m1, m1
1422*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
1423*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*0], m0
1424*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*2], m1
1425*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*4], m2
1426*c0909341SAndroid Build Coastguard Worker    add             wq, 16
1427*c0909341SAndroid Build Coastguard Worker    jl .top_fixup_loop
1428*c0909341SAndroid Build Coastguard Worker    ret
1429*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1430*c0909341SAndroid Build Coastguard Worker.hv: ; horizontal boxsum + vertical boxsum + ab
1431*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1432*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
1433*c0909341SAndroid Build Coastguard Worker%else
1434*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
1435*c0909341SAndroid Build Coastguard Worker%endif
1436*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1437*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
1438*c0909341SAndroid Build Coastguard Worker    movif32      leftq, leftm
1439*c0909341SAndroid Build Coastguard Worker    movddup         m5, [leftq]
1440*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
1441*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq+4]
1442*c0909341SAndroid Build Coastguard Worker    add         leftmp, 8
1443*c0909341SAndroid Build Coastguard Worker    palignr         m4, m5, 10
1444*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
1445*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
1446*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
1447*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq+4]
1448*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m12
1449*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
1450*c0909341SAndroid Build Coastguard Worker.hv_bottom:
1451*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1452*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
1453*c0909341SAndroid Build Coastguard Worker%else
1454*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
1455*c0909341SAndroid Build Coastguard Worker%endif
1456*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1457*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
1458*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
1459*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1460*c0909341SAndroid Build Coastguard Worker    jmp .hv_loop_start
1461*c0909341SAndroid Build Coastguard Worker%endif
1462*c0909341SAndroid Build Coastguard Worker.hv_loop:
1463*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
1464*c0909341SAndroid Build Coastguard Worker.hv_loop_start:
1465*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+wq- 2]
1466*c0909341SAndroid Build Coastguard Worker.hv_main:
1467*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+wq+14]
1468*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1469*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
1470*c0909341SAndroid Build Coastguard Worker    cmp             wd, -20
1471*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
1472*c0909341SAndroid Build Coastguard Worker    call .extend_right
1473*c0909341SAndroid Build Coastguard Worker.hv_have_right:
1474*c0909341SAndroid Build Coastguard Worker    movif32         t3, hd
1475*c0909341SAndroid Build Coastguard Worker    palignr         m3, m5, m4, 2
1476*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4, m3
1477*c0909341SAndroid Build Coastguard Worker    palignr         m1, m5, m4, 6
1478*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
1479*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m1
1480*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1481*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m1
1482*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1483*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 8
1484*c0909341SAndroid Build Coastguard Worker    paddw           m0, m5
1485*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m4, m5
1486*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
1487*c0909341SAndroid Build Coastguard Worker    paddd           m2, m1
1488*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m4, m5
1489*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
1490*c0909341SAndroid Build Coastguard Worker    shufps          m4, m5, q2121
1491*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4            ; h sum
1492*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m4, m6
1493*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
1494*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m6
1495*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
1496*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1
1497*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5            ; h sumsq
1498*c0909341SAndroid Build Coastguard Worker    paddd           m3, m4
1499*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t1+wq+400*0]
1500*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t1+wq+400*2]
1501*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t1+wq+400*4]
1502*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1503*c0909341SAndroid Build Coastguard Worker    test            hd, hd
1504*c0909341SAndroid Build Coastguard Worker%else
1505*c0909341SAndroid Build Coastguard Worker    test            t3, t3
1506*c0909341SAndroid Build Coastguard Worker%endif
1507*c0909341SAndroid Build Coastguard Worker    jz .hv_last_row
1508*c0909341SAndroid Build Coastguard Worker.hv_main2:
1509*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t2+wq+400*0] ; hv sum
1510*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t2+wq+400*2] ; hv sumsq
1511*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t2+wq+400*4]
1512*c0909341SAndroid Build Coastguard Worker    mova [t0+wq+400*0], m0
1513*c0909341SAndroid Build Coastguard Worker    mova [t0+wq+400*2], m2
1514*c0909341SAndroid Build Coastguard Worker    mova [t0+wq+400*4], m3
1515*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
1516*c0909341SAndroid Build Coastguard Worker    paddd           m4, m8
1517*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m6             ; (b + 2) >> 2
1518*c0909341SAndroid Build Coastguard Worker    paddd           m5, m8
1519*c0909341SAndroid Build Coastguard Worker    pand            m4, m9             ; ((a + 8) >> 4) << 4
1520*c0909341SAndroid Build Coastguard Worker    pand            m5, m9
1521*c0909341SAndroid Build Coastguard Worker    psrld           m2, m4, 4
1522*c0909341SAndroid Build Coastguard Worker    psrld           m0, m5, 4
1523*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4
1524*c0909341SAndroid Build Coastguard Worker    psrld           m4, 1
1525*c0909341SAndroid Build Coastguard Worker    paddd           m0, m5
1526*c0909341SAndroid Build Coastguard Worker    psrld           m5, 1
1527*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; a * 25
1528*c0909341SAndroid Build Coastguard Worker    paddd           m5, m0
1529*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
1530*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1531*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2             ; b * b
1532*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1533*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
1534*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
1535*c0909341SAndroid Build Coastguard Worker    MAXSD           m4, m2, m6
1536*c0909341SAndroid Build Coastguard Worker    MAXSD           m5, m3, m6, 1
1537*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p
1538*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
1539*c0909341SAndroid Build Coastguard Worker    MULLD           m4, m10, m2        ; p * s
1540*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m10, m2
1541*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11            ; b * 164
1542*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
1543*c0909341SAndroid Build Coastguard Worker    paddusw         m4, m11
1544*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m11
1545*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; min(z, 255)
1546*c0909341SAndroid Build Coastguard Worker    movif32         t3, t3m
1547*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
1548*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m3, m4, m5, t2, t2m
1549*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m3
1550*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3, m3
1551*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m4, m2
1552*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m5, m2
1553*c0909341SAndroid Build Coastguard Worker    paddd           m0, m13            ; x * b * 164 + (1 << 11) + (1 << 15)
1554*c0909341SAndroid Build Coastguard Worker    paddd           m1, m13
1555*c0909341SAndroid Build Coastguard Worker    mova     [t4+wq+4], m3
1556*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12             ; b
1557*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
1558*c0909341SAndroid Build Coastguard Worker    mova  [t3+wq*2+ 8], m0
1559*c0909341SAndroid Build Coastguard Worker    mova  [t3+wq*2+24], m1
1560*c0909341SAndroid Build Coastguard Worker    add             wq, 16
1561*c0909341SAndroid Build Coastguard Worker    jl .hv_loop
1562*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1563*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
1564*c0909341SAndroid Build Coastguard Worker    mov             t0, t2
1565*c0909341SAndroid Build Coastguard Worker    movif32        t2m, t2
1566*c0909341SAndroid Build Coastguard Worker    movif32        t0m, t0
1567*c0909341SAndroid Build Coastguard Worker    ret
1568*c0909341SAndroid Build Coastguard Worker.hv_last_row: ; esoteric edge case for odd heights
1569*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400*0], m1
1570*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0
1571*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400*2], m4
1572*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2
1573*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400*4], m5
1574*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
1575*c0909341SAndroid Build Coastguard Worker    jmp .hv_main2
1576*c0909341SAndroid Build Coastguard Worker.v: ; vertical boxsum + ab
1577*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1578*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
1579*c0909341SAndroid Build Coastguard Worker%else
1580*c0909341SAndroid Build Coastguard Worker    mov             wd, w0m
1581*c0909341SAndroid Build Coastguard Worker%endif
1582*c0909341SAndroid Build Coastguard Worker.v_loop:
1583*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+wq+400*0]
1584*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+wq+400*2]
1585*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+wq+400*4]
1586*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+wq+400*0]
1587*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t2+wq+400*2]
1588*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t2+wq+400*4]
1589*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
1590*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
1591*c0909341SAndroid Build Coastguard Worker    paddd           m3, m3
1592*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0             ; hv sum
1593*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; hv sumsq
1594*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
1595*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
1596*c0909341SAndroid Build Coastguard Worker    paddd           m4, m8
1597*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m6             ; (b + 2) >> 2
1598*c0909341SAndroid Build Coastguard Worker    paddd           m5, m8
1599*c0909341SAndroid Build Coastguard Worker    pand            m4, m9             ; ((a + 8) >> 4) << 4
1600*c0909341SAndroid Build Coastguard Worker    pand            m5, m9
1601*c0909341SAndroid Build Coastguard Worker    psrld           m2, m4, 4
1602*c0909341SAndroid Build Coastguard Worker    psrld           m0, m5, 4
1603*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4
1604*c0909341SAndroid Build Coastguard Worker    psrld           m4, 1
1605*c0909341SAndroid Build Coastguard Worker    paddd           m0, m5
1606*c0909341SAndroid Build Coastguard Worker    psrld           m5, 1
1607*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; a * 25
1608*c0909341SAndroid Build Coastguard Worker    paddd           m5, m0
1609*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
1610*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1611*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2             ; b * b
1612*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1613*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
1614*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
1615*c0909341SAndroid Build Coastguard Worker    MAXSD           m4, m2, m6
1616*c0909341SAndroid Build Coastguard Worker    MAXSD           m5, m3, m6, 1
1617*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p
1618*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
1619*c0909341SAndroid Build Coastguard Worker    MULLD           m4, m10, m2        ; p * s
1620*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m10, m2
1621*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11            ; b * 164
1622*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
1623*c0909341SAndroid Build Coastguard Worker    paddusw         m4, m11
1624*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m11
1625*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; min(z, 255)
1626*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
1627*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m3, m4, m5, t2, t2m
1628*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m3
1629*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3, m3
1630*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m4, m2
1631*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m5, m2
1632*c0909341SAndroid Build Coastguard Worker    paddd           m0, m13            ; x * b * 164 + (1 << 11) + (1 << 15)
1633*c0909341SAndroid Build Coastguard Worker    paddd           m1, m13
1634*c0909341SAndroid Build Coastguard Worker    mova     [t4+wq+4], m3
1635*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12             ; b
1636*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
1637*c0909341SAndroid Build Coastguard Worker    mova  [t3+wq*2+ 8], m0
1638*c0909341SAndroid Build Coastguard Worker    mova  [t3+wq*2+24], m1
1639*c0909341SAndroid Build Coastguard Worker    add             wq, 16
1640*c0909341SAndroid Build Coastguard Worker    jl .v_loop
1641*c0909341SAndroid Build Coastguard Worker    ret
1642*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup
1643*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
1644*c0909341SAndroid Build Coastguard Worker    movif32         wd, w1m
1645*c0909341SAndroid Build Coastguard Worker.prep_n_loop:
1646*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+wq*1+ 2]
1647*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*1+ 4]
1648*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*2+ 4]
1649*c0909341SAndroid Build Coastguard Worker    movu            m4, [t3+wq*2+ 8]
1650*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+wq*2+20]
1651*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*2+24]
1652*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0
1653*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1
1654*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2
1655*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*1+ 0]
1656*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+wq*2+ 0]
1657*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+wq*2+16]
1658*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3
1659*c0909341SAndroid Build Coastguard Worker    psllw           m3, 2
1660*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
1661*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2
1662*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5
1663*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2
1664*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3             ; a 565
1665*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4             ; b 565
1666*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5
1667*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*2+ 0], m0
1668*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*4+ 0], m1
1669*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*4+16], m2
1670*c0909341SAndroid Build Coastguard Worker    add             wq, 16
1671*c0909341SAndroid Build Coastguard Worker    jl .prep_n_loop
1672*c0909341SAndroid Build Coastguard Worker    ret
1673*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1674*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows)
1675*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
1676*c0909341SAndroid Build Coastguard Worker    movif32         wd, w1m
1677*c0909341SAndroid Build Coastguard Worker.n0_loop:
1678*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+wq*1+ 2]
1679*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*1+ 4]
1680*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*2+ 4]
1681*c0909341SAndroid Build Coastguard Worker    movu            m4, [t3+wq*2+ 8]
1682*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+wq*2+20]
1683*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*2+24]
1684*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0
1685*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1
1686*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2
1687*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*1+ 0]
1688*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+wq*2+ 0]
1689*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+wq*2+16]
1690*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3
1691*c0909341SAndroid Build Coastguard Worker    psllw           m3, 2
1692*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
1693*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2
1694*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5
1695*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2
1696*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3             ; a 565
1697*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4             ; b 565
1698*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5
1699*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0, [t4+wq*1+400*2+ 0]
1700*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1, [t3+wq*2+400*4+ 0]
1701*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2, [t3+wq*2+400*4+16]
1702*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*2+ 0], m0
1703*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*4+ 0], m1
1704*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*4+16], m2
1705*c0909341SAndroid Build Coastguard Worker    mova            m0, [dstq+wq]
1706*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m0, m6          ; src
1707*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6          ; a
1708*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m1              ; a * src
1709*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m6
1710*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1711*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1
1712*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2              ; b - a * src + (1 << 8)
1713*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
1714*c0909341SAndroid Build Coastguard Worker    psrad           m4, 9
1715*c0909341SAndroid Build Coastguard Worker    psrad           m5, 9
1716*c0909341SAndroid Build Coastguard Worker    packssdw        m4, m5
1717*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m4, m7
1718*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4
1719*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m6
1720*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m14
1721*c0909341SAndroid Build Coastguard Worker    mova     [dstq+wq], m0
1722*c0909341SAndroid Build Coastguard Worker    add             wq, 16
1723*c0909341SAndroid Build Coastguard Worker    jl .n0_loop
1724*c0909341SAndroid Build Coastguard Worker    add           dstq, stridemp
1725*c0909341SAndroid Build Coastguard Worker    ret
1726*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1727*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows)
1728*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
1729*c0909341SAndroid Build Coastguard Worker    movif32         wd, w1m
1730*c0909341SAndroid Build Coastguard Worker.n1_loop:
1731*c0909341SAndroid Build Coastguard Worker    mova            m0, [dstq+wq]
1732*c0909341SAndroid Build Coastguard Worker    mova            m3, [t4+wq*1+400*2+ 0]
1733*c0909341SAndroid Build Coastguard Worker    mova            m4, [t3+wq*2+400*4+ 0]
1734*c0909341SAndroid Build Coastguard Worker    mova            m5, [t3+wq*2+400*4+16]
1735*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m0, m6          ; src
1736*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6          ; a
1737*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m1
1738*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m6
1739*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1740*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1
1741*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2              ; b - a * src + (1 << 7)
1742*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
1743*c0909341SAndroid Build Coastguard Worker    psrad           m4, 8
1744*c0909341SAndroid Build Coastguard Worker    psrad           m5, 8
1745*c0909341SAndroid Build Coastguard Worker    packssdw        m4, m5
1746*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m4, m7
1747*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4
1748*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m6
1749*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m14
1750*c0909341SAndroid Build Coastguard Worker    mova     [dstq+wq], m0
1751*c0909341SAndroid Build Coastguard Worker    add             wq, 16
1752*c0909341SAndroid Build Coastguard Worker    jl .n1_loop
1753*c0909341SAndroid Build Coastguard Worker    add           dstq, stridemp
1754*c0909341SAndroid Build Coastguard Worker    movif32       dstm, dstq
1755*c0909341SAndroid Build Coastguard Worker    ret
1756*c0909341SAndroid Build Coastguard Worker
1757*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1758*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
1759*c0909341SAndroid Build Coastguard Worker  %assign extra_stack 4*16
1760*c0909341SAndroid Build Coastguard Worker %else
1761*c0909341SAndroid Build Coastguard Worker  %assign extra_stack 2*16
1762*c0909341SAndroid Build Coastguard Worker %endif
1763*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \
1764*c0909341SAndroid Build Coastguard Worker                              dst, stride, left, lpf, w
1765*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
1766*c0909341SAndroid Build Coastguard Worker  %define dstm         dword [esp+calloff+16*2+4*0]
1767*c0909341SAndroid Build Coastguard Worker  %define stridemp     dword [esp+calloff+16*2+4*1]
1768*c0909341SAndroid Build Coastguard Worker  %define leftm        dword [esp+calloff+16*2+4*2]
1769*c0909341SAndroid Build Coastguard Worker  %define lpfm         dword [esp+calloff+16*2+4*3]
1770*c0909341SAndroid Build Coastguard Worker  %define w0m          dword [esp+calloff+16*2+4*4]
1771*c0909341SAndroid Build Coastguard Worker  %define hd           dword [esp+calloff+16*2+4*5]
1772*c0909341SAndroid Build Coastguard Worker  %define edgeb         byte [esp+calloff+16*2+4*6]
1773*c0909341SAndroid Build Coastguard Worker  %define edged        dword [esp+calloff+16*2+4*6]
1774*c0909341SAndroid Build Coastguard Worker  %define leftmp leftm
1775*c0909341SAndroid Build Coastguard Worker %else
1776*c0909341SAndroid Build Coastguard Worker  %define w0m wm
1777*c0909341SAndroid Build Coastguard Worker  %define hd dword r5m
1778*c0909341SAndroid Build Coastguard Worker  %define edgeb  byte r7m
1779*c0909341SAndroid Build Coastguard Worker  %define edged dword r7m
1780*c0909341SAndroid Build Coastguard Worker %endif
1781*c0909341SAndroid Build Coastguard Worker %define hvsrcm dword [esp+calloff+4*0]
1782*c0909341SAndroid Build Coastguard Worker %define w1m    dword [esp+calloff+4*1]
1783*c0909341SAndroid Build Coastguard Worker %define t3m    dword [esp+calloff+4*2]
1784*c0909341SAndroid Build Coastguard Worker %define t4m    dword [esp+calloff+4*3]
1785*c0909341SAndroid Build Coastguard Worker %define  m8 [base+pd_8]
1786*c0909341SAndroid Build Coastguard Worker %define  m9 [esp+calloff+16*1]
1787*c0909341SAndroid Build Coastguard Worker %define m10 [base+pd_0xf00801c7]
1788*c0909341SAndroid Build Coastguard Worker %define m11 [base+pd_34816]
1789*c0909341SAndroid Build Coastguard Worker %define m12 [base+sgr_lshuf3]
1790*c0909341SAndroid Build Coastguard Worker %define m13 [base+pw_1023]
1791*c0909341SAndroid Build Coastguard Worker %define m14 m6
1792*c0909341SAndroid Build Coastguard Worker %define base r6-$$
1793*c0909341SAndroid Build Coastguard Worker %assign calloff 0
1794*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
1795*c0909341SAndroid Build Coastguard Worker    mov        strideq, [rstk+stack_offset+ 8]
1796*c0909341SAndroid Build Coastguard Worker    mov          leftq, [rstk+stack_offset+12]
1797*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rstk+stack_offset+16]
1798*c0909341SAndroid Build Coastguard Worker    mov             wd, [rstk+stack_offset+20]
1799*c0909341SAndroid Build Coastguard Worker    mov           dstm, dstq
1800*c0909341SAndroid Build Coastguard Worker    mov       stridemp, strideq
1801*c0909341SAndroid Build Coastguard Worker    mov          leftm, leftq
1802*c0909341SAndroid Build Coastguard Worker    mov             r1, [rstk+stack_offset+24]
1803*c0909341SAndroid Build Coastguard Worker    mov             r2, [rstk+stack_offset+32]
1804*c0909341SAndroid Build Coastguard Worker    mov           lpfm, lpfq
1805*c0909341SAndroid Build Coastguard Worker    mov             hd, r1
1806*c0909341SAndroid Build Coastguard Worker    mov          edged, r2
1807*c0909341SAndroid Build Coastguard Worker %endif
1808*c0909341SAndroid Build Coastguard Worker%else
1809*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_3x3_16bpc, 4, 15, 15, -400*42-8, dst, stride, left, lpf, \
1810*c0909341SAndroid Build Coastguard Worker                                                    w, h, edge, params
1811*c0909341SAndroid Build Coastguard Worker%endif
1812*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
1813*c0909341SAndroid Build Coastguard Worker    movifnidn       wd, wm
1814*c0909341SAndroid Build Coastguard Worker%endif
1815*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1816*c0909341SAndroid Build Coastguard Worker    mov        paramsq, r6mp
1817*c0909341SAndroid Build Coastguard Worker    lea            r13, [sgr_x_by_x-0xf03]
1818*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
1819*c0909341SAndroid Build Coastguard Worker    add             wd, wd
1820*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
1821*c0909341SAndroid Build Coastguard Worker    movq            m9, [paramsq+4]
1822*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
1823*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq+12]
1824*c0909341SAndroid Build Coastguard Worker    mova            m8, [pd_8]
1825*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
1826*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+wq*2+400*12+8]
1827*c0909341SAndroid Build Coastguard Worker    mova           m10, [pd_0xf00801c7]
1828*c0909341SAndroid Build Coastguard Worker    lea             t4, [rsp+wq+400*32+8]
1829*c0909341SAndroid Build Coastguard Worker    mova           m11, [pd_34816]
1830*c0909341SAndroid Build Coastguard Worker    pshuflw         m7, m9, q3333
1831*c0909341SAndroid Build Coastguard Worker    pshufb          m9, [pw_256]  ; s1
1832*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m7, m7        ; w1
1833*c0909341SAndroid Build Coastguard Worker    neg             wq
1834*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
1835*c0909341SAndroid Build Coastguard Worker    mova           m13, [pw_1023]
1836*c0909341SAndroid Build Coastguard Worker    psllw           m7, 4
1837*c0909341SAndroid Build Coastguard Worker    mova           m12, [sgr_lshuf3]
1838*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
1839*c0909341SAndroid Build Coastguard Worker %define lpfm [rsp]
1840*c0909341SAndroid Build Coastguard Worker%else
1841*c0909341SAndroid Build Coastguard Worker    mov             r1, [rstk+stack_offset+28] ; params
1842*c0909341SAndroid Build Coastguard Worker    LEA             r6, $$
1843*c0909341SAndroid Build Coastguard Worker    add             wd, wd
1844*c0909341SAndroid Build Coastguard Worker    movq            m1, [r1+4]
1845*c0909341SAndroid Build Coastguard Worker    add           lpfm, wq
1846*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+extra_stack+wq+20]
1847*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
1848*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+extra_stack+wq*2+400*12+16]
1849*c0909341SAndroid Build Coastguard Worker    mov           dstm, dstq
1850*c0909341SAndroid Build Coastguard Worker    lea             t4, [rsp+extra_stack+wq+400*32+16]
1851*c0909341SAndroid Build Coastguard Worker    mov            t3m, t3
1852*c0909341SAndroid Build Coastguard Worker    pshuflw         m7, m1, q3333
1853*c0909341SAndroid Build Coastguard Worker    mov            t4m, t4
1854*c0909341SAndroid Build Coastguard Worker    pshufb          m1, [base+pw_256] ; s1
1855*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m7, m7            ; w1
1856*c0909341SAndroid Build Coastguard Worker    psllw           m7, 4
1857*c0909341SAndroid Build Coastguard Worker    neg             wq
1858*c0909341SAndroid Build Coastguard Worker    mova            m9, m1
1859*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
1860*c0909341SAndroid Build Coastguard Worker    mov            w1m, wd
1861*c0909341SAndroid Build Coastguard Worker    sub             wd, 4
1862*c0909341SAndroid Build Coastguard Worker    mov           lpfq, lpfm
1863*c0909341SAndroid Build Coastguard Worker    mov            w0m, wd
1864*c0909341SAndroid Build Coastguard Worker %define strideq r5
1865*c0909341SAndroid Build Coastguard Worker%endif
1866*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
1867*c0909341SAndroid Build Coastguard Worker    jz .no_top
1868*c0909341SAndroid Build Coastguard Worker    call .h_top
1869*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1870*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1871*c0909341SAndroid Build Coastguard Worker    add             t1, 400*6
1872*c0909341SAndroid Build Coastguard Worker    call .h_top
1873*c0909341SAndroid Build Coastguard Worker    movif32    strideq, stridemp
1874*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1875*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1876*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
1877*c0909341SAndroid Build Coastguard Worker    mov           lpfm, r10 ; below
1878*c0909341SAndroid Build Coastguard Worker    movif32         t4, t4m
1879*c0909341SAndroid Build Coastguard Worker    call .hv0
1880*c0909341SAndroid Build Coastguard Worker.main:
1881*c0909341SAndroid Build Coastguard Worker    dec             hd
1882*c0909341SAndroid Build Coastguard Worker    jz .height1
1883*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
1884*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1885*c0909341SAndroid Build Coastguard Worker    call .hv1
1886*c0909341SAndroid Build Coastguard Worker    call .prep_n
1887*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1888*c0909341SAndroid Build Coastguard Worker    jl .extend_bottom
1889*c0909341SAndroid Build Coastguard Worker.main_loop:
1890*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
1891*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1892*c0909341SAndroid Build Coastguard Worker    call .hv0
1893*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1894*c0909341SAndroid Build Coastguard Worker    test            hb, hb
1895*c0909341SAndroid Build Coastguard Worker%else
1896*c0909341SAndroid Build Coastguard Worker    mov             r4, hd
1897*c0909341SAndroid Build Coastguard Worker    test            r4, r4
1898*c0909341SAndroid Build Coastguard Worker%endif
1899*c0909341SAndroid Build Coastguard Worker    jz .odd_height
1900*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
1901*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1902*c0909341SAndroid Build Coastguard Worker    call .hv1
1903*c0909341SAndroid Build Coastguard Worker    call .n0
1904*c0909341SAndroid Build Coastguard Worker    call .n1
1905*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1906*c0909341SAndroid Build Coastguard Worker    jge .main_loop
1907*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
1908*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
1909*c0909341SAndroid Build Coastguard Worker    mov           lpfq, lpfm
1910*c0909341SAndroid Build Coastguard Worker    call .hv0_bottom
1911*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
1912*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
1913*c0909341SAndroid Build Coastguard Worker    call .hv1_bottom
1914*c0909341SAndroid Build Coastguard Worker.end:
1915*c0909341SAndroid Build Coastguard Worker    call .n0
1916*c0909341SAndroid Build Coastguard Worker    call .n1
1917*c0909341SAndroid Build Coastguard Worker.end2:
1918*c0909341SAndroid Build Coastguard Worker    RET
1919*c0909341SAndroid Build Coastguard Worker.height1:
1920*c0909341SAndroid Build Coastguard Worker    call .v1
1921*c0909341SAndroid Build Coastguard Worker    call .prep_n
1922*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
1923*c0909341SAndroid Build Coastguard Worker.odd_height:
1924*c0909341SAndroid Build Coastguard Worker    call .v1
1925*c0909341SAndroid Build Coastguard Worker    call .n0
1926*c0909341SAndroid Build Coastguard Worker    call .n1
1927*c0909341SAndroid Build Coastguard Worker.odd_height_end:
1928*c0909341SAndroid Build Coastguard Worker    call .v0
1929*c0909341SAndroid Build Coastguard Worker    call .v1
1930*c0909341SAndroid Build Coastguard Worker    call .n0
1931*c0909341SAndroid Build Coastguard Worker    jmp .end2
1932*c0909341SAndroid Build Coastguard Worker.extend_bottom:
1933*c0909341SAndroid Build Coastguard Worker    call .v0
1934*c0909341SAndroid Build Coastguard Worker    call .v1
1935*c0909341SAndroid Build Coastguard Worker    jmp .end
1936*c0909341SAndroid Build Coastguard Worker.no_top:
1937*c0909341SAndroid Build Coastguard Worker    movif32    strideq, stridemp
1938*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1939*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1940*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
1941*c0909341SAndroid Build Coastguard Worker    mov           lpfm, r10
1942*c0909341SAndroid Build Coastguard Worker    call .h
1943*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1944*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
1945*c0909341SAndroid Build Coastguard Worker%else
1946*c0909341SAndroid Build Coastguard Worker    mov             wq, w0m
1947*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
1948*c0909341SAndroid Build Coastguard Worker%endif
1949*c0909341SAndroid Build Coastguard Worker    lea             t2, [t1+400*6]
1950*c0909341SAndroid Build Coastguard Worker.top_fixup_loop:
1951*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+wq+400*0]
1952*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+wq+400*2]
1953*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+wq+400*4]
1954*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*0], m0
1955*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*2], m1
1956*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*4], m2
1957*c0909341SAndroid Build Coastguard Worker    add             wq, 16
1958*c0909341SAndroid Build Coastguard Worker    jl .top_fixup_loop
1959*c0909341SAndroid Build Coastguard Worker    movif32         t3, t3m
1960*c0909341SAndroid Build Coastguard Worker    movif32         t4, t4m
1961*c0909341SAndroid Build Coastguard Worker    call .v0
1962*c0909341SAndroid Build Coastguard Worker    jmp .main
1963*c0909341SAndroid Build Coastguard Worker.extend_right:
1964*c0909341SAndroid Build Coastguard Worker    movd            m1, wd
1965*c0909341SAndroid Build Coastguard Worker    movd            m5, [lpfq-2]
1966*c0909341SAndroid Build Coastguard Worker    mova            m2, [base+pw_256]
1967*c0909341SAndroid Build Coastguard Worker    mova            m3, [base+pb_0to15]
1968*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m6
1969*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m2
1970*c0909341SAndroid Build Coastguard Worker    psubb           m2, m1
1971*c0909341SAndroid Build Coastguard Worker    pcmpgtb         m2, m3
1972*c0909341SAndroid Build Coastguard Worker    pand            m4, m2
1973*c0909341SAndroid Build Coastguard Worker    pandn           m2, m5
1974*c0909341SAndroid Build Coastguard Worker    por             m4, m2
1975*c0909341SAndroid Build Coastguard Worker    ret
1976*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset+4
1977*c0909341SAndroid Build Coastguard Worker%assign calloff 4
1978*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum
1979*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1980*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
1981*c0909341SAndroid Build Coastguard Worker%else
1982*c0909341SAndroid Build Coastguard Worker %define leftq r4
1983*c0909341SAndroid Build Coastguard Worker%endif
1984*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1985*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1986*c0909341SAndroid Build Coastguard Worker    movif32      leftq, leftm
1987*c0909341SAndroid Build Coastguard Worker    movddup         m5, [leftq]
1988*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
1989*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq+4]
1990*c0909341SAndroid Build Coastguard Worker    add         leftmp, 8
1991*c0909341SAndroid Build Coastguard Worker    palignr         m4, m5, 12
1992*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1993*c0909341SAndroid Build Coastguard Worker.h_extend_left:
1994*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
1995*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq+4]
1996*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m12
1997*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1998*c0909341SAndroid Build Coastguard Worker.h_top:
1999*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2000*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
2001*c0909341SAndroid Build Coastguard Worker%endif
2002*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2003*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
2004*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2005*c0909341SAndroid Build Coastguard Worker.h_loop:
2006*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+wq+ 0]
2007*c0909341SAndroid Build Coastguard Worker.h_main:
2008*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+wq+16]
2009*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
2010*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
2011*c0909341SAndroid Build Coastguard Worker    cmp             wd, -18
2012*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
2013*c0909341SAndroid Build Coastguard Worker    call .extend_right
2014*c0909341SAndroid Build Coastguard Worker.h_have_right:
2015*c0909341SAndroid Build Coastguard Worker    palignr         m0, m5, m4, 2
2016*c0909341SAndroid Build Coastguard Worker    paddw           m1, m4, m0
2017*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m4, m0
2018*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2019*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4, m0
2020*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2021*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 4
2022*c0909341SAndroid Build Coastguard Worker    paddw           m1, m5             ; sum
2023*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5, m6
2024*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
2025*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m6
2026*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
2027*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4             ; sumsq
2028*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5
2029*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400*0], m1
2030*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400*2], m2
2031*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400*4], m3
2032*c0909341SAndroid Build Coastguard Worker    add             wq, 16
2033*c0909341SAndroid Build Coastguard Worker    jl .h_loop
2034*c0909341SAndroid Build Coastguard Worker    ret
2035*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2036*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
2037*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2038*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
2039*c0909341SAndroid Build Coastguard Worker%else
2040*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
2041*c0909341SAndroid Build Coastguard Worker%endif
2042*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2043*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
2044*c0909341SAndroid Build Coastguard Worker    movif32      leftq, leftm
2045*c0909341SAndroid Build Coastguard Worker    movddup         m5, [leftq]
2046*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2047*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq+4]
2048*c0909341SAndroid Build Coastguard Worker    add         leftmp, 8
2049*c0909341SAndroid Build Coastguard Worker    palignr         m4, m5, 12
2050*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
2051*c0909341SAndroid Build Coastguard Worker.hv0_extend_left:
2052*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2053*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq+4]
2054*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m12
2055*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
2056*c0909341SAndroid Build Coastguard Worker.hv0_bottom:
2057*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2058*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
2059*c0909341SAndroid Build Coastguard Worker%else
2060*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
2061*c0909341SAndroid Build Coastguard Worker%endif
2062*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2063*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
2064*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2065*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2066*c0909341SAndroid Build Coastguard Worker    jmp .hv0_loop_start
2067*c0909341SAndroid Build Coastguard Worker%endif
2068*c0909341SAndroid Build Coastguard Worker.hv0_loop:
2069*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
2070*c0909341SAndroid Build Coastguard Worker.hv0_loop_start:
2071*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+wq+ 0]
2072*c0909341SAndroid Build Coastguard Worker.hv0_main:
2073*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+wq+16]
2074*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
2075*c0909341SAndroid Build Coastguard Worker    jnz .hv0_have_right
2076*c0909341SAndroid Build Coastguard Worker    cmp             wd, -18
2077*c0909341SAndroid Build Coastguard Worker    jl .hv0_have_right
2078*c0909341SAndroid Build Coastguard Worker    call .extend_right
2079*c0909341SAndroid Build Coastguard Worker.hv0_have_right:
2080*c0909341SAndroid Build Coastguard Worker    palignr         m0, m5, m4, 2
2081*c0909341SAndroid Build Coastguard Worker    paddw           m1, m4, m0
2082*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m4, m0
2083*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2084*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4, m0
2085*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2086*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 4
2087*c0909341SAndroid Build Coastguard Worker    paddw           m1, m5             ; sum
2088*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5, m6
2089*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
2090*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m6
2091*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
2092*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4             ; sumsq
2093*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5
2094*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1, [t1+wq+400*0]
2095*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t1+wq+400*2]
2096*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t1+wq+400*4]
2097*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400*0], m1
2098*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400*2], m2
2099*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400*4], m3
2100*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+wq+400*0]
2101*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+wq+400*2]
2102*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+wq+400*4]
2103*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*0], m0
2104*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*2], m4
2105*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*4], m5
2106*c0909341SAndroid Build Coastguard Worker    paddd           m2, m8
2107*c0909341SAndroid Build Coastguard Worker    paddd           m3, m8
2108*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a + 8) >> 4
2109*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
2110*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
2111*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
2112*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; ((a + 8) >> 4) * 9
2113*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
2114*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
2115*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m6             ; (b + 2) >> 2
2116*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
2117*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2118*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
2119*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2120*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
2121*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
2122*c0909341SAndroid Build Coastguard Worker    MAXSD           m4, m2, m14
2123*c0909341SAndroid Build Coastguard Worker    MAXSD           m5, m3, m14
2124*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p
2125*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
2126*c0909341SAndroid Build Coastguard Worker    MULLD           m4, m9, m14        ; p * s
2127*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m9, m14
2128*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m10            ; b * 455
2129*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m10
2130*c0909341SAndroid Build Coastguard Worker    paddusw         m4, m10
2131*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m10
2132*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; min(z, 255)
2133*c0909341SAndroid Build Coastguard Worker    movif32         t3, t3m
2134*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
2135*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m3, m4, m5, r0, dstm
2136*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m3
2137*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3, m3
2138*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m4, m14
2139*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m5, m14
2140*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2141*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
2142*c0909341SAndroid Build Coastguard Worker%endif
2143*c0909341SAndroid Build Coastguard Worker    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2144*c0909341SAndroid Build Coastguard Worker    paddd           m1, m11
2145*c0909341SAndroid Build Coastguard Worker    mova     [t4+wq+4], m3
2146*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
2147*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
2148*c0909341SAndroid Build Coastguard Worker    mova  [t3+wq*2+ 8], m0
2149*c0909341SAndroid Build Coastguard Worker    mova  [t3+wq*2+24], m1
2150*c0909341SAndroid Build Coastguard Worker    add             wq, 16
2151*c0909341SAndroid Build Coastguard Worker    jl .hv0_loop
2152*c0909341SAndroid Build Coastguard Worker    ret
2153*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2154*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
2155*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2156*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
2157*c0909341SAndroid Build Coastguard Worker%else
2158*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
2159*c0909341SAndroid Build Coastguard Worker%endif
2160*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2161*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
2162*c0909341SAndroid Build Coastguard Worker    movif32      leftq, leftm
2163*c0909341SAndroid Build Coastguard Worker    movddup         m5, [leftq]
2164*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2165*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq+4]
2166*c0909341SAndroid Build Coastguard Worker    add         leftmp, 8
2167*c0909341SAndroid Build Coastguard Worker    palignr         m4, m5, 12
2168*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
2169*c0909341SAndroid Build Coastguard Worker.hv1_extend_left:
2170*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2171*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq+4]
2172*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m12
2173*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
2174*c0909341SAndroid Build Coastguard Worker.hv1_bottom:
2175*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2176*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
2177*c0909341SAndroid Build Coastguard Worker%else
2178*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
2179*c0909341SAndroid Build Coastguard Worker%endif
2180*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2181*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
2182*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2183*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2184*c0909341SAndroid Build Coastguard Worker    jmp .hv1_loop_start
2185*c0909341SAndroid Build Coastguard Worker%endif
2186*c0909341SAndroid Build Coastguard Worker.hv1_loop:
2187*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
2188*c0909341SAndroid Build Coastguard Worker.hv1_loop_start:
2189*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+wq+ 0]
2190*c0909341SAndroid Build Coastguard Worker.hv1_main:
2191*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+wq+16]
2192*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
2193*c0909341SAndroid Build Coastguard Worker    jnz .hv1_have_right
2194*c0909341SAndroid Build Coastguard Worker    cmp             wd, -18
2195*c0909341SAndroid Build Coastguard Worker    jl .hv1_have_right
2196*c0909341SAndroid Build Coastguard Worker    call .extend_right
2197*c0909341SAndroid Build Coastguard Worker.hv1_have_right:
2198*c0909341SAndroid Build Coastguard Worker    palignr         m1, m5, m4, 2
2199*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4, m1
2200*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m4, m1
2201*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2202*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4, m1
2203*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2204*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 4
2205*c0909341SAndroid Build Coastguard Worker    paddw           m0, m5             ; h sum
2206*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m5, m6
2207*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
2208*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m6
2209*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
2210*c0909341SAndroid Build Coastguard Worker    paddd           m2, m1             ; h sumsq
2211*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5
2212*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+wq+400*0]
2213*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t2+wq+400*2]
2214*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t2+wq+400*4]
2215*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*0], m0
2216*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*2], m2
2217*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*4], m3
2218*c0909341SAndroid Build Coastguard Worker    paddd           m4, m8
2219*c0909341SAndroid Build Coastguard Worker    paddd           m5, m8
2220*c0909341SAndroid Build Coastguard Worker    psrld           m4, 4              ; (a + 8) >> 4
2221*c0909341SAndroid Build Coastguard Worker    psrld           m5, 4
2222*c0909341SAndroid Build Coastguard Worker    pslld           m2, m4, 3
2223*c0909341SAndroid Build Coastguard Worker    pslld           m3, m5, 3
2224*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; ((a + 8) >> 4) * 9
2225*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
2226*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
2227*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m6             ; (b + 2) >> 2
2228*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
2229*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2230*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
2231*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2232*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
2233*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
2234*c0909341SAndroid Build Coastguard Worker    MAXSD           m4, m2, m14
2235*c0909341SAndroid Build Coastguard Worker    MAXSD           m5, m3, m14
2236*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p
2237*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
2238*c0909341SAndroid Build Coastguard Worker    MULLD           m4, m9, m14        ; p * s
2239*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m9, m14
2240*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m10            ; b * 455
2241*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m10
2242*c0909341SAndroid Build Coastguard Worker    paddusw         m4, m10
2243*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m10
2244*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; min(z, 255)
2245*c0909341SAndroid Build Coastguard Worker    movif32         t3, t3m
2246*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
2247*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m3, m4, m5, r0, dstm
2248*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m3
2249*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3, m3
2250*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m4, m14
2251*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m5, m14
2252*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2253*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
2254*c0909341SAndroid Build Coastguard Worker%endif
2255*c0909341SAndroid Build Coastguard Worker    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2256*c0909341SAndroid Build Coastguard Worker    paddd           m1, m11
2257*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*2 +4], m3
2258*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
2259*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
2260*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*4+ 8], m0
2261*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*4+24], m1
2262*c0909341SAndroid Build Coastguard Worker    add             wq, 16
2263*c0909341SAndroid Build Coastguard Worker    jl .hv1_loop
2264*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
2265*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
2266*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
2267*c0909341SAndroid Build Coastguard Worker    ret
2268*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab (even rows)
2269*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2270*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
2271*c0909341SAndroid Build Coastguard Worker%else
2272*c0909341SAndroid Build Coastguard Worker    mov             wd, w0m
2273*c0909341SAndroid Build Coastguard Worker%endif
2274*c0909341SAndroid Build Coastguard Worker.v0_loop:
2275*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+wq+400*0]
2276*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+wq+400*2]
2277*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+wq+400*4]
2278*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
2279*c0909341SAndroid Build Coastguard Worker    paddd           m4, m4
2280*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5
2281*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+wq+400*0]
2282*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+wq+400*2]
2283*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+wq+400*4]
2284*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*0], m0
2285*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*2], m4
2286*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*4], m5
2287*c0909341SAndroid Build Coastguard Worker    paddd           m2, m8
2288*c0909341SAndroid Build Coastguard Worker    paddd           m3, m8
2289*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a + 8) >> 4
2290*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
2291*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
2292*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
2293*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; ((a + 8) >> 4) * 9
2294*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
2295*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
2296*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m6             ; (b + 2) >> 2
2297*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
2298*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2299*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
2300*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2301*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
2302*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
2303*c0909341SAndroid Build Coastguard Worker    MAXSD           m4, m2, m14
2304*c0909341SAndroid Build Coastguard Worker    MAXSD           m5, m3, m14
2305*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p
2306*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
2307*c0909341SAndroid Build Coastguard Worker    MULLD           m4, m9, m14        ; p * s
2308*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m9, m14
2309*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m10            ; b * 455
2310*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m10
2311*c0909341SAndroid Build Coastguard Worker    paddusw         m4, m10
2312*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m10
2313*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; min(z, 255)
2314*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
2315*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m3, m4, m5, r0, dstm
2316*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m3
2317*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3, m3
2318*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m4, m14
2319*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m5, m14
2320*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2321*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
2322*c0909341SAndroid Build Coastguard Worker%endif
2323*c0909341SAndroid Build Coastguard Worker    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2324*c0909341SAndroid Build Coastguard Worker    paddd           m1, m11
2325*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*0+ 4], m3
2326*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
2327*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
2328*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*0+ 8], m0
2329*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*0+24], m1
2330*c0909341SAndroid Build Coastguard Worker    add             wq, 16
2331*c0909341SAndroid Build Coastguard Worker    jl .v0_loop
2332*c0909341SAndroid Build Coastguard Worker    ret
2333*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows)
2334*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2335*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
2336*c0909341SAndroid Build Coastguard Worker%else
2337*c0909341SAndroid Build Coastguard Worker    mov             wd, w0m
2338*c0909341SAndroid Build Coastguard Worker%endif
2339*c0909341SAndroid Build Coastguard Worker.v1_loop:
2340*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+wq+400*0]
2341*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+wq+400*2]
2342*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+wq+400*4]
2343*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+wq+400*0]
2344*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+wq+400*2]
2345*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+wq+400*4]
2346*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*0], m0
2347*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*2], m4
2348*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*4], m5
2349*c0909341SAndroid Build Coastguard Worker    paddd           m2, m8
2350*c0909341SAndroid Build Coastguard Worker    paddd           m3, m8
2351*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a + 8) >> 4
2352*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
2353*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
2354*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
2355*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; ((a + 8) >> 4) * 9
2356*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
2357*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
2358*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m6             ; (b + 2) >> 2
2359*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
2360*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2361*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
2362*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2363*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
2364*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
2365*c0909341SAndroid Build Coastguard Worker    MAXSD           m4, m2, m14
2366*c0909341SAndroid Build Coastguard Worker    MAXSD           m5, m3, m14
2367*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p
2368*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
2369*c0909341SAndroid Build Coastguard Worker    MULLD           m4, m9, m14        ; p * s
2370*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m9, m14
2371*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m10            ; b * 455
2372*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m10
2373*c0909341SAndroid Build Coastguard Worker    paddusw         m4, m10
2374*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m10
2375*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; min(z, 255)
2376*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
2377*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m3, m4, m5, r0, dstm
2378*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m3
2379*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3, m3
2380*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m4, m14
2381*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m5, m14
2382*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2383*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
2384*c0909341SAndroid Build Coastguard Worker%endif
2385*c0909341SAndroid Build Coastguard Worker    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2386*c0909341SAndroid Build Coastguard Worker    paddd           m1, m11
2387*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*2+ 4], m3
2388*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
2389*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
2390*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*4+ 8], m0
2391*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*4+24], m1
2392*c0909341SAndroid Build Coastguard Worker    add             wq, 16
2393*c0909341SAndroid Build Coastguard Worker    jl .v1_loop
2394*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
2395*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
2396*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
2397*c0909341SAndroid Build Coastguard Worker    ret
2398*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup
2399*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
2400*c0909341SAndroid Build Coastguard Worker    movif32         wd, w1m
2401*c0909341SAndroid Build Coastguard Worker.prep_n_loop:
2402*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+wq*1+400*0+ 4]
2403*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*2+400*0+ 8]
2404*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+wq*2+400*0+24]
2405*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*1+400*0+ 2]
2406*c0909341SAndroid Build Coastguard Worker    movu            m4, [t3+wq*2+400*0+ 4]
2407*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*2+400*0+20]
2408*c0909341SAndroid Build Coastguard Worker    paddw           m0, [t4+wq*1+400*0+ 0]
2409*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+wq*2+400*0+ 0]
2410*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t3+wq*2+400*0+16]
2411*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0
2412*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1
2413*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2
2414*c0909341SAndroid Build Coastguard Worker    psllw           m3, 2                ; a[-1] 444
2415*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2                ; b[-1] 444
2416*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2
2417*c0909341SAndroid Build Coastguard Worker    psubw           m3, m0               ; a[-1] 343
2418*c0909341SAndroid Build Coastguard Worker    psubd           m4, m1               ; b[-1] 343
2419*c0909341SAndroid Build Coastguard Worker    psubd           m5, m2
2420*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*4], m3
2421*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*8+ 0], m4
2422*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*8+16], m5
2423*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+wq*1+400*2+ 4]
2424*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*2+400*4+ 8]
2425*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+wq*2+400*4+24]
2426*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*1+400*2+ 2]
2427*c0909341SAndroid Build Coastguard Worker    movu            m4, [t3+wq*2+400*4+ 4]
2428*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*2+400*4+20]
2429*c0909341SAndroid Build Coastguard Worker    paddw           m0, [t4+wq*1+400*2+ 0]
2430*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+wq*2+400*4+ 0]
2431*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t3+wq*2+400*4+16]
2432*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0
2433*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1
2434*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2
2435*c0909341SAndroid Build Coastguard Worker    psllw           m3, 2                 ; a[ 0] 444
2436*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2                 ; b[ 0] 444
2437*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2
2438*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400* 6], m3
2439*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*12+ 0], m4
2440*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*12+16], m5
2441*c0909341SAndroid Build Coastguard Worker    psubw           m3, m0                ; a[ 0] 343
2442*c0909341SAndroid Build Coastguard Worker    psubd           m4, m1                ; b[ 0] 343
2443*c0909341SAndroid Build Coastguard Worker    psubd           m5, m2
2444*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400* 8], m3
2445*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*16+ 0], m4
2446*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*16+16], m5
2447*c0909341SAndroid Build Coastguard Worker    add             wq, 16
2448*c0909341SAndroid Build Coastguard Worker    jl .prep_n_loop
2449*c0909341SAndroid Build Coastguard Worker    ret
2450*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2451*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows)
2452*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
2453*c0909341SAndroid Build Coastguard Worker    movif32         wd, w1m
2454*c0909341SAndroid Build Coastguard Worker.n0_loop:
2455*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*1+400*0+4]
2456*c0909341SAndroid Build Coastguard Worker    movu            m1, [t4+wq*1+400*0+2]
2457*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*1+400*0+0]
2458*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
2459*c0909341SAndroid Build Coastguard Worker    psllw           m1, 2                ; a[ 1] 444
2460*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1, m3           ; a[ 1] 343
2461*c0909341SAndroid Build Coastguard Worker    paddw           m3, m2, [t4+wq*1+400*4]
2462*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*1+400*6]
2463*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*4], m2
2464*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*6], m1
2465*c0909341SAndroid Build Coastguard Worker    movu            m4, [t3+wq*2+400*0+8]
2466*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*2+400*0+4]
2467*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+wq*2+400*0+0]
2468*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
2469*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2                ; b[ 1] 444
2470*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1, m4           ; b[ 1] 343
2471*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t3+wq*2+400* 8+ 0]
2472*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+wq*2+400*12+ 0]
2473*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400* 8+ 0], m2
2474*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*12+ 0], m1
2475*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*2+400*0+24]
2476*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*2+400*0+20]
2477*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+wq*2+400*0+16]
2478*c0909341SAndroid Build Coastguard Worker    paddd           m1, m5
2479*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2
2480*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1, m5
2481*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2, [t3+wq*2+400* 8+16]
2482*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+wq*2+400*12+16]
2483*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400* 8+16], m2
2484*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*12+16], m1
2485*c0909341SAndroid Build Coastguard Worker    mova            m0, [dstq+wq]
2486*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m0, m6
2487*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
2488*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m1               ; a * src
2489*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m6
2490*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
2491*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1
2492*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2               ; b - a * src + (1 << 8)
2493*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
2494*c0909341SAndroid Build Coastguard Worker    psrad           m4, 9
2495*c0909341SAndroid Build Coastguard Worker    psrad           m5, 9
2496*c0909341SAndroid Build Coastguard Worker    packssdw        m4, m5
2497*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m4, m7
2498*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4
2499*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m6
2500*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m13
2501*c0909341SAndroid Build Coastguard Worker    mova     [dstq+wq], m0
2502*c0909341SAndroid Build Coastguard Worker    add             wq, 16
2503*c0909341SAndroid Build Coastguard Worker    jl .n0_loop
2504*c0909341SAndroid Build Coastguard Worker    add           dstq, stridemp
2505*c0909341SAndroid Build Coastguard Worker    ret
2506*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2507*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows)
2508*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
2509*c0909341SAndroid Build Coastguard Worker    movif32         wd, w1m
2510*c0909341SAndroid Build Coastguard Worker.n1_loop:
2511*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*1+400*2+4]
2512*c0909341SAndroid Build Coastguard Worker    movu            m1, [t4+wq*1+400*2+2]
2513*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*1+400*2+0]
2514*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
2515*c0909341SAndroid Build Coastguard Worker    psllw           m1, 2                ; a[ 1] 444
2516*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1, m3           ; a[ 1] 343
2517*c0909341SAndroid Build Coastguard Worker    paddw           m3, m2, [t4+wq*1+400*6]
2518*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*1+400*8]
2519*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*6], m1
2520*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*8], m2
2521*c0909341SAndroid Build Coastguard Worker    movu            m4, [t3+wq*2+400*4+8]
2522*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*2+400*4+4]
2523*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+wq*2+400*4+0]
2524*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
2525*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2                ; b[ 1] 444
2526*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1, m4           ; b[ 1] 343
2527*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t3+wq*2+400*12+ 0]
2528*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+wq*2+400*16+ 0]
2529*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*12+ 0], m1
2530*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*16+ 0], m2
2531*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*2+400*4+24]
2532*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*2+400*4+20]
2533*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+wq*2+400*4+16]
2534*c0909341SAndroid Build Coastguard Worker    paddd           m1, m5
2535*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2
2536*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1, m5
2537*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2, [t3+wq*2+400*12+16]
2538*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+wq*2+400*16+16]
2539*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*12+16], m1
2540*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*16+16], m2
2541*c0909341SAndroid Build Coastguard Worker    mova            m0, [dstq+wq]
2542*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m0, m6
2543*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
2544*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m1               ; a * src
2545*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m6
2546*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
2547*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1
2548*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2               ; b - a * src + (1 << 8)
2549*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
2550*c0909341SAndroid Build Coastguard Worker    psrad           m4, 9
2551*c0909341SAndroid Build Coastguard Worker    psrad           m5, 9
2552*c0909341SAndroid Build Coastguard Worker    packssdw        m4, m5
2553*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m4, m7
2554*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4
2555*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m6
2556*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m13
2557*c0909341SAndroid Build Coastguard Worker    mova     [dstq+wq], m0
2558*c0909341SAndroid Build Coastguard Worker    add             wq, 16
2559*c0909341SAndroid Build Coastguard Worker    jl .n1_loop
2560*c0909341SAndroid Build Coastguard Worker    add           dstq, stridemp
2561*c0909341SAndroid Build Coastguard Worker    movif32       dstm, dstq
2562*c0909341SAndroid Build Coastguard Worker    ret
2563*c0909341SAndroid Build Coastguard Worker
2564*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2565*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
2566*c0909341SAndroid Build Coastguard Worker  %assign extra_stack 10*16
2567*c0909341SAndroid Build Coastguard Worker %else
2568*c0909341SAndroid Build Coastguard Worker  %assign extra_stack 8*16
2569*c0909341SAndroid Build Coastguard Worker %endif
2570*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \
2571*c0909341SAndroid Build Coastguard Worker                              dst, stride, left, lpf, w
2572*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
2573*c0909341SAndroid Build Coastguard Worker  %define dstm         dword [esp+calloff+16*8+4*0]
2574*c0909341SAndroid Build Coastguard Worker  %define stridemp     dword [esp+calloff+16*8+4*1]
2575*c0909341SAndroid Build Coastguard Worker  %define leftm        dword [esp+calloff+16*8+4*2]
2576*c0909341SAndroid Build Coastguard Worker  %define lpfm         dword [esp+calloff+16*8+4*3]
2577*c0909341SAndroid Build Coastguard Worker  %define w0m          dword [esp+calloff+16*8+4*4]
2578*c0909341SAndroid Build Coastguard Worker  %define hd           dword [esp+calloff+16*8+4*5]
2579*c0909341SAndroid Build Coastguard Worker  %define edgeb         byte [esp+calloff+16*8+4*6]
2580*c0909341SAndroid Build Coastguard Worker  %define edged        dword [esp+calloff+16*8+4*6]
2581*c0909341SAndroid Build Coastguard Worker  %define leftmp leftm
2582*c0909341SAndroid Build Coastguard Worker %else
2583*c0909341SAndroid Build Coastguard Worker  %define w0m wm
2584*c0909341SAndroid Build Coastguard Worker  %define hd dword r5m
2585*c0909341SAndroid Build Coastguard Worker  %define edgeb  byte r7m
2586*c0909341SAndroid Build Coastguard Worker  %define edged dword r7m
2587*c0909341SAndroid Build Coastguard Worker %endif
2588*c0909341SAndroid Build Coastguard Worker %define hvsrcm dword [esp+calloff+4*0]
2589*c0909341SAndroid Build Coastguard Worker %define w1m    dword [esp+calloff+4*1]
2590*c0909341SAndroid Build Coastguard Worker %define t3m    dword [esp+calloff+4*2]
2591*c0909341SAndroid Build Coastguard Worker %define t4m    dword [esp+calloff+4*3]
2592*c0909341SAndroid Build Coastguard Worker %xdefine m8 m6
2593*c0909341SAndroid Build Coastguard Worker %define  m9 [base+pd_8]
2594*c0909341SAndroid Build Coastguard Worker %define m10 [base+pd_34816]
2595*c0909341SAndroid Build Coastguard Worker %define m11 [base+pd_0xf00801c7]
2596*c0909341SAndroid Build Coastguard Worker %define m12 [base+pd_0xf00800a4]
2597*c0909341SAndroid Build Coastguard Worker %define m13 [esp+calloff+16*4]
2598*c0909341SAndroid Build Coastguard Worker %define m14 [esp+calloff+16*5]
2599*c0909341SAndroid Build Coastguard Worker %define m15 [esp+calloff+16*6]
2600*c0909341SAndroid Build Coastguard Worker %define  m6 [esp+calloff+16*7]
2601*c0909341SAndroid Build Coastguard Worker %define base r6-$$
2602*c0909341SAndroid Build Coastguard Worker %assign calloff 0
2603*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16
2604*c0909341SAndroid Build Coastguard Worker    mov        strideq, [rstk+stack_offset+ 8]
2605*c0909341SAndroid Build Coastguard Worker    mov          leftq, [rstk+stack_offset+12]
2606*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rstk+stack_offset+16]
2607*c0909341SAndroid Build Coastguard Worker    mov             wd, [rstk+stack_offset+20]
2608*c0909341SAndroid Build Coastguard Worker    mov           dstm, dstq
2609*c0909341SAndroid Build Coastguard Worker    mov       stridemp, strideq
2610*c0909341SAndroid Build Coastguard Worker    mov          leftm, leftq
2611*c0909341SAndroid Build Coastguard Worker    mov             r1, [rstk+stack_offset+24]
2612*c0909341SAndroid Build Coastguard Worker    mov             r2, [rstk+stack_offset+32]
2613*c0909341SAndroid Build Coastguard Worker    mov           lpfm, lpfq
2614*c0909341SAndroid Build Coastguard Worker    mov             hd, r1
2615*c0909341SAndroid Build Coastguard Worker    mov          edged, r2
2616*c0909341SAndroid Build Coastguard Worker %endif
2617*c0909341SAndroid Build Coastguard Worker%else
2618*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_mix_16bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
2619*c0909341SAndroid Build Coastguard Worker                                                     w, h, edge, params
2620*c0909341SAndroid Build Coastguard Worker%endif
2621*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
2622*c0909341SAndroid Build Coastguard Worker    movifnidn       wd, wm
2623*c0909341SAndroid Build Coastguard Worker%endif
2624*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2625*c0909341SAndroid Build Coastguard Worker    mov        paramsq, r6mp
2626*c0909341SAndroid Build Coastguard Worker    lea            r13, [sgr_x_by_x-0xf03]
2627*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
2628*c0909341SAndroid Build Coastguard Worker    add             wd, wd
2629*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
2630*c0909341SAndroid Build Coastguard Worker    mova           m14, [paramsq]
2631*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
2632*c0909341SAndroid Build Coastguard Worker    mova            m9, [pd_8]
2633*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq+44]
2634*c0909341SAndroid Build Coastguard Worker    mova           m10, [pd_34816]
2635*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
2636*c0909341SAndroid Build Coastguard Worker    mova           m11, [pd_0xf00801c7]
2637*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+wq*2+400*24+40]
2638*c0909341SAndroid Build Coastguard Worker    mova           m12, [pd_0xf00800a4]
2639*c0909341SAndroid Build Coastguard Worker    lea             t4, [rsp+wq+400*52+40]
2640*c0909341SAndroid Build Coastguard Worker    neg             wq
2641*c0909341SAndroid Build Coastguard Worker    pshufd         m15, m14, q2222 ; w0 w1
2642*c0909341SAndroid Build Coastguard Worker    punpcklwd      m14, m14
2643*c0909341SAndroid Build Coastguard Worker    pshufd         m13, m14, q0000 ; s0
2644*c0909341SAndroid Build Coastguard Worker    pshufd         m14, m14, q2222 ; s1
2645*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
2646*c0909341SAndroid Build Coastguard Worker    psllw          m15, 2
2647*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
2648*c0909341SAndroid Build Coastguard Worker %define lpfm [rsp]
2649*c0909341SAndroid Build Coastguard Worker%else
2650*c0909341SAndroid Build Coastguard Worker    mov             r1, [rstk+stack_offset+28] ; params
2651*c0909341SAndroid Build Coastguard Worker    LEA             r6, $$
2652*c0909341SAndroid Build Coastguard Worker    add             wd, wd
2653*c0909341SAndroid Build Coastguard Worker    mova            m2, [r1]
2654*c0909341SAndroid Build Coastguard Worker    add           lpfm, wq
2655*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+extra_stack+wq+52]
2656*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
2657*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+extra_stack+wq*2+400*24+48]
2658*c0909341SAndroid Build Coastguard Worker    mov           dstm, dstq
2659*c0909341SAndroid Build Coastguard Worker    lea             t4, [rsp+extra_stack+wq+400*52+48]
2660*c0909341SAndroid Build Coastguard Worker    mov            t3m, t3
2661*c0909341SAndroid Build Coastguard Worker    mov            t4m, t4
2662*c0909341SAndroid Build Coastguard Worker    neg             wq
2663*c0909341SAndroid Build Coastguard Worker    pshuflw         m0, m2, q0000
2664*c0909341SAndroid Build Coastguard Worker    pshuflw         m1, m2, q2222
2665*c0909341SAndroid Build Coastguard Worker    pshufhw         m2, m2, q1010
2666*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m0, m0 ; s0
2667*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m1, m1 ; s1
2668*c0909341SAndroid Build Coastguard Worker    punpckhqdq      m2, m2 ; w0 w1
2669*c0909341SAndroid Build Coastguard Worker    mov            w1m, wd
2670*c0909341SAndroid Build Coastguard Worker    pxor            m3, m3
2671*c0909341SAndroid Build Coastguard Worker    psllw           m2, 2
2672*c0909341SAndroid Build Coastguard Worker    mova           m13, m0
2673*c0909341SAndroid Build Coastguard Worker    mova           m14, m1
2674*c0909341SAndroid Build Coastguard Worker    sub             wd, 4
2675*c0909341SAndroid Build Coastguard Worker    mova           m15, m2
2676*c0909341SAndroid Build Coastguard Worker    mova            m6, m3
2677*c0909341SAndroid Build Coastguard Worker    mov           lpfq, lpfm
2678*c0909341SAndroid Build Coastguard Worker    mov            w0m, wd
2679*c0909341SAndroid Build Coastguard Worker %define strideq r5
2680*c0909341SAndroid Build Coastguard Worker%endif
2681*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
2682*c0909341SAndroid Build Coastguard Worker    jz .no_top
2683*c0909341SAndroid Build Coastguard Worker    call .h_top
2684*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
2685*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
2686*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2687*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup
2688*c0909341SAndroid Build Coastguard Worker%else
2689*c0909341SAndroid Build Coastguard Worker    mov             wq, w0m
2690*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup_loop
2691*c0909341SAndroid Build Coastguard Worker%endif
2692*c0909341SAndroid Build Coastguard Worker    add             t1, 400*12
2693*c0909341SAndroid Build Coastguard Worker    call .h_top
2694*c0909341SAndroid Build Coastguard Worker    movif32    strideq, stridemp
2695*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
2696*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
2697*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
2698*c0909341SAndroid Build Coastguard Worker    mov           lpfm, r10 ; below
2699*c0909341SAndroid Build Coastguard Worker    movif32         t4, t4m
2700*c0909341SAndroid Build Coastguard Worker    call .hv0
2701*c0909341SAndroid Build Coastguard Worker.main:
2702*c0909341SAndroid Build Coastguard Worker    dec             hd
2703*c0909341SAndroid Build Coastguard Worker    jz .height1
2704*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
2705*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
2706*c0909341SAndroid Build Coastguard Worker    call .hv1
2707*c0909341SAndroid Build Coastguard Worker    call .prep_n
2708*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
2709*c0909341SAndroid Build Coastguard Worker    jl .extend_bottom
2710*c0909341SAndroid Build Coastguard Worker.main_loop:
2711*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
2712*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
2713*c0909341SAndroid Build Coastguard Worker    call .hv0
2714*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2715*c0909341SAndroid Build Coastguard Worker    test            hd, hd
2716*c0909341SAndroid Build Coastguard Worker%else
2717*c0909341SAndroid Build Coastguard Worker    mov             r4, hd
2718*c0909341SAndroid Build Coastguard Worker    test            r4, r4
2719*c0909341SAndroid Build Coastguard Worker%endif
2720*c0909341SAndroid Build Coastguard Worker    jz .odd_height
2721*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
2722*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
2723*c0909341SAndroid Build Coastguard Worker    call .hv1
2724*c0909341SAndroid Build Coastguard Worker    call .n0
2725*c0909341SAndroid Build Coastguard Worker    call .n1
2726*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
2727*c0909341SAndroid Build Coastguard Worker    jge .main_loop
2728*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
2729*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
2730*c0909341SAndroid Build Coastguard Worker    mov           lpfq, lpfm
2731*c0909341SAndroid Build Coastguard Worker    call .hv0_bottom
2732*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
2733*c0909341SAndroid Build Coastguard Worker    add           lpfq, stridemp
2734*c0909341SAndroid Build Coastguard Worker    call .hv1_bottom
2735*c0909341SAndroid Build Coastguard Worker.end:
2736*c0909341SAndroid Build Coastguard Worker    call .n0
2737*c0909341SAndroid Build Coastguard Worker    call .n1
2738*c0909341SAndroid Build Coastguard Worker.end2:
2739*c0909341SAndroid Build Coastguard Worker    RET
2740*c0909341SAndroid Build Coastguard Worker.height1:
2741*c0909341SAndroid Build Coastguard Worker    call .v1
2742*c0909341SAndroid Build Coastguard Worker    call .prep_n
2743*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
2744*c0909341SAndroid Build Coastguard Worker.odd_height:
2745*c0909341SAndroid Build Coastguard Worker    call .v1
2746*c0909341SAndroid Build Coastguard Worker    call .n0
2747*c0909341SAndroid Build Coastguard Worker    call .n1
2748*c0909341SAndroid Build Coastguard Worker.odd_height_end:
2749*c0909341SAndroid Build Coastguard Worker    call .v0
2750*c0909341SAndroid Build Coastguard Worker    call .v1
2751*c0909341SAndroid Build Coastguard Worker    call .n0
2752*c0909341SAndroid Build Coastguard Worker    jmp .end2
2753*c0909341SAndroid Build Coastguard Worker.extend_bottom:
2754*c0909341SAndroid Build Coastguard Worker    call .v0
2755*c0909341SAndroid Build Coastguard Worker    call .v1
2756*c0909341SAndroid Build Coastguard Worker    jmp .end
2757*c0909341SAndroid Build Coastguard Worker.no_top:
2758*c0909341SAndroid Build Coastguard Worker    movif32    strideq, stridemp
2759*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
2760*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
2761*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
2762*c0909341SAndroid Build Coastguard Worker    mov           lpfm, r10
2763*c0909341SAndroid Build Coastguard Worker    call .h
2764*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2765*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
2766*c0909341SAndroid Build Coastguard Worker%else
2767*c0909341SAndroid Build Coastguard Worker    mov             wq, w0m
2768*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
2769*c0909341SAndroid Build Coastguard Worker%endif
2770*c0909341SAndroid Build Coastguard Worker    lea             t2, [t1+400*12]
2771*c0909341SAndroid Build Coastguard Worker.top_fixup_loop:
2772*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+wq+400* 0]
2773*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+wq+400* 2]
2774*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+wq+400* 4]
2775*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
2776*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+wq+400* 6]
2777*c0909341SAndroid Build Coastguard Worker    paddd           m1, m1
2778*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+wq+400* 8]
2779*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
2780*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+wq+400*10]
2781*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400* 0], m0
2782*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400* 2], m1
2783*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400* 4], m2
2784*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400* 6], m3
2785*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400* 8], m4
2786*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*10], m5
2787*c0909341SAndroid Build Coastguard Worker    add             wq, 16
2788*c0909341SAndroid Build Coastguard Worker    jl .top_fixup_loop
2789*c0909341SAndroid Build Coastguard Worker    movif32         t3, t3m
2790*c0909341SAndroid Build Coastguard Worker    movif32         t4, t4m
2791*c0909341SAndroid Build Coastguard Worker    call .v0
2792*c0909341SAndroid Build Coastguard Worker    jmp .main
2793*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum
2794*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset+4
2795*c0909341SAndroid Build Coastguard Worker%assign calloff 4
2796*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2797*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
2798*c0909341SAndroid Build Coastguard Worker%else
2799*c0909341SAndroid Build Coastguard Worker %define leftq r4
2800*c0909341SAndroid Build Coastguard Worker%endif
2801*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2802*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
2803*c0909341SAndroid Build Coastguard Worker    movif32      leftq, leftm
2804*c0909341SAndroid Build Coastguard Worker    movddup         m5, [leftq]
2805*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2806*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq+4]
2807*c0909341SAndroid Build Coastguard Worker    add         leftmp, 8
2808*c0909341SAndroid Build Coastguard Worker    palignr         m4, m5, 10
2809*c0909341SAndroid Build Coastguard Worker    jmp .h_main
2810*c0909341SAndroid Build Coastguard Worker.h_extend_left:
2811*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2812*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq+4]
2813*c0909341SAndroid Build Coastguard Worker    pshufb          m4, [base+sgr_lshuf5]
2814*c0909341SAndroid Build Coastguard Worker    jmp .h_main
2815*c0909341SAndroid Build Coastguard Worker.h_top:
2816*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2817*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
2818*c0909341SAndroid Build Coastguard Worker%endif
2819*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2820*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
2821*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2822*c0909341SAndroid Build Coastguard Worker.h_loop:
2823*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+wq- 2]
2824*c0909341SAndroid Build Coastguard Worker.h_main:
2825*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+wq+14]
2826*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
2827*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
2828*c0909341SAndroid Build Coastguard Worker    cmp             wd, -20
2829*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
2830*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2831*c0909341SAndroid Build Coastguard Worker    pxor            m8, m8
2832*c0909341SAndroid Build Coastguard Worker%endif
2833*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
2834*c0909341SAndroid Build Coastguard Worker.h_have_right:
2835*c0909341SAndroid Build Coastguard Worker    palignr         m3, m5, m4, 2
2836*c0909341SAndroid Build Coastguard Worker    palignr         m0, m5, m4, 4
2837*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3, m0
2838*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m0
2839*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2840*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m0
2841*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2842*c0909341SAndroid Build Coastguard Worker    palignr         m0, m5, m4, 6
2843*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0             ; sum3
2844*c0909341SAndroid Build Coastguard Worker    punpcklwd       m7, m0, m6
2845*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m7
2846*c0909341SAndroid Build Coastguard Worker    punpckhwd       m0, m6
2847*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
2848*c0909341SAndroid Build Coastguard Worker    paddd           m2, m7             ; sumsq3
2849*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 8
2850*c0909341SAndroid Build Coastguard Worker    punpcklwd       m7, m5, m4
2851*c0909341SAndroid Build Coastguard Worker    paddw           m8, m4, m5
2852*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m7
2853*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m4
2854*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
2855*c0909341SAndroid Build Coastguard Worker    paddd           m3, m0
2856*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400* 6], m1
2857*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400* 8], m2
2858*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400*10], m3
2859*c0909341SAndroid Build Coastguard Worker    paddw           m8, m1             ; sum5
2860*c0909341SAndroid Build Coastguard Worker    paddd           m7, m2             ; sumsq5
2861*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
2862*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400* 0], m8
2863*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400* 2], m7
2864*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400* 4], m5
2865*c0909341SAndroid Build Coastguard Worker    add             wq, 16
2866*c0909341SAndroid Build Coastguard Worker    jl .h_loop
2867*c0909341SAndroid Build Coastguard Worker    ret
2868*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2869*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
2870*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2871*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
2872*c0909341SAndroid Build Coastguard Worker%else
2873*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
2874*c0909341SAndroid Build Coastguard Worker%endif
2875*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2876*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
2877*c0909341SAndroid Build Coastguard Worker    movif32      leftq, leftm
2878*c0909341SAndroid Build Coastguard Worker    movddup         m5, [leftq]
2879*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2880*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq+4]
2881*c0909341SAndroid Build Coastguard Worker    add         leftmp, 8
2882*c0909341SAndroid Build Coastguard Worker    palignr         m4, m5, 10
2883*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
2884*c0909341SAndroid Build Coastguard Worker.hv0_extend_left:
2885*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2886*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq+4]
2887*c0909341SAndroid Build Coastguard Worker    pshufb          m4, [base+sgr_lshuf5]
2888*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
2889*c0909341SAndroid Build Coastguard Worker.hv0_bottom:
2890*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2891*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
2892*c0909341SAndroid Build Coastguard Worker%else
2893*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
2894*c0909341SAndroid Build Coastguard Worker%endif
2895*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2896*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
2897*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
2898*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2899*c0909341SAndroid Build Coastguard Worker    jmp .hv0_loop_start
2900*c0909341SAndroid Build Coastguard Worker%endif
2901*c0909341SAndroid Build Coastguard Worker.hv0_loop:
2902*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
2903*c0909341SAndroid Build Coastguard Worker.hv0_loop_start:
2904*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+wq- 2]
2905*c0909341SAndroid Build Coastguard Worker.hv0_main:
2906*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+wq+14]
2907*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
2908*c0909341SAndroid Build Coastguard Worker    jnz .hv0_have_right
2909*c0909341SAndroid Build Coastguard Worker    cmp             wd, -20
2910*c0909341SAndroid Build Coastguard Worker    jl .hv0_have_right
2911*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2912*c0909341SAndroid Build Coastguard Worker    pxor            m8, m8
2913*c0909341SAndroid Build Coastguard Worker%endif
2914*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
2915*c0909341SAndroid Build Coastguard Worker.hv0_have_right:
2916*c0909341SAndroid Build Coastguard Worker    palignr         m3, m5, m4, 2
2917*c0909341SAndroid Build Coastguard Worker    palignr         m0, m5, m4, 4
2918*c0909341SAndroid Build Coastguard Worker    movif32         t3, t3m
2919*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3, m0
2920*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m0
2921*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2922*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m0
2923*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2924*c0909341SAndroid Build Coastguard Worker    palignr         m0, m5, m4, 6
2925*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0             ; h sum3
2926*c0909341SAndroid Build Coastguard Worker    punpcklwd       m7, m0, m6
2927*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m7
2928*c0909341SAndroid Build Coastguard Worker    punpckhwd       m0, m6
2929*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
2930*c0909341SAndroid Build Coastguard Worker    paddd           m2, m7             ; h sumsq3
2931*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 8
2932*c0909341SAndroid Build Coastguard Worker    punpcklwd       m7, m5, m4
2933*c0909341SAndroid Build Coastguard Worker    paddw           m8, m4, m5
2934*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m7
2935*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m4
2936*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
2937*c0909341SAndroid Build Coastguard Worker    paddd           m3, m0
2938*c0909341SAndroid Build Coastguard Worker    paddw           m8, m1             ; h sum5
2939*c0909341SAndroid Build Coastguard Worker    paddd           m7, m2             ; h sumsq5
2940*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
2941*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*8+ 8], m8
2942*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*0+ 8], m7
2943*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*0+24], m5
2944*c0909341SAndroid Build Coastguard Worker    paddw           m8, [t1+wq+400* 0]
2945*c0909341SAndroid Build Coastguard Worker    paddd           m7, [t1+wq+400* 2]
2946*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t1+wq+400* 4]
2947*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400* 0], m8
2948*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400* 2], m7
2949*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400* 4], m5
2950*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1, [t1+wq+400* 6]
2951*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t1+wq+400* 8]
2952*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t1+wq+400*10]
2953*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400* 6], m1
2954*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400* 8], m2
2955*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400*10], m3
2956*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+wq+400* 6]
2957*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+wq+400* 8]
2958*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+wq+400*10]
2959*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400* 6], m0
2960*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400* 8], m4
2961*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*10], m5
2962*c0909341SAndroid Build Coastguard Worker    paddd           m2, m9
2963*c0909341SAndroid Build Coastguard Worker    paddd           m3, m9
2964*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a3 + 8) >> 4
2965*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
2966*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2967*c0909341SAndroid Build Coastguard Worker    pxor            m7, m7
2968*c0909341SAndroid Build Coastguard Worker%else
2969*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
2970*c0909341SAndroid Build Coastguard Worker%endif
2971*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
2972*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
2973*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
2974*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
2975*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
2976*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m7             ; (b3 + 2) >> 2
2977*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m7
2978*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2979*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m7
2980*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2981*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7         ; b3
2982*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
2983*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2984*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
2985*c0909341SAndroid Build Coastguard Worker%endif
2986*c0909341SAndroid Build Coastguard Worker    MAXSD           m4, m2, m7
2987*c0909341SAndroid Build Coastguard Worker    MAXSD           m5, m3, m7
2988*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p3
2989*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
2990*c0909341SAndroid Build Coastguard Worker    MULLD           m4, m14, m7        ; p3 * s1
2991*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m14, m7
2992*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11            ; b3 * 455
2993*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
2994*c0909341SAndroid Build Coastguard Worker    paddusw         m4, m11
2995*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m11
2996*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; min(z3, 255)
2997*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
2998*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m3, m4, m5, r0, dstm
2999*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m3
3000*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3, m3
3001*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m4, m7
3002*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m5, m7
3003*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3004*c0909341SAndroid Build Coastguard Worker    paddd           m1, m10
3005*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*2+ 4], m3
3006*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
3007*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
3008*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*4+ 8], m0
3009*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*4+24], m1
3010*c0909341SAndroid Build Coastguard Worker    add             wq, 16
3011*c0909341SAndroid Build Coastguard Worker    jl .hv0_loop
3012*c0909341SAndroid Build Coastguard Worker    ret
3013*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3014*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
3015*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3016*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
3017*c0909341SAndroid Build Coastguard Worker%else
3018*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
3019*c0909341SAndroid Build Coastguard Worker%endif
3020*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
3021*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
3022*c0909341SAndroid Build Coastguard Worker    movif32      leftq, leftm
3023*c0909341SAndroid Build Coastguard Worker    movddup         m5, [leftq]
3024*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
3025*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq+4]
3026*c0909341SAndroid Build Coastguard Worker    add         leftmp, 8
3027*c0909341SAndroid Build Coastguard Worker    palignr         m4, m5, 10
3028*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
3029*c0909341SAndroid Build Coastguard Worker.hv1_extend_left:
3030*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
3031*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq+4]
3032*c0909341SAndroid Build Coastguard Worker    pshufb          m4, [base+sgr_lshuf5]
3033*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
3034*c0909341SAndroid Build Coastguard Worker.hv1_bottom:
3035*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3036*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
3037*c0909341SAndroid Build Coastguard Worker%else
3038*c0909341SAndroid Build Coastguard Worker    mov         hvsrcm, lpfq
3039*c0909341SAndroid Build Coastguard Worker%endif
3040*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
3041*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
3042*c0909341SAndroid Build Coastguard Worker    movif32         wq, w0m
3043*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3044*c0909341SAndroid Build Coastguard Worker    jmp .hv1_loop_start
3045*c0909341SAndroid Build Coastguard Worker%endif
3046*c0909341SAndroid Build Coastguard Worker.hv1_loop:
3047*c0909341SAndroid Build Coastguard Worker    movif32       lpfq, hvsrcm
3048*c0909341SAndroid Build Coastguard Worker.hv1_loop_start:
3049*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+wq- 2]
3050*c0909341SAndroid Build Coastguard Worker.hv1_main:
3051*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+wq+14]
3052*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
3053*c0909341SAndroid Build Coastguard Worker    jnz .hv1_have_right
3054*c0909341SAndroid Build Coastguard Worker    cmp             wd, -20
3055*c0909341SAndroid Build Coastguard Worker    jl .hv1_have_right
3056*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3057*c0909341SAndroid Build Coastguard Worker    pxor            m8, m8
3058*c0909341SAndroid Build Coastguard Worker%endif
3059*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
3060*c0909341SAndroid Build Coastguard Worker.hv1_have_right:
3061*c0909341SAndroid Build Coastguard Worker    palignr         m7, m5, m4, 2
3062*c0909341SAndroid Build Coastguard Worker    palignr         m3, m5, m4, 4
3063*c0909341SAndroid Build Coastguard Worker    paddw           m2, m7, m3
3064*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m7, m3
3065*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
3066*c0909341SAndroid Build Coastguard Worker    punpckhwd       m7, m3
3067*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m7
3068*c0909341SAndroid Build Coastguard Worker    palignr         m3, m5, m4, 6
3069*c0909341SAndroid Build Coastguard Worker    paddw           m2, m3             ; h sum3
3070*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m3, m6
3071*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
3072*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
3073*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
3074*c0909341SAndroid Build Coastguard Worker    paddd           m0, m1             ; h sumsq3
3075*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 8
3076*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m4, m5
3077*c0909341SAndroid Build Coastguard Worker    paddw           m8, m4, m5
3078*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
3079*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5
3080*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
3081*c0909341SAndroid Build Coastguard Worker    paddd           m7, m3
3082*c0909341SAndroid Build Coastguard Worker    paddw           m5, m2, [t2+wq+400* 6]
3083*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400* 6], m2
3084*c0909341SAndroid Build Coastguard Worker    paddw           m8, m2             ; h sum5
3085*c0909341SAndroid Build Coastguard Worker    paddd           m2, m0, [t2+wq+400* 8]
3086*c0909341SAndroid Build Coastguard Worker    paddd           m3, m7, [t2+wq+400*10]
3087*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400* 8], m0
3088*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*10], m7
3089*c0909341SAndroid Build Coastguard Worker    paddd           m4, m0             ; h sumsq5
3090*c0909341SAndroid Build Coastguard Worker    paddd           m1, m7
3091*c0909341SAndroid Build Coastguard Worker    paddd           m2, m9
3092*c0909341SAndroid Build Coastguard Worker    paddd           m3, m9
3093*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a3 + 8) >> 4
3094*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
3095*c0909341SAndroid Build Coastguard Worker    pslld           m0, m2, 3
3096*c0909341SAndroid Build Coastguard Worker    pslld           m7, m3, 3
3097*c0909341SAndroid Build Coastguard Worker    paddd           m2, m0             ; ((a3 + 8) >> 4) * 9
3098*c0909341SAndroid Build Coastguard Worker    paddd           m3, m7
3099*c0909341SAndroid Build Coastguard Worker    psrlw           m7, m5, 1
3100*c0909341SAndroid Build Coastguard Worker    pavgw           m7, m6             ; (b3 + 2) >> 2
3101*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m7, m6
3102*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
3103*c0909341SAndroid Build Coastguard Worker    punpckhwd       m7, m6
3104*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m7
3105*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3106*c0909341SAndroid Build Coastguard Worker    mova      [esp+20], m8
3107*c0909341SAndroid Build Coastguard Worker%else
3108*c0909341SAndroid Build Coastguard Worker    SWAP            m8, m6
3109*c0909341SAndroid Build Coastguard Worker%endif
3110*c0909341SAndroid Build Coastguard Worker    MAXSD           m2, m0, m8
3111*c0909341SAndroid Build Coastguard Worker    MAXSD           m3, m7, m8
3112*c0909341SAndroid Build Coastguard Worker    pxor            m8, m8
3113*c0909341SAndroid Build Coastguard Worker    psubd           m2, m0             ; p3
3114*c0909341SAndroid Build Coastguard Worker    psubd           m3, m7
3115*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m5, m8         ; b3
3116*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m8
3117*c0909341SAndroid Build Coastguard Worker    MULLD           m2, m14, m8        ; p3 * s1
3118*c0909341SAndroid Build Coastguard Worker    MULLD           m3, m14, m8
3119*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11            ; b3 * 455
3120*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m11
3121*c0909341SAndroid Build Coastguard Worker    paddusw         m2, m11
3122*c0909341SAndroid Build Coastguard Worker    paddusw         m3, m11
3123*c0909341SAndroid Build Coastguard Worker    psrld           m2, 20             ; min(z3, 255)
3124*c0909341SAndroid Build Coastguard Worker    movif32         t3, t3m
3125*c0909341SAndroid Build Coastguard Worker    psrld           m3, 20
3126*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m8, m2, m3, r0, dstm
3127*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m8, m8
3128*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m8, m8
3129*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m2, m7
3130*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m3, m7
3131*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3132*c0909341SAndroid Build Coastguard Worker    paddd           m5, m10
3133*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
3134*c0909341SAndroid Build Coastguard Worker    psrld           m5, 12
3135*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*4+4], m8
3136*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*8+ 8], m0
3137*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*8+24], m5
3138*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3139*c0909341SAndroid Build Coastguard Worker    mova            m8, [esp+20]
3140*c0909341SAndroid Build Coastguard Worker%else
3141*c0909341SAndroid Build Coastguard Worker    SWAP            m6, m8
3142*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
3143*c0909341SAndroid Build Coastguard Worker%endif
3144*c0909341SAndroid Build Coastguard Worker    paddw           m5, m8, [t2+wq+400*0]
3145*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+wq+400*2]
3146*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1, [t2+wq+400*4]
3147*c0909341SAndroid Build Coastguard Worker    paddw           m5, [t1+wq+400*0]
3148*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+wq+400*2]
3149*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t1+wq+400*4]
3150*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*0], m8
3151*c0909341SAndroid Build Coastguard Worker    paddd           m2, m9
3152*c0909341SAndroid Build Coastguard Worker    paddd           m3, m9
3153*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a5 + 8) >> 4
3154*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
3155*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*2], m4
3156*c0909341SAndroid Build Coastguard Worker    pslld           m8, m2, 4
3157*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*4], m1
3158*c0909341SAndroid Build Coastguard Worker    pslld           m4, m3, 4
3159*c0909341SAndroid Build Coastguard Worker    paddd           m8, m2
3160*c0909341SAndroid Build Coastguard Worker    pslld           m2, 3
3161*c0909341SAndroid Build Coastguard Worker    paddd           m4, m3
3162*c0909341SAndroid Build Coastguard Worker    pslld           m3, 3
3163*c0909341SAndroid Build Coastguard Worker    paddd           m2, m8             ; ((a5 + 8) >> 4) * 25
3164*c0909341SAndroid Build Coastguard Worker    paddd           m3, m4
3165*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3166*c0909341SAndroid Build Coastguard Worker    pxor            m7, m7
3167*c0909341SAndroid Build Coastguard Worker%else
3168*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3169*c0909341SAndroid Build Coastguard Worker%endif
3170*c0909341SAndroid Build Coastguard Worker    psrlw           m1, m5, 1
3171*c0909341SAndroid Build Coastguard Worker    pavgw           m1, m7             ; (b5 + 2) >> 2
3172*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m1, m7
3173*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
3174*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
3175*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
3176*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m5, m7         ; b5
3177*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m7
3178*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3179*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3180*c0909341SAndroid Build Coastguard Worker%endif
3181*c0909341SAndroid Build Coastguard Worker    MAXSD           m2, m4, m7
3182*c0909341SAndroid Build Coastguard Worker    psubd           m2, m4             ; p5
3183*c0909341SAndroid Build Coastguard Worker    MAXSD           m3, m1, m7
3184*c0909341SAndroid Build Coastguard Worker    psubd           m3, m1
3185*c0909341SAndroid Build Coastguard Worker    MULLD           m2, m13, m7        ; p5 * s0
3186*c0909341SAndroid Build Coastguard Worker    MULLD           m3, m13, m7
3187*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m12             ; b5 * 164
3188*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m12
3189*c0909341SAndroid Build Coastguard Worker    paddusw         m2, m12
3190*c0909341SAndroid Build Coastguard Worker    paddusw         m3, m12
3191*c0909341SAndroid Build Coastguard Worker    psrld           m2, 20             ; min(z5, 255)
3192*c0909341SAndroid Build Coastguard Worker    psrld           m3, 20
3193*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m1, m2, m3, r0, dstm
3194*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m1, m1
3195*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m1, m1
3196*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m2, m7
3197*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m3, m7
3198*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
3199*c0909341SAndroid Build Coastguard Worker    paddd           m5, m10
3200*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*0+ 4], m1
3201*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
3202*c0909341SAndroid Build Coastguard Worker    psrld           m5, 12
3203*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*0+ 8], m0
3204*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*0+24], m5
3205*c0909341SAndroid Build Coastguard Worker    add             wq, 16
3206*c0909341SAndroid Build Coastguard Worker    jl .hv1_loop
3207*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
3208*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
3209*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
3210*c0909341SAndroid Build Coastguard Worker    ret
3211*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab3 (even rows)
3212*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3213*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
3214*c0909341SAndroid Build Coastguard Worker%else
3215*c0909341SAndroid Build Coastguard Worker    mov             wd, w0m
3216*c0909341SAndroid Build Coastguard Worker%endif
3217*c0909341SAndroid Build Coastguard Worker.v0_loop:
3218*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+wq+400* 6]
3219*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+wq+400* 8]
3220*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+wq+400*10]
3221*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
3222*c0909341SAndroid Build Coastguard Worker    paddd           m4, m4
3223*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5
3224*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+wq+400* 6]
3225*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+wq+400* 8]
3226*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+wq+400*10]
3227*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400* 6], m0
3228*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400* 8], m4
3229*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*10], m5
3230*c0909341SAndroid Build Coastguard Worker    paddd           m2, m9
3231*c0909341SAndroid Build Coastguard Worker    paddd           m3, m9
3232*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a3 + 8) >> 4
3233*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
3234*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3235*c0909341SAndroid Build Coastguard Worker    pxor            m7, m7
3236*c0909341SAndroid Build Coastguard Worker%else
3237*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3238*c0909341SAndroid Build Coastguard Worker%endif
3239*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
3240*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
3241*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
3242*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
3243*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
3244*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m7             ; (b3 + 2) >> 2
3245*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m7
3246*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
3247*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m7
3248*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
3249*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7         ; b3
3250*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
3251*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3252*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3253*c0909341SAndroid Build Coastguard Worker%endif
3254*c0909341SAndroid Build Coastguard Worker    MAXSD           m4, m2, m7
3255*c0909341SAndroid Build Coastguard Worker    MAXSD           m5, m3, m7
3256*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p3
3257*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
3258*c0909341SAndroid Build Coastguard Worker    MULLD           m4, m14, m7        ; p3 * s1
3259*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m14, m7
3260*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11            ; b3 * 455
3261*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
3262*c0909341SAndroid Build Coastguard Worker    paddusw         m4, m11
3263*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m11
3264*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; min(z3, 255)
3265*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
3266*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m3, m4, m5, r0, dstm
3267*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m3
3268*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3, m3
3269*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m4, m7
3270*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m5, m7
3271*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3272*c0909341SAndroid Build Coastguard Worker    paddd           m1, m10
3273*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*2+4], m3
3274*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
3275*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
3276*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+wq+400*0]
3277*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+wq+400*2]
3278*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+wq+400*4]
3279*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*8+ 8], m3
3280*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*0+ 8], m4
3281*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*0+24], m5
3282*c0909341SAndroid Build Coastguard Worker    paddw           m3, m3 ; cc5
3283*c0909341SAndroid Build Coastguard Worker    paddd           m4, m4
3284*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5
3285*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400*0], m3
3286*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400*2], m4
3287*c0909341SAndroid Build Coastguard Worker    mova [t1+wq+400*4], m5
3288*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*4+ 8], m0
3289*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*4+24], m1
3290*c0909341SAndroid Build Coastguard Worker    add             wq, 16
3291*c0909341SAndroid Build Coastguard Worker    jl .v0_loop
3292*c0909341SAndroid Build Coastguard Worker    ret
3293*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows)
3294*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3295*c0909341SAndroid Build Coastguard Worker    lea             wq, [r4-4]
3296*c0909341SAndroid Build Coastguard Worker%else
3297*c0909341SAndroid Build Coastguard Worker    mov             wd, w0m
3298*c0909341SAndroid Build Coastguard Worker%endif
3299*c0909341SAndroid Build Coastguard Worker.v1_loop:
3300*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+wq+400* 6]
3301*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+wq+400* 8]
3302*c0909341SAndroid Build Coastguard Worker    mova            m7, [t1+wq+400*10]
3303*c0909341SAndroid Build Coastguard Worker    paddw           m1, m4, [t2+wq+400* 6]
3304*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5, [t2+wq+400* 8]
3305*c0909341SAndroid Build Coastguard Worker    paddd           m3, m7, [t2+wq+400*10]
3306*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400* 6], m4
3307*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400* 8], m5
3308*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*10], m7
3309*c0909341SAndroid Build Coastguard Worker    paddd           m2, m9
3310*c0909341SAndroid Build Coastguard Worker    paddd           m3, m9
3311*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a3 + 8) >> 4
3312*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
3313*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3314*c0909341SAndroid Build Coastguard Worker    pxor            m7, m7
3315*c0909341SAndroid Build Coastguard Worker%else
3316*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3317*c0909341SAndroid Build Coastguard Worker%endif
3318*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
3319*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
3320*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
3321*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
3322*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
3323*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m7             ; (b3 + 2) >> 2
3324*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m7
3325*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
3326*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m7
3327*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
3328*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7         ; b3
3329*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
3330*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3331*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3332*c0909341SAndroid Build Coastguard Worker%endif
3333*c0909341SAndroid Build Coastguard Worker    MAXSD           m4, m2, m7
3334*c0909341SAndroid Build Coastguard Worker    MAXSD           m5, m3, m7
3335*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p3
3336*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
3337*c0909341SAndroid Build Coastguard Worker    MULLD           m4, m14, m7        ; p3 * s1
3338*c0909341SAndroid Build Coastguard Worker    MULLD           m5, m14, m7
3339*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11            ; b3 * 455
3340*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
3341*c0909341SAndroid Build Coastguard Worker    paddusw         m4, m11
3342*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m11
3343*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; min(z3, 255)
3344*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
3345*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m3, m4, m5, r0, dstm
3346*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m3
3347*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3, m3
3348*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m4, m7
3349*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m5, m7
3350*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3351*c0909341SAndroid Build Coastguard Worker    paddd           m1, m10
3352*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*4+4], m3
3353*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
3354*c0909341SAndroid Build Coastguard Worker    psrld           m8, m1, 12
3355*c0909341SAndroid Build Coastguard Worker    mova            m4, [t3+wq*2+400*8+ 8]
3356*c0909341SAndroid Build Coastguard Worker    mova            m5, [t3+wq*2+400*0+ 8]
3357*c0909341SAndroid Build Coastguard Worker    mova            m7, [t3+wq*2+400*0+24]
3358*c0909341SAndroid Build Coastguard Worker    paddw           m1, m4, [t2+wq+400*0]
3359*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5, [t2+wq+400*2]
3360*c0909341SAndroid Build Coastguard Worker    paddd           m3, m7, [t2+wq+400*4]
3361*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t1+wq+400*0]
3362*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+wq+400*2]
3363*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t1+wq+400*4]
3364*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*0], m4
3365*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*2], m5
3366*c0909341SAndroid Build Coastguard Worker    mova [t2+wq+400*4], m7
3367*c0909341SAndroid Build Coastguard Worker    paddd           m2, m9
3368*c0909341SAndroid Build Coastguard Worker    paddd           m3, m9
3369*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a5 + 8) >> 4
3370*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
3371*c0909341SAndroid Build Coastguard Worker    mova         [t3+wq*2+400*8+ 8], m0
3372*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 4
3373*c0909341SAndroid Build Coastguard Worker    mova         [t3+wq*2+400*8+24], m8
3374*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 4
3375*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2
3376*c0909341SAndroid Build Coastguard Worker    pslld           m2, 3
3377*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
3378*c0909341SAndroid Build Coastguard Worker    pslld           m3, 3
3379*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4
3380*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5
3381*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3382*c0909341SAndroid Build Coastguard Worker    pxor            m7, m7
3383*c0909341SAndroid Build Coastguard Worker%else
3384*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3385*c0909341SAndroid Build Coastguard Worker%endif
3386*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m1, 1
3387*c0909341SAndroid Build Coastguard Worker    pavgw           m5, m7             ; (b5 + 2) >> 2
3388*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5, m7
3389*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
3390*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m7
3391*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
3392*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7         ; b5
3393*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
3394*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3395*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3396*c0909341SAndroid Build Coastguard Worker%endif
3397*c0909341SAndroid Build Coastguard Worker    MAXSD           m2, m4, m7
3398*c0909341SAndroid Build Coastguard Worker    psubd           m2, m4             ; p5
3399*c0909341SAndroid Build Coastguard Worker    MAXSD           m3, m5, m7
3400*c0909341SAndroid Build Coastguard Worker    psubd           m3, m5
3401*c0909341SAndroid Build Coastguard Worker    MULLD           m2, m13, m7        ; p5 * s0
3402*c0909341SAndroid Build Coastguard Worker    MULLD           m3, m13, m7
3403*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m12            ; b5 * 164
3404*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m12
3405*c0909341SAndroid Build Coastguard Worker    paddusw         m2, m12
3406*c0909341SAndroid Build Coastguard Worker    paddusw         m3, m12
3407*c0909341SAndroid Build Coastguard Worker    psrld           m2, 20             ; min(z5, 255)
3408*c0909341SAndroid Build Coastguard Worker    psrld           m3, 20
3409*c0909341SAndroid Build Coastguard Worker    GATHER_X_BY_X   m4, m2, m3, r0, dstm
3410*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m4, m4
3411*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4, m4
3412*c0909341SAndroid Build Coastguard Worker    MULLD           m0, m2, m7
3413*c0909341SAndroid Build Coastguard Worker    MULLD           m1, m3, m7
3414*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
3415*c0909341SAndroid Build Coastguard Worker    paddd           m1, m10
3416*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*0+ 4], m4
3417*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
3418*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
3419*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*0+ 8], m0
3420*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*0+24], m1
3421*c0909341SAndroid Build Coastguard Worker    add             wq, 16
3422*c0909341SAndroid Build Coastguard Worker    jl .v1_loop
3423*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
3424*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
3425*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
3426*c0909341SAndroid Build Coastguard Worker    ret
3427*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup
3428*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
3429*c0909341SAndroid Build Coastguard Worker    movif32         wd, w1m
3430*c0909341SAndroid Build Coastguard Worker.prep_n_loop:
3431*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+wq*1+400*0+ 2]
3432*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*2+400*0+ 4]
3433*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+wq*2+400*0+20]
3434*c0909341SAndroid Build Coastguard Worker    movu            m7, [t4+wq*1+400*0+ 4]
3435*c0909341SAndroid Build Coastguard Worker    movu            m8, [t3+wq*2+400*0+ 8]
3436*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0, [t4+wq*1+400*0+ 0]
3437*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1, [t3+wq*2+400*0+ 0]
3438*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2, [t3+wq*2+400*0+16]
3439*c0909341SAndroid Build Coastguard Worker    paddw           m3, m7
3440*c0909341SAndroid Build Coastguard Worker    paddd           m4, m8
3441*c0909341SAndroid Build Coastguard Worker    movu            m7, [t3+wq*2+400*0+24]
3442*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3
3443*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
3444*c0909341SAndroid Build Coastguard Worker    psllw           m3, 2
3445*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2
3446*c0909341SAndroid Build Coastguard Worker    paddd           m5, m7
3447*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5
3448*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2
3449*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3               ; a5 565
3450*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4               ; b5 565
3451*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5
3452*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400* 6+ 0], m0
3453*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*12+ 0], m1
3454*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*12+16], m2
3455*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+wq*1+400*2+ 4]
3456*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*2+400*4+ 8]
3457*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+wq*2+400*4+24]
3458*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*1+400*2+ 2]
3459*c0909341SAndroid Build Coastguard Worker    movu            m4, [t3+wq*2+400*4+ 4]
3460*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*2+400*4+20]
3461*c0909341SAndroid Build Coastguard Worker    paddw           m0, [t4+wq*1+400*2+ 0]
3462*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+wq*2+400*4+ 0]
3463*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t3+wq*2+400*4+16]
3464*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0
3465*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1
3466*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2
3467*c0909341SAndroid Build Coastguard Worker    psllw           m3, 2                ; a3[-1] 444
3468*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2                ; b3[-1] 444
3469*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2
3470*c0909341SAndroid Build Coastguard Worker    psubw           m3, m0               ; a3[-1] 343
3471*c0909341SAndroid Build Coastguard Worker    psubd           m4, m1               ; b3[-1] 343
3472*c0909341SAndroid Build Coastguard Worker    psubd           m5, m2
3473*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400* 8+ 0], m3
3474*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*16+ 0], m4
3475*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*16+16], m5
3476*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+wq*1+400*4+ 4]
3477*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*2+400*8+ 8]
3478*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+wq*2+400*8+24]
3479*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*1+400*4+ 2]
3480*c0909341SAndroid Build Coastguard Worker    movu            m4, [t3+wq*2+400*8+ 4]
3481*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*2+400*8+20]
3482*c0909341SAndroid Build Coastguard Worker    paddw           m0, [t4+wq*1+400*4+ 0]
3483*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+wq*2+400*8+ 0]
3484*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t3+wq*2+400*8+16]
3485*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0
3486*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1
3487*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2
3488*c0909341SAndroid Build Coastguard Worker    psllw           m3, 2                 ; a3[ 0] 444
3489*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2                 ; b3[ 0] 444
3490*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2
3491*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*10+ 0], m3
3492*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*20+ 0], m4
3493*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*20+16], m5
3494*c0909341SAndroid Build Coastguard Worker    psubw           m3, m0                ; a3[ 0] 343
3495*c0909341SAndroid Build Coastguard Worker    psubd           m4, m1                ; b3[ 0] 343
3496*c0909341SAndroid Build Coastguard Worker    psubd           m5, m2
3497*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*12+ 0], m3
3498*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*24+ 0], m4
3499*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*24+16], m5
3500*c0909341SAndroid Build Coastguard Worker    add             wq, 16
3501*c0909341SAndroid Build Coastguard Worker    jl .prep_n_loop
3502*c0909341SAndroid Build Coastguard Worker    ret
3503*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3504*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows)
3505*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
3506*c0909341SAndroid Build Coastguard Worker    movif32         wd, w1m
3507*c0909341SAndroid Build Coastguard Worker.n0_loop:
3508*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+wq*1+ 4]
3509*c0909341SAndroid Build Coastguard Worker    movu            m2, [t4+wq*1+ 2]
3510*c0909341SAndroid Build Coastguard Worker    paddw           m0, [t4+wq*1+ 0]
3511*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
3512*c0909341SAndroid Build Coastguard Worker    paddw           m2, m0
3513*c0909341SAndroid Build Coastguard Worker    psllw           m0, 2
3514*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2               ; a5
3515*c0909341SAndroid Build Coastguard Worker    movu            m4, [t3+wq*2+ 8]
3516*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*2+24]
3517*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*2+ 4]
3518*c0909341SAndroid Build Coastguard Worker    movu            m3, [t3+wq*2+20]
3519*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+wq*2+ 0]
3520*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+wq*2+16]
3521*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1
3522*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
3523*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
3524*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5
3525*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2
3526*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2
3527*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1               ; b5
3528*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
3529*c0909341SAndroid Build Coastguard Worker    movu            m2, [t4+wq*1+400* 6]
3530*c0909341SAndroid Build Coastguard Worker    paddw           m2, m0
3531*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400* 6], m0
3532*c0909341SAndroid Build Coastguard Worker    paddd           m0, m4, [t3+wq*2+400*12+ 0]
3533*c0909341SAndroid Build Coastguard Worker    paddd           m1, m5, [t3+wq*2+400*12+16]
3534*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*12+ 0], m4
3535*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*12+16], m5
3536*c0909341SAndroid Build Coastguard Worker    mova [rsp+16+ARCH_X86_32*4], m1
3537*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*1+400*2+4]
3538*c0909341SAndroid Build Coastguard Worker    movu            m5, [t4+wq*1+400*2+2]
3539*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*1+400*2+0]
3540*c0909341SAndroid Build Coastguard Worker    paddw           m5, m3
3541*c0909341SAndroid Build Coastguard Worker    psllw           m5, 2                ; a3[ 1] 444
3542*c0909341SAndroid Build Coastguard Worker    psubw           m4, m5, m3           ; a3[ 1] 343
3543*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*1+400* 8]
3544*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*1+400*10]
3545*c0909341SAndroid Build Coastguard Worker    paddw           m3, m4
3546*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400* 8], m4
3547*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*10], m5
3548*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*2+400*4+ 8]
3549*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*2+400*4+ 4]
3550*c0909341SAndroid Build Coastguard Worker    movu            m7, [t3+wq*2+400*4+24]
3551*c0909341SAndroid Build Coastguard Worker    movu            m8, [t3+wq*2+400*4+20]
3552*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+wq*2+400*4+ 0]
3553*c0909341SAndroid Build Coastguard Worker    paddd           m7, [t3+wq*2+400*4+16]
3554*c0909341SAndroid Build Coastguard Worker    paddd           m5, m1
3555*c0909341SAndroid Build Coastguard Worker    paddd           m8, m7
3556*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2                ; b3[ 1] 444
3557*c0909341SAndroid Build Coastguard Worker    pslld           m8, 2
3558*c0909341SAndroid Build Coastguard Worker    psubd           m4, m5, m1           ; b3[ 1] 343
3559*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3560*c0909341SAndroid Build Coastguard Worker    mova      [esp+52], m8
3561*c0909341SAndroid Build Coastguard Worker    psubd           m8, m7
3562*c0909341SAndroid Build Coastguard Worker%else
3563*c0909341SAndroid Build Coastguard Worker    psubd           m6, m8, m7
3564*c0909341SAndroid Build Coastguard Worker    SWAP            m8, m6
3565*c0909341SAndroid Build Coastguard Worker%endif
3566*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4, [t3+wq*2+400*16+ 0]
3567*c0909341SAndroid Build Coastguard Worker    paddd           m7, m8, [t3+wq*2+400*16+16]
3568*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+wq*2+400*20+ 0]
3569*c0909341SAndroid Build Coastguard Worker    paddd           m7, [t3+wq*2+400*20+16]
3570*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*16+ 0], m4
3571*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*16+16], m8
3572*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*20+ 0], m5
3573*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3574*c0909341SAndroid Build Coastguard Worker    mova            m8, [esp+52]
3575*c0909341SAndroid Build Coastguard Worker%else
3576*c0909341SAndroid Build Coastguard Worker    SWAP            m8, m6
3577*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
3578*c0909341SAndroid Build Coastguard Worker%endif
3579*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*20+16], m8
3580*c0909341SAndroid Build Coastguard Worker    mova [rsp+32+ARCH_X86_32*4], m7
3581*c0909341SAndroid Build Coastguard Worker    movu            m5, [dstq+wq]
3582*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5, m6
3583*c0909341SAndroid Build Coastguard Worker    punpcklwd       m7, m2, m6
3584*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m4               ; a5 * src
3585*c0909341SAndroid Build Coastguard Worker    punpcklwd       m8, m3, m6
3586*c0909341SAndroid Build Coastguard Worker    pmaddwd         m8, m4               ; a3 * src
3587*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m6
3588*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m6
3589*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m5
3590*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
3591*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m5
3592*c0909341SAndroid Build Coastguard Worker    pslld           m4, 13
3593*c0909341SAndroid Build Coastguard Worker    pslld           m5, 13
3594*c0909341SAndroid Build Coastguard Worker    psubd           m0, m7               ; b5 - a5 * src + (1 << 8)
3595*c0909341SAndroid Build Coastguard Worker    psubd           m1, m8               ; b3 - a3 * src + (1 << 8)
3596*c0909341SAndroid Build Coastguard Worker    mova            m7, [base+pd_0xffff]
3597*c0909341SAndroid Build Coastguard Worker    psrld           m0, 9
3598*c0909341SAndroid Build Coastguard Worker    pslld           m1, 7
3599*c0909341SAndroid Build Coastguard Worker    pand            m0, m7
3600*c0909341SAndroid Build Coastguard Worker    pandn           m8, m7, m1
3601*c0909341SAndroid Build Coastguard Worker    por             m0, m8
3602*c0909341SAndroid Build Coastguard Worker    mova            m1, [rsp+16+ARCH_X86_32*4]
3603*c0909341SAndroid Build Coastguard Worker    mova            m8, [rsp+32+ARCH_X86_32*4]
3604*c0909341SAndroid Build Coastguard Worker    psubd           m1, m2
3605*c0909341SAndroid Build Coastguard Worker    psubd           m8, m3
3606*c0909341SAndroid Build Coastguard Worker    mova            m2, [base+pd_4096]
3607*c0909341SAndroid Build Coastguard Worker    psrld           m1, 9
3608*c0909341SAndroid Build Coastguard Worker    pslld           m8, 7
3609*c0909341SAndroid Build Coastguard Worker    pand            m1, m7
3610*c0909341SAndroid Build Coastguard Worker    pandn           m7, m8
3611*c0909341SAndroid Build Coastguard Worker    por             m1, m7
3612*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m15
3613*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m15
3614*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3615*c0909341SAndroid Build Coastguard Worker    pxor            m7, m7
3616*c0909341SAndroid Build Coastguard Worker%else
3617*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3618*c0909341SAndroid Build Coastguard Worker%endif
3619*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2
3620*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2
3621*c0909341SAndroid Build Coastguard Worker    paddd           m0, m4
3622*c0909341SAndroid Build Coastguard Worker    paddd           m1, m5
3623*c0909341SAndroid Build Coastguard Worker    psrad           m0, 8
3624*c0909341SAndroid Build Coastguard Worker    psrad           m1, 8
3625*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1               ; clip
3626*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m7
3627*c0909341SAndroid Build Coastguard Worker    psrlw           m0, 5
3628*c0909341SAndroid Build Coastguard Worker    mova     [dstq+wq], m0
3629*c0909341SAndroid Build Coastguard Worker    add             wq, 16
3630*c0909341SAndroid Build Coastguard Worker    jl .n0_loop
3631*c0909341SAndroid Build Coastguard Worker    add           dstq, stridemp
3632*c0909341SAndroid Build Coastguard Worker    ret
3633*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3634*c0909341SAndroid Build Coastguard Worker    SWAP            m6, m7
3635*c0909341SAndroid Build Coastguard Worker%endif
3636*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3637*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows)
3638*c0909341SAndroid Build Coastguard Worker    movif64         wq, r4
3639*c0909341SAndroid Build Coastguard Worker    movif32         wd, w1m
3640*c0909341SAndroid Build Coastguard Worker.n1_loop:
3641*c0909341SAndroid Build Coastguard Worker    movu            m3, [t4+wq*1+400*4+4]
3642*c0909341SAndroid Build Coastguard Worker    movu            m5, [t4+wq*1+400*4+2]
3643*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*1+400*4+0]
3644*c0909341SAndroid Build Coastguard Worker    paddw           m5, m3
3645*c0909341SAndroid Build Coastguard Worker    psllw           m5, 2                ; a3[ 1] 444
3646*c0909341SAndroid Build Coastguard Worker    psubw           m4, m5, m3           ; a3[ 1] 343
3647*c0909341SAndroid Build Coastguard Worker    paddw           m3, m4, [t4+wq*1+400*12]
3648*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+wq*1+400*10]
3649*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*10], m5
3650*c0909341SAndroid Build Coastguard Worker    mova [t4+wq*1+400*12], m4
3651*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+wq*2+400*8+ 8]
3652*c0909341SAndroid Build Coastguard Worker    movu            m5, [t3+wq*2+400*8+ 4]
3653*c0909341SAndroid Build Coastguard Worker    movu            m7, [t3+wq*2+400*8+24]
3654*c0909341SAndroid Build Coastguard Worker    movu            m8, [t3+wq*2+400*8+20]
3655*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+wq*2+400*8+ 0]
3656*c0909341SAndroid Build Coastguard Worker    paddd           m7, [t3+wq*2+400*8+16]
3657*c0909341SAndroid Build Coastguard Worker    paddd           m5, m1
3658*c0909341SAndroid Build Coastguard Worker    paddd           m8, m7
3659*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2                ; b3[ 1] 444
3660*c0909341SAndroid Build Coastguard Worker    pslld           m8, 2
3661*c0909341SAndroid Build Coastguard Worker    psubd           m4, m5, m1           ; b3[ 1] 343
3662*c0909341SAndroid Build Coastguard Worker    psubd           m0, m8, m7
3663*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4, [t3+wq*2+400*24+ 0]
3664*c0909341SAndroid Build Coastguard Worker    paddd           m7, m0, [t3+wq*2+400*24+16]
3665*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+wq*2+400*20+ 0]
3666*c0909341SAndroid Build Coastguard Worker    paddd           m7, [t3+wq*2+400*20+16]
3667*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*20+ 0], m5
3668*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*20+16], m8
3669*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*24+ 0], m4
3670*c0909341SAndroid Build Coastguard Worker    mova [t3+wq*2+400*24+16], m0
3671*c0909341SAndroid Build Coastguard Worker    mova            m5, [dstq+wq]
3672*c0909341SAndroid Build Coastguard Worker    mova            m2, [t4+wq*1+400* 6]
3673*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5, m6
3674*c0909341SAndroid Build Coastguard Worker    punpcklwd       m8, m2, m6
3675*c0909341SAndroid Build Coastguard Worker    pmaddwd         m8, m4               ; a5 * src
3676*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m3, m6
3677*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m4               ; a3 * src
3678*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m6
3679*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m6
3680*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m5
3681*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
3682*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m5
3683*c0909341SAndroid Build Coastguard Worker    psubd           m1, m0               ; b3 - a3 * src + (1 << 8)
3684*c0909341SAndroid Build Coastguard Worker    pslld           m4, 13
3685*c0909341SAndroid Build Coastguard Worker    pslld           m5, 13
3686*c0909341SAndroid Build Coastguard Worker    mova            m0, [t3+wq*2+400*12+ 0]
3687*c0909341SAndroid Build Coastguard Worker    psubd           m0, m8               ; b5 - a5 * src + (1 << 8)
3688*c0909341SAndroid Build Coastguard Worker    mova            m8, [t3+wq*2+400*12+16]
3689*c0909341SAndroid Build Coastguard Worker    psubd           m8, m2
3690*c0909341SAndroid Build Coastguard Worker    psubd           m7, m3
3691*c0909341SAndroid Build Coastguard Worker    mova            m2, [base+pd_0xffff]
3692*c0909341SAndroid Build Coastguard Worker    pslld           m1, 7
3693*c0909341SAndroid Build Coastguard Worker    psrld           m0, 8
3694*c0909341SAndroid Build Coastguard Worker    psrld           m8, 8
3695*c0909341SAndroid Build Coastguard Worker    pslld           m7, 7
3696*c0909341SAndroid Build Coastguard Worker    pand            m0, m2
3697*c0909341SAndroid Build Coastguard Worker    pandn           m3, m2, m1
3698*c0909341SAndroid Build Coastguard Worker    por             m0, m3
3699*c0909341SAndroid Build Coastguard Worker    pand            m8, m2
3700*c0909341SAndroid Build Coastguard Worker    pandn           m2, m7
3701*c0909341SAndroid Build Coastguard Worker    por             m2, m8
3702*c0909341SAndroid Build Coastguard Worker    mova            m1, [base+pd_4096]
3703*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m15
3704*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m15
3705*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3706*c0909341SAndroid Build Coastguard Worker    SWAP            m7, m6
3707*c0909341SAndroid Build Coastguard Worker%endif
3708*c0909341SAndroid Build Coastguard Worker    pxor            m7, m7
3709*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1
3710*c0909341SAndroid Build Coastguard Worker    paddd           m5, m1
3711*c0909341SAndroid Build Coastguard Worker    paddd           m0, m4
3712*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5
3713*c0909341SAndroid Build Coastguard Worker    psrad           m0, 8
3714*c0909341SAndroid Build Coastguard Worker    psrad           m2, 8
3715*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m2              ; clip
3716*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m7
3717*c0909341SAndroid Build Coastguard Worker    psrlw           m0, 5
3718*c0909341SAndroid Build Coastguard Worker    mova     [dstq+wq], m0
3719*c0909341SAndroid Build Coastguard Worker    add             wq, 16
3720*c0909341SAndroid Build Coastguard Worker    jl .n1_loop
3721*c0909341SAndroid Build Coastguard Worker    add           dstq, stridemp
3722*c0909341SAndroid Build Coastguard Worker    movif32       dstm, dstq
3723*c0909341SAndroid Build Coastguard Worker    ret
3724