xref: /aosp_15_r20/external/libdav1d/src/x86/looprestoration16_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 32
32*c0909341SAndroid Build Coastguard Worker
33*c0909341SAndroid Build Coastguard Workersgr_lshuf3:    db  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
34*c0909341SAndroid Build Coastguard Workersgr_lshuf5:    db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9
35*c0909341SAndroid Build Coastguard Workerwiener_lshuf5: db  4,  5,  4,  5,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
36*c0909341SAndroid Build Coastguard Worker               db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
37*c0909341SAndroid Build Coastguard Workerwiener_lshuf7: db  8,  9,  8,  9,  8,  9,  8,  9,  8,  9, 10, 11, 12, 13, 14, 15
38*c0909341SAndroid Build Coastguard Worker               db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
39*c0909341SAndroid Build Coastguard Workerwiener_shufA:  db  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11
40*c0909341SAndroid Build Coastguard Workerwiener_shufB:  db  6,  7,  4,  5,  8,  9,  6,  7, 10, 11,  8,  9, 12, 13, 10, 11
41*c0909341SAndroid Build Coastguard Workerwiener_shufC:  db  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
42*c0909341SAndroid Build Coastguard Workerwiener_shufD:  db  2,  3, -1, -1,  4,  5, -1, -1,  6,  7, -1, -1,  8,  9, -1, -1
43*c0909341SAndroid Build Coastguard Workerwiener_shufE:  db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
44*c0909341SAndroid Build Coastguard Worker
45*c0909341SAndroid Build Coastguard Workerwiener_hshift: dw 4, 4, 1, 1
46*c0909341SAndroid Build Coastguard Workerwiener_vshift: dw 1024, 1024, 4096, 4096
47*c0909341SAndroid Build Coastguard Workerwiener_round:  dd 1049600, 1048832
48*c0909341SAndroid Build Coastguard Worker
49*c0909341SAndroid Build Coastguard Workerpb_m10_m9:     times 2 db -10, -9
50*c0909341SAndroid Build Coastguard Workerpb_m6_m5:      times 2 db  -6, -5
51*c0909341SAndroid Build Coastguard Workerpb_m2_m1:      times 2 db  -2, -1
52*c0909341SAndroid Build Coastguard Workerpb_2_3:        times 2 db   2,  3
53*c0909341SAndroid Build Coastguard Workerpb_6_7:        times 2 db   6,  7
54*c0909341SAndroid Build Coastguard Workerpw_1023:       times 2 dw 1023
55*c0909341SAndroid Build Coastguard Workerpw_164_24:     dw 164, 24
56*c0909341SAndroid Build Coastguard Workerpw_455_24:     dw 455, 24
57*c0909341SAndroid Build Coastguard Workerpd_8:          dd 8
58*c0909341SAndroid Build Coastguard Workerpd_25:         dd 25
59*c0909341SAndroid Build Coastguard Workerpd_4096:       dd 4096
60*c0909341SAndroid Build Coastguard Workerpd_34816:      dd 34816
61*c0909341SAndroid Build Coastguard Workerpd_m262128:    dd -262128
62*c0909341SAndroid Build Coastguard Workerpf_256:        dd 256.0
63*c0909341SAndroid Build Coastguard Worker
64*c0909341SAndroid Build Coastguard Worker%define pw_256 sgr_lshuf5
65*c0909341SAndroid Build Coastguard Worker
66*c0909341SAndroid Build Coastguard Workercextern pb_0to63
67*c0909341SAndroid Build Coastguard Worker
68*c0909341SAndroid Build Coastguard WorkerSECTION .text
69*c0909341SAndroid Build Coastguard Worker
70*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers
71*c0909341SAndroid Build Coastguard Worker
72*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
73*c0909341SAndroid Build Coastguard Workercglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
74*c0909341SAndroid Build Coastguard Worker                                                     w, h, edge, flt
75*c0909341SAndroid Build Coastguard Worker%define base t4-wiener_hshift
76*c0909341SAndroid Build Coastguard Worker    mov           fltq, r6mp
77*c0909341SAndroid Build Coastguard Worker    movifnidn       wd, wm
78*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
79*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
80*c0909341SAndroid Build Coastguard Worker    mov            t3d, r8m ; pixel_max
81*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m6, [wiener_shufA]
82*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m12, [fltq+ 0] ; x0 x1
83*c0909341SAndroid Build Coastguard Worker    lea             t4, [wiener_hshift]
84*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m7, [wiener_shufB]
85*c0909341SAndroid Build Coastguard Worker    add             wd, wd
86*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m13, [fltq+ 4] ; x2 x3
87*c0909341SAndroid Build Coastguard Worker    shr            t3d, 11
88*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m14, [fltq+16] ; y0 y1
89*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
90*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m15, [fltq+20] ; y2 y3
91*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
92*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m8, [wiener_shufC]
93*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq+16]
94*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m9, [wiener_shufD]
95*c0909341SAndroid Build Coastguard Worker    neg             wq
96*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m0, [base+wiener_hshift+t3*4]
97*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m10, [base+wiener_round+t3*4]
98*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m11, [base+wiener_vshift+t3*4]
99*c0909341SAndroid Build Coastguard Worker    pmullw         m12, m0 ; upshift filter coefs to make the
100*c0909341SAndroid Build Coastguard Worker    pmullw         m13, m0 ; horizontal downshift constant
101*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
102*c0909341SAndroid Build Coastguard Worker    jz .no_top
103*c0909341SAndroid Build Coastguard Worker    call .h_top
104*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
105*c0909341SAndroid Build Coastguard Worker    mov             t6, t1
106*c0909341SAndroid Build Coastguard Worker    mov             t5, t1
107*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
108*c0909341SAndroid Build Coastguard Worker    call .h_top
109*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
110*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
111*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
112*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
113*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
114*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10 ; below
115*c0909341SAndroid Build Coastguard Worker    call .h
116*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
117*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
118*c0909341SAndroid Build Coastguard Worker    dec             hd
119*c0909341SAndroid Build Coastguard Worker    jz .v1
120*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
121*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
122*c0909341SAndroid Build Coastguard Worker    call .h
123*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
124*c0909341SAndroid Build Coastguard Worker    dec             hd
125*c0909341SAndroid Build Coastguard Worker    jz .v2
126*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
127*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
128*c0909341SAndroid Build Coastguard Worker    call .h
129*c0909341SAndroid Build Coastguard Worker    dec             hd
130*c0909341SAndroid Build Coastguard Worker    jz .v3
131*c0909341SAndroid Build Coastguard Worker.main:
132*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+384*2]
133*c0909341SAndroid Build Coastguard Worker.main_loop:
134*c0909341SAndroid Build Coastguard Worker    call .hv
135*c0909341SAndroid Build Coastguard Worker    dec             hd
136*c0909341SAndroid Build Coastguard Worker    jnz .main_loop
137*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
138*c0909341SAndroid Build Coastguard Worker    jz .v3
139*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
140*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
141*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
142*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
143*c0909341SAndroid Build Coastguard Worker.v1:
144*c0909341SAndroid Build Coastguard Worker    call .v
145*c0909341SAndroid Build Coastguard Worker    RET
146*c0909341SAndroid Build Coastguard Worker.no_top:
147*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
148*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
149*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
150*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10
151*c0909341SAndroid Build Coastguard Worker    call .h
152*c0909341SAndroid Build Coastguard Worker    mov             t6, t1
153*c0909341SAndroid Build Coastguard Worker    mov             t5, t1
154*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
155*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
156*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
157*c0909341SAndroid Build Coastguard Worker    dec             hd
158*c0909341SAndroid Build Coastguard Worker    jz .v1
159*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
160*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
161*c0909341SAndroid Build Coastguard Worker    call .h
162*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
163*c0909341SAndroid Build Coastguard Worker    dec             hd
164*c0909341SAndroid Build Coastguard Worker    jz .v2
165*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
166*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
167*c0909341SAndroid Build Coastguard Worker    call .h
168*c0909341SAndroid Build Coastguard Worker    dec             hd
169*c0909341SAndroid Build Coastguard Worker    jz .v3
170*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+384*2]
171*c0909341SAndroid Build Coastguard Worker    call .hv
172*c0909341SAndroid Build Coastguard Worker    dec             hd
173*c0909341SAndroid Build Coastguard Worker    jz .v3
174*c0909341SAndroid Build Coastguard Worker    add             t0, 384*8
175*c0909341SAndroid Build Coastguard Worker    call .hv
176*c0909341SAndroid Build Coastguard Worker    dec             hd
177*c0909341SAndroid Build Coastguard Worker    jnz .main
178*c0909341SAndroid Build Coastguard Worker.v3:
179*c0909341SAndroid Build Coastguard Worker    call .v
180*c0909341SAndroid Build Coastguard Worker.v2:
181*c0909341SAndroid Build Coastguard Worker    call .v
182*c0909341SAndroid Build Coastguard Worker    jmp .v1
183*c0909341SAndroid Build Coastguard Worker.extend_right:
184*c0909341SAndroid Build Coastguard Worker    movd           xm1, r10d
185*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m0, [pb_6_7]
186*c0909341SAndroid Build Coastguard Worker    mova            m2, [pb_0to63]
187*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m1, xm1
188*c0909341SAndroid Build Coastguard Worker    psubb           m0, m1
189*c0909341SAndroid Build Coastguard Worker    pminub          m0, m2
190*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m0
191*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m0, [pb_m2_m1]
192*c0909341SAndroid Build Coastguard Worker    psubb           m0, m1
193*c0909341SAndroid Build Coastguard Worker    pminub          m0, m2
194*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m0
195*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m0, [pb_m10_m9]
196*c0909341SAndroid Build Coastguard Worker    psubb           m0, m1
197*c0909341SAndroid Build Coastguard Worker    pminub          m0, m2
198*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m0
199*c0909341SAndroid Build Coastguard Worker    ret
200*c0909341SAndroid Build Coastguard Worker.h:
201*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
202*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
203*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
204*c0909341SAndroid Build Coastguard Worker    movq           xm3, [leftq]
205*c0909341SAndroid Build Coastguard Worker    vpblendd        m3, [lpfq+r10-8], 0xfc
206*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
207*c0909341SAndroid Build Coastguard Worker    jmp .h_main
208*c0909341SAndroid Build Coastguard Worker.h_extend_left:
209*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m3, [lpfq+r10] ; avoid accessing memory located
210*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+r10] ; before the start of the buffer
211*c0909341SAndroid Build Coastguard Worker    shufpd          m3, m4, 0x05
212*c0909341SAndroid Build Coastguard Worker    pshufb          m3, [wiener_lshuf7]
213*c0909341SAndroid Build Coastguard Worker    jmp .h_main2
214*c0909341SAndroid Build Coastguard Worker.h_top:
215*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
216*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
217*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
218*c0909341SAndroid Build Coastguard Worker.h_loop:
219*c0909341SAndroid Build Coastguard Worker    movu            m3, [lpfq+r10-8]
220*c0909341SAndroid Build Coastguard Worker.h_main:
221*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+r10+0]
222*c0909341SAndroid Build Coastguard Worker.h_main2:
223*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+r10+8]
224*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
225*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
226*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -36
227*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
228*c0909341SAndroid Build Coastguard Worker    call .extend_right
229*c0909341SAndroid Build Coastguard Worker.h_have_right:
230*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m3, m6
231*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m7
232*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
233*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m8
234*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m12
235*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m9
236*c0909341SAndroid Build Coastguard Worker    paddw           m3, m1
237*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m6
238*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m13
239*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m5, m7
240*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2
241*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m2, [pd_m262128] ; (1 << 4) - (1 << 18)
242*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m8
243*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m12
244*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m9
245*c0909341SAndroid Build Coastguard Worker    paddw           m4, m5
246*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m13
247*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
248*c0909341SAndroid Build Coastguard Worker    paddd           m1, m2
249*c0909341SAndroid Build Coastguard Worker    paddd           m0, m3
250*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
251*c0909341SAndroid Build Coastguard Worker    psrad           m0, 4
252*c0909341SAndroid Build Coastguard Worker    psrad           m1, 4
253*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
254*c0909341SAndroid Build Coastguard Worker    psraw           m0, 1
255*c0909341SAndroid Build Coastguard Worker    mova      [t1+r10], m0
256*c0909341SAndroid Build Coastguard Worker    add            r10, 32
257*c0909341SAndroid Build Coastguard Worker    jl .h_loop
258*c0909341SAndroid Build Coastguard Worker    ret
259*c0909341SAndroid Build Coastguard WorkerALIGN function_align
260*c0909341SAndroid Build Coastguard Worker.hv:
261*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
262*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
263*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
264*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
265*c0909341SAndroid Build Coastguard Worker    movq           xm3, [leftq]
266*c0909341SAndroid Build Coastguard Worker    vpblendd        m3, [lpfq+r10-8], 0xfc
267*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
268*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
269*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
270*c0909341SAndroid Build Coastguard Worker    movu            m3, [lpfq+r10-8]
271*c0909341SAndroid Build Coastguard Worker    pshufb          m3, [wiener_lshuf7]
272*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
273*c0909341SAndroid Build Coastguard Worker.hv_bottom:
274*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
275*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
276*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
277*c0909341SAndroid Build Coastguard Worker.hv_loop:
278*c0909341SAndroid Build Coastguard Worker    movu            m3, [lpfq+r10-8]
279*c0909341SAndroid Build Coastguard Worker.hv_main:
280*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+r10+0]
281*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+r10+8]
282*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
283*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
284*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -36
285*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
286*c0909341SAndroid Build Coastguard Worker    call .extend_right
287*c0909341SAndroid Build Coastguard Worker.hv_have_right:
288*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m3, m6
289*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m7
290*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
291*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m8
292*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m12
293*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m9
294*c0909341SAndroid Build Coastguard Worker    paddw           m3, m1
295*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m6
296*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m13
297*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m5, m7
298*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2
299*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m2, [pd_m262128]
300*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m8
301*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m12
302*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m9
303*c0909341SAndroid Build Coastguard Worker    paddw           m4, m5
304*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m13
305*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
306*c0909341SAndroid Build Coastguard Worker    paddd           m1, m2
307*c0909341SAndroid Build Coastguard Worker    mova            m2, [t4+r10]
308*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t2+r10]
309*c0909341SAndroid Build Coastguard Worker    mova            m5, [t3+r10]
310*c0909341SAndroid Build Coastguard Worker    paddd           m0, m3
311*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
312*c0909341SAndroid Build Coastguard Worker    psrad           m0, 4
313*c0909341SAndroid Build Coastguard Worker    psrad           m1, 4
314*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
315*c0909341SAndroid Build Coastguard Worker    mova            m4, [t5+r10]
316*c0909341SAndroid Build Coastguard Worker    paddw           m4, [t1+r10]
317*c0909341SAndroid Build Coastguard Worker    psraw           m0, 1
318*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0, [t6+r10]
319*c0909341SAndroid Build Coastguard Worker    mova      [t0+r10], m0
320*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m2, m5
321*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m15
322*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m5
323*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m15
324*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m3, m4
325*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m14
326*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4
327*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m14
328*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10
329*c0909341SAndroid Build Coastguard Worker    paddd           m2, m10
330*c0909341SAndroid Build Coastguard Worker    paddd           m0, m1
331*c0909341SAndroid Build Coastguard Worker    paddd           m2, m3
332*c0909341SAndroid Build Coastguard Worker    psrad           m0, 5
333*c0909341SAndroid Build Coastguard Worker    psrad           m2, 5
334*c0909341SAndroid Build Coastguard Worker    packusdw        m0, m2
335*c0909341SAndroid Build Coastguard Worker    pmulhuw         m0, m11
336*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
337*c0909341SAndroid Build Coastguard Worker    add            r10, 32
338*c0909341SAndroid Build Coastguard Worker    jl .hv_loop
339*c0909341SAndroid Build Coastguard Worker    mov             t6, t5
340*c0909341SAndroid Build Coastguard Worker    mov             t5, t4
341*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
342*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
343*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
344*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
345*c0909341SAndroid Build Coastguard Worker    mov             t0, t6
346*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
347*c0909341SAndroid Build Coastguard Worker    ret
348*c0909341SAndroid Build Coastguard Worker.v:
349*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
350*c0909341SAndroid Build Coastguard Worker.v_loop:
351*c0909341SAndroid Build Coastguard Worker    mova            m1, [t4+r10]
352*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t2+r10]
353*c0909341SAndroid Build Coastguard Worker    mova            m2, [t3+r10]
354*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+r10]
355*c0909341SAndroid Build Coastguard Worker    paddw           m3, m4, [t6+r10]
356*c0909341SAndroid Build Coastguard Worker    paddw           m4, [t5+r10]
357*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m2
358*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m15
359*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m2
360*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m15
361*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m4
362*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m14
363*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4
364*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m14
365*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10
366*c0909341SAndroid Build Coastguard Worker    paddd           m1, m10
367*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
368*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
369*c0909341SAndroid Build Coastguard Worker    psrad           m0, 5
370*c0909341SAndroid Build Coastguard Worker    psrad           m1, 5
371*c0909341SAndroid Build Coastguard Worker    packusdw        m0, m1
372*c0909341SAndroid Build Coastguard Worker    pmulhuw         m0, m11
373*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
374*c0909341SAndroid Build Coastguard Worker    add            r10, 32
375*c0909341SAndroid Build Coastguard Worker    jl .v_loop
376*c0909341SAndroid Build Coastguard Worker    mov             t6, t5
377*c0909341SAndroid Build Coastguard Worker    mov             t5, t4
378*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
379*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
380*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
381*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
382*c0909341SAndroid Build Coastguard Worker    ret
383*c0909341SAndroid Build Coastguard Worker
384*c0909341SAndroid Build Coastguard Workercglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
385*c0909341SAndroid Build Coastguard Worker                                                   w, h, edge, flt
386*c0909341SAndroid Build Coastguard Worker%define base t4-wiener_hshift
387*c0909341SAndroid Build Coastguard Worker    mov           fltq, r6mp
388*c0909341SAndroid Build Coastguard Worker    movifnidn       wd, wm
389*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
390*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
391*c0909341SAndroid Build Coastguard Worker    mov            t3d, r8m ; pixel_max
392*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m5, [wiener_shufE]
393*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m11, [fltq+ 2] ; x1
394*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m6, [wiener_shufB]
395*c0909341SAndroid Build Coastguard Worker    lea             t4, [wiener_hshift]
396*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m7, [wiener_shufD]
397*c0909341SAndroid Build Coastguard Worker    add             wd, wd
398*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m12, [fltq+ 4] ; x2 x3
399*c0909341SAndroid Build Coastguard Worker    shr            t3d, 11
400*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m8, [pd_m262128] ; (1 << 4) - (1 << 18)
401*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
402*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m13, [fltq+18] ; y1
403*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
404*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m14, [fltq+20] ; y2 y3
405*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq+16]
406*c0909341SAndroid Build Coastguard Worker    neg             wq
407*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m0, [base+wiener_hshift+t3*4]
408*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [base+wiener_round+t3*4]
409*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m10, [base+wiener_vshift+t3*4]
410*c0909341SAndroid Build Coastguard Worker    mova           m15, [wiener_lshuf5]
411*c0909341SAndroid Build Coastguard Worker    pmullw         m11, m0
412*c0909341SAndroid Build Coastguard Worker    pmullw         m12, m0
413*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
414*c0909341SAndroid Build Coastguard Worker    jz .no_top
415*c0909341SAndroid Build Coastguard Worker    call .h_top
416*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
417*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
418*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
419*c0909341SAndroid Build Coastguard Worker    call .h_top
420*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
421*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
422*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
423*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
424*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
425*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10 ; below
426*c0909341SAndroid Build Coastguard Worker    call .h
427*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
428*c0909341SAndroid Build Coastguard Worker    dec             hd
429*c0909341SAndroid Build Coastguard Worker    jz .v1
430*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
431*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
432*c0909341SAndroid Build Coastguard Worker    call .h
433*c0909341SAndroid Build Coastguard Worker    dec             hd
434*c0909341SAndroid Build Coastguard Worker    jz .v2
435*c0909341SAndroid Build Coastguard Worker.main:
436*c0909341SAndroid Build Coastguard Worker    mov             t0, t4
437*c0909341SAndroid Build Coastguard Worker.main_loop:
438*c0909341SAndroid Build Coastguard Worker    call .hv
439*c0909341SAndroid Build Coastguard Worker    dec             hd
440*c0909341SAndroid Build Coastguard Worker    jnz .main_loop
441*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
442*c0909341SAndroid Build Coastguard Worker    jz .v2
443*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
444*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
445*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
446*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
447*c0909341SAndroid Build Coastguard Worker.end:
448*c0909341SAndroid Build Coastguard Worker    RET
449*c0909341SAndroid Build Coastguard Worker.no_top:
450*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
451*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
452*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
453*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10
454*c0909341SAndroid Build Coastguard Worker    call .h
455*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
456*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
457*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
458*c0909341SAndroid Build Coastguard Worker    dec             hd
459*c0909341SAndroid Build Coastguard Worker    jz .v1
460*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
461*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
462*c0909341SAndroid Build Coastguard Worker    call .h
463*c0909341SAndroid Build Coastguard Worker    dec             hd
464*c0909341SAndroid Build Coastguard Worker    jz .v2
465*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+384*2]
466*c0909341SAndroid Build Coastguard Worker    call .hv
467*c0909341SAndroid Build Coastguard Worker    dec             hd
468*c0909341SAndroid Build Coastguard Worker    jz .v2
469*c0909341SAndroid Build Coastguard Worker    add             t0, 384*6
470*c0909341SAndroid Build Coastguard Worker    call .hv
471*c0909341SAndroid Build Coastguard Worker    dec             hd
472*c0909341SAndroid Build Coastguard Worker    jnz .main
473*c0909341SAndroid Build Coastguard Worker.v2:
474*c0909341SAndroid Build Coastguard Worker    call .v
475*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
476*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
477*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
478*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
479*c0909341SAndroid Build Coastguard Worker.v1:
480*c0909341SAndroid Build Coastguard Worker    call .v
481*c0909341SAndroid Build Coastguard Worker    jmp .end
482*c0909341SAndroid Build Coastguard Worker.extend_right:
483*c0909341SAndroid Build Coastguard Worker    movd           xm2, r10d
484*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m0, [pb_2_3]
485*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m1, [pb_m6_m5]
486*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m2, xm2
487*c0909341SAndroid Build Coastguard Worker    psubb           m0, m2
488*c0909341SAndroid Build Coastguard Worker    psubb           m1, m2
489*c0909341SAndroid Build Coastguard Worker    mova            m2, [pb_0to63]
490*c0909341SAndroid Build Coastguard Worker    pminub          m0, m2
491*c0909341SAndroid Build Coastguard Worker    pminub          m1, m2
492*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m0
493*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m1
494*c0909341SAndroid Build Coastguard Worker    ret
495*c0909341SAndroid Build Coastguard Worker.h:
496*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
497*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
498*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
499*c0909341SAndroid Build Coastguard Worker    movd           xm3, [leftq+4]
500*c0909341SAndroid Build Coastguard Worker    vpblendd        m3, [lpfq+r10-4], 0xfe
501*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
502*c0909341SAndroid Build Coastguard Worker    jmp .h_main
503*c0909341SAndroid Build Coastguard Worker.h_extend_left:
504*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m4, [lpfq+r10] ; avoid accessing memory located
505*c0909341SAndroid Build Coastguard Worker    mova            m3, [lpfq+r10] ; before the start of the buffer
506*c0909341SAndroid Build Coastguard Worker    palignr         m3, m4, 12
507*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m15
508*c0909341SAndroid Build Coastguard Worker    jmp .h_main
509*c0909341SAndroid Build Coastguard Worker.h_top:
510*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
511*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
512*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
513*c0909341SAndroid Build Coastguard Worker.h_loop:
514*c0909341SAndroid Build Coastguard Worker    movu            m3, [lpfq+r10-4]
515*c0909341SAndroid Build Coastguard Worker.h_main:
516*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+r10+4]
517*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
518*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
519*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -34
520*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
521*c0909341SAndroid Build Coastguard Worker    call .extend_right
522*c0909341SAndroid Build Coastguard Worker.h_have_right:
523*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m3, m5
524*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11
525*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m5
526*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
527*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m3, m6
528*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m7
529*c0909341SAndroid Build Coastguard Worker    paddw           m2, m3
530*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m4, m6
531*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m12
532*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m7
533*c0909341SAndroid Build Coastguard Worker    paddw           m3, m4
534*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m12
535*c0909341SAndroid Build Coastguard Worker    paddd           m0, m8
536*c0909341SAndroid Build Coastguard Worker    paddd           m1, m8
537*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
538*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
539*c0909341SAndroid Build Coastguard Worker    psrad           m0, 4
540*c0909341SAndroid Build Coastguard Worker    psrad           m1, 4
541*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
542*c0909341SAndroid Build Coastguard Worker    psraw           m0, 1
543*c0909341SAndroid Build Coastguard Worker    mova      [t1+r10], m0
544*c0909341SAndroid Build Coastguard Worker    add            r10, 32
545*c0909341SAndroid Build Coastguard Worker    jl .h_loop
546*c0909341SAndroid Build Coastguard Worker    ret
547*c0909341SAndroid Build Coastguard WorkerALIGN function_align
548*c0909341SAndroid Build Coastguard Worker.hv:
549*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
550*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
551*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
552*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
553*c0909341SAndroid Build Coastguard Worker    movd           xm3, [leftq+4]
554*c0909341SAndroid Build Coastguard Worker    vpblendd        m3, [lpfq+r10-4], 0xfe
555*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
556*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
557*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
558*c0909341SAndroid Build Coastguard Worker    movu            m3, [lpfq+r10-4]
559*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m15
560*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
561*c0909341SAndroid Build Coastguard Worker.hv_bottom:
562*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
563*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
564*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
565*c0909341SAndroid Build Coastguard Worker.hv_loop:
566*c0909341SAndroid Build Coastguard Worker    movu            m3, [lpfq+r10-4]
567*c0909341SAndroid Build Coastguard Worker.hv_main:
568*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+r10+4]
569*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
570*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
571*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -34
572*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
573*c0909341SAndroid Build Coastguard Worker    call .extend_right
574*c0909341SAndroid Build Coastguard Worker.hv_have_right:
575*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m3, m5
576*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11
577*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m5
578*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
579*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m3, m6
580*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m7
581*c0909341SAndroid Build Coastguard Worker    paddw           m2, m3
582*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m4, m6
583*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m12
584*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m7
585*c0909341SAndroid Build Coastguard Worker    paddw           m3, m4
586*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m12
587*c0909341SAndroid Build Coastguard Worker    paddd           m0, m8
588*c0909341SAndroid Build Coastguard Worker    paddd           m1, m8
589*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
590*c0909341SAndroid Build Coastguard Worker    mova            m2, [t3+r10]
591*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t1+r10]
592*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
593*c0909341SAndroid Build Coastguard Worker    mova            m4, [t2+r10]
594*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m2, m4
595*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m14
596*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m4
597*c0909341SAndroid Build Coastguard Worker    mova            m4, [t4+r10]
598*c0909341SAndroid Build Coastguard Worker    psrad           m0, 4
599*c0909341SAndroid Build Coastguard Worker    psrad           m1, 4
600*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
601*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m14
602*c0909341SAndroid Build Coastguard Worker    psraw           m0, 1
603*c0909341SAndroid Build Coastguard Worker    mova      [t0+r10], m0
604*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m4
605*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m13
606*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m4
607*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m13
608*c0909341SAndroid Build Coastguard Worker    paddd           m3, m9
609*c0909341SAndroid Build Coastguard Worker    paddd           m2, m9
610*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
611*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
612*c0909341SAndroid Build Coastguard Worker    psrad           m1, 5
613*c0909341SAndroid Build Coastguard Worker    psrad           m0, 5
614*c0909341SAndroid Build Coastguard Worker    packusdw        m0, m1
615*c0909341SAndroid Build Coastguard Worker    pmulhuw         m0, m10
616*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
617*c0909341SAndroid Build Coastguard Worker    add            r10, 32
618*c0909341SAndroid Build Coastguard Worker    jl .hv_loop
619*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
620*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
621*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
622*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
623*c0909341SAndroid Build Coastguard Worker    mov             t0, t4
624*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
625*c0909341SAndroid Build Coastguard Worker    ret
626*c0909341SAndroid Build Coastguard Worker.v:
627*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
628*c0909341SAndroid Build Coastguard Worker.v_loop:
629*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10]
630*c0909341SAndroid Build Coastguard Worker    paddw           m2, m0, [t3+r10]
631*c0909341SAndroid Build Coastguard Worker    mova            m1, [t2+r10]
632*c0909341SAndroid Build Coastguard Worker    mova            m4, [t4+r10]
633*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m2, m1
634*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m14
635*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m1
636*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m14
637*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m4
638*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m13
639*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m4
640*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m13
641*c0909341SAndroid Build Coastguard Worker    paddd           m3, m9
642*c0909341SAndroid Build Coastguard Worker    paddd           m2, m9
643*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
644*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
645*c0909341SAndroid Build Coastguard Worker    psrad           m1, 5
646*c0909341SAndroid Build Coastguard Worker    psrad           m0, 5
647*c0909341SAndroid Build Coastguard Worker    packusdw        m0, m1
648*c0909341SAndroid Build Coastguard Worker    pmulhuw         m0, m10
649*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
650*c0909341SAndroid Build Coastguard Worker    add            r10, 32
651*c0909341SAndroid Build Coastguard Worker    jl .v_loop
652*c0909341SAndroid Build Coastguard Worker    ret
653*c0909341SAndroid Build Coastguard Worker
654*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_5x5_16bpc, 4, 14, 16, 400*24+16, dst, stride, left, lpf, \
655*c0909341SAndroid Build Coastguard Worker                                                    w, h, edge, params
656*c0909341SAndroid Build Coastguard Worker%define base r13-pb_m10_m9
657*c0909341SAndroid Build Coastguard Worker    movifnidn       wd, wm
658*c0909341SAndroid Build Coastguard Worker    mov        paramsq, r6mp
659*c0909341SAndroid Build Coastguard Worker    lea            r13, [pb_m10_m9]
660*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
661*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
662*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m7, [paramsq+8] ; w0
663*c0909341SAndroid Build Coastguard Worker    add             wd, wd
664*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m8, [base+pd_8]
665*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
666*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [base+pd_25]
667*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
668*c0909341SAndroid Build Coastguard Worker    mova          xm10, [base+sgr_lshuf5]
669*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+wq*2+400*12+16]
670*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m11, [paramsq+0] ; s0
671*c0909341SAndroid Build Coastguard Worker    lea             t4, [rsp+wq+400*20+16]
672*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m12, [base+pw_164_24]
673*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq+20]
674*c0909341SAndroid Build Coastguard Worker    vbroadcastss   m13, [base+pf_256]
675*c0909341SAndroid Build Coastguard Worker    neg             wq
676*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m14, [base+pd_34816] ; (1 << 11) + (1 << 15)
677*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
678*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m15, [base+pw_1023]
679*c0909341SAndroid Build Coastguard Worker    psllw           m7, 4
680*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
681*c0909341SAndroid Build Coastguard Worker    jz .no_top
682*c0909341SAndroid Build Coastguard Worker    call .h_top
683*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
684*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
685*c0909341SAndroid Build Coastguard Worker    call .top_fixup
686*c0909341SAndroid Build Coastguard Worker    add             t1, 400*6
687*c0909341SAndroid Build Coastguard Worker    call .h_top
688*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
689*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
690*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
691*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10 ; below
692*c0909341SAndroid Build Coastguard Worker    mov             t0, t2
693*c0909341SAndroid Build Coastguard Worker    dec             hd
694*c0909341SAndroid Build Coastguard Worker    jz .height1
695*c0909341SAndroid Build Coastguard Worker    or           edged, 16
696*c0909341SAndroid Build Coastguard Worker    call .h
697*c0909341SAndroid Build Coastguard Worker.main:
698*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
699*c0909341SAndroid Build Coastguard Worker    call .hv
700*c0909341SAndroid Build Coastguard Worker    call .prep_n
701*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
702*c0909341SAndroid Build Coastguard Worker    jl .extend_bottom
703*c0909341SAndroid Build Coastguard Worker.main_loop:
704*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
705*c0909341SAndroid Build Coastguard Worker    test            hd, hd
706*c0909341SAndroid Build Coastguard Worker    jz .odd_height
707*c0909341SAndroid Build Coastguard Worker    call .h
708*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
709*c0909341SAndroid Build Coastguard Worker    call .hv
710*c0909341SAndroid Build Coastguard Worker    call .n0
711*c0909341SAndroid Build Coastguard Worker    call .n1
712*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
713*c0909341SAndroid Build Coastguard Worker    jge .main_loop
714*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
715*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
716*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
717*c0909341SAndroid Build Coastguard Worker    call .h_top
718*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
719*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
720*c0909341SAndroid Build Coastguard Worker.end:
721*c0909341SAndroid Build Coastguard Worker    call .n0
722*c0909341SAndroid Build Coastguard Worker    call .n1
723*c0909341SAndroid Build Coastguard Worker.end2:
724*c0909341SAndroid Build Coastguard Worker    RET
725*c0909341SAndroid Build Coastguard Worker.height1:
726*c0909341SAndroid Build Coastguard Worker    call .hv
727*c0909341SAndroid Build Coastguard Worker    call .prep_n
728*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
729*c0909341SAndroid Build Coastguard Worker.odd_height:
730*c0909341SAndroid Build Coastguard Worker    call .hv
731*c0909341SAndroid Build Coastguard Worker    call .n0
732*c0909341SAndroid Build Coastguard Worker    call .n1
733*c0909341SAndroid Build Coastguard Worker.odd_height_end:
734*c0909341SAndroid Build Coastguard Worker    call .v
735*c0909341SAndroid Build Coastguard Worker    call .n0
736*c0909341SAndroid Build Coastguard Worker    jmp .end2
737*c0909341SAndroid Build Coastguard Worker.extend_bottom:
738*c0909341SAndroid Build Coastguard Worker    call .v
739*c0909341SAndroid Build Coastguard Worker    jmp .end
740*c0909341SAndroid Build Coastguard Worker.no_top:
741*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
742*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
743*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
744*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10
745*c0909341SAndroid Build Coastguard Worker    call .h
746*c0909341SAndroid Build Coastguard Worker    lea             t2, [t1+400*6]
747*c0909341SAndroid Build Coastguard Worker    call .top_fixup
748*c0909341SAndroid Build Coastguard Worker    dec             hd
749*c0909341SAndroid Build Coastguard Worker    jz .no_top_height1
750*c0909341SAndroid Build Coastguard Worker    or           edged, 16
751*c0909341SAndroid Build Coastguard Worker    mov             t0, t1
752*c0909341SAndroid Build Coastguard Worker    mov             t1, t2
753*c0909341SAndroid Build Coastguard Worker    jmp .main
754*c0909341SAndroid Build Coastguard Worker.no_top_height1:
755*c0909341SAndroid Build Coastguard Worker    call .v
756*c0909341SAndroid Build Coastguard Worker    call .prep_n
757*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
758*c0909341SAndroid Build Coastguard Worker.extend_right:
759*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m0, [lpfq-2]
760*c0909341SAndroid Build Coastguard Worker    movu            m1, [r13+r10+ 0]
761*c0909341SAndroid Build Coastguard Worker    movu            m2, [r13+r10+16]
762*c0909341SAndroid Build Coastguard Worker    vpblendvb       m4, m0, m1
763*c0909341SAndroid Build Coastguard Worker    vpblendvb       m5, m0, m2
764*c0909341SAndroid Build Coastguard Worker    ret
765*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum
766*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
767*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
768*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
769*c0909341SAndroid Build Coastguard Worker    vpbroadcastq   xm5, [leftq]
770*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [lpfq+wq], 1
771*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq]
772*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
773*c0909341SAndroid Build Coastguard Worker    palignr         m4, m5, 10
774*c0909341SAndroid Build Coastguard Worker    jmp .h_main
775*c0909341SAndroid Build Coastguard Worker.h_extend_left:
776*c0909341SAndroid Build Coastguard Worker    mova           xm4, [lpfq+wq]
777*c0909341SAndroid Build Coastguard Worker    pshufb         xm4, xm10
778*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [lpfq+wq+10], 1
779*c0909341SAndroid Build Coastguard Worker    jmp .h_main
780*c0909341SAndroid Build Coastguard Worker.h_top:
781*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
782*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
783*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
784*c0909341SAndroid Build Coastguard Worker.h_loop:
785*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+r10- 2]
786*c0909341SAndroid Build Coastguard Worker.h_main:
787*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+r10+14]
788*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
789*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
790*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -36
791*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
792*c0909341SAndroid Build Coastguard Worker    call .extend_right
793*c0909341SAndroid Build Coastguard Worker.h_have_right:
794*c0909341SAndroid Build Coastguard Worker    palignr         m2, m5, m4, 2
795*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4, m2
796*c0909341SAndroid Build Coastguard Worker    palignr         m3, m5, m4, 6
797*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3
798*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m2, m3
799*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
800*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m3
801*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
802*c0909341SAndroid Build Coastguard Worker    shufpd          m5, m4, m5, 0x05
803*c0909341SAndroid Build Coastguard Worker    paddw           m0, m5
804*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m4, m5
805*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
806*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
807*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4, m5
808*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
809*c0909341SAndroid Build Coastguard Worker    shufps          m4, m5, q2121
810*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4             ; sum
811*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m4, m6
812*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
813*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m6
814*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
815*c0909341SAndroid Build Coastguard Worker    paddd           m2, m3
816*c0909341SAndroid Build Coastguard Worker    test         edgeb, 16             ; y > 0
817*c0909341SAndroid Build Coastguard Worker    jz .h_loop_end
818*c0909341SAndroid Build Coastguard Worker    paddw           m0, [t1+r10+400*0]
819*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t1+r10+400*2]
820*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+r10+400*4]
821*c0909341SAndroid Build Coastguard Worker.h_loop_end:
822*c0909341SAndroid Build Coastguard Worker    paddd           m1, m5             ; sumsq
823*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4
824*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400*0], m0
825*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400*2], m1
826*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400*4], m2
827*c0909341SAndroid Build Coastguard Worker    add            r10, 32
828*c0909341SAndroid Build Coastguard Worker    jl .h_loop
829*c0909341SAndroid Build Coastguard Worker    ret
830*c0909341SAndroid Build Coastguard Worker.top_fixup:
831*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
832*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: ; the sums of the first row needs to be doubled
833*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10+400*0]
834*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+r10+400*2]
835*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10+400*4]
836*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
837*c0909341SAndroid Build Coastguard Worker    paddd           m1, m1
838*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
839*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*0], m0
840*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*2], m1
841*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*4], m2
842*c0909341SAndroid Build Coastguard Worker    add            r10, 32
843*c0909341SAndroid Build Coastguard Worker    jl .top_fixup_loop
844*c0909341SAndroid Build Coastguard Worker    ret
845*c0909341SAndroid Build Coastguard WorkerALIGN function_align
846*c0909341SAndroid Build Coastguard Worker.hv: ; horizontal boxsum + vertical boxsum + ab
847*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
848*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
849*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
850*c0909341SAndroid Build Coastguard Worker    vpbroadcastq   xm5, [leftq]
851*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [lpfq+wq], 1
852*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq]
853*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
854*c0909341SAndroid Build Coastguard Worker    palignr         m4, m5, 10
855*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
856*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
857*c0909341SAndroid Build Coastguard Worker    mova           xm4, [lpfq+wq]
858*c0909341SAndroid Build Coastguard Worker    pshufb         xm4, xm10
859*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [lpfq+wq+10], 1
860*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
861*c0909341SAndroid Build Coastguard Worker.hv_bottom:
862*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
863*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
864*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
865*c0909341SAndroid Build Coastguard Worker.hv_loop:
866*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+r10- 2]
867*c0909341SAndroid Build Coastguard Worker.hv_main:
868*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+r10+14]
869*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
870*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
871*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -36
872*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
873*c0909341SAndroid Build Coastguard Worker    call .extend_right
874*c0909341SAndroid Build Coastguard Worker.hv_have_right:
875*c0909341SAndroid Build Coastguard Worker    palignr         m3, m5, m4, 2
876*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4, m3
877*c0909341SAndroid Build Coastguard Worker    palignr         m1, m5, m4, 6
878*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
879*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m1
880*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
881*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m1
882*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
883*c0909341SAndroid Build Coastguard Worker    shufpd          m5, m4, m5, 0x05
884*c0909341SAndroid Build Coastguard Worker    paddw           m0, m5
885*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m4, m5
886*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
887*c0909341SAndroid Build Coastguard Worker    paddd           m2, m1
888*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m4, m5
889*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
890*c0909341SAndroid Build Coastguard Worker    shufps          m4, m5, q2121
891*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4            ; h sum
892*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m4, m6
893*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
894*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m6
895*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
896*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1
897*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5            ; h sumsq
898*c0909341SAndroid Build Coastguard Worker    paddd           m3, m4
899*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t1+r10+400*0]
900*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t1+r10+400*2]
901*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t1+r10+400*4]
902*c0909341SAndroid Build Coastguard Worker    test            hd, hd
903*c0909341SAndroid Build Coastguard Worker    jz .hv_last_row
904*c0909341SAndroid Build Coastguard Worker.hv_main2:
905*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t2+r10+400*0] ; hv sum
906*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t2+r10+400*2] ; hv sumsq
907*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t2+r10+400*4]
908*c0909341SAndroid Build Coastguard Worker    mova [t0+r10+400*0], m0
909*c0909341SAndroid Build Coastguard Worker    mova [t0+r10+400*2], m2
910*c0909341SAndroid Build Coastguard Worker    mova [t0+r10+400*4], m3
911*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
912*c0909341SAndroid Build Coastguard Worker    paddd           m4, m8
913*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m6             ; (b + 2) >> 2
914*c0909341SAndroid Build Coastguard Worker    paddd           m5, m8
915*c0909341SAndroid Build Coastguard Worker    psrld           m4, 4              ; (a + 8) >> 4
916*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
917*c0909341SAndroid Build Coastguard Worker    psrld           m5, 4
918*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
919*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m9             ; a * 25
920*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m9
921*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2             ; b * b
922*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
923*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
924*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
925*c0909341SAndroid Build Coastguard Worker    pmaxud          m4, m2
926*c0909341SAndroid Build Coastguard Worker    pmaxud          m5, m3
927*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p
928*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
929*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m11            ; p * s
930*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m11
931*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m12            ; b * 164
932*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m12
933*c0909341SAndroid Build Coastguard Worker    paddw           m4, m12
934*c0909341SAndroid Build Coastguard Worker    paddw           m5, m12
935*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; z + 1
936*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
937*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m4, m4
938*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m5, m5
939*c0909341SAndroid Build Coastguard Worker    rcpps           m2, m4             ; 1 / (z + 1)
940*c0909341SAndroid Build Coastguard Worker    rcpps           m3, m5
941*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m4, m13, m4
942*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m5, m13, m5
943*c0909341SAndroid Build Coastguard Worker    mulps           m2, m13            ; 256 / (z + 1)
944*c0909341SAndroid Build Coastguard Worker    mulps           m3, m13
945*c0909341SAndroid Build Coastguard Worker    psrld           m4, 24             ; z < 255 ? 255 : 0
946*c0909341SAndroid Build Coastguard Worker    psrld           m5, 24
947*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m2, m2
948*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m3, m3
949*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m4             ; x
950*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m5
951*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m2
952*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m3
953*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
954*c0909341SAndroid Build Coastguard Worker    paddd           m0, m14            ; x * b * 164 + (1 << 11) + (1 << 15)
955*c0909341SAndroid Build Coastguard Worker    paddd           m1, m14
956*c0909341SAndroid Build Coastguard Worker    mova    [t4+r10+4], m2
957*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12             ; b
958*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
959*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+ 8], xm0
960*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+40], m0, 1
961*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+24], xm1
962*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+56], m1, 1
963*c0909341SAndroid Build Coastguard Worker    add            r10, 32
964*c0909341SAndroid Build Coastguard Worker    jl .hv_loop
965*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
966*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
967*c0909341SAndroid Build Coastguard Worker    mov             t0, t2
968*c0909341SAndroid Build Coastguard Worker    ret
969*c0909341SAndroid Build Coastguard Worker.hv_last_row: ; esoteric edge case for odd heights
970*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400*0], m1
971*c0909341SAndroid Build Coastguard Worker    paddw            m1, m0
972*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400*2], m4
973*c0909341SAndroid Build Coastguard Worker    paddd            m4, m2
974*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400*4], m5
975*c0909341SAndroid Build Coastguard Worker    paddd            m5, m3
976*c0909341SAndroid Build Coastguard Worker    jmp .hv_main2
977*c0909341SAndroid Build Coastguard Worker.v: ; vertical boxsum + ab
978*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
979*c0909341SAndroid Build Coastguard Worker.v_loop:
980*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10+400*0]
981*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10+400*2]
982*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+r10+400*4]
983*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10+400*0]
984*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t2+r10+400*2]
985*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t2+r10+400*4]
986*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
987*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
988*c0909341SAndroid Build Coastguard Worker    paddd           m3, m3
989*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0             ; hv sum
990*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; hv sumsq
991*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
992*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
993*c0909341SAndroid Build Coastguard Worker    paddd           m4, m8
994*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m6             ; (b + 2) >> 2
995*c0909341SAndroid Build Coastguard Worker    paddd           m5, m8
996*c0909341SAndroid Build Coastguard Worker    psrld           m4, 4              ; (a + 8) >> 4
997*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
998*c0909341SAndroid Build Coastguard Worker    psrld           m5, 4
999*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1000*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m9             ; a * 25
1001*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m9
1002*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2             ; b * b
1003*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1004*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
1005*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
1006*c0909341SAndroid Build Coastguard Worker    pmaxud          m4, m2
1007*c0909341SAndroid Build Coastguard Worker    pmaxud          m5, m3
1008*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p
1009*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
1010*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m11            ; p * s
1011*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m11
1012*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m12            ; b * 164
1013*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m12
1014*c0909341SAndroid Build Coastguard Worker    paddw           m4, m12
1015*c0909341SAndroid Build Coastguard Worker    paddw           m5, m12
1016*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; z + 1
1017*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
1018*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m4, m4
1019*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m5, m5
1020*c0909341SAndroid Build Coastguard Worker    rcpps           m2, m4             ; 1 / (z + 1)
1021*c0909341SAndroid Build Coastguard Worker    rcpps           m3, m5
1022*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m4, m13, m4
1023*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m5, m13, m5
1024*c0909341SAndroid Build Coastguard Worker    mulps           m2, m13            ; 256 / (z + 1)
1025*c0909341SAndroid Build Coastguard Worker    mulps           m3, m13
1026*c0909341SAndroid Build Coastguard Worker    psrld           m4, 24             ; z < 255 ? 255 : 0
1027*c0909341SAndroid Build Coastguard Worker    psrld           m5, 24
1028*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m2, m2
1029*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m3, m3
1030*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m4             ; x
1031*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m5
1032*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m2
1033*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m3
1034*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
1035*c0909341SAndroid Build Coastguard Worker    paddd           m0, m14            ; x * b * 164 + (1 << 11) + (1 << 15)
1036*c0909341SAndroid Build Coastguard Worker    paddd           m1, m14
1037*c0909341SAndroid Build Coastguard Worker    mova    [t4+r10+4], m2
1038*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12             ; b
1039*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
1040*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+ 8], xm0
1041*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+40], m0, 1
1042*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+24], xm1
1043*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+56], m1, 1
1044*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1045*c0909341SAndroid Build Coastguard Worker    jl .v_loop
1046*c0909341SAndroid Build Coastguard Worker    ret
1047*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup
1048*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1049*c0909341SAndroid Build Coastguard Worker.prep_n_loop:
1050*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+r10*1+ 2]
1051*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+r10*2+ 4]
1052*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+r10*2+36]
1053*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0, [t4+r10*1+ 0]
1054*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1, [t3+r10*2+ 0]
1055*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2, [t3+r10*2+32]
1056*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+r10*1+ 4]
1057*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+r10*2+ 8]
1058*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+r10*2+40]
1059*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3
1060*c0909341SAndroid Build Coastguard Worker    psllw           m3, 2
1061*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
1062*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2
1063*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5
1064*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2
1065*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3             ; a 565
1066*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4             ; b 565
1067*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5
1068*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400*2+ 0], m0
1069*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*4+ 0], m1
1070*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*4+32], m2
1071*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1072*c0909341SAndroid Build Coastguard Worker    jl .prep_n_loop
1073*c0909341SAndroid Build Coastguard Worker    ret
1074*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1075*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows)
1076*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1077*c0909341SAndroid Build Coastguard Worker.n0_loop:
1078*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+r10*1+ 2]
1079*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+r10*2+ 4]
1080*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+r10*2+36]
1081*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0, [t4+r10*1+ 0]
1082*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1, [t3+r10*2+ 0]
1083*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2, [t3+r10*2+32]
1084*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+r10*1+ 4]
1085*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+r10*2+ 8]
1086*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+r10*2+40]
1087*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3
1088*c0909341SAndroid Build Coastguard Worker    psllw           m3, 2
1089*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
1090*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2
1091*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5
1092*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2
1093*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3             ; a 565
1094*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4             ; b 565
1095*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5
1096*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0, [t4+r10*1+400*2+ 0]
1097*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1, [t3+r10*2+400*4+ 0]
1098*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2, [t3+r10*2+400*4+32]
1099*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400*2+ 0], m0
1100*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*4+ 0], m1
1101*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*4+32], m2
1102*c0909341SAndroid Build Coastguard Worker    mova            m0, [dstq+r10]
1103*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m0, m6          ; src
1104*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6          ; a
1105*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m1              ; a * src
1106*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m6
1107*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1108*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1
1109*c0909341SAndroid Build Coastguard Worker    vinserti128     m1, m4, xm5, 1
1110*c0909341SAndroid Build Coastguard Worker    vperm2i128      m4, m5, 0x31
1111*c0909341SAndroid Build Coastguard Worker    psubd           m1, m2              ; b - a * src + (1 << 8)
1112*c0909341SAndroid Build Coastguard Worker    psubd           m4, m3
1113*c0909341SAndroid Build Coastguard Worker    psrad           m1, 9
1114*c0909341SAndroid Build Coastguard Worker    psrad           m4, 9
1115*c0909341SAndroid Build Coastguard Worker    packssdw        m1, m4
1116*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m1, m7
1117*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
1118*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m6
1119*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m15
1120*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
1121*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1122*c0909341SAndroid Build Coastguard Worker    jl .n0_loop
1123*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
1124*c0909341SAndroid Build Coastguard Worker    ret
1125*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1126*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows)
1127*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1128*c0909341SAndroid Build Coastguard Worker.n1_loop:
1129*c0909341SAndroid Build Coastguard Worker    mova            m0, [dstq+r10]
1130*c0909341SAndroid Build Coastguard Worker    mova            m3, [t4+r10*1+400*2+ 0]
1131*c0909341SAndroid Build Coastguard Worker    mova            m4, [t3+r10*2+400*4+ 0]
1132*c0909341SAndroid Build Coastguard Worker    mova            m5, [t3+r10*2+400*4+32]
1133*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m0, m6          ; src
1134*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6          ; a
1135*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m1
1136*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m6
1137*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1138*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1
1139*c0909341SAndroid Build Coastguard Worker    vinserti128     m1, m4, xm5, 1
1140*c0909341SAndroid Build Coastguard Worker    vperm2i128      m4, m5, 0x31
1141*c0909341SAndroid Build Coastguard Worker    psubd           m1, m2              ; b - a * src + (1 << 7)
1142*c0909341SAndroid Build Coastguard Worker    psubd           m4, m3
1143*c0909341SAndroid Build Coastguard Worker    psrad           m1, 8
1144*c0909341SAndroid Build Coastguard Worker    psrad           m4, 8
1145*c0909341SAndroid Build Coastguard Worker    packssdw        m1, m4
1146*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m1, m7
1147*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
1148*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m6
1149*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m15
1150*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
1151*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1152*c0909341SAndroid Build Coastguard Worker    jl .n1_loop
1153*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
1154*c0909341SAndroid Build Coastguard Worker    ret
1155*c0909341SAndroid Build Coastguard Worker
1156*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_3x3_16bpc, 4, 14, 15, 400*42+8, dst, stride, left, lpf, \
1157*c0909341SAndroid Build Coastguard Worker                                                   w, h, edge, params
1158*c0909341SAndroid Build Coastguard Worker    movifnidn       wd, wm
1159*c0909341SAndroid Build Coastguard Worker    mov        paramsq, r6mp
1160*c0909341SAndroid Build Coastguard Worker    lea            r13, [pb_m10_m9]
1161*c0909341SAndroid Build Coastguard Worker    add             wd, wd
1162*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
1163*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
1164*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m7, [paramsq+10] ; w1
1165*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
1166*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m8, [base+pd_8]
1167*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
1168*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [paramsq+ 4] ; s1
1169*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+wq*2+400*12+8]
1170*c0909341SAndroid Build Coastguard Worker    mova          xm10, [base+sgr_lshuf3]
1171*c0909341SAndroid Build Coastguard Worker    lea             t4, [rsp+wq+400*32+8]
1172*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m11, [base+pw_455_24]
1173*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq+12]
1174*c0909341SAndroid Build Coastguard Worker    vbroadcastss   m12, [base+pf_256]
1175*c0909341SAndroid Build Coastguard Worker    neg             wq
1176*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m13, [base+pd_34816]
1177*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
1178*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m14, [base+pw_1023]
1179*c0909341SAndroid Build Coastguard Worker    psllw           m7, 4
1180*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
1181*c0909341SAndroid Build Coastguard Worker    jz .no_top
1182*c0909341SAndroid Build Coastguard Worker    call .h_top
1183*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1184*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1185*c0909341SAndroid Build Coastguard Worker    add             t1, 400*6
1186*c0909341SAndroid Build Coastguard Worker    call .h_top
1187*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1188*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1189*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
1190*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10 ; below
1191*c0909341SAndroid Build Coastguard Worker    call .hv0
1192*c0909341SAndroid Build Coastguard Worker.main:
1193*c0909341SAndroid Build Coastguard Worker    dec             hd
1194*c0909341SAndroid Build Coastguard Worker    jz .height1
1195*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1196*c0909341SAndroid Build Coastguard Worker    call .hv1
1197*c0909341SAndroid Build Coastguard Worker    call .prep_n
1198*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1199*c0909341SAndroid Build Coastguard Worker    jl .extend_bottom
1200*c0909341SAndroid Build Coastguard Worker.main_loop:
1201*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1202*c0909341SAndroid Build Coastguard Worker    call .hv0
1203*c0909341SAndroid Build Coastguard Worker    test            hd, hd
1204*c0909341SAndroid Build Coastguard Worker    jz .odd_height
1205*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1206*c0909341SAndroid Build Coastguard Worker    call .hv1
1207*c0909341SAndroid Build Coastguard Worker    call .n0
1208*c0909341SAndroid Build Coastguard Worker    call .n1
1209*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1210*c0909341SAndroid Build Coastguard Worker    jge .main_loop
1211*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
1212*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
1213*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
1214*c0909341SAndroid Build Coastguard Worker    call .hv0_bottom
1215*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1216*c0909341SAndroid Build Coastguard Worker    call .hv1_bottom
1217*c0909341SAndroid Build Coastguard Worker.end:
1218*c0909341SAndroid Build Coastguard Worker    call .n0
1219*c0909341SAndroid Build Coastguard Worker    call .n1
1220*c0909341SAndroid Build Coastguard Worker.end2:
1221*c0909341SAndroid Build Coastguard Worker    RET
1222*c0909341SAndroid Build Coastguard Worker.height1:
1223*c0909341SAndroid Build Coastguard Worker    call .v1
1224*c0909341SAndroid Build Coastguard Worker    call .prep_n
1225*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
1226*c0909341SAndroid Build Coastguard Worker.odd_height:
1227*c0909341SAndroid Build Coastguard Worker    call .v1
1228*c0909341SAndroid Build Coastguard Worker    call .n0
1229*c0909341SAndroid Build Coastguard Worker    call .n1
1230*c0909341SAndroid Build Coastguard Worker.odd_height_end:
1231*c0909341SAndroid Build Coastguard Worker    call .v0
1232*c0909341SAndroid Build Coastguard Worker    call .v1
1233*c0909341SAndroid Build Coastguard Worker    call .n0
1234*c0909341SAndroid Build Coastguard Worker    jmp .end2
1235*c0909341SAndroid Build Coastguard Worker.extend_bottom:
1236*c0909341SAndroid Build Coastguard Worker    call .v0
1237*c0909341SAndroid Build Coastguard Worker    call .v1
1238*c0909341SAndroid Build Coastguard Worker    jmp .end
1239*c0909341SAndroid Build Coastguard Worker.no_top:
1240*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1241*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1242*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
1243*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10
1244*c0909341SAndroid Build Coastguard Worker    call .h
1245*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1246*c0909341SAndroid Build Coastguard Worker    lea             t2, [t1+400*6]
1247*c0909341SAndroid Build Coastguard Worker.top_fixup_loop:
1248*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10+400*0]
1249*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+r10+400*2]
1250*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10+400*4]
1251*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*0], m0
1252*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*2], m1
1253*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*4], m2
1254*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1255*c0909341SAndroid Build Coastguard Worker    jl .top_fixup_loop
1256*c0909341SAndroid Build Coastguard Worker    call .v0
1257*c0909341SAndroid Build Coastguard Worker    jmp .main
1258*c0909341SAndroid Build Coastguard Worker.extend_right:
1259*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m0, [lpfq-2]
1260*c0909341SAndroid Build Coastguard Worker    movu            m1, [r13+r10+ 2]
1261*c0909341SAndroid Build Coastguard Worker    movu            m2, [r13+r10+18]
1262*c0909341SAndroid Build Coastguard Worker    vpblendvb       m4, m0, m1
1263*c0909341SAndroid Build Coastguard Worker    vpblendvb       m5, m0, m2
1264*c0909341SAndroid Build Coastguard Worker    ret
1265*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum
1266*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1267*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1268*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1269*c0909341SAndroid Build Coastguard Worker    vpbroadcastq   xm5, [leftq]
1270*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [lpfq+wq], 1
1271*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq]
1272*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
1273*c0909341SAndroid Build Coastguard Worker    palignr         m4, m5, 12
1274*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1275*c0909341SAndroid Build Coastguard Worker.h_extend_left:
1276*c0909341SAndroid Build Coastguard Worker    mova           xm4, [lpfq+wq]
1277*c0909341SAndroid Build Coastguard Worker    pshufb         xm4, xm10
1278*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [lpfq+wq+12], 1
1279*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1280*c0909341SAndroid Build Coastguard Worker.h_top:
1281*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1282*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1283*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1284*c0909341SAndroid Build Coastguard Worker.h_loop:
1285*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+r10+ 0]
1286*c0909341SAndroid Build Coastguard Worker.h_main:
1287*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+r10+16]
1288*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1289*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
1290*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -34
1291*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
1292*c0909341SAndroid Build Coastguard Worker    call .extend_right
1293*c0909341SAndroid Build Coastguard Worker.h_have_right:
1294*c0909341SAndroid Build Coastguard Worker    palignr         m0, m5, m4, 2
1295*c0909341SAndroid Build Coastguard Worker    paddw           m1, m4, m0
1296*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m4, m0
1297*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1298*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4, m0
1299*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1300*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 4
1301*c0909341SAndroid Build Coastguard Worker    paddw           m1, m5             ; sum
1302*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5, m6
1303*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
1304*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m6
1305*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
1306*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4             ; sumsq
1307*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5
1308*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400*0], m1
1309*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400*2], m2
1310*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400*4], m3
1311*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1312*c0909341SAndroid Build Coastguard Worker    jl .h_loop
1313*c0909341SAndroid Build Coastguard Worker    ret
1314*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1315*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
1316*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1317*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1318*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
1319*c0909341SAndroid Build Coastguard Worker    vpbroadcastq   xm5, [leftq]
1320*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [lpfq+wq], 1
1321*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq]
1322*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
1323*c0909341SAndroid Build Coastguard Worker    palignr         m4, m5, 12
1324*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
1325*c0909341SAndroid Build Coastguard Worker.hv0_extend_left:
1326*c0909341SAndroid Build Coastguard Worker    mova           xm4, [lpfq+wq]
1327*c0909341SAndroid Build Coastguard Worker    pshufb         xm4, xm10
1328*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [lpfq+wq+12], 1
1329*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
1330*c0909341SAndroid Build Coastguard Worker.hv0_bottom:
1331*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1332*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1333*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
1334*c0909341SAndroid Build Coastguard Worker.hv0_loop:
1335*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+r10+ 0]
1336*c0909341SAndroid Build Coastguard Worker.hv0_main:
1337*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+r10+16]
1338*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1339*c0909341SAndroid Build Coastguard Worker    jnz .hv0_have_right
1340*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -34
1341*c0909341SAndroid Build Coastguard Worker    jl .hv0_have_right
1342*c0909341SAndroid Build Coastguard Worker    call .extend_right
1343*c0909341SAndroid Build Coastguard Worker.hv0_have_right:
1344*c0909341SAndroid Build Coastguard Worker    palignr         m0, m5, m4, 2
1345*c0909341SAndroid Build Coastguard Worker    paddw           m1, m4, m0
1346*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m4, m0
1347*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1348*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4, m0
1349*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1350*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 4
1351*c0909341SAndroid Build Coastguard Worker    paddw           m1, m5             ; sum
1352*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5, m6
1353*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
1354*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m6
1355*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
1356*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4             ; sumsq
1357*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5
1358*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1, [t1+r10+400*0]
1359*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t1+r10+400*2]
1360*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t1+r10+400*4]
1361*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400*0], m1
1362*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400*2], m2
1363*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400*4], m3
1364*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10+400*0]
1365*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+r10+400*2]
1366*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+r10+400*4]
1367*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*0], m0
1368*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*2], m4
1369*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*4], m5
1370*c0909341SAndroid Build Coastguard Worker    paddd           m2, m8
1371*c0909341SAndroid Build Coastguard Worker    paddd           m3, m8
1372*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a + 8) >> 4
1373*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
1374*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
1375*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
1376*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; ((a + 8) >> 4) * 9
1377*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
1378*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
1379*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m6             ; (b + 2) >> 2
1380*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
1381*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1382*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1383*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1384*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
1385*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
1386*c0909341SAndroid Build Coastguard Worker    pmaxud          m4, m2
1387*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p
1388*c0909341SAndroid Build Coastguard Worker    pmaxud          m5, m3
1389*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
1390*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m9             ; p * s
1391*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m9
1392*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11            ; b * 455
1393*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
1394*c0909341SAndroid Build Coastguard Worker    paddw           m4, m11
1395*c0909341SAndroid Build Coastguard Worker    paddw           m5, m11
1396*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; z + 1
1397*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
1398*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m4, m4
1399*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m5, m5
1400*c0909341SAndroid Build Coastguard Worker    rcpps           m2, m4             ; 1 / (z + 1)
1401*c0909341SAndroid Build Coastguard Worker    rcpps           m3, m5
1402*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m4, m12, m4
1403*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m5, m12, m5
1404*c0909341SAndroid Build Coastguard Worker    mulps           m2, m12            ; 256 / (z + 1)
1405*c0909341SAndroid Build Coastguard Worker    mulps           m3, m12
1406*c0909341SAndroid Build Coastguard Worker    psrld           m4, 24             ; z < 255 ? 255 : 0
1407*c0909341SAndroid Build Coastguard Worker    psrld           m5, 24
1408*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m2, m2
1409*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m3, m3
1410*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m4             ; x
1411*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m5
1412*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m2
1413*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m3
1414*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
1415*c0909341SAndroid Build Coastguard Worker    paddd           m0, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
1416*c0909341SAndroid Build Coastguard Worker    paddd           m1, m13
1417*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
1418*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
1419*c0909341SAndroid Build Coastguard Worker    mova         [t4+r10*1+400*0+ 4], m2
1420*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*0+ 8], xm0
1421*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*0+40], m0, 1
1422*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*0+24], xm1
1423*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*0+56], m1, 1
1424*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1425*c0909341SAndroid Build Coastguard Worker    jl .hv0_loop
1426*c0909341SAndroid Build Coastguard Worker    ret
1427*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1428*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
1429*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1430*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1431*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
1432*c0909341SAndroid Build Coastguard Worker    vpbroadcastq   xm5, [leftq]
1433*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [lpfq+wq], 1
1434*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq]
1435*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
1436*c0909341SAndroid Build Coastguard Worker    palignr         m4, m5, 12
1437*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
1438*c0909341SAndroid Build Coastguard Worker.hv1_extend_left:
1439*c0909341SAndroid Build Coastguard Worker    mova           xm4, [lpfq+wq]
1440*c0909341SAndroid Build Coastguard Worker    pshufb         xm4, xm10
1441*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [lpfq+wq+12], 1
1442*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
1443*c0909341SAndroid Build Coastguard Worker.hv1_bottom:
1444*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1445*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1446*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
1447*c0909341SAndroid Build Coastguard Worker.hv1_loop:
1448*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+r10+ 0]
1449*c0909341SAndroid Build Coastguard Worker.hv1_main:
1450*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+r10+16]
1451*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1452*c0909341SAndroid Build Coastguard Worker    jnz .hv1_have_right
1453*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -34
1454*c0909341SAndroid Build Coastguard Worker    jl .hv1_have_right
1455*c0909341SAndroid Build Coastguard Worker    call .extend_right
1456*c0909341SAndroid Build Coastguard Worker.hv1_have_right:
1457*c0909341SAndroid Build Coastguard Worker    palignr         m1, m5, m4, 2
1458*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4, m1
1459*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m4, m1
1460*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1461*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4, m1
1462*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1463*c0909341SAndroid Build Coastguard Worker    palignr         m5, m4, 4
1464*c0909341SAndroid Build Coastguard Worker    paddw           m0, m5             ; h sum
1465*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m5, m6
1466*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
1467*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m6
1468*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
1469*c0909341SAndroid Build Coastguard Worker    paddd           m2, m1             ; h sumsq
1470*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5
1471*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10+400*0]
1472*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t2+r10+400*2]
1473*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t2+r10+400*4]
1474*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*0], m0
1475*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*2], m2
1476*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*4], m3
1477*c0909341SAndroid Build Coastguard Worker    paddd           m4, m8
1478*c0909341SAndroid Build Coastguard Worker    paddd           m5, m8
1479*c0909341SAndroid Build Coastguard Worker    psrld           m4, 4              ; (a + 8) >> 4
1480*c0909341SAndroid Build Coastguard Worker    psrld           m5, 4
1481*c0909341SAndroid Build Coastguard Worker    pslld           m2, m4, 3
1482*c0909341SAndroid Build Coastguard Worker    pslld           m3, m5, 3
1483*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; ((a + 8) >> 4) * 9
1484*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
1485*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
1486*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m6             ; (b + 2) >> 2
1487*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
1488*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1489*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1490*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1491*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
1492*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
1493*c0909341SAndroid Build Coastguard Worker    pmaxud          m4, m2
1494*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p
1495*c0909341SAndroid Build Coastguard Worker    pmaxud          m5, m3
1496*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
1497*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m9             ; p * s
1498*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m9
1499*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11            ; b * 455
1500*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
1501*c0909341SAndroid Build Coastguard Worker    paddw           m4, m11
1502*c0909341SAndroid Build Coastguard Worker    paddw           m5, m11
1503*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; z + 1
1504*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
1505*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m4, m4
1506*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m5, m5
1507*c0909341SAndroid Build Coastguard Worker    rcpps           m2, m4             ; 1 / (z + 1)
1508*c0909341SAndroid Build Coastguard Worker    rcpps           m3, m5
1509*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m4, m12, m4
1510*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m5, m12, m5
1511*c0909341SAndroid Build Coastguard Worker    mulps           m2, m12            ; 256 / (z + 1)
1512*c0909341SAndroid Build Coastguard Worker    mulps           m3, m12
1513*c0909341SAndroid Build Coastguard Worker    psrld           m4, 24             ; z < 255 ? 255 : 0
1514*c0909341SAndroid Build Coastguard Worker    psrld           m5, 24
1515*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m2, m2
1516*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m3, m3
1517*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m4             ; x
1518*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m5
1519*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m2
1520*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m3
1521*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
1522*c0909341SAndroid Build Coastguard Worker    paddd           m0, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
1523*c0909341SAndroid Build Coastguard Worker    paddd           m1, m13
1524*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
1525*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
1526*c0909341SAndroid Build Coastguard Worker    mova         [t4+r10*1+400*2 +4], m2
1527*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*4+ 8], xm0
1528*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*4+40], m0, 1
1529*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*4+24], xm1
1530*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*4+56], m1, 1
1531*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1532*c0909341SAndroid Build Coastguard Worker    jl .hv1_loop
1533*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
1534*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1535*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
1536*c0909341SAndroid Build Coastguard Worker    ret
1537*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab (even rows)
1538*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1539*c0909341SAndroid Build Coastguard Worker.v0_loop:
1540*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10+400*0]
1541*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+r10+400*2]
1542*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+r10+400*4]
1543*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
1544*c0909341SAndroid Build Coastguard Worker    paddd           m4, m4
1545*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5
1546*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10+400*0]
1547*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+r10+400*2]
1548*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+r10+400*4]
1549*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*0], m0
1550*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*2], m4
1551*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*4], m5
1552*c0909341SAndroid Build Coastguard Worker    paddd           m2, m8
1553*c0909341SAndroid Build Coastguard Worker    paddd           m3, m8
1554*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a + 8) >> 4
1555*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
1556*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
1557*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
1558*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; ((a + 8) >> 4) * 9
1559*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
1560*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
1561*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m6             ; (b + 2) >> 2
1562*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
1563*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1564*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1565*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1566*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
1567*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
1568*c0909341SAndroid Build Coastguard Worker    pmaxud          m4, m2
1569*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p
1570*c0909341SAndroid Build Coastguard Worker    pmaxud          m5, m3
1571*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
1572*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m9             ; p * s
1573*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m9
1574*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11            ; b * 455
1575*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
1576*c0909341SAndroid Build Coastguard Worker    paddw           m4, m11
1577*c0909341SAndroid Build Coastguard Worker    paddw           m5, m11
1578*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; z + 1
1579*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
1580*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m4, m4
1581*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m5, m5
1582*c0909341SAndroid Build Coastguard Worker    rcpps           m2, m4             ; 1 / (z + 1)
1583*c0909341SAndroid Build Coastguard Worker    rcpps           m3, m5
1584*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m4, m12, m4
1585*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m5, m12, m5
1586*c0909341SAndroid Build Coastguard Worker    mulps           m2, m12            ; 256 / (z + 1)
1587*c0909341SAndroid Build Coastguard Worker    mulps           m3, m12
1588*c0909341SAndroid Build Coastguard Worker    psrld           m4, 24             ; z < 255 ? 255 : 0
1589*c0909341SAndroid Build Coastguard Worker    psrld           m5, 24
1590*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m2, m2
1591*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m3, m3
1592*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m4             ; x
1593*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m5
1594*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m2
1595*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m3
1596*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
1597*c0909341SAndroid Build Coastguard Worker    paddd           m0, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
1598*c0909341SAndroid Build Coastguard Worker    paddd           m1, m13
1599*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
1600*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
1601*c0909341SAndroid Build Coastguard Worker    mova         [t4+r10*1+400*0+ 4], m2
1602*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*0+ 8], xm0
1603*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*0+40], m0, 1
1604*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*0+24], xm1
1605*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*0+56], m1, 1
1606*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1607*c0909341SAndroid Build Coastguard Worker    jl .v0_loop
1608*c0909341SAndroid Build Coastguard Worker    ret
1609*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows)
1610*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1611*c0909341SAndroid Build Coastguard Worker.v1_loop:
1612*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10+400*0]
1613*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+r10+400*2]
1614*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+r10+400*4]
1615*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10+400*0]
1616*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+r10+400*2]
1617*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+r10+400*4]
1618*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*0], m0
1619*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*2], m4
1620*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*4], m5
1621*c0909341SAndroid Build Coastguard Worker    paddd           m2, m8
1622*c0909341SAndroid Build Coastguard Worker    paddd           m3, m8
1623*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a + 8) >> 4
1624*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
1625*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
1626*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
1627*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; ((a + 8) >> 4) * 9
1628*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
1629*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
1630*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m6             ; (b + 2) >> 2
1631*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
1632*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1633*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1634*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1635*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
1636*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
1637*c0909341SAndroid Build Coastguard Worker    pmaxud          m4, m2
1638*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p
1639*c0909341SAndroid Build Coastguard Worker    pmaxud          m5, m3
1640*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
1641*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m9             ; p * s
1642*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m9
1643*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11            ; b * 455
1644*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
1645*c0909341SAndroid Build Coastguard Worker    paddw           m4, m11
1646*c0909341SAndroid Build Coastguard Worker    paddw           m5, m11
1647*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; z + 1
1648*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
1649*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m4, m4
1650*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m5, m5
1651*c0909341SAndroid Build Coastguard Worker    rcpps           m2, m4             ; 1 / (z + 1)
1652*c0909341SAndroid Build Coastguard Worker    rcpps           m3, m5
1653*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m4, m12, m4
1654*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m5, m12, m5
1655*c0909341SAndroid Build Coastguard Worker    mulps           m2, m12            ; 256 / (z + 1)
1656*c0909341SAndroid Build Coastguard Worker    mulps           m3, m12
1657*c0909341SAndroid Build Coastguard Worker    psrld           m4, 24             ; z < 255 ? 255 : 0
1658*c0909341SAndroid Build Coastguard Worker    psrld           m5, 24
1659*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m2, m2
1660*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m3, m3
1661*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m4             ; x
1662*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m5
1663*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m2
1664*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m3
1665*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
1666*c0909341SAndroid Build Coastguard Worker    paddd           m0, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
1667*c0909341SAndroid Build Coastguard Worker    paddd           m1, m13
1668*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
1669*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
1670*c0909341SAndroid Build Coastguard Worker    mova         [t4+r10*1+400*2+ 4], m2
1671*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*4+ 8], xm0
1672*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*4+40], m0, 1
1673*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*4+24], xm1
1674*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*4+56], m1, 1
1675*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1676*c0909341SAndroid Build Coastguard Worker    jl .v1_loop
1677*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
1678*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1679*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
1680*c0909341SAndroid Build Coastguard Worker    ret
1681*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup
1682*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1683*c0909341SAndroid Build Coastguard Worker.prep_n_loop:
1684*c0909341SAndroid Build Coastguard Worker    mova           xm0, [t4+r10*1+400*0+0]
1685*c0909341SAndroid Build Coastguard Worker    paddw          xm0, [t4+r10*1+400*0+4]
1686*c0909341SAndroid Build Coastguard Worker    paddw          xm2, xm0, [t4+r10*1+400*0+2]
1687*c0909341SAndroid Build Coastguard Worker    mova            m1, [t3+r10*2+400*0+0]
1688*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+r10*2+400*0+8]
1689*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1, [t3+r10*2+400*0+4]
1690*c0909341SAndroid Build Coastguard Worker    psllw          xm2, 2                ; a[-1] 444
1691*c0909341SAndroid Build Coastguard Worker    pslld           m3, 2                ; b[-1] 444
1692*c0909341SAndroid Build Coastguard Worker    psubw          xm2, xm0              ; a[-1] 343
1693*c0909341SAndroid Build Coastguard Worker    psubd           m3, m1               ; b[-1] 343
1694*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400* 4], xm2
1695*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400* 8], m3
1696*c0909341SAndroid Build Coastguard Worker    mova           xm0, [t4+r10*1+400*2+0]
1697*c0909341SAndroid Build Coastguard Worker    paddw          xm0, [t4+r10*1+400*2+4]
1698*c0909341SAndroid Build Coastguard Worker    paddw          xm2, xm0, [t4+r10*1+400*2+2]
1699*c0909341SAndroid Build Coastguard Worker    mova            m1, [t3+r10*2+400*4+0]
1700*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+r10*2+400*4+8]
1701*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1, [t3+r10*2+400*4+4]
1702*c0909341SAndroid Build Coastguard Worker    psllw          xm2, 2                 ; a[ 0] 444
1703*c0909341SAndroid Build Coastguard Worker    pslld           m3, 2                 ; b[ 0] 444
1704*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400* 6], xm2
1705*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*12], m3
1706*c0909341SAndroid Build Coastguard Worker    psubw          xm2, xm0               ; a[ 0] 343
1707*c0909341SAndroid Build Coastguard Worker    psubd           m3, m1                ; b[ 0] 343
1708*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400* 8], xm2
1709*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*16], m3
1710*c0909341SAndroid Build Coastguard Worker    add            r10, 16
1711*c0909341SAndroid Build Coastguard Worker    jl .prep_n_loop
1712*c0909341SAndroid Build Coastguard Worker    ret
1713*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1714*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows)
1715*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1716*c0909341SAndroid Build Coastguard Worker.n0_loop:
1717*c0909341SAndroid Build Coastguard Worker    mova            m3, [t4+r10*1+400*0+0]
1718*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+r10*1+400*0+4]
1719*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3, [t4+r10*1+400*0+2]
1720*c0909341SAndroid Build Coastguard Worker    psllw           m1, 2                ; a[ 1] 444
1721*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1, m3           ; a[ 1] 343
1722*c0909341SAndroid Build Coastguard Worker    paddw           m3, m2, [t4+r10*1+400*4]
1723*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+r10*1+400*6]
1724*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400*4], m2
1725*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400*6], m1
1726*c0909341SAndroid Build Coastguard Worker    mova            m4, [t3+r10*2+400*0+0]
1727*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+r10*2+400*0+8]
1728*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4, [t3+r10*2+400*0+4]
1729*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2                ; b[ 1] 444
1730*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1, m4           ; b[ 1] 343
1731*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t3+r10*2+400* 8+ 0]
1732*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+r10*2+400*12+ 0]
1733*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400* 8+ 0], m2
1734*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*12+ 0], m1
1735*c0909341SAndroid Build Coastguard Worker    mova            m5, [t3+r10*2+400*0+32]
1736*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+r10*2+400*0+40]
1737*c0909341SAndroid Build Coastguard Worker    paddd           m1, m5, [t3+r10*2+400*0+36]
1738*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2
1739*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1, m5
1740*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2, [t3+r10*2+400* 8+32]
1741*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+r10*2+400*12+32]
1742*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400* 8+32], m2
1743*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*12+32], m1
1744*c0909341SAndroid Build Coastguard Worker    mova            m0, [dstq+r10]
1745*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m0, m6
1746*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
1747*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m1               ; a * src
1748*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m6
1749*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1750*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1
1751*c0909341SAndroid Build Coastguard Worker    vinserti128     m1, m4, xm5, 1
1752*c0909341SAndroid Build Coastguard Worker    vperm2i128      m4, m5, 0x31
1753*c0909341SAndroid Build Coastguard Worker    psubd           m1, m2               ; b - a * src + (1 << 8)
1754*c0909341SAndroid Build Coastguard Worker    psubd           m4, m3
1755*c0909341SAndroid Build Coastguard Worker    psrad           m1, 9
1756*c0909341SAndroid Build Coastguard Worker    psrad           m4, 9
1757*c0909341SAndroid Build Coastguard Worker    packssdw        m1, m4
1758*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m1, m7
1759*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
1760*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m6
1761*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m14
1762*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
1763*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1764*c0909341SAndroid Build Coastguard Worker    jl .n0_loop
1765*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
1766*c0909341SAndroid Build Coastguard Worker    ret
1767*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1768*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows)
1769*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1770*c0909341SAndroid Build Coastguard Worker.n1_loop:
1771*c0909341SAndroid Build Coastguard Worker    mova            m3, [t4+r10*1+400*2+0]
1772*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+r10*1+400*2+4]
1773*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3, [t4+r10*1+400*2+2]
1774*c0909341SAndroid Build Coastguard Worker    psllw           m1, 2                ; a[ 1] 444
1775*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1, m3           ; a[ 1] 343
1776*c0909341SAndroid Build Coastguard Worker    paddw           m3, m2, [t4+r10*1+400*6]
1777*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+r10*1+400*8]
1778*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400*6], m1
1779*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400*8], m2
1780*c0909341SAndroid Build Coastguard Worker    mova            m4, [t3+r10*2+400*4+0]
1781*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+r10*2+400*4+8]
1782*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4, [t3+r10*2+400*4+4]
1783*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2                ; b[ 1] 444
1784*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1, m4           ; b[ 1] 343
1785*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t3+r10*2+400*12+ 0]
1786*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+r10*2+400*16+ 0]
1787*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*12+ 0], m1
1788*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*16+ 0], m2
1789*c0909341SAndroid Build Coastguard Worker    mova            m5, [t3+r10*2+400*4+32]
1790*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+r10*2+400*4+40]
1791*c0909341SAndroid Build Coastguard Worker    paddd           m1, m5, [t3+r10*2+400*4+36]
1792*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2
1793*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1, m5
1794*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2, [t3+r10*2+400*12+32]
1795*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+r10*2+400*16+32]
1796*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*12+32], m1
1797*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*16+32], m2
1798*c0909341SAndroid Build Coastguard Worker    mova            m0, [dstq+r10]
1799*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m0, m6
1800*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
1801*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m1               ; a * src
1802*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m6
1803*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1804*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1
1805*c0909341SAndroid Build Coastguard Worker    vinserti128     m1, m4, xm5, 1
1806*c0909341SAndroid Build Coastguard Worker    vperm2i128      m4, m5, 0x31
1807*c0909341SAndroid Build Coastguard Worker    psubd           m1, m2               ; b - a * src + (1 << 8)
1808*c0909341SAndroid Build Coastguard Worker    psubd           m4, m3
1809*c0909341SAndroid Build Coastguard Worker    psrad           m1, 9
1810*c0909341SAndroid Build Coastguard Worker    psrad           m4, 9
1811*c0909341SAndroid Build Coastguard Worker    packssdw        m1, m4
1812*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m1, m7
1813*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
1814*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m6
1815*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m14
1816*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
1817*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1818*c0909341SAndroid Build Coastguard Worker    jl .n1_loop
1819*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
1820*c0909341SAndroid Build Coastguard Worker    ret
1821*c0909341SAndroid Build Coastguard Worker
1822*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_mix_16bpc, 4, 14, 16, 400*66+8, dst, stride, left, lpf, \
1823*c0909341SAndroid Build Coastguard Worker                                                   w, h, edge, params
1824*c0909341SAndroid Build Coastguard Worker    movifnidn       wd, wm
1825*c0909341SAndroid Build Coastguard Worker    mov        paramsq, r6mp
1826*c0909341SAndroid Build Coastguard Worker    lea            r13, [pb_m10_m9]
1827*c0909341SAndroid Build Coastguard Worker    add             wd, wd
1828*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
1829*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
1830*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
1831*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m15, [paramsq+8] ; w0 w1
1832*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
1833*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m13, [paramsq+0] ; s0
1834*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+wq*2+400*24+8]
1835*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m14, [paramsq+4] ; s1
1836*c0909341SAndroid Build Coastguard Worker    lea             t4, [rsp+wq+400*52+8]
1837*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [base+pd_8]
1838*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq+12]
1839*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m10, [base+pd_34816]
1840*c0909341SAndroid Build Coastguard Worker    neg             wq
1841*c0909341SAndroid Build Coastguard Worker    vbroadcastss   m11, [base+pf_256]
1842*c0909341SAndroid Build Coastguard Worker    pxor            m7, m7
1843*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m12, [base+pw_455_24]
1844*c0909341SAndroid Build Coastguard Worker    psllw          m15, 2
1845*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
1846*c0909341SAndroid Build Coastguard Worker    jz .no_top
1847*c0909341SAndroid Build Coastguard Worker    call .h_top
1848*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1849*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1850*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup
1851*c0909341SAndroid Build Coastguard Worker    add             t1, 400*12
1852*c0909341SAndroid Build Coastguard Worker    call .h_top
1853*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1854*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1855*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
1856*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10 ; below
1857*c0909341SAndroid Build Coastguard Worker    call .hv0
1858*c0909341SAndroid Build Coastguard Worker.main:
1859*c0909341SAndroid Build Coastguard Worker    dec             hd
1860*c0909341SAndroid Build Coastguard Worker    jz .height1
1861*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1862*c0909341SAndroid Build Coastguard Worker    call .hv1
1863*c0909341SAndroid Build Coastguard Worker    call .prep_n
1864*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1865*c0909341SAndroid Build Coastguard Worker    jl .extend_bottom
1866*c0909341SAndroid Build Coastguard Worker.main_loop:
1867*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1868*c0909341SAndroid Build Coastguard Worker    call .hv0
1869*c0909341SAndroid Build Coastguard Worker    test            hd, hd
1870*c0909341SAndroid Build Coastguard Worker    jz .odd_height
1871*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1872*c0909341SAndroid Build Coastguard Worker    call .hv1
1873*c0909341SAndroid Build Coastguard Worker    call .n0
1874*c0909341SAndroid Build Coastguard Worker    call .n1
1875*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1876*c0909341SAndroid Build Coastguard Worker    jge .main_loop
1877*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
1878*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
1879*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
1880*c0909341SAndroid Build Coastguard Worker    call .hv0_bottom
1881*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1882*c0909341SAndroid Build Coastguard Worker    call .hv1_bottom
1883*c0909341SAndroid Build Coastguard Worker.end:
1884*c0909341SAndroid Build Coastguard Worker    call .n0
1885*c0909341SAndroid Build Coastguard Worker    call .n1
1886*c0909341SAndroid Build Coastguard Worker.end2:
1887*c0909341SAndroid Build Coastguard Worker    RET
1888*c0909341SAndroid Build Coastguard Worker.height1:
1889*c0909341SAndroid Build Coastguard Worker    call .v1
1890*c0909341SAndroid Build Coastguard Worker    call .prep_n
1891*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
1892*c0909341SAndroid Build Coastguard Worker.odd_height:
1893*c0909341SAndroid Build Coastguard Worker    call .v1
1894*c0909341SAndroid Build Coastguard Worker    call .n0
1895*c0909341SAndroid Build Coastguard Worker    call .n1
1896*c0909341SAndroid Build Coastguard Worker.odd_height_end:
1897*c0909341SAndroid Build Coastguard Worker    call .v0
1898*c0909341SAndroid Build Coastguard Worker    call .v1
1899*c0909341SAndroid Build Coastguard Worker    call .n0
1900*c0909341SAndroid Build Coastguard Worker    jmp .end2
1901*c0909341SAndroid Build Coastguard Worker.extend_bottom:
1902*c0909341SAndroid Build Coastguard Worker    call .v0
1903*c0909341SAndroid Build Coastguard Worker    call .v1
1904*c0909341SAndroid Build Coastguard Worker    jmp .end
1905*c0909341SAndroid Build Coastguard Worker.no_top:
1906*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1907*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1908*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
1909*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10
1910*c0909341SAndroid Build Coastguard Worker    call .h
1911*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1912*c0909341SAndroid Build Coastguard Worker    lea             t2, [t1+400*12]
1913*c0909341SAndroid Build Coastguard Worker.top_fixup_loop:
1914*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10+400* 0]
1915*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+r10+400* 2]
1916*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10+400* 4]
1917*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
1918*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+r10+400* 6]
1919*c0909341SAndroid Build Coastguard Worker    paddd           m1, m1
1920*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+r10+400* 8]
1921*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
1922*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+r10+400*10]
1923*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400* 0], m0
1924*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400* 2], m1
1925*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400* 4], m2
1926*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400* 6], m3
1927*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400* 8], m4
1928*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*10], m5
1929*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1930*c0909341SAndroid Build Coastguard Worker    jl .top_fixup_loop
1931*c0909341SAndroid Build Coastguard Worker    call .v0
1932*c0909341SAndroid Build Coastguard Worker    jmp .main
1933*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum
1934*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1935*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1936*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1937*c0909341SAndroid Build Coastguard Worker    vpbroadcastq   xm5, [leftq]
1938*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [lpfq+wq], 1
1939*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq]
1940*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
1941*c0909341SAndroid Build Coastguard Worker    palignr         m4, m5, 10
1942*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1943*c0909341SAndroid Build Coastguard Worker.h_extend_left:
1944*c0909341SAndroid Build Coastguard Worker    mova           xm4, [lpfq+wq]
1945*c0909341SAndroid Build Coastguard Worker    pshufb         xm4, [base+sgr_lshuf5]
1946*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [lpfq+wq+10], 1
1947*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1948*c0909341SAndroid Build Coastguard Worker.h_top:
1949*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1950*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1951*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1952*c0909341SAndroid Build Coastguard Worker.h_loop:
1953*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+r10- 2]
1954*c0909341SAndroid Build Coastguard Worker.h_main:
1955*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+r10+14]
1956*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1957*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
1958*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -36
1959*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
1960*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
1961*c0909341SAndroid Build Coastguard Worker.h_have_right:
1962*c0909341SAndroid Build Coastguard Worker    palignr         m3, m5, m4, 2
1963*c0909341SAndroid Build Coastguard Worker    palignr         m0, m5, m4, 4
1964*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3, m0
1965*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m0
1966*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1967*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m0
1968*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1969*c0909341SAndroid Build Coastguard Worker    palignr         m0, m5, m4, 6
1970*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0             ; sum3
1971*c0909341SAndroid Build Coastguard Worker    punpcklwd       m6, m0, m7
1972*c0909341SAndroid Build Coastguard Worker    pmaddwd         m6, m6
1973*c0909341SAndroid Build Coastguard Worker    punpckhwd       m0, m7
1974*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
1975*c0909341SAndroid Build Coastguard Worker    paddd           m2, m6             ; sumsq3
1976*c0909341SAndroid Build Coastguard Worker    shufpd          m6, m4, m5, 0x05
1977*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m6, m4
1978*c0909341SAndroid Build Coastguard Worker    paddw           m8, m4, m6
1979*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
1980*c0909341SAndroid Build Coastguard Worker    punpckhwd       m6, m4
1981*c0909341SAndroid Build Coastguard Worker    pmaddwd         m6, m6
1982*c0909341SAndroid Build Coastguard Worker    paddd           m3, m0
1983*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400* 6], m1
1984*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400* 8], m2
1985*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400*10], m3
1986*c0909341SAndroid Build Coastguard Worker    paddw           m8, m1             ; sum5
1987*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2             ; sumsq5
1988*c0909341SAndroid Build Coastguard Worker    paddd           m6, m3
1989*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400* 0], m8
1990*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400* 2], m5
1991*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400* 4], m6
1992*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1993*c0909341SAndroid Build Coastguard Worker    jl .h_loop
1994*c0909341SAndroid Build Coastguard Worker    ret
1995*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1996*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
1997*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1998*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1999*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
2000*c0909341SAndroid Build Coastguard Worker    vpbroadcastq   xm5, [leftq]
2001*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [lpfq+wq], 1
2002*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq]
2003*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
2004*c0909341SAndroid Build Coastguard Worker    palignr         m4, m5, 10
2005*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
2006*c0909341SAndroid Build Coastguard Worker.hv0_extend_left:
2007*c0909341SAndroid Build Coastguard Worker    mova           xm4, [lpfq+wq]
2008*c0909341SAndroid Build Coastguard Worker    pshufb         xm4, [base+sgr_lshuf5]
2009*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [lpfq+wq+10], 1
2010*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
2011*c0909341SAndroid Build Coastguard Worker.hv0_bottom:
2012*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
2013*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2014*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
2015*c0909341SAndroid Build Coastguard Worker.hv0_loop:
2016*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+r10- 2]
2017*c0909341SAndroid Build Coastguard Worker.hv0_main:
2018*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+r10+14]
2019*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
2020*c0909341SAndroid Build Coastguard Worker    jnz .hv0_have_right
2021*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -36
2022*c0909341SAndroid Build Coastguard Worker    jl .hv0_have_right
2023*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
2024*c0909341SAndroid Build Coastguard Worker.hv0_have_right:
2025*c0909341SAndroid Build Coastguard Worker    palignr         m3, m5, m4, 2
2026*c0909341SAndroid Build Coastguard Worker    palignr         m0, m5, m4, 4
2027*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3, m0
2028*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m0
2029*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2030*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m0
2031*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2032*c0909341SAndroid Build Coastguard Worker    palignr         m0, m5, m4, 6
2033*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0             ; h sum3
2034*c0909341SAndroid Build Coastguard Worker    punpcklwd       m6, m0, m7
2035*c0909341SAndroid Build Coastguard Worker    pmaddwd         m6, m6
2036*c0909341SAndroid Build Coastguard Worker    punpckhwd       m0, m7
2037*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
2038*c0909341SAndroid Build Coastguard Worker    paddd           m2, m6             ; h sumsq3
2039*c0909341SAndroid Build Coastguard Worker    shufpd          m6, m4, m5, 0x05
2040*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m6, m4
2041*c0909341SAndroid Build Coastguard Worker    paddw           m8, m4, m6
2042*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
2043*c0909341SAndroid Build Coastguard Worker    punpckhwd       m6, m4
2044*c0909341SAndroid Build Coastguard Worker    pmaddwd         m6, m6
2045*c0909341SAndroid Build Coastguard Worker    paddd           m3, m0
2046*c0909341SAndroid Build Coastguard Worker    paddw           m8, m1             ; h sum5
2047*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2             ; h sumsq5
2048*c0909341SAndroid Build Coastguard Worker    paddd           m6, m3
2049*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*8+ 8], m8 ; we need a clean copy of the last row TODO: t4?
2050*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*0+ 8], m5 ; in case height is odd
2051*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*0+40], m6
2052*c0909341SAndroid Build Coastguard Worker    paddw           m8, [t1+r10+400* 0]
2053*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t1+r10+400* 2]
2054*c0909341SAndroid Build Coastguard Worker    paddd           m6, [t1+r10+400* 4]
2055*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400* 0], m8
2056*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400* 2], m5
2057*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400* 4], m6
2058*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1, [t1+r10+400* 6]
2059*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t1+r10+400* 8]
2060*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t1+r10+400*10]
2061*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400* 6], m1
2062*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400* 8], m2
2063*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400*10], m3
2064*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10+400* 6]
2065*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+r10+400* 8]
2066*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+r10+400*10]
2067*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400* 6], m0
2068*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400* 8], m4
2069*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*10], m5
2070*c0909341SAndroid Build Coastguard Worker    paddd           m2, m9
2071*c0909341SAndroid Build Coastguard Worker    paddd           m3, m9
2072*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a3 + 8) >> 4
2073*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
2074*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
2075*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
2076*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
2077*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
2078*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
2079*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m7             ; (b3 + 2) >> 2
2080*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m7
2081*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2082*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m7
2083*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2084*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7         ; b3
2085*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
2086*c0909341SAndroid Build Coastguard Worker    pmaxud          m4, m2
2087*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p3
2088*c0909341SAndroid Build Coastguard Worker    pmaxud          m5, m3
2089*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
2090*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m14            ; p3 * s1
2091*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m14
2092*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m12            ; b3 * 455
2093*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m12
2094*c0909341SAndroid Build Coastguard Worker    paddw           m4, m12
2095*c0909341SAndroid Build Coastguard Worker    paddw           m5, m12
2096*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; z3 + 1
2097*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
2098*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m4, m4
2099*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m5, m5
2100*c0909341SAndroid Build Coastguard Worker    rcpps           m2, m4             ; 1 / (z3 + 1)
2101*c0909341SAndroid Build Coastguard Worker    rcpps           m3, m5
2102*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m4, m11, m4
2103*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m5, m11, m5
2104*c0909341SAndroid Build Coastguard Worker    mulps           m2, m11            ; 256 / (z3 + 1)
2105*c0909341SAndroid Build Coastguard Worker    mulps           m3, m11
2106*c0909341SAndroid Build Coastguard Worker    psrld           m4, 24             ; z3 < 255 ? 255 : 0
2107*c0909341SAndroid Build Coastguard Worker    psrld           m5, 24
2108*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m2, m2
2109*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m3, m3
2110*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m4             ; x3
2111*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m5
2112*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m2
2113*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m3
2114*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
2115*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2116*c0909341SAndroid Build Coastguard Worker    paddd           m1, m10
2117*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
2118*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
2119*c0909341SAndroid Build Coastguard Worker    mova         [t4+r10*1+400*2+ 4], m2
2120*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*4+ 8], xm0
2121*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*4+40], m0, 1
2122*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*4+24], xm1
2123*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*4+56], m1, 1
2124*c0909341SAndroid Build Coastguard Worker    add            r10, 32
2125*c0909341SAndroid Build Coastguard Worker    jl .hv0_loop
2126*c0909341SAndroid Build Coastguard Worker    ret
2127*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2128*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
2129*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
2130*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2131*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
2132*c0909341SAndroid Build Coastguard Worker    vpbroadcastq   xm5, [leftq]
2133*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [lpfq+wq], 1
2134*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+wq]
2135*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
2136*c0909341SAndroid Build Coastguard Worker    palignr         m4, m5, 10
2137*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
2138*c0909341SAndroid Build Coastguard Worker.hv1_extend_left:
2139*c0909341SAndroid Build Coastguard Worker    mova           xm4, [lpfq+wq]
2140*c0909341SAndroid Build Coastguard Worker    pshufb         xm4, [base+sgr_lshuf5]
2141*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [lpfq+wq+10], 1
2142*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
2143*c0909341SAndroid Build Coastguard Worker.hv1_bottom:
2144*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
2145*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2146*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
2147*c0909341SAndroid Build Coastguard Worker.hv1_loop:
2148*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+r10- 2]
2149*c0909341SAndroid Build Coastguard Worker.hv1_main:
2150*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+r10+14]
2151*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
2152*c0909341SAndroid Build Coastguard Worker    jnz .hv1_have_right
2153*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -36
2154*c0909341SAndroid Build Coastguard Worker    jl .hv1_have_right
2155*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
2156*c0909341SAndroid Build Coastguard Worker.hv1_have_right:
2157*c0909341SAndroid Build Coastguard Worker    palignr         m6, m5, m4, 2
2158*c0909341SAndroid Build Coastguard Worker    palignr         m3, m5, m4, 4
2159*c0909341SAndroid Build Coastguard Worker    paddw           m2, m6, m3
2160*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m6, m3
2161*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
2162*c0909341SAndroid Build Coastguard Worker    punpckhwd       m6, m3
2163*c0909341SAndroid Build Coastguard Worker    pmaddwd         m6, m6
2164*c0909341SAndroid Build Coastguard Worker    palignr         m3, m5, m4, 6
2165*c0909341SAndroid Build Coastguard Worker    paddw           m2, m3             ; h sum3
2166*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m3, m7
2167*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
2168*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m7
2169*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2170*c0909341SAndroid Build Coastguard Worker    paddd           m0, m1             ; h sumsq3
2171*c0909341SAndroid Build Coastguard Worker    shufpd          m1, m4, m5, 0x05
2172*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m4, m1
2173*c0909341SAndroid Build Coastguard Worker    paddw           m8, m4, m1
2174*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
2175*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m1
2176*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
2177*c0909341SAndroid Build Coastguard Worker    paddd           m6, m3
2178*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2, [t2+r10+400* 6]
2179*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400* 6], m2
2180*c0909341SAndroid Build Coastguard Worker    paddw           m8, m2             ; h sum5
2181*c0909341SAndroid Build Coastguard Worker    paddd           m2, m0, [t2+r10+400* 8]
2182*c0909341SAndroid Build Coastguard Worker    paddd           m3, m6, [t2+r10+400*10]
2183*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400* 8], m0
2184*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*10], m6
2185*c0909341SAndroid Build Coastguard Worker    paddd           m4, m0             ; h sumsq5
2186*c0909341SAndroid Build Coastguard Worker    paddd           m5, m6
2187*c0909341SAndroid Build Coastguard Worker    paddd           m2, m9
2188*c0909341SAndroid Build Coastguard Worker    paddd           m3, m9
2189*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a3 + 8) >> 4
2190*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
2191*c0909341SAndroid Build Coastguard Worker    pslld           m0, m2, 3
2192*c0909341SAndroid Build Coastguard Worker    pslld           m6, m3, 3
2193*c0909341SAndroid Build Coastguard Worker    paddd           m2, m0             ; ((a3 + 8) >> 4) * 9
2194*c0909341SAndroid Build Coastguard Worker    paddd           m3, m6
2195*c0909341SAndroid Build Coastguard Worker    psrlw           m6, m1, 1
2196*c0909341SAndroid Build Coastguard Worker    pavgw           m6, m7             ; (b3 + 2) >> 2
2197*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m6, m7
2198*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
2199*c0909341SAndroid Build Coastguard Worker    punpckhwd       m6, m7
2200*c0909341SAndroid Build Coastguard Worker    pmaddwd         m6, m6
2201*c0909341SAndroid Build Coastguard Worker    pmaxud          m2, m0
2202*c0909341SAndroid Build Coastguard Worker    psubd           m2, m0             ; p3
2203*c0909341SAndroid Build Coastguard Worker    pmaxud          m3, m6
2204*c0909341SAndroid Build Coastguard Worker    psubd           m3, m6
2205*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7         ; b3
2206*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
2207*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m14            ; p3 * s1
2208*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m14
2209*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m12            ; b3 * 455
2210*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m12
2211*c0909341SAndroid Build Coastguard Worker    paddw           m2, m12
2212*c0909341SAndroid Build Coastguard Worker    paddw           m3, m12
2213*c0909341SAndroid Build Coastguard Worker    psrld           m2, 20             ; z + 1
2214*c0909341SAndroid Build Coastguard Worker    psrld           m3, 20
2215*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m2, m2
2216*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m3, m3
2217*c0909341SAndroid Build Coastguard Worker    rcpps           m6, m2             ; 1 / (z + 1)
2218*c0909341SAndroid Build Coastguard Worker    rcpps           m7, m3
2219*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m2, m11, m2
2220*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m3, m11, m3
2221*c0909341SAndroid Build Coastguard Worker    mulps           m6, m11            ; 256 / (z + 1)
2222*c0909341SAndroid Build Coastguard Worker    mulps           m7, m11
2223*c0909341SAndroid Build Coastguard Worker    psrld           m2, 24             ; z < 255 ? 255 : 0
2224*c0909341SAndroid Build Coastguard Worker    psrld           m3, 24
2225*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m6, m6
2226*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m7, m7
2227*c0909341SAndroid Build Coastguard Worker    pminsw          m6, m2             ; x
2228*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m3
2229*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m6
2230*c0909341SAndroid Build Coastguard Worker    packssdw        m6, m7
2231*c0909341SAndroid Build Coastguard Worker    pmulld          m7, m1
2232*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2233*c0909341SAndroid Build Coastguard Worker    paddd           m7, m10
2234*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
2235*c0909341SAndroid Build Coastguard Worker    psrld           m7, 12
2236*c0909341SAndroid Build Coastguard Worker    paddw           m1, m8, [t2+r10+400*0]
2237*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+r10+400*2]
2238*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+r10+400*4]
2239*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t1+r10+400*0]
2240*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+r10+400*2]
2241*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t1+r10+400*4]
2242*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*0], m8
2243*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*2], m4
2244*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*4], m5
2245*c0909341SAndroid Build Coastguard Worker    mova         [t4+r10*1+400*4 +4], m6
2246*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*8+ 8], xm0
2247*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*8+40], m0, 1
2248*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*8+24], xm7
2249*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*8+56], m7, 1
2250*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m4, [base+pd_25]
2251*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m6, [base+pw_164_24]
2252*c0909341SAndroid Build Coastguard Worker    pxor            m7, m7
2253*c0909341SAndroid Build Coastguard Worker    paddd           m2, m9
2254*c0909341SAndroid Build Coastguard Worker    paddd           m3, m9
2255*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a5 + 8) >> 4
2256*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
2257*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m4             ; ((a5 + 8) >> 4) * 25
2258*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m4
2259*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m1, 1
2260*c0909341SAndroid Build Coastguard Worker    pavgw           m5, m7             ; (b5 + 2) >> 2
2261*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5, m7
2262*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
2263*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m7
2264*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
2265*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7         ; b5
2266*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
2267*c0909341SAndroid Build Coastguard Worker    pmaxud          m2, m4
2268*c0909341SAndroid Build Coastguard Worker    psubd           m2, m4             ; p5
2269*c0909341SAndroid Build Coastguard Worker    pmaxud          m3, m5
2270*c0909341SAndroid Build Coastguard Worker    psubd           m3, m5
2271*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m13            ; p5 * s0
2272*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m13
2273*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m6             ; b5 * 164
2274*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m6
2275*c0909341SAndroid Build Coastguard Worker    paddw           m2, m6
2276*c0909341SAndroid Build Coastguard Worker    paddw           m3, m6
2277*c0909341SAndroid Build Coastguard Worker    psrld           m2, 20             ; z5 + 1
2278*c0909341SAndroid Build Coastguard Worker    psrld           m3, 20
2279*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m2, m2
2280*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m3, m3
2281*c0909341SAndroid Build Coastguard Worker    rcpps           m4, m2             ; 1 / (z5 + 1)
2282*c0909341SAndroid Build Coastguard Worker    rcpps           m5, m3
2283*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m2, m11, m2
2284*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m3, m11, m3
2285*c0909341SAndroid Build Coastguard Worker    mulps           m4, m11            ; 256 / (z5 + 1)
2286*c0909341SAndroid Build Coastguard Worker    mulps           m5, m11
2287*c0909341SAndroid Build Coastguard Worker    psrld           m2, 24             ; z5 < 255 ? 255 : 0
2288*c0909341SAndroid Build Coastguard Worker    psrld           m3, 24
2289*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m4, m4
2290*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m5, m5
2291*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m2             ; x5
2292*c0909341SAndroid Build Coastguard Worker    pminsw          m5, m3
2293*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m4
2294*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m5
2295*c0909341SAndroid Build Coastguard Worker    packssdw        m4, m5
2296*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
2297*c0909341SAndroid Build Coastguard Worker    paddd           m1, m10
2298*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
2299*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
2300*c0909341SAndroid Build Coastguard Worker    mova         [t4+r10*1+400*0+ 4], m4
2301*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*0+ 8], xm0
2302*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*0+40], m0, 1
2303*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*0+24], xm1
2304*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*0+56], m1, 1
2305*c0909341SAndroid Build Coastguard Worker    add            r10, 32
2306*c0909341SAndroid Build Coastguard Worker    jl .hv1_loop
2307*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
2308*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
2309*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
2310*c0909341SAndroid Build Coastguard Worker    ret
2311*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab3 (even rows)
2312*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
2313*c0909341SAndroid Build Coastguard Worker.v0_loop:
2314*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10+400* 6]
2315*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+r10+400* 8]
2316*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+r10+400*10]
2317*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
2318*c0909341SAndroid Build Coastguard Worker    paddd           m4, m4
2319*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5
2320*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10+400* 6]
2321*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+r10+400* 8]
2322*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+r10+400*10]
2323*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400* 6], m0
2324*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400* 8], m4
2325*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*10], m5
2326*c0909341SAndroid Build Coastguard Worker    paddd           m2, m9
2327*c0909341SAndroid Build Coastguard Worker    paddd           m3, m9
2328*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a3 + 8) >> 4
2329*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
2330*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
2331*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
2332*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
2333*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
2334*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
2335*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m7             ; (b3 + 2) >> 2
2336*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m7
2337*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2338*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m7
2339*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2340*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7         ; b3
2341*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
2342*c0909341SAndroid Build Coastguard Worker    pmaxud          m4, m2
2343*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p3
2344*c0909341SAndroid Build Coastguard Worker    pmaxud          m5, m3
2345*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
2346*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m14            ; p3 * s1
2347*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m14
2348*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m12            ; b3 * 455
2349*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m12
2350*c0909341SAndroid Build Coastguard Worker    paddw           m4, m12
2351*c0909341SAndroid Build Coastguard Worker    paddw           m5, m12
2352*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; z + 1
2353*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
2354*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m4, m4
2355*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m5, m5
2356*c0909341SAndroid Build Coastguard Worker    rcpps           m2, m4             ; 1 / (z + 1)
2357*c0909341SAndroid Build Coastguard Worker    rcpps           m3, m5
2358*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m4, m11, m4
2359*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m5, m11, m5
2360*c0909341SAndroid Build Coastguard Worker    mulps           m2, m11            ; 256 / (z + 1)
2361*c0909341SAndroid Build Coastguard Worker    mulps           m3, m11
2362*c0909341SAndroid Build Coastguard Worker    psrld           m4, 24             ; z < 255 ? 255 : 0
2363*c0909341SAndroid Build Coastguard Worker    psrld           m5, 24
2364*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m2, m2
2365*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m3, m3
2366*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m4             ; x
2367*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m5
2368*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m2
2369*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m3
2370*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
2371*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2372*c0909341SAndroid Build Coastguard Worker    paddd           m1, m10
2373*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
2374*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
2375*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+r10+400*0]
2376*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+r10+400*2]
2377*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+r10+400*4]
2378*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*8+ 8], m3
2379*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*0+ 8], m4
2380*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*0+40], m5
2381*c0909341SAndroid Build Coastguard Worker    paddw           m3, m3 ; cc5
2382*c0909341SAndroid Build Coastguard Worker    paddd           m4, m4
2383*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5
2384*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400*0], m3
2385*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400*2], m4
2386*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+400*4], m5
2387*c0909341SAndroid Build Coastguard Worker    mova         [t4+r10*1+400*2+ 4], m2
2388*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*4+ 8], xm0
2389*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*4+40], m0, 1
2390*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*4+24], xm1
2391*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*4+56], m1, 1
2392*c0909341SAndroid Build Coastguard Worker    add            r10, 32
2393*c0909341SAndroid Build Coastguard Worker    jl .v0_loop
2394*c0909341SAndroid Build Coastguard Worker    ret
2395*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows)
2396*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
2397*c0909341SAndroid Build Coastguard Worker.v1_loop:
2398*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+r10+400* 6]
2399*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+r10+400* 8]
2400*c0909341SAndroid Build Coastguard Worker    mova            m6, [t1+r10+400*10]
2401*c0909341SAndroid Build Coastguard Worker    paddw           m1, m4, [t2+r10+400* 6]
2402*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5, [t2+r10+400* 8]
2403*c0909341SAndroid Build Coastguard Worker    paddd           m3, m6, [t2+r10+400*10]
2404*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400* 6], m4
2405*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400* 8], m5
2406*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*10], m6
2407*c0909341SAndroid Build Coastguard Worker    paddd           m2, m9
2408*c0909341SAndroid Build Coastguard Worker    paddd           m3, m9
2409*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a3 + 8) >> 4
2410*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
2411*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
2412*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
2413*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
2414*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
2415*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
2416*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m7             ; (b3 + 2) >> 2
2417*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m7
2418*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
2419*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m7
2420*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
2421*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7         ; b3
2422*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
2423*c0909341SAndroid Build Coastguard Worker    pmaxud          m4, m2
2424*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2             ; p3
2425*c0909341SAndroid Build Coastguard Worker    pmaxud          m5, m3
2426*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
2427*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m14            ; p3 * s1
2428*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m14
2429*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m12            ; b3 * 455
2430*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m12
2431*c0909341SAndroid Build Coastguard Worker    paddw           m4, m12
2432*c0909341SAndroid Build Coastguard Worker    paddw           m5, m12
2433*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20             ; z + 1
2434*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
2435*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m4, m4
2436*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m5, m5
2437*c0909341SAndroid Build Coastguard Worker    rcpps           m2, m4             ; 1 / (z + 1)
2438*c0909341SAndroid Build Coastguard Worker    rcpps           m3, m5
2439*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m4, m11, m4
2440*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m5, m11, m5
2441*c0909341SAndroid Build Coastguard Worker    mulps           m2, m11            ; 256 / (z + 1)
2442*c0909341SAndroid Build Coastguard Worker    mulps           m3, m11
2443*c0909341SAndroid Build Coastguard Worker    psrld           m4, 24             ; z < 255 ? 255 : 0
2444*c0909341SAndroid Build Coastguard Worker    psrld           m5, 24
2445*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m2, m2
2446*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m3, m3
2447*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m4             ; x
2448*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m5
2449*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m2
2450*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m3
2451*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
2452*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2453*c0909341SAndroid Build Coastguard Worker    paddd           m1, m10
2454*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
2455*c0909341SAndroid Build Coastguard Worker    psrld           m8, m1, 12
2456*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400*4+4], m2
2457*c0909341SAndroid Build Coastguard Worker    mova            m4, [t3+r10*2+400*8+ 8]
2458*c0909341SAndroid Build Coastguard Worker    mova            m5, [t3+r10*2+400*0+ 8]
2459*c0909341SAndroid Build Coastguard Worker    mova            m6, [t3+r10*2+400*0+40]
2460*c0909341SAndroid Build Coastguard Worker    paddw           m1, m4, [t2+r10+400*0]
2461*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5, [t2+r10+400*2]
2462*c0909341SAndroid Build Coastguard Worker    paddd           m3, m6, [t2+r10+400*4]
2463*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t1+r10+400*0]
2464*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+r10+400*2]
2465*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t1+r10+400*4]
2466*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*0], m4
2467*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*2], m5
2468*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+400*4], m6
2469*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*8+ 8], xm0
2470*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*8+40], m0, 1
2471*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*8+24], xm8
2472*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*8+56], m8, 1
2473*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m4, [base+pd_25]
2474*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m6, [base+pw_164_24]
2475*c0909341SAndroid Build Coastguard Worker    paddd           m2, m9
2476*c0909341SAndroid Build Coastguard Worker    paddd           m3, m9
2477*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a5 + 8) >> 4
2478*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
2479*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m4             ; ((a5 + 8) >> 4) * 25
2480*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m4
2481*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m1, 1
2482*c0909341SAndroid Build Coastguard Worker    pavgw           m5, m7             ; (b5 + 2) >> 2
2483*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5, m7
2484*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
2485*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m7
2486*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
2487*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7         ; b5
2488*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
2489*c0909341SAndroid Build Coastguard Worker    pmaxud          m2, m4
2490*c0909341SAndroid Build Coastguard Worker    psubd           m2, m4             ; p5
2491*c0909341SAndroid Build Coastguard Worker    pmaxud          m3, m5
2492*c0909341SAndroid Build Coastguard Worker    psubd           m3, m5
2493*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m13            ; p5 * s0
2494*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m13
2495*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m6             ; b5 * 164
2496*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m6
2497*c0909341SAndroid Build Coastguard Worker    paddw           m2, m6
2498*c0909341SAndroid Build Coastguard Worker    paddw           m3, m6
2499*c0909341SAndroid Build Coastguard Worker    psrld           m2, 20             ; z5 + 1
2500*c0909341SAndroid Build Coastguard Worker    psrld           m3, 20
2501*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m2, m2
2502*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m3, m3
2503*c0909341SAndroid Build Coastguard Worker    rcpps           m4, m2             ; 1 / (z5 + 1)
2504*c0909341SAndroid Build Coastguard Worker    rcpps           m5, m3
2505*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m2, m11, m2
2506*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m3, m11, m3
2507*c0909341SAndroid Build Coastguard Worker    mulps           m4, m11            ; 256 / (z5 + 1)
2508*c0909341SAndroid Build Coastguard Worker    mulps           m5, m11
2509*c0909341SAndroid Build Coastguard Worker    psrld           m2, 24             ; z5 < 255 ? 255 : 0
2510*c0909341SAndroid Build Coastguard Worker    psrld           m3, 24
2511*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m4, m4
2512*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m5, m5
2513*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m2             ; x5
2514*c0909341SAndroid Build Coastguard Worker    pminsw          m5, m3
2515*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m4
2516*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m5
2517*c0909341SAndroid Build Coastguard Worker    packssdw        m4, m5
2518*c0909341SAndroid Build Coastguard Worker    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
2519*c0909341SAndroid Build Coastguard Worker    paddd           m1, m10
2520*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
2521*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
2522*c0909341SAndroid Build Coastguard Worker    mova         [t4+r10*1+400*0+ 4], m4
2523*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*0+ 8], xm0
2524*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*0+40], m0, 1
2525*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*2+400*0+24], xm1
2526*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*2+400*0+56], m1, 1
2527*c0909341SAndroid Build Coastguard Worker    add            r10, 32
2528*c0909341SAndroid Build Coastguard Worker    jl .v1_loop
2529*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
2530*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
2531*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
2532*c0909341SAndroid Build Coastguard Worker    ret
2533*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup
2534*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
2535*c0909341SAndroid Build Coastguard Worker.prep_n_loop:
2536*c0909341SAndroid Build Coastguard Worker    movu           xm0, [t4+r10*1+400*0+2]
2537*c0909341SAndroid Build Coastguard Worker    paddw          xm2, xm0, [t4+r10*1+400*0+0]
2538*c0909341SAndroid Build Coastguard Worker    paddw          xm2, [t4+r10*1+400*0+4]
2539*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+r10*2+400*0+4]
2540*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1, [t3+r10*2+400*0+0]
2541*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t3+r10*2+400*0+8]
2542*c0909341SAndroid Build Coastguard Worker    paddw          xm0, xm2
2543*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
2544*c0909341SAndroid Build Coastguard Worker    psllw          xm2, 2
2545*c0909341SAndroid Build Coastguard Worker    pslld           m3, 2
2546*c0909341SAndroid Build Coastguard Worker    paddw          xm0, xm2              ; a5 565
2547*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3               ; b5 565
2548*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400* 6], xm0
2549*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*12], m1
2550*c0909341SAndroid Build Coastguard Worker    mova           xm0, [t4+r10*1+400*2+0]
2551*c0909341SAndroid Build Coastguard Worker    paddw          xm0, [t4+r10*1+400*2+4]
2552*c0909341SAndroid Build Coastguard Worker    paddw          xm2, xm0, [t4+r10*1+400*2+2]
2553*c0909341SAndroid Build Coastguard Worker    mova            m1, [t3+r10*2+400*4+0]
2554*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+r10*2+400*4+8]
2555*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1, [t3+r10*2+400*4+4]
2556*c0909341SAndroid Build Coastguard Worker    psllw          xm2, 2                ; a3[-1] 444
2557*c0909341SAndroid Build Coastguard Worker    pslld           m3, 2                ; b3[-1] 444
2558*c0909341SAndroid Build Coastguard Worker    psubw          xm2, xm0              ; a3[-1] 343
2559*c0909341SAndroid Build Coastguard Worker    psubd           m3, m1               ; b3[-1] 343
2560*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400* 8], xm2
2561*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*16], m3
2562*c0909341SAndroid Build Coastguard Worker    mova           xm0, [t4+r10*1+400*4+0]
2563*c0909341SAndroid Build Coastguard Worker    paddw          xm0, [t4+r10*1+400*4+4]
2564*c0909341SAndroid Build Coastguard Worker    paddw          xm2, xm0, [t4+r10*1+400*4+2]
2565*c0909341SAndroid Build Coastguard Worker    mova            m1, [t3+r10*2+400*8+0]
2566*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+r10*2+400*8+8]
2567*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1, [t3+r10*2+400*8+4]
2568*c0909341SAndroid Build Coastguard Worker    psllw          xm2, 2                 ; a3[ 0] 444
2569*c0909341SAndroid Build Coastguard Worker    pslld           m3, 2                 ; b3[ 0] 444
2570*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400*10], xm2
2571*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*20], m3
2572*c0909341SAndroid Build Coastguard Worker    psubw          xm2, xm0               ; a3[ 0] 343
2573*c0909341SAndroid Build Coastguard Worker    psubd           m3, m1                ; b3[ 0] 343
2574*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400*12], xm2
2575*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*24], m3
2576*c0909341SAndroid Build Coastguard Worker    add            r10, 16
2577*c0909341SAndroid Build Coastguard Worker    jl .prep_n_loop
2578*c0909341SAndroid Build Coastguard Worker    ret
2579*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2580*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows)
2581*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
2582*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m6, [base+pd_4096]
2583*c0909341SAndroid Build Coastguard Worker.n0_loop:
2584*c0909341SAndroid Build Coastguard Worker    movu           xm2, [t4+r10*1+2]
2585*c0909341SAndroid Build Coastguard Worker    paddw          xm0, xm2, [t4+r10*1+0]
2586*c0909341SAndroid Build Coastguard Worker    paddw          xm0, [t4+r10*1+4]
2587*c0909341SAndroid Build Coastguard Worker    paddw          xm2, xm0
2588*c0909341SAndroid Build Coastguard Worker    psllw          xm0, 2
2589*c0909341SAndroid Build Coastguard Worker    paddw          xm0, xm2              ; a5
2590*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+r10*2+4]
2591*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1, [t3+r10*2+0]
2592*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+r10*2+8]
2593*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
2594*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2
2595*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1               ; b5
2596*c0909341SAndroid Build Coastguard Worker    paddw          xm2, xm0, [t4+r10*1+400* 6]
2597*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400* 6], xm0
2598*c0909341SAndroid Build Coastguard Worker    paddd           m0, m4, [t3+r10*2+400*12]
2599*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*12], m4
2600*c0909341SAndroid Build Coastguard Worker    mova           xm3, [t4+r10*1+400*2+0]
2601*c0909341SAndroid Build Coastguard Worker    paddw          xm3, [t4+r10*1+400*2+4]
2602*c0909341SAndroid Build Coastguard Worker    paddw          xm5, xm3, [t4+r10*1+400*2+2]
2603*c0909341SAndroid Build Coastguard Worker    psllw          xm5, 2                ; a3[ 1] 444
2604*c0909341SAndroid Build Coastguard Worker    psubw          xm4, xm5, xm3         ; a3[ 1] 343
2605*c0909341SAndroid Build Coastguard Worker    paddw          xm3, xm4, [t4+r10*1+400* 8]
2606*c0909341SAndroid Build Coastguard Worker    paddw          xm3, [t4+r10*1+400*10]
2607*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400* 8], xm4
2608*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400*10], xm5
2609*c0909341SAndroid Build Coastguard Worker    mova            m1, [t3+r10*2+400*4+0]
2610*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+r10*2+400*4+8]
2611*c0909341SAndroid Build Coastguard Worker    paddd           m5, m1, [t3+r10*2+400*4+4]
2612*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2                ; b3[ 1] 444
2613*c0909341SAndroid Build Coastguard Worker    psubd           m4, m5, m1           ; b3[ 1] 343
2614*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4, [t3+r10*2+400*16]
2615*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+r10*2+400*20]
2616*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*16], m4
2617*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*20], m5
2618*c0909341SAndroid Build Coastguard Worker    pmovzxwd        m4, [dstq+r10]
2619*c0909341SAndroid Build Coastguard Worker    pmovzxwd        m2, xm2              ; a5
2620*c0909341SAndroid Build Coastguard Worker    pmovzxwd        m3, xm3              ; a3
2621*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m4               ; a5 * src
2622*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m4               ; a3 * src
2623*c0909341SAndroid Build Coastguard Worker    pslld           m4, 13
2624*c0909341SAndroid Build Coastguard Worker    psubd           m0, m2               ; b5 - a5 * src + (1 << 8)
2625*c0909341SAndroid Build Coastguard Worker    psubd           m1, m3               ; b3 - a3 * src + (1 << 8)
2626*c0909341SAndroid Build Coastguard Worker    psrld           m0, 9
2627*c0909341SAndroid Build Coastguard Worker    pslld           m1, 7
2628*c0909341SAndroid Build Coastguard Worker    pblendw         m0, m1, 0xaa
2629*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m15
2630*c0909341SAndroid Build Coastguard Worker    paddd           m4, m6
2631*c0909341SAndroid Build Coastguard Worker    paddd           m0, m4
2632*c0909341SAndroid Build Coastguard Worker    psrad           m0, 7
2633*c0909341SAndroid Build Coastguard Worker    vextracti128   xm1, m0, 1
2634*c0909341SAndroid Build Coastguard Worker    packusdw       xm0, xm1              ; clip
2635*c0909341SAndroid Build Coastguard Worker    psrlw          xm0, 6
2636*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], xm0
2637*c0909341SAndroid Build Coastguard Worker    add            r10, 16
2638*c0909341SAndroid Build Coastguard Worker    jl .n0_loop
2639*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
2640*c0909341SAndroid Build Coastguard Worker    ret
2641*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2642*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows)
2643*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
2644*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m6, [base+pd_4096]
2645*c0909341SAndroid Build Coastguard Worker.n1_loop:
2646*c0909341SAndroid Build Coastguard Worker    mova           xm3, [t4+r10*1+400*4+0]
2647*c0909341SAndroid Build Coastguard Worker    paddw          xm3, [t4+r10*1+400*4+4]
2648*c0909341SAndroid Build Coastguard Worker    paddw          xm5, xm3, [t4+r10*1+400*4+2]
2649*c0909341SAndroid Build Coastguard Worker    psllw          xm5, 2                ; a3[ 1] 444
2650*c0909341SAndroid Build Coastguard Worker    psubw          xm4, xm5, xm3         ; a3[ 1] 343
2651*c0909341SAndroid Build Coastguard Worker    paddw          xm3, xm4, [t4+r10*1+400*12]
2652*c0909341SAndroid Build Coastguard Worker    paddw          xm3, [t4+r10*1+400*10]
2653*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400*10], xm5
2654*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+400*12], xm4
2655*c0909341SAndroid Build Coastguard Worker    mova            m1, [t3+r10*2+400*8+0]
2656*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+r10*2+400*8+8]
2657*c0909341SAndroid Build Coastguard Worker    paddd           m5, m1, [t3+r10*2+400*8+4]
2658*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2                ; b3[ 1] 444
2659*c0909341SAndroid Build Coastguard Worker    psubd           m4, m5, m1           ; b3[ 1] 343
2660*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4, [t3+r10*2+400*24]
2661*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+r10*2+400*20]
2662*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*20], m5
2663*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+400*24], m4
2664*c0909341SAndroid Build Coastguard Worker    pmovzxwd        m4, [dstq+r10]
2665*c0909341SAndroid Build Coastguard Worker    pmovzxwd        m2, [t4+r10*1+400* 6]
2666*c0909341SAndroid Build Coastguard Worker    pmovzxwd        m3, xm3
2667*c0909341SAndroid Build Coastguard Worker    mova            m0, [t3+r10*2+400*12]
2668*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m4               ; a5 * src
2669*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m4               ; a3 * src
2670*c0909341SAndroid Build Coastguard Worker    pslld           m4, 13
2671*c0909341SAndroid Build Coastguard Worker    psubd           m0, m2               ; b5 - a5 * src + (1 << 8)
2672*c0909341SAndroid Build Coastguard Worker    psubd           m1, m3               ; b3 - a3 * src + (1 << 8)
2673*c0909341SAndroid Build Coastguard Worker    psrld           m0, 8
2674*c0909341SAndroid Build Coastguard Worker    pslld           m1, 7
2675*c0909341SAndroid Build Coastguard Worker    pblendw         m0, m1, 0xaa
2676*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m15
2677*c0909341SAndroid Build Coastguard Worker    paddd           m4, m6
2678*c0909341SAndroid Build Coastguard Worker    paddd           m0, m4
2679*c0909341SAndroid Build Coastguard Worker    psrad           m0, 7
2680*c0909341SAndroid Build Coastguard Worker    vextracti128   xm1, m0, 1
2681*c0909341SAndroid Build Coastguard Worker    packusdw       xm0, xm1              ; clip
2682*c0909341SAndroid Build Coastguard Worker    psrlw          xm0, 6
2683*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], xm0
2684*c0909341SAndroid Build Coastguard Worker    add            r10, 16
2685*c0909341SAndroid Build Coastguard Worker    jl .n1_loop
2686*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
2687*c0909341SAndroid Build Coastguard Worker    ret
2688*c0909341SAndroid Build Coastguard Worker
2689*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
2690