xref: /aosp_15_r20/external/libdav1d/src/x86/looprestoration_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 32
32*c0909341SAndroid Build Coastguard Worker
33*c0909341SAndroid Build Coastguard Workerwiener_l_shuf: db  4,  4,  4,  4,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
34*c0909341SAndroid Build Coastguard Worker               db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
35*c0909341SAndroid Build Coastguard Workerwiener_shufA:  db  1,  7,  2,  8,  3,  9,  4, 10,  5, 11,  6, 12,  7, 13,  8, 14
36*c0909341SAndroid Build Coastguard Workerwiener_shufB:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10
37*c0909341SAndroid Build Coastguard Workerwiener_shufC:  db  6,  5,  7,  6,  8,  7,  9,  8, 10,  9, 11, 10, 12, 11, 13, 12
38*c0909341SAndroid Build Coastguard Workersgr_l_shuf:    db  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
39*c0909341SAndroid Build Coastguard Workersgr_r_ext:     times 16 db 1
40*c0909341SAndroid Build Coastguard Worker               times 16 db 9
41*c0909341SAndroid Build Coastguard Workersgr_shuf:      db  1, -1,  2, -1,  3, -1,  4, -1,  5, -1,  6, -1,  7, -1,  8, -1
42*c0909341SAndroid Build Coastguard Worker               db  9, -1, 10, -1, 11, -1, 12, -1
43*c0909341SAndroid Build Coastguard Worker
44*c0909341SAndroid Build Coastguard Workerpb_m5:         times 4 db -5
45*c0909341SAndroid Build Coastguard Workerpb_3:          times 4 db 3
46*c0909341SAndroid Build Coastguard Workerpw_5_6:        dw 5, 6
47*c0909341SAndroid Build Coastguard Workerpw_164_24:     dw 164, 24
48*c0909341SAndroid Build Coastguard Workerpw_455_24:     dw 455, 24
49*c0909341SAndroid Build Coastguard Workerpw_256:        times 2 dw 256
50*c0909341SAndroid Build Coastguard Workerpw_2056:       times 2 dw 2056
51*c0909341SAndroid Build Coastguard Workerpw_m16380:     times 2 dw -16380
52*c0909341SAndroid Build Coastguard Workerpd_25:         dd 25
53*c0909341SAndroid Build Coastguard Workerpd_34816:      dd 34816
54*c0909341SAndroid Build Coastguard Workerpd_m4096:      dd -4096
55*c0909341SAndroid Build Coastguard Workerpf_256:        dd 256.0
56*c0909341SAndroid Build Coastguard Worker
57*c0909341SAndroid Build Coastguard Workercextern pb_0to63
58*c0909341SAndroid Build Coastguard Worker
59*c0909341SAndroid Build Coastguard WorkerSECTION .text
60*c0909341SAndroid Build Coastguard Worker
61*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
62*c0909341SAndroid Build Coastguard Worker
63*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
64*c0909341SAndroid Build Coastguard Workercglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
65*c0909341SAndroid Build Coastguard Worker                                                    w, h, edge, flt
66*c0909341SAndroid Build Coastguard Worker    mov           fltq, r6mp
67*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
68*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
69*c0909341SAndroid Build Coastguard Worker    mov             wd, wm
70*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m6, [wiener_shufA]
71*c0909341SAndroid Build Coastguard Worker    vpbroadcastb   m11, [fltq+ 0] ; x0 x0
72*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m7, [wiener_shufB]
73*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m12, [fltq+ 2]
74*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m8, [wiener_shufC]
75*c0909341SAndroid Build Coastguard Worker    packsswb       m12, m12       ; x1 x2
76*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m13, [fltq+ 6] ; x3
77*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m9, [sgr_shuf+6]
78*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
79*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m10, [pw_m16380]
80*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m14, [fltq+16] ; y0 y1
81*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
82*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m15, [fltq+20] ; y2 y3
83*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq*2+16]
84*c0909341SAndroid Build Coastguard Worker    psllw          m14, 5
85*c0909341SAndroid Build Coastguard Worker    neg             wq
86*c0909341SAndroid Build Coastguard Worker    psllw          m15, 5
87*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
88*c0909341SAndroid Build Coastguard Worker    jz .no_top
89*c0909341SAndroid Build Coastguard Worker    call .h_top
90*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
91*c0909341SAndroid Build Coastguard Worker    mov             t6, t1
92*c0909341SAndroid Build Coastguard Worker    mov             t5, t1
93*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
94*c0909341SAndroid Build Coastguard Worker    call .h_top
95*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
96*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
97*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
98*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
99*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
100*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10 ; below
101*c0909341SAndroid Build Coastguard Worker    call .h
102*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
103*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
104*c0909341SAndroid Build Coastguard Worker    dec             hd
105*c0909341SAndroid Build Coastguard Worker    jz .v1
106*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
107*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
108*c0909341SAndroid Build Coastguard Worker    call .h
109*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
110*c0909341SAndroid Build Coastguard Worker    dec             hd
111*c0909341SAndroid Build Coastguard Worker    jz .v2
112*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
113*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
114*c0909341SAndroid Build Coastguard Worker    call .h
115*c0909341SAndroid Build Coastguard Worker    dec             hd
116*c0909341SAndroid Build Coastguard Worker    jz .v3
117*c0909341SAndroid Build Coastguard Worker.main:
118*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+384*2]
119*c0909341SAndroid Build Coastguard Worker.main_loop:
120*c0909341SAndroid Build Coastguard Worker    call .hv
121*c0909341SAndroid Build Coastguard Worker    dec             hd
122*c0909341SAndroid Build Coastguard Worker    jnz .main_loop
123*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
124*c0909341SAndroid Build Coastguard Worker    jz .v3
125*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
126*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
127*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
128*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
129*c0909341SAndroid Build Coastguard Worker.v1:
130*c0909341SAndroid Build Coastguard Worker    call .v
131*c0909341SAndroid Build Coastguard Worker    RET
132*c0909341SAndroid Build Coastguard Worker.no_top:
133*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
134*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
135*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
136*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10
137*c0909341SAndroid Build Coastguard Worker    call .h
138*c0909341SAndroid Build Coastguard Worker    mov             t6, t1
139*c0909341SAndroid Build Coastguard Worker    mov             t5, t1
140*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
141*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
142*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
143*c0909341SAndroid Build Coastguard Worker    dec             hd
144*c0909341SAndroid Build Coastguard Worker    jz .v1
145*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
146*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
147*c0909341SAndroid Build Coastguard Worker    call .h
148*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
149*c0909341SAndroid Build Coastguard Worker    dec             hd
150*c0909341SAndroid Build Coastguard Worker    jz .v2
151*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
152*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
153*c0909341SAndroid Build Coastguard Worker    call .h
154*c0909341SAndroid Build Coastguard Worker    dec             hd
155*c0909341SAndroid Build Coastguard Worker    jz .v3
156*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+384*2]
157*c0909341SAndroid Build Coastguard Worker    call .hv
158*c0909341SAndroid Build Coastguard Worker    dec             hd
159*c0909341SAndroid Build Coastguard Worker    jz .v3
160*c0909341SAndroid Build Coastguard Worker    add             t0, 384*8
161*c0909341SAndroid Build Coastguard Worker    call .hv
162*c0909341SAndroid Build Coastguard Worker    dec             hd
163*c0909341SAndroid Build Coastguard Worker    jnz .main
164*c0909341SAndroid Build Coastguard Worker.v3:
165*c0909341SAndroid Build Coastguard Worker    call .v
166*c0909341SAndroid Build Coastguard Worker.v2:
167*c0909341SAndroid Build Coastguard Worker    call .v
168*c0909341SAndroid Build Coastguard Worker    jmp .v1
169*c0909341SAndroid Build Coastguard Worker.extend_right:
170*c0909341SAndroid Build Coastguard Worker    movd           xm2, r10d
171*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m0, [pb_3]
172*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m1, [pb_m5]
173*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m2, xm2
174*c0909341SAndroid Build Coastguard Worker    mova            m3, [pb_0to63]
175*c0909341SAndroid Build Coastguard Worker    psubb           m0, m2
176*c0909341SAndroid Build Coastguard Worker    psubb           m1, m2
177*c0909341SAndroid Build Coastguard Worker    pminub          m0, m3
178*c0909341SAndroid Build Coastguard Worker    pminub          m1, m3
179*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m0
180*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m1
181*c0909341SAndroid Build Coastguard Worker    ret
182*c0909341SAndroid Build Coastguard Worker.h:
183*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
184*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
185*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
186*c0909341SAndroid Build Coastguard Worker    movd           xm4, [leftq]
187*c0909341SAndroid Build Coastguard Worker    vpblendd        m4, [lpfq+r10-4], 0xfe
188*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
189*c0909341SAndroid Build Coastguard Worker    jmp .h_main
190*c0909341SAndroid Build Coastguard Worker.h_extend_left:
191*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m5, [lpfq+r10] ; avoid accessing memory located
192*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+r10] ; before the start of the buffer
193*c0909341SAndroid Build Coastguard Worker    palignr         m4, m5, 12
194*c0909341SAndroid Build Coastguard Worker    pshufb          m4, [wiener_l_shuf]
195*c0909341SAndroid Build Coastguard Worker    jmp .h_main
196*c0909341SAndroid Build Coastguard Worker.h_top:
197*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
198*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
199*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
200*c0909341SAndroid Build Coastguard Worker.h_loop:
201*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+r10-4]
202*c0909341SAndroid Build Coastguard Worker.h_main:
203*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+r10+4]
204*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
205*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
206*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -34
207*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
208*c0909341SAndroid Build Coastguard Worker    call .extend_right
209*c0909341SAndroid Build Coastguard Worker.h_have_right:
210*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m4, m6
211*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m0, m11
212*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m5, m6
213*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m1, m11
214*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m4, m7
215*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m2, m12
216*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m5, m7
217*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m3, m12
218*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
219*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m4, m8
220*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m2, m12
221*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
222*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m5, m8
223*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m3, m12
224*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m9
225*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
226*c0909341SAndroid Build Coastguard Worker    pmullw          m2, m4, m13
227*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m9
228*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
229*c0909341SAndroid Build Coastguard Worker    pmullw          m3, m5, m13
230*c0909341SAndroid Build Coastguard Worker    psllw           m4, 7
231*c0909341SAndroid Build Coastguard Worker    psllw           m5, 7
232*c0909341SAndroid Build Coastguard Worker    paddw           m4, m10
233*c0909341SAndroid Build Coastguard Worker    paddw           m5, m10
234*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
235*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m2, [pw_2056]
236*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
237*c0909341SAndroid Build Coastguard Worker    paddsw          m0, m4
238*c0909341SAndroid Build Coastguard Worker    paddsw          m1, m5
239*c0909341SAndroid Build Coastguard Worker    psraw           m0, 3
240*c0909341SAndroid Build Coastguard Worker    psraw           m1, 3
241*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
242*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2
243*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+ 0], m0
244*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+32], m1
245*c0909341SAndroid Build Coastguard Worker    add            r10, 32
246*c0909341SAndroid Build Coastguard Worker    jl .h_loop
247*c0909341SAndroid Build Coastguard Worker    ret
248*c0909341SAndroid Build Coastguard WorkerALIGN function_align
249*c0909341SAndroid Build Coastguard Worker.hv:
250*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
251*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
252*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
253*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
254*c0909341SAndroid Build Coastguard Worker    movd           xm4, [leftq]
255*c0909341SAndroid Build Coastguard Worker    vpblendd        m4, [lpfq+r10-4], 0xfe
256*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
257*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
258*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
259*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+r10-4]
260*c0909341SAndroid Build Coastguard Worker    pshufb          m4, [wiener_l_shuf]
261*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
262*c0909341SAndroid Build Coastguard Worker.hv_bottom:
263*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
264*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
265*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
266*c0909341SAndroid Build Coastguard Worker.hv_loop:
267*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+r10-4]
268*c0909341SAndroid Build Coastguard Worker.hv_main:
269*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+r10+4]
270*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
271*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
272*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -34
273*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
274*c0909341SAndroid Build Coastguard Worker    call .extend_right
275*c0909341SAndroid Build Coastguard Worker.hv_have_right:
276*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m4, m6
277*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m0, m11
278*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m5, m6
279*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m1, m11
280*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m4, m7
281*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m2, m12
282*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m5, m7
283*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m3, m12
284*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
285*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m4, m8
286*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m2, m12
287*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
288*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m5, m8
289*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m3, m12
290*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m9
291*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
292*c0909341SAndroid Build Coastguard Worker    pmullw          m2, m4, m13
293*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m9
294*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
295*c0909341SAndroid Build Coastguard Worker    pmullw          m3, m5, m13
296*c0909341SAndroid Build Coastguard Worker    psllw           m4, 7
297*c0909341SAndroid Build Coastguard Worker    psllw           m5, 7
298*c0909341SAndroid Build Coastguard Worker    paddw           m4, m10
299*c0909341SAndroid Build Coastguard Worker    paddw           m5, m10
300*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
301*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
302*c0909341SAndroid Build Coastguard Worker    mova            m2, [t4+r10*2]
303*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t2+r10*2]
304*c0909341SAndroid Build Coastguard Worker    mova            m3, [t3+r10*2]
305*c0909341SAndroid Build Coastguard Worker    paddsw          m0, m4
306*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m4, [pw_2056]
307*c0909341SAndroid Build Coastguard Worker    paddsw          m1, m5
308*c0909341SAndroid Build Coastguard Worker    mova            m5, [t5+r10*2]
309*c0909341SAndroid Build Coastguard Worker    paddw           m5, [t1+r10*2]
310*c0909341SAndroid Build Coastguard Worker    psraw           m0, 3
311*c0909341SAndroid Build Coastguard Worker    psraw           m1, 3
312*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4
313*c0909341SAndroid Build Coastguard Worker    paddw           m1, m4
314*c0909341SAndroid Build Coastguard Worker    paddw           m4, m0, [t6+r10*2]
315*c0909341SAndroid Build Coastguard Worker    mova    [t0+r10*2], m0
316*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m2, m3
317*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m15
318*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m3
319*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m15
320*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m4, m5
321*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m14
322*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m5
323*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m14
324*c0909341SAndroid Build Coastguard Worker    paddd           m0, m3
325*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2
326*c0909341SAndroid Build Coastguard Worker    mova            m2, [t4+r10*2+32]
327*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t2+r10*2+32]
328*c0909341SAndroid Build Coastguard Worker    mova            m3, [t3+r10*2+32]
329*c0909341SAndroid Build Coastguard Worker    mova            m5, [t5+r10*2+32]
330*c0909341SAndroid Build Coastguard Worker    paddw           m5, [t1+r10*2+32]
331*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m4
332*c0909341SAndroid Build Coastguard Worker    paddw           m4, m1, [t6+r10*2+32]
333*c0909341SAndroid Build Coastguard Worker    mova [t0+r10*2+32], m1
334*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m2, m3
335*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m15
336*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m3
337*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m15
338*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m4, m5
339*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m14
340*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m5
341*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m14
342*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
343*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4
344*c0909341SAndroid Build Coastguard Worker    packuswb        m1, m2
345*c0909341SAndroid Build Coastguard Worker    psrlw           m0, 8
346*c0909341SAndroid Build Coastguard Worker    psrlw           m1, 8
347*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m1
348*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
349*c0909341SAndroid Build Coastguard Worker    add            r10, 32
350*c0909341SAndroid Build Coastguard Worker    jl .hv_loop
351*c0909341SAndroid Build Coastguard Worker    mov             t6, t5
352*c0909341SAndroid Build Coastguard Worker    mov             t5, t4
353*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
354*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
355*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
356*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
357*c0909341SAndroid Build Coastguard Worker    mov             t0, t6
358*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
359*c0909341SAndroid Build Coastguard Worker    ret
360*c0909341SAndroid Build Coastguard Worker.v:
361*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
362*c0909341SAndroid Build Coastguard Worker.v_loop:
363*c0909341SAndroid Build Coastguard Worker    mova            m2, [t4+r10*2+ 0]
364*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t2+r10*2+ 0]
365*c0909341SAndroid Build Coastguard Worker    mova            m4, [t3+r10*2+ 0]
366*c0909341SAndroid Build Coastguard Worker    mova            m6, [t1+r10*2+ 0]
367*c0909341SAndroid Build Coastguard Worker    paddw           m8, m6, [t6+r10*2+ 0]
368*c0909341SAndroid Build Coastguard Worker    paddw           m6, [t5+r10*2+ 0]
369*c0909341SAndroid Build Coastguard Worker    mova            m3, [t4+r10*2+32]
370*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t2+r10*2+32]
371*c0909341SAndroid Build Coastguard Worker    mova            m5, [t3+r10*2+32]
372*c0909341SAndroid Build Coastguard Worker    mova            m7, [t1+r10*2+32]
373*c0909341SAndroid Build Coastguard Worker    paddw           m9, m7, [t6+r10*2+32]
374*c0909341SAndroid Build Coastguard Worker    paddw           m7, [t5+r10*2+32]
375*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m2, m4
376*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m15
377*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m4
378*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m15
379*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m8, m6
380*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m14
381*c0909341SAndroid Build Coastguard Worker    punpckhwd       m6, m8, m6
382*c0909341SAndroid Build Coastguard Worker    pmaddwd         m6, m14
383*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m3, m5
384*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m15
385*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m5
386*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m15
387*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m9, m7
388*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m14
389*c0909341SAndroid Build Coastguard Worker    punpckhwd       m7, m9, m7
390*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m14
391*c0909341SAndroid Build Coastguard Worker    paddd           m0, m4
392*c0909341SAndroid Build Coastguard Worker    paddd           m2, m6
393*c0909341SAndroid Build Coastguard Worker    paddd           m1, m5
394*c0909341SAndroid Build Coastguard Worker    paddd           m3, m7
395*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m2
396*c0909341SAndroid Build Coastguard Worker    packuswb        m1, m3
397*c0909341SAndroid Build Coastguard Worker    psrlw           m0, 8
398*c0909341SAndroid Build Coastguard Worker    psrlw           m1, 8
399*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m1
400*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
401*c0909341SAndroid Build Coastguard Worker    add            r10, 32
402*c0909341SAndroid Build Coastguard Worker    jl .v_loop
403*c0909341SAndroid Build Coastguard Worker    mov             t6, t5
404*c0909341SAndroid Build Coastguard Worker    mov             t5, t4
405*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
406*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
407*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
408*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
409*c0909341SAndroid Build Coastguard Worker    ret
410*c0909341SAndroid Build Coastguard Worker
411*c0909341SAndroid Build Coastguard Workercglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
412*c0909341SAndroid Build Coastguard Worker                                                  w, h, edge, flt
413*c0909341SAndroid Build Coastguard Worker    mov           fltq, r6mp
414*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
415*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
416*c0909341SAndroid Build Coastguard Worker    mov             wd, wm
417*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m6, [wiener_shufB]
418*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m12, [fltq+ 2]
419*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m7, [wiener_shufC]
420*c0909341SAndroid Build Coastguard Worker    packsswb       m12, m12       ; x1 x2
421*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m13, [fltq+ 6] ; x3
422*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m8, [sgr_shuf+6]
423*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
424*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [pw_m16380]
425*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m10, [pw_2056]
426*c0909341SAndroid Build Coastguard Worker    mova           m11, [wiener_l_shuf]
427*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m14, [fltq+16] ; __ y1
428*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
429*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m15, [fltq+20] ; y2 y3
430*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq*2+16]
431*c0909341SAndroid Build Coastguard Worker    psllw          m14, 5
432*c0909341SAndroid Build Coastguard Worker    neg             wq
433*c0909341SAndroid Build Coastguard Worker    psllw          m15, 5
434*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
435*c0909341SAndroid Build Coastguard Worker    jz .no_top
436*c0909341SAndroid Build Coastguard Worker    call .h_top
437*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
438*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
439*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
440*c0909341SAndroid Build Coastguard Worker    call .h_top
441*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
442*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
443*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
444*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
445*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
446*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10 ; below
447*c0909341SAndroid Build Coastguard Worker    call .h
448*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
449*c0909341SAndroid Build Coastguard Worker    dec             hd
450*c0909341SAndroid Build Coastguard Worker    jz .v1
451*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
452*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
453*c0909341SAndroid Build Coastguard Worker    call .h
454*c0909341SAndroid Build Coastguard Worker    dec             hd
455*c0909341SAndroid Build Coastguard Worker    jz .v2
456*c0909341SAndroid Build Coastguard Worker.main:
457*c0909341SAndroid Build Coastguard Worker    mov             t0, t4
458*c0909341SAndroid Build Coastguard Worker.main_loop:
459*c0909341SAndroid Build Coastguard Worker    call .hv
460*c0909341SAndroid Build Coastguard Worker    dec             hd
461*c0909341SAndroid Build Coastguard Worker    jnz .main_loop
462*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
463*c0909341SAndroid Build Coastguard Worker    jz .v2
464*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
465*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
466*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
467*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
468*c0909341SAndroid Build Coastguard Worker.end:
469*c0909341SAndroid Build Coastguard Worker    RET
470*c0909341SAndroid Build Coastguard Worker.no_top:
471*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
472*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
473*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
474*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10
475*c0909341SAndroid Build Coastguard Worker    call .h
476*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
477*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
478*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
479*c0909341SAndroid Build Coastguard Worker    dec             hd
480*c0909341SAndroid Build Coastguard Worker    jz .v1
481*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
482*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
483*c0909341SAndroid Build Coastguard Worker    call .h
484*c0909341SAndroid Build Coastguard Worker    dec             hd
485*c0909341SAndroid Build Coastguard Worker    jz .v2
486*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+384*2]
487*c0909341SAndroid Build Coastguard Worker    call .hv
488*c0909341SAndroid Build Coastguard Worker    dec             hd
489*c0909341SAndroid Build Coastguard Worker    jz .v2
490*c0909341SAndroid Build Coastguard Worker    add             t0, 384*6
491*c0909341SAndroid Build Coastguard Worker    call .hv
492*c0909341SAndroid Build Coastguard Worker    dec             hd
493*c0909341SAndroid Build Coastguard Worker    jnz .main
494*c0909341SAndroid Build Coastguard Worker.v2:
495*c0909341SAndroid Build Coastguard Worker    call .v
496*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
497*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
498*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
499*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
500*c0909341SAndroid Build Coastguard Worker.v1:
501*c0909341SAndroid Build Coastguard Worker    call .v
502*c0909341SAndroid Build Coastguard Worker    jmp .end
503*c0909341SAndroid Build Coastguard Worker.h:
504*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
505*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
506*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
507*c0909341SAndroid Build Coastguard Worker    movd           xm4, [leftq]
508*c0909341SAndroid Build Coastguard Worker    vpblendd        m4, [lpfq+r10-4], 0xfe
509*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
510*c0909341SAndroid Build Coastguard Worker    jmp .h_main
511*c0909341SAndroid Build Coastguard Worker.h_extend_left:
512*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m5, [lpfq+r10] ; avoid accessing memory located
513*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+r10] ; before the start of the buffer
514*c0909341SAndroid Build Coastguard Worker    palignr         m4, m5, 12
515*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m11
516*c0909341SAndroid Build Coastguard Worker    jmp .h_main
517*c0909341SAndroid Build Coastguard Worker.h_top:
518*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
519*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
520*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
521*c0909341SAndroid Build Coastguard Worker.h_loop:
522*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+r10-4]
523*c0909341SAndroid Build Coastguard Worker.h_main:
524*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+r10+4]
525*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
526*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
527*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -33
528*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
529*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
530*c0909341SAndroid Build Coastguard Worker.h_have_right:
531*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m4, m6
532*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m0, m12
533*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m5, m6
534*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m1, m12
535*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m4, m7
536*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m2, m12
537*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m5, m7
538*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m3, m12
539*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m8
540*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
541*c0909341SAndroid Build Coastguard Worker    pmullw          m2, m4, m13
542*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m8
543*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
544*c0909341SAndroid Build Coastguard Worker    pmullw          m3, m5, m13
545*c0909341SAndroid Build Coastguard Worker    psllw           m4, 7
546*c0909341SAndroid Build Coastguard Worker    psllw           m5, 7
547*c0909341SAndroid Build Coastguard Worker    paddw           m4, m9
548*c0909341SAndroid Build Coastguard Worker    paddw           m5, m9
549*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
550*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
551*c0909341SAndroid Build Coastguard Worker    paddsw          m0, m4
552*c0909341SAndroid Build Coastguard Worker    paddsw          m1, m5
553*c0909341SAndroid Build Coastguard Worker    psraw           m0, 3
554*c0909341SAndroid Build Coastguard Worker    psraw           m1, 3
555*c0909341SAndroid Build Coastguard Worker    paddw           m0, m10
556*c0909341SAndroid Build Coastguard Worker    paddw           m1, m10
557*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+ 0], m0
558*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+32], m1
559*c0909341SAndroid Build Coastguard Worker    add            r10, 32
560*c0909341SAndroid Build Coastguard Worker    jl .h_loop
561*c0909341SAndroid Build Coastguard Worker    ret
562*c0909341SAndroid Build Coastguard WorkerALIGN function_align
563*c0909341SAndroid Build Coastguard Worker.hv:
564*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
565*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
566*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
567*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
568*c0909341SAndroid Build Coastguard Worker    movd           xm4, [leftq]
569*c0909341SAndroid Build Coastguard Worker    vpblendd        m4, [lpfq+r10-4], 0xfe
570*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
571*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
572*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
573*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+r10-4]
574*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m11
575*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
576*c0909341SAndroid Build Coastguard Worker.hv_bottom:
577*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
578*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
579*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
580*c0909341SAndroid Build Coastguard Worker.hv_loop:
581*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+r10-4]
582*c0909341SAndroid Build Coastguard Worker.hv_main:
583*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+r10+4]
584*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
585*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
586*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -33
587*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
588*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
589*c0909341SAndroid Build Coastguard Worker.hv_have_right:
590*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m4, m6
591*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m0, m12
592*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m5, m6
593*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m1, m12
594*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m4, m7
595*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m2, m12
596*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m5, m7
597*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m3, m12
598*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m8
599*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
600*c0909341SAndroid Build Coastguard Worker    pmullw          m2, m4, m13
601*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m8
602*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
603*c0909341SAndroid Build Coastguard Worker    pmullw          m3, m5, m13
604*c0909341SAndroid Build Coastguard Worker    psllw           m4, 7
605*c0909341SAndroid Build Coastguard Worker    psllw           m5, 7
606*c0909341SAndroid Build Coastguard Worker    paddw           m4, m9
607*c0909341SAndroid Build Coastguard Worker    paddw           m5, m9
608*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
609*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
610*c0909341SAndroid Build Coastguard Worker    mova            m2, [t3+r10*2]
611*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t1+r10*2]
612*c0909341SAndroid Build Coastguard Worker    mova            m3, [t2+r10*2]
613*c0909341SAndroid Build Coastguard Worker    paddsw          m0, m4
614*c0909341SAndroid Build Coastguard Worker    paddsw          m1, m5
615*c0909341SAndroid Build Coastguard Worker    psraw           m0, 3
616*c0909341SAndroid Build Coastguard Worker    psraw           m1, 3
617*c0909341SAndroid Build Coastguard Worker    paddw           m0, m10
618*c0909341SAndroid Build Coastguard Worker    paddw           m1, m10
619*c0909341SAndroid Build Coastguard Worker    paddw           m4, m0, [t4+r10*2]
620*c0909341SAndroid Build Coastguard Worker    mova    [t0+r10*2], m0
621*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m2, m3
622*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m15
623*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m3
624*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m15
625*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m4, m4
626*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m14
627*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m4
628*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m14
629*c0909341SAndroid Build Coastguard Worker    paddd           m0, m3
630*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2
631*c0909341SAndroid Build Coastguard Worker    mova            m2, [t3+r10*2+32]
632*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t1+r10*2+32]
633*c0909341SAndroid Build Coastguard Worker    mova            m3, [t2+r10*2+32]
634*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m4
635*c0909341SAndroid Build Coastguard Worker    paddw           m4, m1, [t4+r10*2+32]
636*c0909341SAndroid Build Coastguard Worker    mova [t0+r10*2+32], m1
637*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m2, m3
638*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m15
639*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m3
640*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m15
641*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m4, m4
642*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m14
643*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m4
644*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m14
645*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
646*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4
647*c0909341SAndroid Build Coastguard Worker    packuswb        m1, m2
648*c0909341SAndroid Build Coastguard Worker    psrlw           m0, 8
649*c0909341SAndroid Build Coastguard Worker    psrlw           m1, 8
650*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m1
651*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
652*c0909341SAndroid Build Coastguard Worker    add            r10, 32
653*c0909341SAndroid Build Coastguard Worker    jl .hv_loop
654*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
655*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
656*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
657*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
658*c0909341SAndroid Build Coastguard Worker    mov             t0, t4
659*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
660*c0909341SAndroid Build Coastguard Worker    ret
661*c0909341SAndroid Build Coastguard Worker.v:
662*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
663*c0909341SAndroid Build Coastguard Worker    psrld          m13, m14, 16 ; y1 __
664*c0909341SAndroid Build Coastguard Worker.v_loop:
665*c0909341SAndroid Build Coastguard Worker    mova            m6, [t1+r10*2+ 0]
666*c0909341SAndroid Build Coastguard Worker    paddw           m2, m6, [t3+r10*2+ 0]
667*c0909341SAndroid Build Coastguard Worker    mova            m4, [t2+r10*2+ 0]
668*c0909341SAndroid Build Coastguard Worker    mova            m7, [t1+r10*2+32]
669*c0909341SAndroid Build Coastguard Worker    paddw           m3, m7, [t3+r10*2+32]
670*c0909341SAndroid Build Coastguard Worker    mova            m5, [t2+r10*2+32]
671*c0909341SAndroid Build Coastguard Worker    paddw           m6, [t4+r10*2+ 0]
672*c0909341SAndroid Build Coastguard Worker    paddw           m7, [t4+r10*2+32]
673*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m2, m4
674*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m15
675*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m4
676*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m15
677*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m3, m5
678*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m15
679*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m5
680*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m15
681*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m7, m6
682*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m5, m14
683*c0909341SAndroid Build Coastguard Worker    punpckhwd       m7, m6
684*c0909341SAndroid Build Coastguard Worker    pmaddwd         m6, m7, m14
685*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m13
686*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m13
687*c0909341SAndroid Build Coastguard Worker    paddd           m0, m4
688*c0909341SAndroid Build Coastguard Worker    paddd           m2, m6
689*c0909341SAndroid Build Coastguard Worker    paddd           m1, m5
690*c0909341SAndroid Build Coastguard Worker    paddd           m3, m7
691*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m2
692*c0909341SAndroid Build Coastguard Worker    packuswb        m1, m3
693*c0909341SAndroid Build Coastguard Worker    psrlw           m0, 8
694*c0909341SAndroid Build Coastguard Worker    psrlw           m1, 8
695*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m1
696*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
697*c0909341SAndroid Build Coastguard Worker    add            r10, 32
698*c0909341SAndroid Build Coastguard Worker    jl .v_loop
699*c0909341SAndroid Build Coastguard Worker    ret
700*c0909341SAndroid Build Coastguard Worker
701*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_5x5_8bpc, 4, 12, 16, 400*24+16, dst, stride, left, lpf, \
702*c0909341SAndroid Build Coastguard Worker                                                   w, h, edge, params
703*c0909341SAndroid Build Coastguard Worker    mov        paramsq, r6mp
704*c0909341SAndroid Build Coastguard Worker    mov             wd, wm
705*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
706*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m8, [sgr_shuf+0]
707*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
708*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m9, [sgr_shuf+8]
709*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
710*c0909341SAndroid Build Coastguard Worker    vbroadcasti128 m10, [sgr_shuf+2]
711*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
712*c0909341SAndroid Build Coastguard Worker    vbroadcasti128 m11, [sgr_shuf+6]
713*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+wq*4+16+400*12]
714*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m7, [paramsq+8] ; w0
715*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
716*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m12, [paramsq+0] ; s0
717*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq*2+20]
718*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m13, [pw_164_24]
719*c0909341SAndroid Build Coastguard Worker    neg             wq
720*c0909341SAndroid Build Coastguard Worker    vbroadcastss   m14, [pf_256]
721*c0909341SAndroid Build Coastguard Worker    psllw           m7, 4
722*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m15, [pd_m4096]
723*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
724*c0909341SAndroid Build Coastguard Worker    jz .no_top
725*c0909341SAndroid Build Coastguard Worker    call .h_top
726*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
727*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
728*c0909341SAndroid Build Coastguard Worker    call .top_fixup
729*c0909341SAndroid Build Coastguard Worker    add             t1, 400*6
730*c0909341SAndroid Build Coastguard Worker    call .h_top
731*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
732*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
733*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
734*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10 ; below
735*c0909341SAndroid Build Coastguard Worker    mov             t0, t2
736*c0909341SAndroid Build Coastguard Worker    dec             hd
737*c0909341SAndroid Build Coastguard Worker    jz .height1
738*c0909341SAndroid Build Coastguard Worker    or           edged, 16
739*c0909341SAndroid Build Coastguard Worker    call .h
740*c0909341SAndroid Build Coastguard Worker.main:
741*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
742*c0909341SAndroid Build Coastguard Worker    call .hv
743*c0909341SAndroid Build Coastguard Worker    call .prep_n
744*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
745*c0909341SAndroid Build Coastguard Worker    jl .extend_bottom
746*c0909341SAndroid Build Coastguard Worker.main_loop:
747*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
748*c0909341SAndroid Build Coastguard Worker    test            hd, hd
749*c0909341SAndroid Build Coastguard Worker    jz .odd_height
750*c0909341SAndroid Build Coastguard Worker    call .h
751*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
752*c0909341SAndroid Build Coastguard Worker    call .hv
753*c0909341SAndroid Build Coastguard Worker    call .n0
754*c0909341SAndroid Build Coastguard Worker    call .n1
755*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
756*c0909341SAndroid Build Coastguard Worker    jge .main_loop
757*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
758*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
759*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
760*c0909341SAndroid Build Coastguard Worker    call .h_top
761*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
762*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
763*c0909341SAndroid Build Coastguard Worker.end:
764*c0909341SAndroid Build Coastguard Worker    call .n0
765*c0909341SAndroid Build Coastguard Worker    call .n1
766*c0909341SAndroid Build Coastguard Worker.end2:
767*c0909341SAndroid Build Coastguard Worker    RET
768*c0909341SAndroid Build Coastguard Worker.height1:
769*c0909341SAndroid Build Coastguard Worker    call .hv
770*c0909341SAndroid Build Coastguard Worker    call .prep_n
771*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
772*c0909341SAndroid Build Coastguard Worker.odd_height:
773*c0909341SAndroid Build Coastguard Worker    call .hv
774*c0909341SAndroid Build Coastguard Worker    call .n0
775*c0909341SAndroid Build Coastguard Worker    call .n1
776*c0909341SAndroid Build Coastguard Worker.odd_height_end:
777*c0909341SAndroid Build Coastguard Worker    call .v
778*c0909341SAndroid Build Coastguard Worker    call .n0
779*c0909341SAndroid Build Coastguard Worker    jmp .end2
780*c0909341SAndroid Build Coastguard Worker.extend_bottom:
781*c0909341SAndroid Build Coastguard Worker    call .v
782*c0909341SAndroid Build Coastguard Worker    jmp .end
783*c0909341SAndroid Build Coastguard Worker.no_top:
784*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
785*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
786*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
787*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10
788*c0909341SAndroid Build Coastguard Worker    call .h
789*c0909341SAndroid Build Coastguard Worker    lea             t2, [t1+400*6]
790*c0909341SAndroid Build Coastguard Worker    call .top_fixup
791*c0909341SAndroid Build Coastguard Worker    dec             hd
792*c0909341SAndroid Build Coastguard Worker    jz .no_top_height1
793*c0909341SAndroid Build Coastguard Worker    or           edged, 16
794*c0909341SAndroid Build Coastguard Worker    mov             t0, t1
795*c0909341SAndroid Build Coastguard Worker    mov             t1, t2
796*c0909341SAndroid Build Coastguard Worker    jmp .main
797*c0909341SAndroid Build Coastguard Worker.no_top_height1:
798*c0909341SAndroid Build Coastguard Worker    call .v
799*c0909341SAndroid Build Coastguard Worker    call .prep_n
800*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
801*c0909341SAndroid Build Coastguard Worker.extend_right:
802*c0909341SAndroid Build Coastguard Worker    movd           xm2, r10d
803*c0909341SAndroid Build Coastguard Worker    mova            m0, [sgr_r_ext]
804*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m2, xm2
805*c0909341SAndroid Build Coastguard Worker    psubb           m0, m2
806*c0909341SAndroid Build Coastguard Worker    pminub          m0, [pb_0to63]
807*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m0
808*c0909341SAndroid Build Coastguard Worker    ret
809*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum
810*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
811*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
812*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
813*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   xm0, [leftq]
814*c0909341SAndroid Build Coastguard Worker    mova           xm5, [lpfq+wq]
815*c0909341SAndroid Build Coastguard Worker    palignr        xm5, xm0, 12
816*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
817*c0909341SAndroid Build Coastguard Worker    jmp .h_main
818*c0909341SAndroid Build Coastguard Worker.h_extend_left:
819*c0909341SAndroid Build Coastguard Worker    mova           xm5, [lpfq+wq]
820*c0909341SAndroid Build Coastguard Worker    pshufb         xm5, [sgr_l_shuf]
821*c0909341SAndroid Build Coastguard Worker    jmp .h_main
822*c0909341SAndroid Build Coastguard Worker.h_top:
823*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
824*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
825*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
826*c0909341SAndroid Build Coastguard Worker.h_loop:
827*c0909341SAndroid Build Coastguard Worker    movu           xm5, [lpfq+r10-2]
828*c0909341SAndroid Build Coastguard Worker.h_main:
829*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [lpfq+r10+6], 1
830*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
831*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
832*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -18
833*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
834*c0909341SAndroid Build Coastguard Worker    call .extend_right
835*c0909341SAndroid Build Coastguard Worker.h_have_right:
836*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m5, m8
837*c0909341SAndroid Build Coastguard Worker    pmullw          m4, m3, m3
838*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m5, m9
839*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3, m2
840*c0909341SAndroid Build Coastguard Worker    shufps          m3, m2, q2121
841*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3
842*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m2, m3
843*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
844*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m3
845*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
846*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m4, m6
847*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
848*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m6
849*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4
850*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m5, m10
851*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4
852*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m11
853*c0909341SAndroid Build Coastguard Worker    paddw           m0, m5 ; sum
854*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m4, m5
855*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
856*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m5
857*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
858*c0909341SAndroid Build Coastguard Worker    test         edgeb, 16 ; y > 0
859*c0909341SAndroid Build Coastguard Worker    jz .h_loop_end
860*c0909341SAndroid Build Coastguard Worker    paddw           m0, [t1+r10*2+400*0]
861*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t1+r10*2+400*2]
862*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+r10*2+400*4]
863*c0909341SAndroid Build Coastguard Worker.h_loop_end:
864*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3 ; sumsq
865*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4
866*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400*0], m0
867*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400*2], m1
868*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400*4], m2
869*c0909341SAndroid Build Coastguard Worker    add            r10, 16
870*c0909341SAndroid Build Coastguard Worker    jl .h_loop
871*c0909341SAndroid Build Coastguard Worker    ret
872*c0909341SAndroid Build Coastguard Worker.top_fixup:
873*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
874*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: ; the sums of the first row needs to be doubled
875*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10*2+400*0]
876*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+r10*2+400*2]
877*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10*2+400*4]
878*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
879*c0909341SAndroid Build Coastguard Worker    paddd           m1, m1
880*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
881*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400*0], m0
882*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400*2], m1
883*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400*4], m2
884*c0909341SAndroid Build Coastguard Worker    add            r10, 16
885*c0909341SAndroid Build Coastguard Worker    jl .top_fixup_loop
886*c0909341SAndroid Build Coastguard Worker    ret
887*c0909341SAndroid Build Coastguard WorkerALIGN function_align
888*c0909341SAndroid Build Coastguard Worker.hv: ; horizontal boxsum + vertical boxsum + ab
889*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
890*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
891*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
892*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   xm0, [leftq]
893*c0909341SAndroid Build Coastguard Worker    mova           xm5, [lpfq+wq]
894*c0909341SAndroid Build Coastguard Worker    palignr        xm5, xm0, 12
895*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
896*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
897*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
898*c0909341SAndroid Build Coastguard Worker    mova           xm5, [lpfq+wq]
899*c0909341SAndroid Build Coastguard Worker    pshufb         xm5, [sgr_l_shuf]
900*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
901*c0909341SAndroid Build Coastguard Worker.hv_bottom:
902*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
903*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
904*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
905*c0909341SAndroid Build Coastguard Worker.hv_loop:
906*c0909341SAndroid Build Coastguard Worker    movu           xm5, [lpfq+r10-2]
907*c0909341SAndroid Build Coastguard Worker.hv_main:
908*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [lpfq+r10+6], 1
909*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
910*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
911*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -18
912*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
913*c0909341SAndroid Build Coastguard Worker    call .extend_right
914*c0909341SAndroid Build Coastguard Worker.hv_have_right:
915*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m5, m8
916*c0909341SAndroid Build Coastguard Worker    pmullw          m4, m1, m1
917*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m5, m9
918*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1, m3
919*c0909341SAndroid Build Coastguard Worker    shufps          m1, m3, q2121
920*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
921*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m1
922*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
923*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m1
924*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
925*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m4, m6
926*c0909341SAndroid Build Coastguard Worker    paddd           m2, m1
927*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m6
928*c0909341SAndroid Build Coastguard Worker    paddd           m3, m4
929*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m5, m10
930*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
931*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m11
932*c0909341SAndroid Build Coastguard Worker    paddw           m0, m5               ; h sum
933*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5, m1
934*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
935*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m1
936*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
937*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t1+r10*2+400*0]
938*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4               ; h sumsq
939*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5
940*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t1+r10*2+400*2]
941*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t1+r10*2+400*4]
942*c0909341SAndroid Build Coastguard Worker    test            hd, hd
943*c0909341SAndroid Build Coastguard Worker    jz .hv_last_row
944*c0909341SAndroid Build Coastguard Worker.hv_main2:
945*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t2+r10*2+400*0] ; hv sum
946*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t2+r10*2+400*2] ; hv sumsq
947*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t2+r10*2+400*4]
948*c0909341SAndroid Build Coastguard Worker    mova [t0+r10*2+400*0], m0
949*c0909341SAndroid Build Coastguard Worker    mova [t0+r10*2+400*2], m2
950*c0909341SAndroid Build Coastguard Worker    mova [t0+r10*2+400*4], m3
951*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m2, [pd_25]
952*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6           ; b
953*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
954*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m2               ; a * 25
955*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m2
956*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0, m0           ; b * b
957*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1, m1
958*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2               ; p
959*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
960*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m12              ; p * s
961*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m12
962*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m13              ; b * 164
963*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m13
964*c0909341SAndroid Build Coastguard Worker    paddw           m4, m13
965*c0909341SAndroid Build Coastguard Worker    paddw           m5, m13
966*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20               ; z + 1
967*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
968*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m4, m4
969*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m5, m5
970*c0909341SAndroid Build Coastguard Worker    rcpps           m2, m4               ; 1 / (z + 1)
971*c0909341SAndroid Build Coastguard Worker    rcpps           m3, m5
972*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m4, m14, m4
973*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m5, m14, m5
974*c0909341SAndroid Build Coastguard Worker    mulps           m2, m14              ; 256 / (z + 1)
975*c0909341SAndroid Build Coastguard Worker    mulps           m3, m14
976*c0909341SAndroid Build Coastguard Worker    psrld           m4, 24               ; z < 255 ? 255 : 0
977*c0909341SAndroid Build Coastguard Worker    psrld           m5, 24
978*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m2, m2
979*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m3, m3
980*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m4               ; x
981*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m5
982*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m4, [pd_34816]
983*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m2
984*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m3
985*c0909341SAndroid Build Coastguard Worker    paddd           m0, m4               ; x * b * 164 + (1 << 11) + (1 << 15)
986*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
987*c0909341SAndroid Build Coastguard Worker    pand            m0, m15
988*c0909341SAndroid Build Coastguard Worker    pand            m1, m15
989*c0909341SAndroid Build Coastguard Worker    por             m0, m2               ; a | (b << 12)
990*c0909341SAndroid Build Coastguard Worker    por             m1, m3
991*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+ 8], xm0      ; The neighbor calculations requires
992*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+40], m0, 1    ; 13 bits for a and 21 bits for b.
993*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+24], xm1      ; Packing them allows for 12+20, but
994*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+56], m1, 1    ; that gets us most of the way.
995*c0909341SAndroid Build Coastguard Worker    add            r10, 16
996*c0909341SAndroid Build Coastguard Worker    jl .hv_loop
997*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
998*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
999*c0909341SAndroid Build Coastguard Worker    mov             t0, t2
1000*c0909341SAndroid Build Coastguard Worker    ret
1001*c0909341SAndroid Build Coastguard Worker.hv_last_row: ; esoteric edge case for odd heights
1002*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400*0], m1
1003*c0909341SAndroid Build Coastguard Worker    paddw              m1, m0
1004*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400*2], m4
1005*c0909341SAndroid Build Coastguard Worker    paddd              m4, m2
1006*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400*4], m5
1007*c0909341SAndroid Build Coastguard Worker    paddd              m5, m3
1008*c0909341SAndroid Build Coastguard Worker    jmp .hv_main2
1009*c0909341SAndroid Build Coastguard Worker.v: ; vertical boxsum + ab
1010*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1011*c0909341SAndroid Build Coastguard Worker.v_loop:
1012*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10*2+400*0]
1013*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10*2+400*2]
1014*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+r10*2+400*4]
1015*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10*2+400*0]
1016*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t2+r10*2+400*2]
1017*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t2+r10*2+400*4]
1018*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
1019*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
1020*c0909341SAndroid Build Coastguard Worker    paddd           m3, m3
1021*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0               ; hv sum
1022*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2               ; hv sumsq
1023*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
1024*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m2, [pd_25]
1025*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6           ; b
1026*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
1027*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m2               ; a * 25
1028*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m2
1029*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0, m0           ; b * b
1030*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1, m1
1031*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2               ; p
1032*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
1033*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m12              ; p * s
1034*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m12
1035*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m13              ; b * 164
1036*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m13
1037*c0909341SAndroid Build Coastguard Worker    paddw           m4, m13
1038*c0909341SAndroid Build Coastguard Worker    paddw           m5, m13
1039*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20               ; z + 1
1040*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
1041*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m4, m4
1042*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m5, m5
1043*c0909341SAndroid Build Coastguard Worker    rcpps           m2, m4               ; 1 / (z + 1)
1044*c0909341SAndroid Build Coastguard Worker    rcpps           m3, m5
1045*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m4, m14, m4
1046*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m5, m14, m5
1047*c0909341SAndroid Build Coastguard Worker    mulps           m2, m14              ; 256 / (z + 1)
1048*c0909341SAndroid Build Coastguard Worker    mulps           m3, m14
1049*c0909341SAndroid Build Coastguard Worker    psrld           m4, 24               ; z < 255 ? 255 : 0
1050*c0909341SAndroid Build Coastguard Worker    psrld           m5, 24
1051*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m2, m2
1052*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m3, m3
1053*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m4               ; x
1054*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m5
1055*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m4, [pd_34816]
1056*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m2
1057*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m3
1058*c0909341SAndroid Build Coastguard Worker    paddd           m0, m4               ; x * b * 164 + (1 << 11) + (1 << 15)
1059*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
1060*c0909341SAndroid Build Coastguard Worker    pand            m0, m15
1061*c0909341SAndroid Build Coastguard Worker    pand            m1, m15
1062*c0909341SAndroid Build Coastguard Worker    por             m0, m2               ; a | (b << 12)
1063*c0909341SAndroid Build Coastguard Worker    por             m1, m3
1064*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+ 8], xm0
1065*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+40], m0, 1
1066*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+24], xm1
1067*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+56], m1, 1
1068*c0909341SAndroid Build Coastguard Worker    add            r10, 16
1069*c0909341SAndroid Build Coastguard Worker    jl .v_loop
1070*c0909341SAndroid Build Coastguard Worker    ret
1071*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup
1072*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1073*c0909341SAndroid Build Coastguard Worker.prep_n_loop:
1074*c0909341SAndroid Build Coastguard Worker    movu            m0, [t3+r10*4+ 4]
1075*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+r10*4+36]
1076*c0909341SAndroid Build Coastguard Worker    paddd           m2, m0, [t3+r10*4+ 0]
1077*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1, [t3+r10*4+32]
1078*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t3+r10*4+ 8]
1079*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t3+r10*4+40]
1080*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
1081*c0909341SAndroid Build Coastguard Worker    pslld           m2, 2
1082*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
1083*c0909341SAndroid Build Coastguard Worker    pslld           m3, 2
1084*c0909341SAndroid Build Coastguard Worker    paddd           m2, m0                ; ab 565
1085*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1
1086*c0909341SAndroid Build Coastguard Worker    pandn           m0, m15, m2           ; a
1087*c0909341SAndroid Build Coastguard Worker    psrld           m2, 12                ; b
1088*c0909341SAndroid Build Coastguard Worker    pandn           m1, m15, m3
1089*c0909341SAndroid Build Coastguard Worker    psrld           m3, 12
1090*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*4+ 0], m0
1091*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*8+ 0], m2
1092*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*4+32], m1
1093*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*8+32], m3
1094*c0909341SAndroid Build Coastguard Worker    add            r10, 16
1095*c0909341SAndroid Build Coastguard Worker    jl .prep_n_loop
1096*c0909341SAndroid Build Coastguard Worker    ret
1097*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1098*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows)
1099*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1100*c0909341SAndroid Build Coastguard Worker.n0_loop:
1101*c0909341SAndroid Build Coastguard Worker    movu            m0, [t3+r10*4+ 4]
1102*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+r10*4+36]
1103*c0909341SAndroid Build Coastguard Worker    paddd           m2, m0, [t3+r10*4+ 0]
1104*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1, [t3+r10*4+32]
1105*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t3+r10*4+ 8]
1106*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t3+r10*4+40]
1107*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
1108*c0909341SAndroid Build Coastguard Worker    pslld           m2, 2
1109*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
1110*c0909341SAndroid Build Coastguard Worker    pslld           m3, 2
1111*c0909341SAndroid Build Coastguard Worker    paddd           m2, m0
1112*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1
1113*c0909341SAndroid Build Coastguard Worker    pandn           m0, m15, m2
1114*c0909341SAndroid Build Coastguard Worker    psrld           m2, 12
1115*c0909341SAndroid Build Coastguard Worker    pandn           m1, m15, m3
1116*c0909341SAndroid Build Coastguard Worker    psrld           m3, 12
1117*c0909341SAndroid Build Coastguard Worker    paddd           m4, m0, [t3+r10*4+400*4+ 0] ; a
1118*c0909341SAndroid Build Coastguard Worker    paddd           m5, m1, [t3+r10*4+400*4+32]
1119*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*4+ 0], m0
1120*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*4+32], m1
1121*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2, [t3+r10*4+400*8+ 0] ; b
1122*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3, [t3+r10*4+400*8+32]
1123*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*8+ 0], m2
1124*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*8+32], m3
1125*c0909341SAndroid Build Coastguard Worker    pmovzxbd        m2, [dstq+r10+0]
1126*c0909341SAndroid Build Coastguard Worker    pmovzxbd        m3, [dstq+r10+8]
1127*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m2 ; a * src
1128*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m3
1129*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
1130*c0909341SAndroid Build Coastguard Worker    psubd           m0, m4 ; b - a * src + (1 << 8)
1131*c0909341SAndroid Build Coastguard Worker    psubd           m1, m5
1132*c0909341SAndroid Build Coastguard Worker    psrad           m0, 9
1133*c0909341SAndroid Build Coastguard Worker    psrad           m1, 9
1134*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
1135*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m0, m7
1136*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
1137*c0909341SAndroid Build Coastguard Worker    vextracti128   xm1, m0, 1
1138*c0909341SAndroid Build Coastguard Worker    packuswb       xm0, xm1
1139*c0909341SAndroid Build Coastguard Worker    pshufd         xm0, xm0, q3120
1140*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], xm0
1141*c0909341SAndroid Build Coastguard Worker    add            r10, 16
1142*c0909341SAndroid Build Coastguard Worker    jl .n0_loop
1143*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
1144*c0909341SAndroid Build Coastguard Worker    ret
1145*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1146*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows)
1147*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1148*c0909341SAndroid Build Coastguard Worker.n1_loop:
1149*c0909341SAndroid Build Coastguard Worker    pmovzxbd        m2, [dstq+r10+0]
1150*c0909341SAndroid Build Coastguard Worker    pmovzxbd        m3, [dstq+r10+8]
1151*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m2, [t3+r10*4+400*4+ 0] ; a * src
1152*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m3, [t3+r10*4+400*4+32]
1153*c0909341SAndroid Build Coastguard Worker    mova            m0, [t3+r10*4+400*8+ 0]     ; b
1154*c0909341SAndroid Build Coastguard Worker    mova            m1, [t3+r10*4+400*8+32]
1155*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
1156*c0909341SAndroid Build Coastguard Worker    psubd           m0, m4                      ; b - a * src + (1 << 7)
1157*c0909341SAndroid Build Coastguard Worker    psubd           m1, m5
1158*c0909341SAndroid Build Coastguard Worker    psrad           m0, 8
1159*c0909341SAndroid Build Coastguard Worker    psrad           m1, 8
1160*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
1161*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m0, m7
1162*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
1163*c0909341SAndroid Build Coastguard Worker    vextracti128   xm1, m0, 1
1164*c0909341SAndroid Build Coastguard Worker    packuswb       xm0, xm1
1165*c0909341SAndroid Build Coastguard Worker    pshufd         xm0, xm0, q3120
1166*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], xm0
1167*c0909341SAndroid Build Coastguard Worker    add            r10, 16
1168*c0909341SAndroid Build Coastguard Worker    jl .n1_loop
1169*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
1170*c0909341SAndroid Build Coastguard Worker    ret
1171*c0909341SAndroid Build Coastguard Worker
1172*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_3x3_8bpc, 4, 14, 16, -400*28-16, dst, stride, left, lpf, \
1173*c0909341SAndroid Build Coastguard Worker                                                    w, h, edge, params
1174*c0909341SAndroid Build Coastguard Worker    mov        paramsq, r6mp
1175*c0909341SAndroid Build Coastguard Worker    mov             wd, wm
1176*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
1177*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m8, [sgr_shuf+2]
1178*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
1179*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m9, [sgr_shuf+4]
1180*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
1181*c0909341SAndroid Build Coastguard Worker    vbroadcasti128 m10, [sgr_shuf+6]
1182*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
1183*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m7, [paramsq+10] ; w1
1184*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+wq*4+16+400*12]
1185*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m11, [paramsq+ 4] ; s1
1186*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
1187*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m12, [pw_455_24]
1188*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq*2+20]
1189*c0909341SAndroid Build Coastguard Worker    vbroadcastss   m13, [pf_256]
1190*c0909341SAndroid Build Coastguard Worker    neg             wq
1191*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m14, [pd_34816] ; (1 << 11) + (1 << 15)
1192*c0909341SAndroid Build Coastguard Worker    psllw           m7, 4
1193*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m15, [pd_m4096]
1194*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
1195*c0909341SAndroid Build Coastguard Worker    jz .no_top
1196*c0909341SAndroid Build Coastguard Worker    call .h_top
1197*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1198*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1199*c0909341SAndroid Build Coastguard Worker    add             t1, 400*6
1200*c0909341SAndroid Build Coastguard Worker    call .h_top
1201*c0909341SAndroid Build Coastguard Worker    lea             t4, [lpfq+strideq*4]
1202*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1203*c0909341SAndroid Build Coastguard Worker    add             t4, strideq
1204*c0909341SAndroid Build Coastguard Worker    mov          [rsp], t4 ; below
1205*c0909341SAndroid Build Coastguard Worker    mov             t0, t2
1206*c0909341SAndroid Build Coastguard Worker    call .hv
1207*c0909341SAndroid Build Coastguard Worker.main:
1208*c0909341SAndroid Build Coastguard Worker    mov             t5, t3
1209*c0909341SAndroid Build Coastguard Worker    add             t3, 400*4
1210*c0909341SAndroid Build Coastguard Worker    dec             hd
1211*c0909341SAndroid Build Coastguard Worker    jz .height1
1212*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1213*c0909341SAndroid Build Coastguard Worker    call .hv
1214*c0909341SAndroid Build Coastguard Worker    call .prep_n
1215*c0909341SAndroid Build Coastguard Worker    dec             hd
1216*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
1217*c0909341SAndroid Build Coastguard Worker.main_loop:
1218*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1219*c0909341SAndroid Build Coastguard Worker    call .hv
1220*c0909341SAndroid Build Coastguard Worker    call .n
1221*c0909341SAndroid Build Coastguard Worker    dec             hd
1222*c0909341SAndroid Build Coastguard Worker    jnz .main_loop
1223*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
1224*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
1225*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
1226*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
1227*c0909341SAndroid Build Coastguard Worker    call .n
1228*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1229*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
1230*c0909341SAndroid Build Coastguard Worker.end:
1231*c0909341SAndroid Build Coastguard Worker    call .n
1232*c0909341SAndroid Build Coastguard Worker    RET
1233*c0909341SAndroid Build Coastguard Worker.height1:
1234*c0909341SAndroid Build Coastguard Worker    call .v
1235*c0909341SAndroid Build Coastguard Worker    call .prep_n
1236*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1237*c0909341SAndroid Build Coastguard Worker    call .v
1238*c0909341SAndroid Build Coastguard Worker    jmp .end
1239*c0909341SAndroid Build Coastguard Worker.extend_bottom:
1240*c0909341SAndroid Build Coastguard Worker    call .v
1241*c0909341SAndroid Build Coastguard Worker    call .n
1242*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1243*c0909341SAndroid Build Coastguard Worker    call .v
1244*c0909341SAndroid Build Coastguard Worker    jmp .end
1245*c0909341SAndroid Build Coastguard Worker.no_top:
1246*c0909341SAndroid Build Coastguard Worker    lea             t4, [lpfq+strideq*4]
1247*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1248*c0909341SAndroid Build Coastguard Worker    lea             t4, [t4+strideq*2]
1249*c0909341SAndroid Build Coastguard Worker    mov          [rsp], t4
1250*c0909341SAndroid Build Coastguard Worker    call .h
1251*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+400*6]
1252*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1253*c0909341SAndroid Build Coastguard Worker    call .v
1254*c0909341SAndroid Build Coastguard Worker    jmp .main
1255*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum
1256*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1257*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1258*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1259*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   xm0, [leftq]
1260*c0909341SAndroid Build Coastguard Worker    mova           xm5, [lpfq+wq]
1261*c0909341SAndroid Build Coastguard Worker    palignr        xm5, xm0, 12
1262*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
1263*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1264*c0909341SAndroid Build Coastguard Worker.h_extend_left:
1265*c0909341SAndroid Build Coastguard Worker    mova           xm5, [lpfq+wq]
1266*c0909341SAndroid Build Coastguard Worker    pshufb         xm5, [sgr_l_shuf]
1267*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1268*c0909341SAndroid Build Coastguard Worker.h_top:
1269*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1270*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1271*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1272*c0909341SAndroid Build Coastguard Worker.h_loop:
1273*c0909341SAndroid Build Coastguard Worker    movu           xm5, [lpfq+r10-2]
1274*c0909341SAndroid Build Coastguard Worker.h_main:
1275*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [lpfq+r10+6], 1
1276*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1277*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
1278*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -17
1279*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
1280*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1281*c0909341SAndroid Build Coastguard Worker.h_have_right:
1282*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m5, m8
1283*c0909341SAndroid Build Coastguard Worker    pmullw          m2, m0, m0
1284*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m5, m9
1285*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4
1286*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m10
1287*c0909341SAndroid Build Coastguard Worker    paddw           m0, m5 ; sum
1288*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m4, m5
1289*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1290*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m5
1291*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
1292*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m2, m6
1293*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m6
1294*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400*0], m0
1295*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3 ; sumsq
1296*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4
1297*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400*2], m1
1298*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400*4], m2
1299*c0909341SAndroid Build Coastguard Worker    add            r10, 16
1300*c0909341SAndroid Build Coastguard Worker    jl .h_loop
1301*c0909341SAndroid Build Coastguard Worker    ret
1302*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1303*c0909341SAndroid Build Coastguard Worker.hv: ; horizontal boxsum + vertical boxsum + ab
1304*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1305*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1306*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
1307*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   xm0, [leftq]
1308*c0909341SAndroid Build Coastguard Worker    mova           xm5, [lpfq+wq]
1309*c0909341SAndroid Build Coastguard Worker    palignr        xm5, xm0, 12
1310*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
1311*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
1312*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
1313*c0909341SAndroid Build Coastguard Worker    mova           xm5, [lpfq+wq]
1314*c0909341SAndroid Build Coastguard Worker    pshufb         xm5, [sgr_l_shuf]
1315*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
1316*c0909341SAndroid Build Coastguard Worker.hv_bottom:
1317*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1318*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1319*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
1320*c0909341SAndroid Build Coastguard Worker.hv_loop:
1321*c0909341SAndroid Build Coastguard Worker    movu           xm5, [lpfq+r10-2]
1322*c0909341SAndroid Build Coastguard Worker.hv_main:
1323*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [lpfq+r10+6], 1
1324*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1325*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
1326*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -17
1327*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
1328*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1329*c0909341SAndroid Build Coastguard Worker.hv_have_right:
1330*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m5, m8
1331*c0909341SAndroid Build Coastguard Worker    pmullw          m3, m0, m0
1332*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m5, m9
1333*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
1334*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m10
1335*c0909341SAndroid Build Coastguard Worker    paddw           m0, m5               ; h sum
1336*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5, m1
1337*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
1338*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m1
1339*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
1340*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10*2+400*0]
1341*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t1+r10*2+400*0] ; hv sum
1342*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
1343*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1344*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2               ; h sumsq
1345*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
1346*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+r10*2+400*2]
1347*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+r10*2+400*4]
1348*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+r10*2+400*2] ; hv sumsq
1349*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t1+r10*2+400*4]
1350*c0909341SAndroid Build Coastguard Worker    mova [t0+r10*2+400*0], m0
1351*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6           ; b
1352*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
1353*c0909341SAndroid Build Coastguard Worker    mova [t0+r10*2+400*2], m4
1354*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
1355*c0909341SAndroid Build Coastguard Worker    mova [t0+r10*2+400*4], m5
1356*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
1357*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2               ; a * 9
1358*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0, m0           ; b * b
1359*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
1360*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1, m1
1361*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2               ; p
1362*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
1363*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m11              ; p * s
1364*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m11
1365*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m12              ; b * 455
1366*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m12
1367*c0909341SAndroid Build Coastguard Worker    paddw           m4, m12
1368*c0909341SAndroid Build Coastguard Worker    paddw           m5, m12
1369*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20               ; z + 1
1370*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
1371*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m4, m4
1372*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m5, m5
1373*c0909341SAndroid Build Coastguard Worker    rcpps           m2, m4               ; 1 / (z + 1)
1374*c0909341SAndroid Build Coastguard Worker    rcpps           m3, m5
1375*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m4, m13, m4
1376*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m5, m13, m5
1377*c0909341SAndroid Build Coastguard Worker    mulps           m2, m13              ; 256 / (z + 1)
1378*c0909341SAndroid Build Coastguard Worker    mulps           m3, m13
1379*c0909341SAndroid Build Coastguard Worker    psrld           m4, 24               ; z < 255 ? 255 : 0
1380*c0909341SAndroid Build Coastguard Worker    psrld           m5, 24
1381*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m2, m2
1382*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m3, m3
1383*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m4               ; x
1384*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m5
1385*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m2
1386*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m3
1387*c0909341SAndroid Build Coastguard Worker    paddd           m0, m14              ; x * b * 455 + (1 << 11) + (1 << 15)
1388*c0909341SAndroid Build Coastguard Worker    paddd           m1, m14
1389*c0909341SAndroid Build Coastguard Worker    pand            m0, m15
1390*c0909341SAndroid Build Coastguard Worker    pand            m1, m15
1391*c0909341SAndroid Build Coastguard Worker    por             m0, m2               ; a | (b << 12)
1392*c0909341SAndroid Build Coastguard Worker    por             m1, m3
1393*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+ 8], xm0
1394*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+40], m0, 1
1395*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+24], xm1
1396*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+56], m1, 1
1397*c0909341SAndroid Build Coastguard Worker    add            r10, 16
1398*c0909341SAndroid Build Coastguard Worker    jl .hv_loop
1399*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1400*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
1401*c0909341SAndroid Build Coastguard Worker    mov             t0, t2
1402*c0909341SAndroid Build Coastguard Worker    ret
1403*c0909341SAndroid Build Coastguard Worker.v: ; vertical boxsum + ab
1404*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1405*c0909341SAndroid Build Coastguard Worker.v_loop:
1406*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+r10*2+400*0]
1407*c0909341SAndroid Build Coastguard Worker    paddw           m1, m1
1408*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t2+r10*2+400*0] ; hv sum
1409*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10*2+400*2]
1410*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+r10*2+400*4]
1411*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
1412*c0909341SAndroid Build Coastguard Worker    paddd           m3, m3
1413*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t2+r10*2+400*2] ; hv sumsq
1414*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t2+r10*2+400*4]
1415*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6           ; b
1416*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
1417*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
1418*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
1419*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2               ; a * 9
1420*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0, m0           ; b * b
1421*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
1422*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1, m1
1423*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2               ; p
1424*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
1425*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m11              ; p * s
1426*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m11
1427*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m12              ; b * 455
1428*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m12
1429*c0909341SAndroid Build Coastguard Worker    paddw           m4, m12
1430*c0909341SAndroid Build Coastguard Worker    paddw           m5, m12
1431*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20               ; z + 1
1432*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
1433*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m4, m4
1434*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m5, m5
1435*c0909341SAndroid Build Coastguard Worker    rcpps           m2, m4               ; 1 / (z + 1)
1436*c0909341SAndroid Build Coastguard Worker    rcpps           m3, m5
1437*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m4, m13, m4
1438*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m5, m13, m5
1439*c0909341SAndroid Build Coastguard Worker    mulps           m2, m13              ; 256 / (z + 1)
1440*c0909341SAndroid Build Coastguard Worker    mulps           m3, m13
1441*c0909341SAndroid Build Coastguard Worker    psrld           m4, 24               ; z < 255 ? 255 : 0
1442*c0909341SAndroid Build Coastguard Worker    psrld           m5, 24
1443*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m2, m2
1444*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m3, m3
1445*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m4               ; x
1446*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m5
1447*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m2
1448*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m3
1449*c0909341SAndroid Build Coastguard Worker    paddd           m0, m14              ; x * b * 455 + (1 << 11) + (1 << 15)
1450*c0909341SAndroid Build Coastguard Worker    paddd           m1, m14
1451*c0909341SAndroid Build Coastguard Worker    pand            m0, m15
1452*c0909341SAndroid Build Coastguard Worker    pand            m1, m15
1453*c0909341SAndroid Build Coastguard Worker    por             m0, m2               ; a | (b << 12)
1454*c0909341SAndroid Build Coastguard Worker    por             m1, m3
1455*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+ 8], xm0
1456*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+40], m0, 1
1457*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+24], xm1
1458*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+56], m1, 1
1459*c0909341SAndroid Build Coastguard Worker    add            r10, 16
1460*c0909341SAndroid Build Coastguard Worker    jl .v_loop
1461*c0909341SAndroid Build Coastguard Worker    ret
1462*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup
1463*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1464*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
1465*c0909341SAndroid Build Coastguard Worker    add             t3, 400*4
1466*c0909341SAndroid Build Coastguard Worker.prep_n_loop:
1467*c0909341SAndroid Build Coastguard Worker    mova            m2, [t5+r10*4+0]
1468*c0909341SAndroid Build Coastguard Worker    mova            m3, [t4+r10*4+0]
1469*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t5+r10*4+8]
1470*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t4+r10*4+8]
1471*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2, [t5+r10*4+4]
1472*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3, [t4+r10*4+4]
1473*c0909341SAndroid Build Coastguard Worker    pslld           m0, 2
1474*c0909341SAndroid Build Coastguard Worker    paddd           m1, m1                ; ab[ 0] 222
1475*c0909341SAndroid Build Coastguard Worker    psubd           m0, m2                ; ab[-1] 343
1476*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*4], m1
1477*c0909341SAndroid Build Coastguard Worker    paddd           m1, m1
1478*c0909341SAndroid Build Coastguard Worker    mova    [t5+r10*4], m0
1479*c0909341SAndroid Build Coastguard Worker    psubd           m1, m3                ; ab[ 0] 343
1480*c0909341SAndroid Build Coastguard Worker    mova    [t4+r10*4], m1
1481*c0909341SAndroid Build Coastguard Worker    add            r10, 8
1482*c0909341SAndroid Build Coastguard Worker    jl .prep_n_loop
1483*c0909341SAndroid Build Coastguard Worker    ret
1484*c0909341SAndroid Build Coastguard Worker; a+b are packed together in a single dword, but we can't do the
1485*c0909341SAndroid Build Coastguard Worker; full neighbor calculations before splitting them since we don't
1486*c0909341SAndroid Build Coastguard Worker; have sufficient precision. The solution is to do the calculations
1487*c0909341SAndroid Build Coastguard Worker; in two equal halves and split a and b before doing the final sum.
1488*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1489*c0909341SAndroid Build Coastguard Worker.n: ; neighbor + output
1490*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1491*c0909341SAndroid Build Coastguard Worker.n_loop:
1492*c0909341SAndroid Build Coastguard Worker    mova            m4, [t3+r10*4+ 0]
1493*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+r10*4+ 8]
1494*c0909341SAndroid Build Coastguard Worker    paddd           m5, m4, [t3+r10*4+ 4]
1495*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5                ; ab[+1] 222
1496*c0909341SAndroid Build Coastguard Worker    mova            m2, [t3+r10*4+400*4+ 0]
1497*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343
1498*c0909341SAndroid Build Coastguard Worker    mova            m3, [t3+r10*4+400*4+32]
1499*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3, [t5+r10*4+32]
1500*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*4+ 0], m5
1501*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5
1502*c0909341SAndroid Build Coastguard Worker    psubd           m5, m4                ; ab[+1] 343
1503*c0909341SAndroid Build Coastguard Worker    mova [t5+r10*4+ 0], m5
1504*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5                ; ab[ 0] 222 + ab[+1] 343
1505*c0909341SAndroid Build Coastguard Worker    mova            m4, [t3+r10*4+32]
1506*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+r10*4+40]
1507*c0909341SAndroid Build Coastguard Worker    paddd           m5, m4, [t3+r10*4+36]
1508*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5
1509*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*4+32], m5
1510*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5
1511*c0909341SAndroid Build Coastguard Worker    psubd           m5, m4
1512*c0909341SAndroid Build Coastguard Worker    mova [t5+r10*4+32], m5
1513*c0909341SAndroid Build Coastguard Worker    pandn           m4, m15, m0
1514*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
1515*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5
1516*c0909341SAndroid Build Coastguard Worker    pandn           m5, m15, m2
1517*c0909341SAndroid Build Coastguard Worker    psrld           m2, 12
1518*c0909341SAndroid Build Coastguard Worker    paddd           m4, m5                ; a
1519*c0909341SAndroid Build Coastguard Worker    pandn           m5, m15, m1
1520*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
1521*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2                ; b + (1 << 8)
1522*c0909341SAndroid Build Coastguard Worker    pandn           m2, m15, m3
1523*c0909341SAndroid Build Coastguard Worker    psrld           m3, 12
1524*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2
1525*c0909341SAndroid Build Coastguard Worker    pmovzxbd        m2, [dstq+r10+0]
1526*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
1527*c0909341SAndroid Build Coastguard Worker    pmovzxbd        m3, [dstq+r10+8]
1528*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m2                ; a * src
1529*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m3
1530*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
1531*c0909341SAndroid Build Coastguard Worker    psubd           m0, m4                ; b - a * src + (1 << 8)
1532*c0909341SAndroid Build Coastguard Worker    psubd           m1, m5
1533*c0909341SAndroid Build Coastguard Worker    psrad           m0, 9
1534*c0909341SAndroid Build Coastguard Worker    psrad           m1, 9
1535*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
1536*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m0, m7
1537*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
1538*c0909341SAndroid Build Coastguard Worker    vextracti128   xm1, m0, 1
1539*c0909341SAndroid Build Coastguard Worker    packuswb       xm0, xm1
1540*c0909341SAndroid Build Coastguard Worker    pshufd         xm0, xm0, q3120
1541*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], xm0
1542*c0909341SAndroid Build Coastguard Worker    add            r10, 16
1543*c0909341SAndroid Build Coastguard Worker    jl .n_loop
1544*c0909341SAndroid Build Coastguard Worker    mov            r10, t5
1545*c0909341SAndroid Build Coastguard Worker    mov             t5, t4
1546*c0909341SAndroid Build Coastguard Worker    mov             t4, r10
1547*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
1548*c0909341SAndroid Build Coastguard Worker    ret
1549*c0909341SAndroid Build Coastguard Worker
1550*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_mix_8bpc, 4, 12, 16, 400*56+8, dst, stride, left, lpf, \
1551*c0909341SAndroid Build Coastguard Worker                                                  w, h, edge, params
1552*c0909341SAndroid Build Coastguard Worker    mov        paramsq, r6mp
1553*c0909341SAndroid Build Coastguard Worker    mov             wd, wm
1554*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
1555*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
1556*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m9, [sgr_shuf+0]
1557*c0909341SAndroid Build Coastguard Worker    vbroadcasti128 m10, [sgr_shuf+8]
1558*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
1559*c0909341SAndroid Build Coastguard Worker    vbroadcasti128 m11, [sgr_shuf+2]
1560*c0909341SAndroid Build Coastguard Worker    vbroadcasti128 m12, [sgr_shuf+6]
1561*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
1562*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m15, [paramsq+8] ; w0 w1
1563*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+wq*4+400*24+8]
1564*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m13, [paramsq+0] ; s0
1565*c0909341SAndroid Build Coastguard Worker    pxor            m7, m7
1566*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m14, [paramsq+4] ; s1
1567*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq*2+12]
1568*c0909341SAndroid Build Coastguard Worker    neg             wq
1569*c0909341SAndroid Build Coastguard Worker    psllw          m15, 2 ; to reuse existing pd_m4096 register for rounding
1570*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
1571*c0909341SAndroid Build Coastguard Worker    jz .no_top
1572*c0909341SAndroid Build Coastguard Worker    call .h_top
1573*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1574*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1575*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup
1576*c0909341SAndroid Build Coastguard Worker    add             t1, 400*12
1577*c0909341SAndroid Build Coastguard Worker    call .h_top
1578*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1579*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1580*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
1581*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10 ; below
1582*c0909341SAndroid Build Coastguard Worker    call .hv0
1583*c0909341SAndroid Build Coastguard Worker.main:
1584*c0909341SAndroid Build Coastguard Worker    dec             hd
1585*c0909341SAndroid Build Coastguard Worker    jz .height1
1586*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1587*c0909341SAndroid Build Coastguard Worker    call .hv1
1588*c0909341SAndroid Build Coastguard Worker    call .prep_n
1589*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1590*c0909341SAndroid Build Coastguard Worker    jl .extend_bottom
1591*c0909341SAndroid Build Coastguard Worker.main_loop:
1592*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1593*c0909341SAndroid Build Coastguard Worker    call .hv0
1594*c0909341SAndroid Build Coastguard Worker    test            hd, hd
1595*c0909341SAndroid Build Coastguard Worker    jz .odd_height
1596*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1597*c0909341SAndroid Build Coastguard Worker    call .hv1
1598*c0909341SAndroid Build Coastguard Worker    call .n0
1599*c0909341SAndroid Build Coastguard Worker    call .n1
1600*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1601*c0909341SAndroid Build Coastguard Worker    jge .main_loop
1602*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
1603*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
1604*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
1605*c0909341SAndroid Build Coastguard Worker    call .hv0_bottom
1606*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1607*c0909341SAndroid Build Coastguard Worker    call .hv1_bottom
1608*c0909341SAndroid Build Coastguard Worker.end:
1609*c0909341SAndroid Build Coastguard Worker    call .n0
1610*c0909341SAndroid Build Coastguard Worker    call .n1
1611*c0909341SAndroid Build Coastguard Worker.end2:
1612*c0909341SAndroid Build Coastguard Worker    RET
1613*c0909341SAndroid Build Coastguard Worker.height1:
1614*c0909341SAndroid Build Coastguard Worker    call .v1
1615*c0909341SAndroid Build Coastguard Worker    call .prep_n
1616*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
1617*c0909341SAndroid Build Coastguard Worker.odd_height:
1618*c0909341SAndroid Build Coastguard Worker    call .v1
1619*c0909341SAndroid Build Coastguard Worker    call .n0
1620*c0909341SAndroid Build Coastguard Worker    call .n1
1621*c0909341SAndroid Build Coastguard Worker.odd_height_end:
1622*c0909341SAndroid Build Coastguard Worker    call .v0
1623*c0909341SAndroid Build Coastguard Worker    call .v1
1624*c0909341SAndroid Build Coastguard Worker    call .n0
1625*c0909341SAndroid Build Coastguard Worker    jmp .end2
1626*c0909341SAndroid Build Coastguard Worker.extend_bottom:
1627*c0909341SAndroid Build Coastguard Worker    call .v0
1628*c0909341SAndroid Build Coastguard Worker    call .v1
1629*c0909341SAndroid Build Coastguard Worker    jmp .end
1630*c0909341SAndroid Build Coastguard Worker.no_top:
1631*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1632*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1633*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
1634*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10
1635*c0909341SAndroid Build Coastguard Worker    call .h
1636*c0909341SAndroid Build Coastguard Worker    lea             t2, [t1+400*12]
1637*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1638*c0909341SAndroid Build Coastguard Worker.top_fixup_loop:
1639*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10*2+400* 0]
1640*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+r10*2+400* 2]
1641*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10*2+400* 4]
1642*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
1643*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+r10*2+400* 6]
1644*c0909341SAndroid Build Coastguard Worker    paddd           m1, m1
1645*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+r10*2+400* 8]
1646*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
1647*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+r10*2+400*10]
1648*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400* 0], m0
1649*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400* 2], m1
1650*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400* 4], m2
1651*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400* 6], m3
1652*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400* 8], m4
1653*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400*10], m5
1654*c0909341SAndroid Build Coastguard Worker    add            r10, 16
1655*c0909341SAndroid Build Coastguard Worker    jl .top_fixup_loop
1656*c0909341SAndroid Build Coastguard Worker    call .v0
1657*c0909341SAndroid Build Coastguard Worker    jmp .main
1658*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsums
1659*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1660*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1661*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1662*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   xm0, [leftq]
1663*c0909341SAndroid Build Coastguard Worker    mova           xm5, [lpfq+wq]
1664*c0909341SAndroid Build Coastguard Worker    palignr        xm5, xm0, 12
1665*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
1666*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1667*c0909341SAndroid Build Coastguard Worker.h_extend_left:
1668*c0909341SAndroid Build Coastguard Worker    mova           xm5, [lpfq+wq]
1669*c0909341SAndroid Build Coastguard Worker    pshufb         xm5, [sgr_l_shuf]
1670*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1671*c0909341SAndroid Build Coastguard Worker.h_top:
1672*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1673*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1674*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1675*c0909341SAndroid Build Coastguard Worker.h_loop:
1676*c0909341SAndroid Build Coastguard Worker    movu           xm5, [lpfq+r10-2]
1677*c0909341SAndroid Build Coastguard Worker.h_main:
1678*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [lpfq+r10+6], 1
1679*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1680*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
1681*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -18
1682*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
1683*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1684*c0909341SAndroid Build Coastguard Worker.h_have_right:
1685*c0909341SAndroid Build Coastguard Worker    pshufb          m6, m5, m9
1686*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m5, m10
1687*c0909341SAndroid Build Coastguard Worker    paddw           m8, m6, m4
1688*c0909341SAndroid Build Coastguard Worker    shufps          m0, m6, m4, q2121
1689*c0909341SAndroid Build Coastguard Worker    pmullw          m3, m0, m0
1690*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m5, m11
1691*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
1692*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m12
1693*c0909341SAndroid Build Coastguard Worker    paddw           m0, m5 ; sum3
1694*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m2, m5
1695*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
1696*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m5
1697*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1698*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m6, m4
1699*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
1700*c0909341SAndroid Build Coastguard Worker    punpckhwd       m6, m4
1701*c0909341SAndroid Build Coastguard Worker    pmaddwd         m6, m6
1702*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m7
1703*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4 ; sumsq3
1704*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m7
1705*c0909341SAndroid Build Coastguard Worker    paddd           m2, m3
1706*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400* 6], m0
1707*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400* 8], m1
1708*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400*10], m2
1709*c0909341SAndroid Build Coastguard Worker    paddw           m8, m0 ; sum5
1710*c0909341SAndroid Build Coastguard Worker    paddd           m5, m1 ; sumsq5
1711*c0909341SAndroid Build Coastguard Worker    paddd           m6, m2
1712*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400* 0], m8
1713*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400* 2], m5
1714*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400* 4], m6
1715*c0909341SAndroid Build Coastguard Worker    add            r10, 16
1716*c0909341SAndroid Build Coastguard Worker    jl .h_loop
1717*c0909341SAndroid Build Coastguard Worker    ret
1718*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1719*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows)
1720*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1721*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1722*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
1723*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   xm0, [leftq]
1724*c0909341SAndroid Build Coastguard Worker    mova           xm5, [lpfq+wq]
1725*c0909341SAndroid Build Coastguard Worker    palignr        xm5, xm0, 12
1726*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
1727*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
1728*c0909341SAndroid Build Coastguard Worker.hv0_extend_left:
1729*c0909341SAndroid Build Coastguard Worker    mova           xm5, [lpfq+wq]
1730*c0909341SAndroid Build Coastguard Worker    pshufb         xm5, [sgr_l_shuf]
1731*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
1732*c0909341SAndroid Build Coastguard Worker.hv0_bottom:
1733*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1734*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1735*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
1736*c0909341SAndroid Build Coastguard Worker.hv0_loop:
1737*c0909341SAndroid Build Coastguard Worker    movu           xm5, [lpfq+r10-2]
1738*c0909341SAndroid Build Coastguard Worker.hv0_main:
1739*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [lpfq+r10+6], 1
1740*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1741*c0909341SAndroid Build Coastguard Worker    jnz .hv0_have_right
1742*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -18
1743*c0909341SAndroid Build Coastguard Worker    jl .hv0_have_right
1744*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1745*c0909341SAndroid Build Coastguard Worker.hv0_have_right:
1746*c0909341SAndroid Build Coastguard Worker    pshufb          m6, m5, m9
1747*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m5, m10
1748*c0909341SAndroid Build Coastguard Worker    paddw           m8, m6, m4
1749*c0909341SAndroid Build Coastguard Worker    shufps          m1, m6, m4, q2121
1750*c0909341SAndroid Build Coastguard Worker    pmullw          m0, m1, m1
1751*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m5, m11
1752*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
1753*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m12
1754*c0909341SAndroid Build Coastguard Worker    paddw           m1, m5 ; sum3
1755*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m5
1756*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1757*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m5
1758*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1759*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m6, m4
1760*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
1761*c0909341SAndroid Build Coastguard Worker    punpckhwd       m6, m4
1762*c0909341SAndroid Build Coastguard Worker    pmaddwd         m6, m6
1763*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m0, m7
1764*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4 ; sumsq3
1765*c0909341SAndroid Build Coastguard Worker    punpckhwd       m0, m7
1766*c0909341SAndroid Build Coastguard Worker    paddd           m3, m0
1767*c0909341SAndroid Build Coastguard Worker    paddw           m8, m1 ; sum5
1768*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2 ; sumsq5
1769*c0909341SAndroid Build Coastguard Worker    paddd           m6, m3
1770*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row
1771*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd
1772*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*0+40], m6
1773*c0909341SAndroid Build Coastguard Worker    paddw           m8, [t1+r10*2+400* 0]
1774*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t1+r10*2+400* 2]
1775*c0909341SAndroid Build Coastguard Worker    paddd           m6, [t1+r10*2+400* 4]
1776*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400* 0], m8
1777*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400* 2], m5
1778*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400* 4], m6
1779*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1, [t1+r10*2+400* 6]
1780*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t1+r10*2+400* 8]
1781*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t1+r10*2+400*10]
1782*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400* 6], m1
1783*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400* 8], m2
1784*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400*10], m3
1785*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10*2+400* 6]
1786*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+r10*2+400* 8]
1787*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+r10*2+400*10]
1788*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400* 6], m0
1789*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400* 8], m4
1790*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400*10], m5
1791*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m8, [pw_455_24]
1792*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7           ; b3
1793*c0909341SAndroid Build Coastguard Worker    vbroadcastss    m6, [pf_256]
1794*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
1795*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
1796*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
1797*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2               ; a3 * 9
1798*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0, m0           ; b3 * b
1799*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
1800*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1, m1
1801*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2               ; p3
1802*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
1803*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m14              ; p3 * s1
1804*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m14
1805*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m8               ; b3 * 455
1806*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m8
1807*c0909341SAndroid Build Coastguard Worker    paddw           m4, m8
1808*c0909341SAndroid Build Coastguard Worker    paddw           m5, m8
1809*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m8, [pd_34816]
1810*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20               ; z3 + 1
1811*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
1812*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m4, m4
1813*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m5, m5
1814*c0909341SAndroid Build Coastguard Worker    rcpps           m2, m4               ; 1 / (z3 + 1)
1815*c0909341SAndroid Build Coastguard Worker    rcpps           m3, m5
1816*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m4, m6, m4
1817*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m5, m6, m5
1818*c0909341SAndroid Build Coastguard Worker    mulps           m2, m6               ; 256 / (z3 + 1)
1819*c0909341SAndroid Build Coastguard Worker    mulps           m3, m6
1820*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m6, [pd_m4096]
1821*c0909341SAndroid Build Coastguard Worker    psrld           m4, 24               ; z3 < 255 ? 255 : 0
1822*c0909341SAndroid Build Coastguard Worker    psrld           m5, 24
1823*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m2, m2
1824*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m3, m3
1825*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m4               ; x3
1826*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m5
1827*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m2
1828*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m3
1829*c0909341SAndroid Build Coastguard Worker    paddd           m0, m8               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
1830*c0909341SAndroid Build Coastguard Worker    paddd           m1, m8
1831*c0909341SAndroid Build Coastguard Worker    pand            m0, m6
1832*c0909341SAndroid Build Coastguard Worker    pand            m1, m6
1833*c0909341SAndroid Build Coastguard Worker    por             m0, m2               ; a3 | (b3 << 12)
1834*c0909341SAndroid Build Coastguard Worker    por             m1, m3
1835*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+400*4+ 8], xm0
1836*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+400*4+40], m0, 1
1837*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+400*4+24], xm1
1838*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+400*4+56], m1, 1
1839*c0909341SAndroid Build Coastguard Worker    add            r10, 16
1840*c0909341SAndroid Build Coastguard Worker    jl .hv0_loop
1841*c0909341SAndroid Build Coastguard Worker    ret
1842*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1843*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
1844*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1845*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1846*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
1847*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   xm0, [leftq]
1848*c0909341SAndroid Build Coastguard Worker    mova           xm5, [lpfq+wq]
1849*c0909341SAndroid Build Coastguard Worker    palignr        xm5, xm0, 12
1850*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
1851*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
1852*c0909341SAndroid Build Coastguard Worker.hv1_extend_left:
1853*c0909341SAndroid Build Coastguard Worker    mova           xm5, [lpfq+wq]
1854*c0909341SAndroid Build Coastguard Worker    pshufb         xm5, [sgr_l_shuf]
1855*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
1856*c0909341SAndroid Build Coastguard Worker.hv1_bottom:
1857*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1858*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1859*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
1860*c0909341SAndroid Build Coastguard Worker.hv1_loop:
1861*c0909341SAndroid Build Coastguard Worker    movu           xm5, [lpfq+r10-2]
1862*c0909341SAndroid Build Coastguard Worker.hv1_main:
1863*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [lpfq+r10+6], 1
1864*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1865*c0909341SAndroid Build Coastguard Worker    jnz .hv1_have_right
1866*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -18
1867*c0909341SAndroid Build Coastguard Worker    jl .hv1_have_right
1868*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1869*c0909341SAndroid Build Coastguard Worker.hv1_have_right:
1870*c0909341SAndroid Build Coastguard Worker    pshufb          m6, m5, m9
1871*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m5, m10
1872*c0909341SAndroid Build Coastguard Worker    paddw           m8, m6, m3
1873*c0909341SAndroid Build Coastguard Worker    shufps          m2, m6, m3, q2121
1874*c0909341SAndroid Build Coastguard Worker    pmullw          m1, m2, m2
1875*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m5, m11
1876*c0909341SAndroid Build Coastguard Worker    paddw           m2, m0
1877*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m12
1878*c0909341SAndroid Build Coastguard Worker    paddw           m2, m5 ; sum3
1879*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5, m0
1880*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
1881*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m0
1882*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
1883*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m6, m3
1884*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
1885*c0909341SAndroid Build Coastguard Worker    punpckhwd       m6, m3
1886*c0909341SAndroid Build Coastguard Worker    pmaddwd         m6, m6
1887*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m1, m7
1888*c0909341SAndroid Build Coastguard Worker    paddd           m4, m3 ; sumsq3
1889*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
1890*c0909341SAndroid Build Coastguard Worker    paddd           m5, m1
1891*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2, [t2+r10*2+400* 6]
1892*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400* 6], m2
1893*c0909341SAndroid Build Coastguard Worker    paddw           m8, m2 ; sum5
1894*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+r10*2+400* 8]
1895*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+r10*2+400*10]
1896*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400* 8], m4
1897*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400*10], m5
1898*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [pw_455_24]
1899*c0909341SAndroid Build Coastguard Worker    paddd           m4, m0 ; sumsq5
1900*c0909341SAndroid Build Coastguard Worker    paddd           m5, m6
1901*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7           ; b3
1902*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
1903*c0909341SAndroid Build Coastguard Worker    pslld           m6, m2, 3
1904*c0909341SAndroid Build Coastguard Worker    pslld           m7, m3, 3
1905*c0909341SAndroid Build Coastguard Worker    paddd           m6, m2               ; a3 * 9
1906*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0, m0           ; b3 * b3
1907*c0909341SAndroid Build Coastguard Worker    paddd           m7, m3
1908*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1, m1
1909*c0909341SAndroid Build Coastguard Worker    psubd           m6, m2               ; p3
1910*c0909341SAndroid Build Coastguard Worker    psubd           m7, m3
1911*c0909341SAndroid Build Coastguard Worker    pmulld          m6, m14              ; p3 * s1
1912*c0909341SAndroid Build Coastguard Worker    pmulld          m7, m14
1913*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m9               ; b3 * 455
1914*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m9
1915*c0909341SAndroid Build Coastguard Worker    paddw           m6, m9
1916*c0909341SAndroid Build Coastguard Worker    paddw           m7, m9
1917*c0909341SAndroid Build Coastguard Worker    vbroadcastss    m9, [pf_256]
1918*c0909341SAndroid Build Coastguard Worker    psrld           m6, 20               ; z3 + 1
1919*c0909341SAndroid Build Coastguard Worker    psrld           m7, 20
1920*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m6, m6
1921*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m7, m7
1922*c0909341SAndroid Build Coastguard Worker    rcpps           m2, m6               ; 1 / (z3 + 1)
1923*c0909341SAndroid Build Coastguard Worker    rcpps           m3, m7
1924*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m6, m9, m6
1925*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m7, m9, m7
1926*c0909341SAndroid Build Coastguard Worker    mulps           m2, m9               ; 256 / (z3 + 1)
1927*c0909341SAndroid Build Coastguard Worker    mulps           m3, m9
1928*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [pd_34816]
1929*c0909341SAndroid Build Coastguard Worker    psrld           m6, 24               ; z3 < 255 ? 255 : 0
1930*c0909341SAndroid Build Coastguard Worker    psrld           m7, 24
1931*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m2, m2
1932*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m3, m3
1933*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m6               ; x3
1934*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m6, [pd_m4096]
1935*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m7
1936*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m2
1937*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m3
1938*c0909341SAndroid Build Coastguard Worker    paddd           m0, m9               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
1939*c0909341SAndroid Build Coastguard Worker    paddd           m1, m9
1940*c0909341SAndroid Build Coastguard Worker    pand            m0, m6
1941*c0909341SAndroid Build Coastguard Worker    pand            m7, m6, m1
1942*c0909341SAndroid Build Coastguard Worker    por             m0, m2               ; a3 | (b3 << 12)
1943*c0909341SAndroid Build Coastguard Worker    por             m7, m3
1944*c0909341SAndroid Build Coastguard Worker    paddw           m1, m8, [t2+r10*2+400*0]
1945*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+r10*2+400*2]
1946*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+r10*2+400*4]
1947*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t1+r10*2+400*0]
1948*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+r10*2+400*2]
1949*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t1+r10*2+400*4]
1950*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400*0], m8
1951*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400*2], m4
1952*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400*4], m5
1953*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+400*8+ 8], xm0
1954*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+400*8+40], m0, 1
1955*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+400*8+24], xm7
1956*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+400*8+56], m7, 1
1957*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m4, [pd_25]
1958*c0909341SAndroid Build Coastguard Worker    pxor            m7, m7
1959*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m8, [pw_164_24]
1960*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7           ; b5
1961*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
1962*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m4               ; a5 * 25
1963*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m4
1964*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m0, m0           ; b5 * b5
1965*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m1, m1
1966*c0909341SAndroid Build Coastguard Worker    psubd           m2, m4               ; p5
1967*c0909341SAndroid Build Coastguard Worker    psubd           m3, m5
1968*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m13              ; p5 * s0
1969*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m13
1970*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m8               ; b5 * 164
1971*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m8
1972*c0909341SAndroid Build Coastguard Worker    paddw           m2, m8
1973*c0909341SAndroid Build Coastguard Worker    paddw           m3, m8
1974*c0909341SAndroid Build Coastguard Worker    vbroadcastss    m8, [pf_256]
1975*c0909341SAndroid Build Coastguard Worker    psrld           m2, 20               ; z5 + 1
1976*c0909341SAndroid Build Coastguard Worker    psrld           m3, 20
1977*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m2, m2
1978*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m3, m3
1979*c0909341SAndroid Build Coastguard Worker    rcpps           m4, m2               ; 1 / (z5 + 1)
1980*c0909341SAndroid Build Coastguard Worker    rcpps           m5, m3
1981*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m2, m8, m2
1982*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m3, m8, m3
1983*c0909341SAndroid Build Coastguard Worker    mulps           m4, m8               ; 256 / (z5 + 1)
1984*c0909341SAndroid Build Coastguard Worker    mulps           m5, m8
1985*c0909341SAndroid Build Coastguard Worker    psrld           m2, 24               ; z5 < 255 ? 255 : 0
1986*c0909341SAndroid Build Coastguard Worker    psrld           m3, 24
1987*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m4, m4
1988*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m5, m5
1989*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m2               ; x5
1990*c0909341SAndroid Build Coastguard Worker    pminsw          m5, m3
1991*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m4
1992*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m5
1993*c0909341SAndroid Build Coastguard Worker    paddd           m0, m9               ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
1994*c0909341SAndroid Build Coastguard Worker    paddd           m1, m9
1995*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m9, [sgr_shuf]
1996*c0909341SAndroid Build Coastguard Worker    pand            m0, m6
1997*c0909341SAndroid Build Coastguard Worker    pand            m1, m6
1998*c0909341SAndroid Build Coastguard Worker    por             m0, m4               ; a5 | (b5 << 12)
1999*c0909341SAndroid Build Coastguard Worker    por             m1, m5
2000*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+400*0+ 8], xm0
2001*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+400*0+40], m0, 1
2002*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+400*0+24], xm1
2003*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+400*0+56], m1, 1
2004*c0909341SAndroid Build Coastguard Worker    add            r10, 16
2005*c0909341SAndroid Build Coastguard Worker    jl .hv1_loop
2006*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
2007*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
2008*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
2009*c0909341SAndroid Build Coastguard Worker    ret
2010*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab3 (even rows)
2011*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
2012*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m6, [pd_34816]
2013*c0909341SAndroid Build Coastguard Worker.v0_loop:
2014*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10*2+400* 6]
2015*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+r10*2+400* 8]
2016*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+r10*2+400*10]
2017*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
2018*c0909341SAndroid Build Coastguard Worker    paddd           m4, m4
2019*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5
2020*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10*2+400* 6]
2021*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+r10*2+400* 8]
2022*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+r10*2+400*10]
2023*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400* 6], m0
2024*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400* 8], m4
2025*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400*10], m5
2026*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m8, [pw_455_24]
2027*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7           ; b3
2028*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
2029*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
2030*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
2031*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2               ; a3 * 9
2032*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0, m0           ; b3 * b3
2033*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
2034*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1, m1
2035*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2               ; p3
2036*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
2037*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m14              ; p3 * s1
2038*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m14
2039*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m8               ; b3 * 455
2040*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m8
2041*c0909341SAndroid Build Coastguard Worker    paddw           m4, m8
2042*c0909341SAndroid Build Coastguard Worker    paddw           m5, m8
2043*c0909341SAndroid Build Coastguard Worker    vbroadcastss    m8, [pf_256]
2044*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20               ; z3 + 1
2045*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
2046*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m4, m4
2047*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m5, m5
2048*c0909341SAndroid Build Coastguard Worker    rcpps           m2, m4               ; 1 / (z3 + 1)
2049*c0909341SAndroid Build Coastguard Worker    rcpps           m3, m5
2050*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m4, m8, m4
2051*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m5, m8, m5
2052*c0909341SAndroid Build Coastguard Worker    mulps           m2, m8               ; 256 / (z3 + 1)
2053*c0909341SAndroid Build Coastguard Worker    mulps           m3, m8
2054*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m8, [pd_m4096]
2055*c0909341SAndroid Build Coastguard Worker    psrld           m4, 24               ; z3 < 255 ? 255 : 0
2056*c0909341SAndroid Build Coastguard Worker    psrld           m5, 24
2057*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m2, m2
2058*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m3, m3
2059*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m4               ; x3
2060*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m5
2061*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m2
2062*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m3
2063*c0909341SAndroid Build Coastguard Worker    paddd           m0, m6               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2064*c0909341SAndroid Build Coastguard Worker    paddd           m1, m6
2065*c0909341SAndroid Build Coastguard Worker    pand            m0, m8
2066*c0909341SAndroid Build Coastguard Worker    pand            m1, m8
2067*c0909341SAndroid Build Coastguard Worker    por             m0, m2               ; a3 | (b3 << 12)
2068*c0909341SAndroid Build Coastguard Worker    por             m1, m3
2069*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10*2+400*0]
2070*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+r10*2+400*2]
2071*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+r10*2+400*4]
2072*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*8+ 8], m2
2073*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*0+ 8], m3
2074*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*0+40], m4
2075*c0909341SAndroid Build Coastguard Worker    paddw           m2, m2               ; cc5
2076*c0909341SAndroid Build Coastguard Worker    paddd           m3, m3
2077*c0909341SAndroid Build Coastguard Worker    paddd           m4, m4
2078*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400*0], m2
2079*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400*2], m3
2080*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+400*4], m4
2081*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+400*4+ 8], xm0
2082*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+400*4+40], m0, 1
2083*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+400*4+24], xm1
2084*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+400*4+56], m1, 1
2085*c0909341SAndroid Build Coastguard Worker    add            r10, 16
2086*c0909341SAndroid Build Coastguard Worker    jl .v0_loop
2087*c0909341SAndroid Build Coastguard Worker    ret
2088*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows)
2089*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
2090*c0909341SAndroid Build Coastguard Worker.v1_loop:
2091*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+r10*2+400* 6]
2092*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+r10*2+400* 8]
2093*c0909341SAndroid Build Coastguard Worker    mova            m6, [t1+r10*2+400*10]
2094*c0909341SAndroid Build Coastguard Worker    paddw           m1, m4, [t2+r10*2+400* 6]
2095*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5, [t2+r10*2+400* 8]
2096*c0909341SAndroid Build Coastguard Worker    paddd           m3, m6, [t2+r10*2+400*10]
2097*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400* 6], m4
2098*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400* 8], m5
2099*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400*10], m6
2100*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m8, [pw_455_24]
2101*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7           ; b3
2102*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
2103*c0909341SAndroid Build Coastguard Worker    pslld           m4, m2, 3
2104*c0909341SAndroid Build Coastguard Worker    pslld           m5, m3, 3
2105*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2               ; a3 * 9
2106*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0, m0           ; b3 * b3
2107*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3
2108*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1, m1
2109*c0909341SAndroid Build Coastguard Worker    psubd           m4, m2               ; p3
2110*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3
2111*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m14              ; p3 * s1
2112*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m14
2113*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m8               ; b3 * 455
2114*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m8
2115*c0909341SAndroid Build Coastguard Worker    paddw           m4, m8
2116*c0909341SAndroid Build Coastguard Worker    paddw           m5, m8
2117*c0909341SAndroid Build Coastguard Worker    vbroadcastss    m8, [pf_256]
2118*c0909341SAndroid Build Coastguard Worker    psrld           m4, 20               ; z3 + 1
2119*c0909341SAndroid Build Coastguard Worker    psrld           m5, 20
2120*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m4, m4
2121*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m5, m5
2122*c0909341SAndroid Build Coastguard Worker    rcpps           m2, m4               ; 1 / (z3 + 1)
2123*c0909341SAndroid Build Coastguard Worker    rcpps           m3, m5
2124*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m4, m8, m4
2125*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m5, m8, m5
2126*c0909341SAndroid Build Coastguard Worker    mulps           m2, m8               ; 256 / (z3 + 1)
2127*c0909341SAndroid Build Coastguard Worker    mulps           m3, m8
2128*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m8, [pd_m4096]
2129*c0909341SAndroid Build Coastguard Worker    psrld           m4, 24               ; z3 < 255 ? 255 : 0
2130*c0909341SAndroid Build Coastguard Worker    psrld           m5, 24
2131*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m2, m2
2132*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m3, m3
2133*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m4               ; x3
2134*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m4, [pd_34816]
2135*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m5
2136*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m2
2137*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m3
2138*c0909341SAndroid Build Coastguard Worker    paddd           m0, m4               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2139*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
2140*c0909341SAndroid Build Coastguard Worker    pand            m0, m8
2141*c0909341SAndroid Build Coastguard Worker    pand            m8, m1
2142*c0909341SAndroid Build Coastguard Worker    por             m0, m2               ; a3 | (b3 << 12)
2143*c0909341SAndroid Build Coastguard Worker    por             m8, m3
2144*c0909341SAndroid Build Coastguard Worker    mova            m4, [t3+r10*4+400*8+ 8]
2145*c0909341SAndroid Build Coastguard Worker    mova            m5, [t3+r10*4+400*0+ 8]
2146*c0909341SAndroid Build Coastguard Worker    mova            m6, [t3+r10*4+400*0+40]
2147*c0909341SAndroid Build Coastguard Worker    paddw           m1, m4, [t2+r10*2+400*0]
2148*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5, [t2+r10*2+400*2]
2149*c0909341SAndroid Build Coastguard Worker    paddd           m3, m6, [t2+r10*2+400*4]
2150*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t1+r10*2+400*0]
2151*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+r10*2+400*2]
2152*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t1+r10*2+400*4]
2153*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400*0], m4
2154*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400*2], m5
2155*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+400*4], m6
2156*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m4, [pd_25]
2157*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+400*8+ 8], xm0
2158*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+400*8+40], m0, 1
2159*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+400*8+24], xm8
2160*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+400*8+56], m8, 1
2161*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m8, [pw_164_24]
2162*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m7           ; b5
2163*c0909341SAndroid Build Coastguard Worker    vbroadcastss    m6, [pf_256]
2164*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m7
2165*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m4               ; a5 * 25
2166*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m4
2167*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m0, m0           ; b5 * b5
2168*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m1, m1
2169*c0909341SAndroid Build Coastguard Worker    psubd           m2, m4               ; p5
2170*c0909341SAndroid Build Coastguard Worker    psubd           m3, m5
2171*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m13              ; p5 * s0
2172*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m13
2173*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m8               ; b5 * 164
2174*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m8
2175*c0909341SAndroid Build Coastguard Worker    paddw           m2, m8
2176*c0909341SAndroid Build Coastguard Worker    paddw           m3, m8
2177*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m8, [pd_34816]
2178*c0909341SAndroid Build Coastguard Worker    psrld           m2, 20               ; z5 + 1
2179*c0909341SAndroid Build Coastguard Worker    psrld           m3, 20
2180*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m2, m2
2181*c0909341SAndroid Build Coastguard Worker    cvtdq2ps        m3, m3
2182*c0909341SAndroid Build Coastguard Worker    rcpps           m4, m2               ; 1 / (z5 + 1)
2183*c0909341SAndroid Build Coastguard Worker    rcpps           m5, m3
2184*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m2, m6, m2
2185*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m3, m6, m3
2186*c0909341SAndroid Build Coastguard Worker    mulps           m4, m6               ; 256 / (z5 + 1)
2187*c0909341SAndroid Build Coastguard Worker    mulps           m5, m6
2188*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m6, [pd_m4096]
2189*c0909341SAndroid Build Coastguard Worker    psrld           m2, 24               ; z5 < 255 ? 255 : 0
2190*c0909341SAndroid Build Coastguard Worker    psrld           m3, 24
2191*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m4, m4
2192*c0909341SAndroid Build Coastguard Worker    cvtps2dq        m5, m5
2193*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m2               ; x5
2194*c0909341SAndroid Build Coastguard Worker    pminsw          m5, m3
2195*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m4
2196*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m5
2197*c0909341SAndroid Build Coastguard Worker    paddd           m0, m8               ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
2198*c0909341SAndroid Build Coastguard Worker    paddd           m1, m8
2199*c0909341SAndroid Build Coastguard Worker    pand            m0, m6
2200*c0909341SAndroid Build Coastguard Worker    pand            m1, m6
2201*c0909341SAndroid Build Coastguard Worker    por             m0, m4               ; a5 | (b5 << 12)
2202*c0909341SAndroid Build Coastguard Worker    por             m1, m5
2203*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+400*0+ 8], xm0
2204*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+400*0+40], m0, 1
2205*c0909341SAndroid Build Coastguard Worker    mova         [t3+r10*4+400*0+24], xm1
2206*c0909341SAndroid Build Coastguard Worker    vextracti128 [t3+r10*4+400*0+56], m1, 1
2207*c0909341SAndroid Build Coastguard Worker    add            r10, 16
2208*c0909341SAndroid Build Coastguard Worker    jl .v1_loop
2209*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
2210*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
2211*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
2212*c0909341SAndroid Build Coastguard Worker    ret
2213*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup
2214*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
2215*c0909341SAndroid Build Coastguard Worker.prep_n_loop:
2216*c0909341SAndroid Build Coastguard Worker    movu            m0, [t3+r10*4+400*0+4]
2217*c0909341SAndroid Build Coastguard Worker    paddd           m1, m0, [t3+r10*4+400*0+0]
2218*c0909341SAndroid Build Coastguard Worker    mova            m4, [t3+r10*4+400*4+0]
2219*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+r10*4+400*0+8]
2220*c0909341SAndroid Build Coastguard Worker    mova            m5, [t3+r10*4+400*8+0]
2221*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+r10*4+400*4+8]
2222*c0909341SAndroid Build Coastguard Worker    paddd           m5, [t3+r10*4+400*8+8]
2223*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t3+r10*4+400*4+4]
2224*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t3+r10*4+400*8+4]
2225*c0909341SAndroid Build Coastguard Worker    paddd           m0, m1
2226*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2
2227*c0909341SAndroid Build Coastguard Worker    pslld           m2, 2
2228*c0909341SAndroid Build Coastguard Worker    paddd           m1, m0                ; ab5 565
2229*c0909341SAndroid Build Coastguard Worker    paddd           m3, m3                ; ab3[ 0] 222
2230*c0909341SAndroid Build Coastguard Worker    psubd           m2, m4                ; ab3[-1] 343
2231*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*20], m3
2232*c0909341SAndroid Build Coastguard Worker    pandn           m0, m6, m1            ; a5 565
2233*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*24], m2
2234*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12                ; b5 565
2235*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*12], m0
2236*c0909341SAndroid Build Coastguard Worker    paddd           m3, m3
2237*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*16], m1
2238*c0909341SAndroid Build Coastguard Worker    psubd           m3, m5                ; ab3[ 0] 343
2239*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*28], m3
2240*c0909341SAndroid Build Coastguard Worker    add            r10, 8
2241*c0909341SAndroid Build Coastguard Worker    jl .prep_n_loop
2242*c0909341SAndroid Build Coastguard Worker    ret
2243*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2244*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows)
2245*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
2246*c0909341SAndroid Build Coastguard Worker.n0_loop:
2247*c0909341SAndroid Build Coastguard Worker    movu            m0, [t3+r10*4+4]
2248*c0909341SAndroid Build Coastguard Worker    paddd           m4, m0, [t3+r10*4+0]
2249*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+r10*4+8]
2250*c0909341SAndroid Build Coastguard Worker    paddd           m0, m4
2251*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2
2252*c0909341SAndroid Build Coastguard Worker    paddd           m4, m0
2253*c0909341SAndroid Build Coastguard Worker    pandn           m0, m6, m4
2254*c0909341SAndroid Build Coastguard Worker    psrld           m4, 12
2255*c0909341SAndroid Build Coastguard Worker    paddd           m2, m0, [t3+r10*4+400*12] ; a5
2256*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*12], m0
2257*c0909341SAndroid Build Coastguard Worker    paddd           m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8)
2258*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*16], m4
2259*c0909341SAndroid Build Coastguard Worker    mova            m3, [t3+r10*4+400*4+0]
2260*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t3+r10*4+400*4+8]
2261*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t3+r10*4+400*4+4]
2262*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5                    ; ab3[ 1] 222
2263*c0909341SAndroid Build Coastguard Worker    mova            m4, [t3+r10*4+400*20]
2264*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343
2265*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*20], m5
2266*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5
2267*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3                    ; ab3[ 1] 343
2268*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*24], m5
2269*c0909341SAndroid Build Coastguard Worker    paddd           m4, m5                    ; ab3[ 0] 222 + ab3[ 1] 343
2270*c0909341SAndroid Build Coastguard Worker    pandn           m3, m6, m1
2271*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
2272*c0909341SAndroid Build Coastguard Worker    pandn           m5, m6, m4
2273*c0909341SAndroid Build Coastguard Worker    psrld           m4, 12
2274*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5                    ; a3
2275*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4                    ; b3 + (1 << 8)
2276*c0909341SAndroid Build Coastguard Worker    pmovzxbd        m4, [dstq+r10]
2277*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m4                    ; a5 * src
2278*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m4                    ; a3 * src
2279*c0909341SAndroid Build Coastguard Worker    psubd           m0, m2                    ; b5 - a5 * src + (1 << 8)
2280*c0909341SAndroid Build Coastguard Worker    psubd           m1, m3                    ; b3 - a3 * src + (1 << 8)
2281*c0909341SAndroid Build Coastguard Worker    psrld           m0, 9
2282*c0909341SAndroid Build Coastguard Worker    pslld           m1, 7
2283*c0909341SAndroid Build Coastguard Worker    pblendw         m0, m1, 0xaa
2284*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m15
2285*c0909341SAndroid Build Coastguard Worker    psubd           m0, m6
2286*c0909341SAndroid Build Coastguard Worker    psrad           m0, 13
2287*c0909341SAndroid Build Coastguard Worker    paddd           m0, m4
2288*c0909341SAndroid Build Coastguard Worker    vextracti128   xm1, m0, 1
2289*c0909341SAndroid Build Coastguard Worker    packssdw       xm0, xm1
2290*c0909341SAndroid Build Coastguard Worker    packuswb       xm0, xm0
2291*c0909341SAndroid Build Coastguard Worker    movq    [dstq+r10], xm0
2292*c0909341SAndroid Build Coastguard Worker    add            r10, 8
2293*c0909341SAndroid Build Coastguard Worker    jl .n0_loop
2294*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
2295*c0909341SAndroid Build Coastguard Worker    ret
2296*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2297*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows)
2298*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
2299*c0909341SAndroid Build Coastguard Worker.n1_loop:
2300*c0909341SAndroid Build Coastguard Worker    mova            m3, [t3+r10*4+400*8+0]
2301*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t3+r10*4+400*8+8]
2302*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t3+r10*4+400*8+4]
2303*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5                    ; ab3[ 1] 222
2304*c0909341SAndroid Build Coastguard Worker    mova            m4, [t3+r10*4+400*20]
2305*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343
2306*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*20], m5
2307*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5
2308*c0909341SAndroid Build Coastguard Worker    psubd           m5, m3                    ; ab3[ 1] 343
2309*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+400*28], m5
2310*c0909341SAndroid Build Coastguard Worker    paddd           m4, m5                    ; ab3[ 0] 222 + ab3[ 1] 343
2311*c0909341SAndroid Build Coastguard Worker    pandn           m3, m6, m1
2312*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
2313*c0909341SAndroid Build Coastguard Worker    pandn           m5, m6, m4
2314*c0909341SAndroid Build Coastguard Worker    psrld           m4, 12
2315*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5                    ; -a3
2316*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4                    ;  b3 + (1 << 8)
2317*c0909341SAndroid Build Coastguard Worker    pmovzxbd        m4, [dstq+r10]
2318*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m4, [t3+r10*4+400*12] ; -a5 * src
2319*c0909341SAndroid Build Coastguard Worker    mova            m0, [t3+r10*4+400*16]     ;  b5 + (1 << 7)
2320*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m4                    ; -a3 * src
2321*c0909341SAndroid Build Coastguard Worker    psubd           m0, m2                    ; a5 * src + b5 + (1 << 7)
2322*c0909341SAndroid Build Coastguard Worker    psubd           m1, m3                    ; a3 * src + b3 + (1 << 8)
2323*c0909341SAndroid Build Coastguard Worker    psrld           m0, 8
2324*c0909341SAndroid Build Coastguard Worker    pslld           m1, 7
2325*c0909341SAndroid Build Coastguard Worker    pblendw         m0, m1, 0xaa
2326*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m15
2327*c0909341SAndroid Build Coastguard Worker    psubd           m0, m6
2328*c0909341SAndroid Build Coastguard Worker    psrad           m0, 13
2329*c0909341SAndroid Build Coastguard Worker    paddd           m0, m4
2330*c0909341SAndroid Build Coastguard Worker    vextracti128   xm1, m0, 1
2331*c0909341SAndroid Build Coastguard Worker    packssdw       xm0, xm1
2332*c0909341SAndroid Build Coastguard Worker    packuswb       xm0, xm0
2333*c0909341SAndroid Build Coastguard Worker    movq    [dstq+r10], xm0
2334*c0909341SAndroid Build Coastguard Worker    add            r10, 8
2335*c0909341SAndroid Build Coastguard Worker    jl .n1_loop
2336*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
2337*c0909341SAndroid Build Coastguard Worker    ret
2338*c0909341SAndroid Build Coastguard Worker
2339*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
2340