xref: /aosp_15_r20/external/libdav1d/src/x86/looprestoration16_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 16
32*c0909341SAndroid Build Coastguard Worker
33*c0909341SAndroid Build Coastguard Workerwiener_shufA:  db  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11
34*c0909341SAndroid Build Coastguard Workerwiener_shufB:  db  6,  7,  4,  5,  8,  9,  6,  7, 10, 11,  8,  9, 12, 13, 10, 11
35*c0909341SAndroid Build Coastguard Workerwiener_shufC:  db  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
36*c0909341SAndroid Build Coastguard Workerwiener_shufD:  db  2,  3, -1, -1,  4,  5, -1, -1,  6,  7, -1, -1,  8,  9, -1, -1
37*c0909341SAndroid Build Coastguard Workerwiener_shufE:  db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
38*c0909341SAndroid Build Coastguard Workerr_ext_mask:    times 72 db -1
39*c0909341SAndroid Build Coastguard Worker               times  8 db  0
40*c0909341SAndroid Build Coastguard Workerwiener_hshift: dw 4, 4, 1, 1
41*c0909341SAndroid Build Coastguard Workerwiener_vshift: dw 1024, 1024, 4096, 4096
42*c0909341SAndroid Build Coastguard Workerwiener_round:  dd 1049600, 1048832
43*c0909341SAndroid Build Coastguard Worker
44*c0909341SAndroid Build Coastguard Workerpw_164_455:    dw 164, 455
45*c0909341SAndroid Build Coastguard Workerpw_1023:       times 2 dw 1023
46*c0909341SAndroid Build Coastguard Workerpw_61448:      times 2 dw 61448
47*c0909341SAndroid Build Coastguard Workerpd_m262128:    dd -262128
48*c0909341SAndroid Build Coastguard Workerpd_m34816:     dd -34816
49*c0909341SAndroid Build Coastguard Workerpd_m25:        dd -25
50*c0909341SAndroid Build Coastguard Workerpd_m9:         dd -9
51*c0909341SAndroid Build Coastguard Workerpd_8:          dd 8
52*c0909341SAndroid Build Coastguard Workerpd_2147483648: dd 2147483648
53*c0909341SAndroid Build Coastguard Worker
54*c0909341SAndroid Build Coastguard Workercextern sgr_x_by_x
55*c0909341SAndroid Build Coastguard Worker
56*c0909341SAndroid Build Coastguard WorkerSECTION .text
57*c0909341SAndroid Build Coastguard Worker
58*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
59*c0909341SAndroid Build Coastguard Worker
60*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
61*c0909341SAndroid Build Coastguard Workercglobal wiener_filter7_16bpc, 4, 15, 17, -384*12-16, dst, stride, left, lpf, \
62*c0909341SAndroid Build Coastguard Worker                                                     w, h, edge, flt
63*c0909341SAndroid Build Coastguard Worker%define base t4-wiener_hshift
64*c0909341SAndroid Build Coastguard Worker    mov           fltq, r6mp
65*c0909341SAndroid Build Coastguard Worker    movifnidn       wd, wm
66*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
67*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
68*c0909341SAndroid Build Coastguard Worker    mov            t3d, r8m ; pixel_max
69*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m6, [wiener_shufA]
70*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m12, [fltq+ 0] ; x0 x1
71*c0909341SAndroid Build Coastguard Worker    lea             t4, [wiener_hshift]
72*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m7, [wiener_shufB]
73*c0909341SAndroid Build Coastguard Worker    add             wd, wd
74*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m13, [fltq+ 4] ; x2 x3
75*c0909341SAndroid Build Coastguard Worker    shr            t3d, 11
76*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m14, [fltq+16] ; y0 y1
77*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
78*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m15, [fltq+20] ; y2 y3
79*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
80*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m8, [wiener_shufC]
81*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq+16]
82*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m9, [wiener_shufD]
83*c0909341SAndroid Build Coastguard Worker    neg             wq
84*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m0, [base+wiener_hshift+t3*4]
85*c0909341SAndroid Build Coastguard Worker    mov           r10d, 0xfe
86*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m10, [base+wiener_round+t3*4]
87*c0909341SAndroid Build Coastguard Worker    kmovb           k1, r10d
88*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m11, [base+wiener_vshift+t3*4]
89*c0909341SAndroid Build Coastguard Worker    pmullw         m12, m0 ; upshift filter coefs to make the
90*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m16, [pd_m262128]
91*c0909341SAndroid Build Coastguard Worker    pmullw         m13, m0 ; horizontal downshift constant
92*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
93*c0909341SAndroid Build Coastguard Worker    jz .no_top
94*c0909341SAndroid Build Coastguard Worker    call .h_top
95*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
96*c0909341SAndroid Build Coastguard Worker    mov             t6, t1
97*c0909341SAndroid Build Coastguard Worker    mov             t5, t1
98*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
99*c0909341SAndroid Build Coastguard Worker    call .h_top
100*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
101*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
102*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
103*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
104*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
105*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10 ; below
106*c0909341SAndroid Build Coastguard Worker    call .h
107*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
108*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
109*c0909341SAndroid Build Coastguard Worker    dec             hd
110*c0909341SAndroid Build Coastguard Worker    jz .v1
111*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
112*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
113*c0909341SAndroid Build Coastguard Worker    call .h
114*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
115*c0909341SAndroid Build Coastguard Worker    dec             hd
116*c0909341SAndroid Build Coastguard Worker    jz .v2
117*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
118*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
119*c0909341SAndroid Build Coastguard Worker    call .h
120*c0909341SAndroid Build Coastguard Worker    dec             hd
121*c0909341SAndroid Build Coastguard Worker    jz .v3
122*c0909341SAndroid Build Coastguard Worker.main:
123*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+384*2]
124*c0909341SAndroid Build Coastguard Worker.main_loop:
125*c0909341SAndroid Build Coastguard Worker    call .hv
126*c0909341SAndroid Build Coastguard Worker    dec             hd
127*c0909341SAndroid Build Coastguard Worker    jnz .main_loop
128*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
129*c0909341SAndroid Build Coastguard Worker    jz .v3
130*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
131*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
132*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
133*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
134*c0909341SAndroid Build Coastguard Worker.v1:
135*c0909341SAndroid Build Coastguard Worker    call .v
136*c0909341SAndroid Build Coastguard Worker    RET
137*c0909341SAndroid Build Coastguard Worker.no_top:
138*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
139*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
140*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
141*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10
142*c0909341SAndroid Build Coastguard Worker    call .h
143*c0909341SAndroid Build Coastguard Worker    mov             t6, t1
144*c0909341SAndroid Build Coastguard Worker    mov             t5, t1
145*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
146*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
147*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
148*c0909341SAndroid Build Coastguard Worker    dec             hd
149*c0909341SAndroid Build Coastguard Worker    jz .v1
150*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
151*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
152*c0909341SAndroid Build Coastguard Worker    call .h
153*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
154*c0909341SAndroid Build Coastguard Worker    dec             hd
155*c0909341SAndroid Build Coastguard Worker    jz .v2
156*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
157*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
158*c0909341SAndroid Build Coastguard Worker    call .h
159*c0909341SAndroid Build Coastguard Worker    dec             hd
160*c0909341SAndroid Build Coastguard Worker    jz .v3
161*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+384*2]
162*c0909341SAndroid Build Coastguard Worker    call .hv
163*c0909341SAndroid Build Coastguard Worker    dec             hd
164*c0909341SAndroid Build Coastguard Worker    jz .v3
165*c0909341SAndroid Build Coastguard Worker    add             t0, 384*8
166*c0909341SAndroid Build Coastguard Worker    call .hv
167*c0909341SAndroid Build Coastguard Worker    dec             hd
168*c0909341SAndroid Build Coastguard Worker    jnz .main
169*c0909341SAndroid Build Coastguard Worker.v3:
170*c0909341SAndroid Build Coastguard Worker    call .v
171*c0909341SAndroid Build Coastguard Worker.v2:
172*c0909341SAndroid Build Coastguard Worker    call .v
173*c0909341SAndroid Build Coastguard Worker    jmp .v1
174*c0909341SAndroid Build Coastguard Worker.h:
175*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
176*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
177*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
178*c0909341SAndroid Build Coastguard Worker    movq           xm3, [leftq]
179*c0909341SAndroid Build Coastguard Worker    vmovdqu64   m3{k1}, [lpfq+r10-8]
180*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
181*c0909341SAndroid Build Coastguard Worker    jmp .h_main
182*c0909341SAndroid Build Coastguard Worker.h_extend_left:
183*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+r10+0]
184*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   xm3, xm4
185*c0909341SAndroid Build Coastguard Worker    vmovdqu64   m3{k1}, [lpfq+r10-8]
186*c0909341SAndroid Build Coastguard Worker    jmp .h_main2
187*c0909341SAndroid Build Coastguard Worker.h_top:
188*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
189*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
190*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
191*c0909341SAndroid Build Coastguard Worker.h_loop:
192*c0909341SAndroid Build Coastguard Worker    movu            m3, [lpfq+r10-8]
193*c0909341SAndroid Build Coastguard Worker.h_main:
194*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+r10+0]
195*c0909341SAndroid Build Coastguard Worker.h_main2:
196*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+r10+8]
197*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
198*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
199*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -68
200*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
201*c0909341SAndroid Build Coastguard Worker    push            r0
202*c0909341SAndroid Build Coastguard Worker    lea             r0, [r_ext_mask+66]
203*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m0, [lpfq-2]
204*c0909341SAndroid Build Coastguard Worker    vpternlogd      m3, m0, [r0+r10+ 0], 0xe4 ; c ? a : b
205*c0909341SAndroid Build Coastguard Worker    vpternlogd      m4, m0, [r0+r10+ 8], 0xe4
206*c0909341SAndroid Build Coastguard Worker    vpternlogd      m5, m0, [r0+r10+16], 0xe4
207*c0909341SAndroid Build Coastguard Worker    pop             r0
208*c0909341SAndroid Build Coastguard Worker.h_have_right:
209*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m3, m6
210*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m7
211*c0909341SAndroid Build Coastguard Worker    paddw           m2, m1
212*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m8
213*c0909341SAndroid Build Coastguard Worker    mova            m0, m16
214*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m2, m12
215*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m9
216*c0909341SAndroid Build Coastguard Worker    paddw           m3, m1
217*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m6
218*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m3, m13
219*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m5, m7
220*c0909341SAndroid Build Coastguard Worker    paddw           m2, m1
221*c0909341SAndroid Build Coastguard Worker    mova            m1, m16
222*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m8
223*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m2, m12
224*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m9
225*c0909341SAndroid Build Coastguard Worker    paddw           m4, m5
226*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m4, m13
227*c0909341SAndroid Build Coastguard Worker    psrad           m0, 4
228*c0909341SAndroid Build Coastguard Worker    psrad           m1, 4
229*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
230*c0909341SAndroid Build Coastguard Worker    psraw           m0, 1
231*c0909341SAndroid Build Coastguard Worker    mova      [t1+r10], m0
232*c0909341SAndroid Build Coastguard Worker    add            r10, 64
233*c0909341SAndroid Build Coastguard Worker    jl .h_loop
234*c0909341SAndroid Build Coastguard Worker    ret
235*c0909341SAndroid Build Coastguard WorkerALIGN function_align
236*c0909341SAndroid Build Coastguard Worker.hv:
237*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
238*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
239*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
240*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
241*c0909341SAndroid Build Coastguard Worker    movq           xm3, [leftq]
242*c0909341SAndroid Build Coastguard Worker    vmovdqu64   m3{k1}, [lpfq+r10-8]
243*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
244*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
245*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
246*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+r10+0]
247*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   xm3, xm4
248*c0909341SAndroid Build Coastguard Worker    vmovdqu64   m3{k1}, [lpfq+r10-8]
249*c0909341SAndroid Build Coastguard Worker    jmp .hv_main2
250*c0909341SAndroid Build Coastguard Worker.hv_bottom:
251*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
252*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
253*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
254*c0909341SAndroid Build Coastguard Worker.hv_loop:
255*c0909341SAndroid Build Coastguard Worker    movu            m3, [lpfq+r10-8]
256*c0909341SAndroid Build Coastguard Worker.hv_main:
257*c0909341SAndroid Build Coastguard Worker    mova            m4, [lpfq+r10+0]
258*c0909341SAndroid Build Coastguard Worker.hv_main2:
259*c0909341SAndroid Build Coastguard Worker    movu            m5, [lpfq+r10+8]
260*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
261*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
262*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -68
263*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
264*c0909341SAndroid Build Coastguard Worker    push            r0
265*c0909341SAndroid Build Coastguard Worker    lea             r0, [r_ext_mask+66]
266*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m0, [lpfq-2]
267*c0909341SAndroid Build Coastguard Worker    vpternlogd      m3, m0, [r0+r10+ 0], 0xe4
268*c0909341SAndroid Build Coastguard Worker    vpternlogd      m4, m0, [r0+r10+ 8], 0xe4
269*c0909341SAndroid Build Coastguard Worker    vpternlogd      m5, m0, [r0+r10+16], 0xe4
270*c0909341SAndroid Build Coastguard Worker    pop             r0
271*c0909341SAndroid Build Coastguard Worker.hv_have_right:
272*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m3, m6
273*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m7
274*c0909341SAndroid Build Coastguard Worker    paddw           m2, m1
275*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m8
276*c0909341SAndroid Build Coastguard Worker    mova            m0, m16
277*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m2, m12
278*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m9
279*c0909341SAndroid Build Coastguard Worker    paddw           m3, m1
280*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m4, m6
281*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m3, m13
282*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m5, m7
283*c0909341SAndroid Build Coastguard Worker    paddw           m2, m1
284*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m8
285*c0909341SAndroid Build Coastguard Worker    mova            m1, m16
286*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m2, m12
287*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m9
288*c0909341SAndroid Build Coastguard Worker    paddw           m4, m5
289*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m4, m13
290*c0909341SAndroid Build Coastguard Worker    mova            m2, [t4+r10]
291*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t2+r10]
292*c0909341SAndroid Build Coastguard Worker    mova            m5, [t3+r10]
293*c0909341SAndroid Build Coastguard Worker    psrad           m0, 4
294*c0909341SAndroid Build Coastguard Worker    psrad           m1, 4
295*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
296*c0909341SAndroid Build Coastguard Worker    mova            m4, [t5+r10]
297*c0909341SAndroid Build Coastguard Worker    paddw           m4, [t1+r10]
298*c0909341SAndroid Build Coastguard Worker    psraw           m0, 1
299*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0, [t6+r10]
300*c0909341SAndroid Build Coastguard Worker    mova      [t0+r10], m0
301*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m2, m5
302*c0909341SAndroid Build Coastguard Worker    mova            m0, m10
303*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m1, m15
304*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m5
305*c0909341SAndroid Build Coastguard Worker    mova            m1, m10
306*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m2, m15
307*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m4
308*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m2, m14
309*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4
310*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m3, m14
311*c0909341SAndroid Build Coastguard Worker    psrad           m0, 5
312*c0909341SAndroid Build Coastguard Worker    psrad           m1, 5
313*c0909341SAndroid Build Coastguard Worker    packusdw        m0, m1
314*c0909341SAndroid Build Coastguard Worker    pmulhuw         m0, m11
315*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
316*c0909341SAndroid Build Coastguard Worker    add            r10, 64
317*c0909341SAndroid Build Coastguard Worker    jl .hv_loop
318*c0909341SAndroid Build Coastguard Worker    mov             t6, t5
319*c0909341SAndroid Build Coastguard Worker    mov             t5, t4
320*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
321*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
322*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
323*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
324*c0909341SAndroid Build Coastguard Worker    mov             t0, t6
325*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
326*c0909341SAndroid Build Coastguard Worker    ret
327*c0909341SAndroid Build Coastguard Worker.v:
328*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
329*c0909341SAndroid Build Coastguard Worker.v_loop:
330*c0909341SAndroid Build Coastguard Worker    mova            m2, [t4+r10]
331*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t2+r10]
332*c0909341SAndroid Build Coastguard Worker    mova            m3, [t3+r10]
333*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m2, m3
334*c0909341SAndroid Build Coastguard Worker    mova            m0, m10
335*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m1, m15
336*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m3
337*c0909341SAndroid Build Coastguard Worker    mova            m1, m10
338*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m2, m15
339*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+r10]
340*c0909341SAndroid Build Coastguard Worker    paddw           m3, m4, [t6+r10]
341*c0909341SAndroid Build Coastguard Worker    paddw           m4, [t5+r10]
342*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m4
343*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m2, m14
344*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4
345*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m3, m14
346*c0909341SAndroid Build Coastguard Worker    psrad           m0, 5
347*c0909341SAndroid Build Coastguard Worker    psrad           m1, 5
348*c0909341SAndroid Build Coastguard Worker    packusdw        m0, m1
349*c0909341SAndroid Build Coastguard Worker    pmulhuw         m0, m11
350*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
351*c0909341SAndroid Build Coastguard Worker    add            r10, 64
352*c0909341SAndroid Build Coastguard Worker    jl .v_loop
353*c0909341SAndroid Build Coastguard Worker    mov             t6, t5
354*c0909341SAndroid Build Coastguard Worker    mov             t5, t4
355*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
356*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
357*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
358*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
359*c0909341SAndroid Build Coastguard Worker    ret
360*c0909341SAndroid Build Coastguard Worker
361*c0909341SAndroid Build Coastguard Workercglobal wiener_filter5_16bpc, 4, 14, 15, 384*8+16, dst, stride, left, lpf, \
362*c0909341SAndroid Build Coastguard Worker                                                   w, h, edge, flt
363*c0909341SAndroid Build Coastguard Worker%define base r13-r_ext_mask-70
364*c0909341SAndroid Build Coastguard Worker    mov           fltq, r6mp
365*c0909341SAndroid Build Coastguard Worker    movifnidn       wd, wm
366*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
367*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
368*c0909341SAndroid Build Coastguard Worker    mov            t3d, r8m ; pixel_max
369*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m5, [wiener_shufE]
370*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m11, [fltq+ 2] ; x1
371*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m6, [wiener_shufB]
372*c0909341SAndroid Build Coastguard Worker    lea            r13, [r_ext_mask+70]
373*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m7, [wiener_shufD]
374*c0909341SAndroid Build Coastguard Worker    add             wd, wd
375*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m12, [fltq+ 4] ; x2 x3
376*c0909341SAndroid Build Coastguard Worker    shr            t3d, 11
377*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m8, [pd_m262128] ; (1 << 4) - (1 << 18)
378*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
379*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m13, [fltq+18] ; y1
380*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
381*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m14, [fltq+20] ; y2 y3
382*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq+16]
383*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m0, [base+wiener_hshift+t3*4]
384*c0909341SAndroid Build Coastguard Worker    neg             wq
385*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [base+wiener_round+t3*4]
386*c0909341SAndroid Build Coastguard Worker    mov           r10d, 0xfffe
387*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m10, [base+wiener_vshift+t3*4]
388*c0909341SAndroid Build Coastguard Worker    kmovw           k1, r10d
389*c0909341SAndroid Build Coastguard Worker    pmullw         m11, m0
390*c0909341SAndroid Build Coastguard Worker    pmullw         m12, m0
391*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
392*c0909341SAndroid Build Coastguard Worker    jz .no_top
393*c0909341SAndroid Build Coastguard Worker    call .h_top
394*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
395*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
396*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
397*c0909341SAndroid Build Coastguard Worker    call .h_top
398*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
399*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
400*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
401*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
402*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
403*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10 ; below
404*c0909341SAndroid Build Coastguard Worker    call .h
405*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
406*c0909341SAndroid Build Coastguard Worker    dec             hd
407*c0909341SAndroid Build Coastguard Worker    jz .v1
408*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
409*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
410*c0909341SAndroid Build Coastguard Worker    call .h
411*c0909341SAndroid Build Coastguard Worker    dec             hd
412*c0909341SAndroid Build Coastguard Worker    jz .v2
413*c0909341SAndroid Build Coastguard Worker.main:
414*c0909341SAndroid Build Coastguard Worker    mov             t0, t4
415*c0909341SAndroid Build Coastguard Worker.main_loop:
416*c0909341SAndroid Build Coastguard Worker    call .hv
417*c0909341SAndroid Build Coastguard Worker    dec             hd
418*c0909341SAndroid Build Coastguard Worker    jnz .main_loop
419*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
420*c0909341SAndroid Build Coastguard Worker    jz .v2
421*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
422*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
423*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
424*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
425*c0909341SAndroid Build Coastguard Worker.end:
426*c0909341SAndroid Build Coastguard Worker    RET
427*c0909341SAndroid Build Coastguard Worker.no_top:
428*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
429*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
430*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
431*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10
432*c0909341SAndroid Build Coastguard Worker    call .h
433*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
434*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
435*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
436*c0909341SAndroid Build Coastguard Worker    dec             hd
437*c0909341SAndroid Build Coastguard Worker    jz .v1
438*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
439*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
440*c0909341SAndroid Build Coastguard Worker    call .h
441*c0909341SAndroid Build Coastguard Worker    dec             hd
442*c0909341SAndroid Build Coastguard Worker    jz .v2
443*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+384*2]
444*c0909341SAndroid Build Coastguard Worker    call .hv
445*c0909341SAndroid Build Coastguard Worker    dec             hd
446*c0909341SAndroid Build Coastguard Worker    jz .v2
447*c0909341SAndroid Build Coastguard Worker    add             t0, 384*6
448*c0909341SAndroid Build Coastguard Worker    call .hv
449*c0909341SAndroid Build Coastguard Worker    dec             hd
450*c0909341SAndroid Build Coastguard Worker    jnz .main
451*c0909341SAndroid Build Coastguard Worker.v2:
452*c0909341SAndroid Build Coastguard Worker    call .v
453*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
454*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
455*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
456*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
457*c0909341SAndroid Build Coastguard Worker.v1:
458*c0909341SAndroid Build Coastguard Worker    call .v
459*c0909341SAndroid Build Coastguard Worker    jmp .end
460*c0909341SAndroid Build Coastguard Worker.h:
461*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
462*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
463*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
464*c0909341SAndroid Build Coastguard Worker    movd           xm3, [leftq+4]
465*c0909341SAndroid Build Coastguard Worker    vmovdqu32   m3{k1}, [lpfq+r10-4]
466*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
467*c0909341SAndroid Build Coastguard Worker    jmp .h_main
468*c0909341SAndroid Build Coastguard Worker.h_extend_left:
469*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   xm3, [lpfq+r10]
470*c0909341SAndroid Build Coastguard Worker    vmovdqu32   m3{k1}, [lpfq+r10-4]
471*c0909341SAndroid Build Coastguard Worker    jmp .h_main
472*c0909341SAndroid Build Coastguard Worker.h_top:
473*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
474*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
475*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
476*c0909341SAndroid Build Coastguard Worker.h_loop:
477*c0909341SAndroid Build Coastguard Worker    movu            m3, [lpfq+r10-4]
478*c0909341SAndroid Build Coastguard Worker.h_main:
479*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+r10+4]
480*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
481*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
482*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -66
483*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
484*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m0, [lpfq-2]
485*c0909341SAndroid Build Coastguard Worker    vpternlogd      m3, m0, [r13+r10+0], 0xe4 ; c ? a : b
486*c0909341SAndroid Build Coastguard Worker    vpternlogd      m4, m0, [r13+r10+8], 0xe4
487*c0909341SAndroid Build Coastguard Worker.h_have_right:
488*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m3, m5
489*c0909341SAndroid Build Coastguard Worker    mova            m0, m8
490*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m1, m11
491*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m4, m5
492*c0909341SAndroid Build Coastguard Worker    mova            m1, m8
493*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m2, m11
494*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m3, m6
495*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m7
496*c0909341SAndroid Build Coastguard Worker    paddw           m2, m3
497*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m4, m6
498*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m2, m12
499*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m7
500*c0909341SAndroid Build Coastguard Worker    paddw           m3, m4
501*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m3, m12
502*c0909341SAndroid Build Coastguard Worker    psrad           m0, 4
503*c0909341SAndroid Build Coastguard Worker    psrad           m1, 4
504*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
505*c0909341SAndroid Build Coastguard Worker    psraw           m0, 1
506*c0909341SAndroid Build Coastguard Worker    mova      [t1+r10], m0
507*c0909341SAndroid Build Coastguard Worker    add            r10, 64
508*c0909341SAndroid Build Coastguard Worker    jl .h_loop
509*c0909341SAndroid Build Coastguard Worker    ret
510*c0909341SAndroid Build Coastguard WorkerALIGN function_align
511*c0909341SAndroid Build Coastguard Worker.hv:
512*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
513*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
514*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
515*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
516*c0909341SAndroid Build Coastguard Worker    movd           xm3, [leftq+4]
517*c0909341SAndroid Build Coastguard Worker    vmovdqu32   m3{k1}, [lpfq+r10-4]
518*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
519*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
520*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
521*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   xm3, [lpfq+r10]
522*c0909341SAndroid Build Coastguard Worker    vmovdqu32   m3{k1}, [lpfq+r10-4]
523*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
524*c0909341SAndroid Build Coastguard Worker.hv_bottom:
525*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
526*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
527*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
528*c0909341SAndroid Build Coastguard Worker.hv_loop:
529*c0909341SAndroid Build Coastguard Worker    movu            m3, [lpfq+r10-4]
530*c0909341SAndroid Build Coastguard Worker.hv_main:
531*c0909341SAndroid Build Coastguard Worker    movu            m4, [lpfq+r10+4]
532*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
533*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
534*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -66
535*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
536*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m0, [lpfq-2]
537*c0909341SAndroid Build Coastguard Worker    vpternlogd      m3, m0, [r13+r10+0], 0xe4
538*c0909341SAndroid Build Coastguard Worker    vpternlogd      m4, m0, [r13+r10+8], 0xe4
539*c0909341SAndroid Build Coastguard Worker.hv_have_right:
540*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m3, m5
541*c0909341SAndroid Build Coastguard Worker    mova            m0, m8
542*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m1, m11
543*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m4, m5
544*c0909341SAndroid Build Coastguard Worker    mova            m1, m8
545*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m2, m11
546*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m3, m6
547*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m7
548*c0909341SAndroid Build Coastguard Worker    paddw           m2, m3
549*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m4, m6
550*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m2, m12
551*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m7
552*c0909341SAndroid Build Coastguard Worker    paddw           m4, m3
553*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m4, m12
554*c0909341SAndroid Build Coastguard Worker    mova            m2, [t3+r10]
555*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t1+r10]
556*c0909341SAndroid Build Coastguard Worker    mova            m3, [t2+r10]
557*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m2, m3
558*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m3
559*c0909341SAndroid Build Coastguard Worker    mova            m3, m9
560*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m2, m14
561*c0909341SAndroid Build Coastguard Worker    mova            m2, m9
562*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m4, m14
563*c0909341SAndroid Build Coastguard Worker    mova            m4, [t4+r10]
564*c0909341SAndroid Build Coastguard Worker    psrad           m0, 4
565*c0909341SAndroid Build Coastguard Worker    psrad           m1, 4
566*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
567*c0909341SAndroid Build Coastguard Worker    psraw           m0, 1
568*c0909341SAndroid Build Coastguard Worker    mova      [t0+r10], m0
569*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m0, m4
570*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m1, m13
571*c0909341SAndroid Build Coastguard Worker    punpckhwd       m0, m4
572*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m0, m13
573*c0909341SAndroid Build Coastguard Worker    psrad           m2, 5
574*c0909341SAndroid Build Coastguard Worker    psrad           m3, 5
575*c0909341SAndroid Build Coastguard Worker    packusdw        m2, m3
576*c0909341SAndroid Build Coastguard Worker    pmulhuw         m2, m10
577*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m2
578*c0909341SAndroid Build Coastguard Worker    add            r10, 64
579*c0909341SAndroid Build Coastguard Worker    jl .hv_loop
580*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
581*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
582*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
583*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
584*c0909341SAndroid Build Coastguard Worker    mov             t0, t4
585*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
586*c0909341SAndroid Build Coastguard Worker    ret
587*c0909341SAndroid Build Coastguard Worker.v:
588*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
589*c0909341SAndroid Build Coastguard Worker.v_loop:
590*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10]
591*c0909341SAndroid Build Coastguard Worker    paddw           m2, m0, [t3+r10]
592*c0909341SAndroid Build Coastguard Worker    mova            m1, [t2+r10]
593*c0909341SAndroid Build Coastguard Worker    mova            m4, [t4+r10]
594*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m2, m1
595*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m14
596*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m1
597*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m14
598*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m4
599*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m13
600*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m4
601*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m13
602*c0909341SAndroid Build Coastguard Worker    paddd           m3, m9
603*c0909341SAndroid Build Coastguard Worker    paddd           m2, m9
604*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
605*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
606*c0909341SAndroid Build Coastguard Worker    psrad           m1, 5
607*c0909341SAndroid Build Coastguard Worker    psrad           m0, 5
608*c0909341SAndroid Build Coastguard Worker    packusdw        m0, m1
609*c0909341SAndroid Build Coastguard Worker    pmulhuw         m0, m10
610*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
611*c0909341SAndroid Build Coastguard Worker    add            r10, 64
612*c0909341SAndroid Build Coastguard Worker    jl .v_loop
613*c0909341SAndroid Build Coastguard Worker    ret
614*c0909341SAndroid Build Coastguard Worker
615*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_5x5_16bpc, 4, 14, 22, 416*24+8, dst, stride, left, lpf, \
616*c0909341SAndroid Build Coastguard Worker                                                   w, h, edge, params
617*c0909341SAndroid Build Coastguard Worker%define base r13-r_ext_mask-72
618*c0909341SAndroid Build Coastguard Worker    movifnidn       wd, wm
619*c0909341SAndroid Build Coastguard Worker    mov        paramsq, r6mp
620*c0909341SAndroid Build Coastguard Worker    lea            r13, [r_ext_mask+72]
621*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
622*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
623*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
624*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m7, [paramsq+8] ; w0
625*c0909341SAndroid Build Coastguard Worker    add             wd, wd
626*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m8, [base+pd_8]
627*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
628*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [base+pd_m25]
629*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
630*c0909341SAndroid Build Coastguard Worker    vpsubd         m10, m6, [paramsq+0] {1to16} ; -s0
631*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+wq*2+416*12+8]
632*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m11, [base+pw_164_455]
633*c0909341SAndroid Build Coastguard Worker    lea             t4, [rsp+wq+416*20+8]
634*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m12, [base+pw_61448]  ; (15 << 12) + (1 << 3)
635*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq+12]
636*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m13, [base+pd_m34816] ; -((1 << 11) + (1 << 15))
637*c0909341SAndroid Build Coastguard Worker    neg             wq
638*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m14, [base+pw_1023]
639*c0909341SAndroid Build Coastguard Worker    psllw           m7, 4
640*c0909341SAndroid Build Coastguard Worker    mova           m18, [sgr_x_by_x+64*0]
641*c0909341SAndroid Build Coastguard Worker    mov           r10d, 0xfffffff8
642*c0909341SAndroid Build Coastguard Worker    mova           m19, [sgr_x_by_x+64*1]
643*c0909341SAndroid Build Coastguard Worker    kmovd           k1, r10d
644*c0909341SAndroid Build Coastguard Worker    mova           m20, [sgr_x_by_x+64*2]
645*c0909341SAndroid Build Coastguard Worker    mov            r10, 0x3333333333333333
646*c0909341SAndroid Build Coastguard Worker    mova           m21, [sgr_x_by_x+64*3]
647*c0909341SAndroid Build Coastguard Worker    kmovq           k2, r10
648*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
649*c0909341SAndroid Build Coastguard Worker    jz .no_top
650*c0909341SAndroid Build Coastguard Worker    call .h_top
651*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
652*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
653*c0909341SAndroid Build Coastguard Worker    call .top_fixup
654*c0909341SAndroid Build Coastguard Worker    add             t1, 416*6
655*c0909341SAndroid Build Coastguard Worker    call .h_top
656*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
657*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
658*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
659*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10 ; below
660*c0909341SAndroid Build Coastguard Worker    mov             t0, t2
661*c0909341SAndroid Build Coastguard Worker    dec             hd
662*c0909341SAndroid Build Coastguard Worker    jz .height1
663*c0909341SAndroid Build Coastguard Worker    or           edged, 16
664*c0909341SAndroid Build Coastguard Worker    call .h
665*c0909341SAndroid Build Coastguard Worker.main:
666*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
667*c0909341SAndroid Build Coastguard Worker    call .hv
668*c0909341SAndroid Build Coastguard Worker    call .prep_n
669*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
670*c0909341SAndroid Build Coastguard Worker    jl .extend_bottom
671*c0909341SAndroid Build Coastguard Worker.main_loop:
672*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
673*c0909341SAndroid Build Coastguard Worker    test            hd, hd
674*c0909341SAndroid Build Coastguard Worker    jz .odd_height
675*c0909341SAndroid Build Coastguard Worker    call .h
676*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
677*c0909341SAndroid Build Coastguard Worker    call .hv
678*c0909341SAndroid Build Coastguard Worker    call .n0
679*c0909341SAndroid Build Coastguard Worker    call .n1
680*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
681*c0909341SAndroid Build Coastguard Worker    jge .main_loop
682*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
683*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
684*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
685*c0909341SAndroid Build Coastguard Worker    call .h_top
686*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
687*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
688*c0909341SAndroid Build Coastguard Worker.end:
689*c0909341SAndroid Build Coastguard Worker    call .n0
690*c0909341SAndroid Build Coastguard Worker    call .n1
691*c0909341SAndroid Build Coastguard Worker.end2:
692*c0909341SAndroid Build Coastguard Worker    RET
693*c0909341SAndroid Build Coastguard Worker.height1:
694*c0909341SAndroid Build Coastguard Worker    call .hv
695*c0909341SAndroid Build Coastguard Worker    call .prep_n
696*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
697*c0909341SAndroid Build Coastguard Worker.odd_height:
698*c0909341SAndroid Build Coastguard Worker    call .hv
699*c0909341SAndroid Build Coastguard Worker    call .n0
700*c0909341SAndroid Build Coastguard Worker    call .n1
701*c0909341SAndroid Build Coastguard Worker.odd_height_end:
702*c0909341SAndroid Build Coastguard Worker    call .v
703*c0909341SAndroid Build Coastguard Worker    call .n0
704*c0909341SAndroid Build Coastguard Worker    jmp .end2
705*c0909341SAndroid Build Coastguard Worker.extend_bottom:
706*c0909341SAndroid Build Coastguard Worker    call .v
707*c0909341SAndroid Build Coastguard Worker    jmp .end
708*c0909341SAndroid Build Coastguard Worker.no_top:
709*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
710*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
711*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
712*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10
713*c0909341SAndroid Build Coastguard Worker    call .h
714*c0909341SAndroid Build Coastguard Worker    lea             t2, [t1+416*6]
715*c0909341SAndroid Build Coastguard Worker    call .top_fixup
716*c0909341SAndroid Build Coastguard Worker    dec             hd
717*c0909341SAndroid Build Coastguard Worker    jz .no_top_height1
718*c0909341SAndroid Build Coastguard Worker    or           edged, 16
719*c0909341SAndroid Build Coastguard Worker    mov             t0, t1
720*c0909341SAndroid Build Coastguard Worker    mov             t1, t2
721*c0909341SAndroid Build Coastguard Worker    jmp .main
722*c0909341SAndroid Build Coastguard Worker.no_top_height1:
723*c0909341SAndroid Build Coastguard Worker    call .v
724*c0909341SAndroid Build Coastguard Worker    call .prep_n
725*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
726*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum
727*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
728*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
729*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
730*c0909341SAndroid Build Coastguard Worker    movq          xm16, [leftq+2]
731*c0909341SAndroid Build Coastguard Worker    vmovdqu16  m16{k1}, [lpfq+wq-6]
732*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
733*c0909341SAndroid Build Coastguard Worker    jmp .h_main
734*c0909341SAndroid Build Coastguard Worker.h_extend_left:
735*c0909341SAndroid Build Coastguard Worker    vpbroadcastw  xm16, [lpfq+wq]
736*c0909341SAndroid Build Coastguard Worker    vmovdqu16  m16{k1}, [lpfq+wq-6]
737*c0909341SAndroid Build Coastguard Worker    jmp .h_main
738*c0909341SAndroid Build Coastguard Worker.h_top:
739*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
740*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
741*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
742*c0909341SAndroid Build Coastguard Worker.h_loop:
743*c0909341SAndroid Build Coastguard Worker    movu           m16, [lpfq+r10- 2]
744*c0909341SAndroid Build Coastguard Worker.h_main:
745*c0909341SAndroid Build Coastguard Worker    movu           m17, [lpfq+r10+14]
746*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
747*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
748*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -68
749*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
750*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m0, [lpfq-2]
751*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, [r13+r10+ 0], 0xe4 ; c ? a : b
752*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m0, [r13+r10+16], 0xe4
753*c0909341SAndroid Build Coastguard Worker.h_have_right:
754*c0909341SAndroid Build Coastguard Worker    palignr         m2, m17, m16, 2
755*c0909341SAndroid Build Coastguard Worker    paddw           m0, m16, m2
756*c0909341SAndroid Build Coastguard Worker    palignr         m3, m17, m16, 6
757*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3
758*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m2, m3
759*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
760*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m3
761*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
762*c0909341SAndroid Build Coastguard Worker    shufpd         m17, m16, m17, 0x55
763*c0909341SAndroid Build Coastguard Worker    paddw           m0, m17
764*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m16, m17
765*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m3, m3
766*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m16, m17
767*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m3, m3
768*c0909341SAndroid Build Coastguard Worker    shufps         m16, m17, q2121
769*c0909341SAndroid Build Coastguard Worker    paddw           m0, m16            ; sum
770*c0909341SAndroid Build Coastguard Worker    test         edgeb, 16             ; y > 0
771*c0909341SAndroid Build Coastguard Worker    jz .h_loop_end
772*c0909341SAndroid Build Coastguard Worker    paddw           m0, [t1+r10+416*0]
773*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t1+r10+416*2]
774*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+r10+416*4]
775*c0909341SAndroid Build Coastguard Worker.h_loop_end:
776*c0909341SAndroid Build Coastguard Worker    punpcklwd      m17, m16, m6
777*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m17, m17       ; sumsq
778*c0909341SAndroid Build Coastguard Worker    punpckhwd      m16, m6
779*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m16, m16
780*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416*0], m0
781*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416*2], m1
782*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416*4], m2
783*c0909341SAndroid Build Coastguard Worker    add            r10, 64
784*c0909341SAndroid Build Coastguard Worker    jl .h_loop
785*c0909341SAndroid Build Coastguard Worker    ret
786*c0909341SAndroid Build Coastguard Worker.top_fixup:
787*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
788*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: ; the sums of the first row needs to be doubled
789*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10+416*0]
790*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+r10+416*2]
791*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10+416*4]
792*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
793*c0909341SAndroid Build Coastguard Worker    paddd           m1, m1
794*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
795*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*0], m0
796*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*2], m1
797*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*4], m2
798*c0909341SAndroid Build Coastguard Worker    add            r10, 64
799*c0909341SAndroid Build Coastguard Worker    jl .top_fixup_loop
800*c0909341SAndroid Build Coastguard Worker    ret
801*c0909341SAndroid Build Coastguard WorkerALIGN function_align
802*c0909341SAndroid Build Coastguard Worker.hv: ; horizontal boxsum + vertical boxsum + ab
803*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
804*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
805*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
806*c0909341SAndroid Build Coastguard Worker    movq          xm16, [leftq+2]
807*c0909341SAndroid Build Coastguard Worker    vmovdqu16  m16{k1}, [lpfq+wq-6]
808*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
809*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
810*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
811*c0909341SAndroid Build Coastguard Worker    vpbroadcastw  xm16, [lpfq+wq]
812*c0909341SAndroid Build Coastguard Worker    vmovdqu16  m16{k1}, [lpfq+wq-6]
813*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
814*c0909341SAndroid Build Coastguard Worker.hv_bottom:
815*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
816*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
817*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
818*c0909341SAndroid Build Coastguard Worker.hv_loop:
819*c0909341SAndroid Build Coastguard Worker    movu           m16, [lpfq+r10- 2]
820*c0909341SAndroid Build Coastguard Worker.hv_main:
821*c0909341SAndroid Build Coastguard Worker    movu           m17, [lpfq+r10+14]
822*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
823*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
824*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -68
825*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
826*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m0, [lpfq-2]
827*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, [r13+r10+ 0], 0xe4
828*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m0, [r13+r10+16], 0xe4
829*c0909341SAndroid Build Coastguard Worker.hv_have_right:
830*c0909341SAndroid Build Coastguard Worker    palignr         m3, m17, m16, 2
831*c0909341SAndroid Build Coastguard Worker    paddw           m0, m16, m3
832*c0909341SAndroid Build Coastguard Worker    palignr         m1, m17, m16, 6
833*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
834*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m1
835*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
836*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m1
837*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
838*c0909341SAndroid Build Coastguard Worker    shufpd         m17, m16, m17, 0x55
839*c0909341SAndroid Build Coastguard Worker    paddw           m0, m17
840*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m16, m17
841*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m1, m1
842*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m16, m17
843*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m1, m1
844*c0909341SAndroid Build Coastguard Worker    shufps         m16, m17, q2121
845*c0909341SAndroid Build Coastguard Worker    paddw           m0, m16           ; h sum
846*c0909341SAndroid Build Coastguard Worker    punpcklwd      m17, m16, m6
847*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m17, m17      ; h sumsq
848*c0909341SAndroid Build Coastguard Worker    punpckhwd      m16, m6
849*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m16, m16
850*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t1+r10+416*0]
851*c0909341SAndroid Build Coastguard Worker    paddd          m16, m2, [t1+r10+416*2]
852*c0909341SAndroid Build Coastguard Worker    paddd          m17, m3, [t1+r10+416*4]
853*c0909341SAndroid Build Coastguard Worker    test            hd, hd
854*c0909341SAndroid Build Coastguard Worker    jz .hv_last_row
855*c0909341SAndroid Build Coastguard Worker.hv_main2:
856*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t2+r10+416*0] ; hv sum
857*c0909341SAndroid Build Coastguard Worker    paddd          m16, [t2+r10+416*2] ; hv sumsq
858*c0909341SAndroid Build Coastguard Worker    paddd          m17, [t2+r10+416*4]
859*c0909341SAndroid Build Coastguard Worker    mova [t0+r10+416*0], m0
860*c0909341SAndroid Build Coastguard Worker    mova [t0+r10+416*2], m2
861*c0909341SAndroid Build Coastguard Worker    mova [t0+r10+416*4], m3
862*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
863*c0909341SAndroid Build Coastguard Worker    paddd          m16, m8
864*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m6             ; (b + 2) >> 2
865*c0909341SAndroid Build Coastguard Worker    paddd          m17, m8
866*c0909341SAndroid Build Coastguard Worker    psrld          m16, 4              ; (a + 8) >> 4
867*c0909341SAndroid Build Coastguard Worker    psrld          m17, 4
868*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m9             ; -a * 25
869*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m9
870*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
871*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m16, m2, m2         ; -p
872*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
873*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m17, m3, m3
874*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
875*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
876*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m10            ; p * s
877*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m10
878*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11            ; b * 164
879*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
880*c0909341SAndroid Build Coastguard Worker    vpalignr   m17{k2}, m16, m16, 2
881*c0909341SAndroid Build Coastguard Worker    mova           m16, m20
882*c0909341SAndroid Build Coastguard Worker    pmaxsw         m17, m6
883*c0909341SAndroid Build Coastguard Worker    paddusw        m17, m12
884*c0909341SAndroid Build Coastguard Worker    psraw          m17, 4              ; min(z, 255) - 256
885*c0909341SAndroid Build Coastguard Worker    vpermt2b       m16, m17, m21       ; sgr_x_by_x[128..255]
886*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m17
887*c0909341SAndroid Build Coastguard Worker    vpermi2b       m17, m18, m19       ; sgr_x_by_x[  0..127]
888*c0909341SAndroid Build Coastguard Worker    vmovdqu8   m17{k3}, m16            ; x
889*c0909341SAndroid Build Coastguard Worker    pandn          m16, m13, m17
890*c0909341SAndroid Build Coastguard Worker    psrld          m17, 16
891*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m16
892*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m17
893*c0909341SAndroid Build Coastguard Worker    packssdw       m16, m17
894*c0909341SAndroid Build Coastguard Worker    psubd           m0, m13            ; x * b * 164 + (1 << 11) + (1 << 15)
895*c0909341SAndroid Build Coastguard Worker    psubd           m1, m13
896*c0909341SAndroid Build Coastguard Worker    mova    [t4+r10+4], m16
897*c0909341SAndroid Build Coastguard Worker    psrld          m16, m0, 12         ; b
898*c0909341SAndroid Build Coastguard Worker    psrld          m17, m1, 12
899*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+  8], xm16
900*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+ 24], xm17
901*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+ 40], ym16, 1
902*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+ 56], ym17, 1
903*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+ 72], m16, 2
904*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+ 88], m17, 2
905*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+104], m16, 3
906*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+120], m17, 3
907*c0909341SAndroid Build Coastguard Worker    add            r10, 64
908*c0909341SAndroid Build Coastguard Worker    jl .hv_loop
909*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
910*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
911*c0909341SAndroid Build Coastguard Worker    mov             t0, t2
912*c0909341SAndroid Build Coastguard Worker    ret
913*c0909341SAndroid Build Coastguard Worker.hv_last_row: ; esoteric edge case for odd heights
914*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416*0], m1
915*c0909341SAndroid Build Coastguard Worker    paddw            m1, m0
916*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416*2], m16
917*c0909341SAndroid Build Coastguard Worker    paddd           m16, m2
918*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416*4], m17
919*c0909341SAndroid Build Coastguard Worker    paddd           m17, m3
920*c0909341SAndroid Build Coastguard Worker    jmp .hv_main2
921*c0909341SAndroid Build Coastguard Worker.v: ; vertical boxsum + ab
922*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
923*c0909341SAndroid Build Coastguard Worker.v_loop:
924*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10+416*2]
925*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+r10+416*4]
926*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10+416*0]
927*c0909341SAndroid Build Coastguard Worker    paddd          m16, m2, [t2+r10+416*2]
928*c0909341SAndroid Build Coastguard Worker    paddd          m17, m3, [t2+r10+416*4]
929*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10+416*0]
930*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
931*c0909341SAndroid Build Coastguard Worker    paddd           m3, m3
932*c0909341SAndroid Build Coastguard Worker    paddd          m16, m2             ; hv sumsq
933*c0909341SAndroid Build Coastguard Worker    paddd          m17, m3
934*c0909341SAndroid Build Coastguard Worker    paddd          m16, m8
935*c0909341SAndroid Build Coastguard Worker    paddd          m17, m8
936*c0909341SAndroid Build Coastguard Worker    psrld          m16, 4              ; (a + 8) >> 4
937*c0909341SAndroid Build Coastguard Worker    psrld          m17, 4
938*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m9             ; -a * 25
939*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m9
940*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
941*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0             ; hv sum
942*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
943*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m6             ; (b + 2) >> 2
944*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
945*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m16, m2, m2         ; -p
946*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
947*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m17, m3, m3
948*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m6         ; b
949*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6
950*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m10            ; p * s
951*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m10
952*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11            ; b * 164
953*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
954*c0909341SAndroid Build Coastguard Worker    vpalignr   m17{k2}, m16, m16, 2
955*c0909341SAndroid Build Coastguard Worker    mova           m16, m20
956*c0909341SAndroid Build Coastguard Worker    pmaxsw         m17, m6
957*c0909341SAndroid Build Coastguard Worker    paddusw        m17, m12
958*c0909341SAndroid Build Coastguard Worker    psraw          m17, 4              ; min(z, 255) - 256
959*c0909341SAndroid Build Coastguard Worker    vpermt2b       m16, m17, m21       ; sgr_x_by_x[128..255]
960*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m17
961*c0909341SAndroid Build Coastguard Worker    vpermi2b       m17, m18, m19       ; sgr_x_by_x[  0..127]
962*c0909341SAndroid Build Coastguard Worker    vmovdqu8   m17{k3}, m16            ; x
963*c0909341SAndroid Build Coastguard Worker    pandn          m16, m13, m17
964*c0909341SAndroid Build Coastguard Worker    psrld          m17, 16
965*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m16
966*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m17
967*c0909341SAndroid Build Coastguard Worker    packssdw       m16, m17
968*c0909341SAndroid Build Coastguard Worker    psubd           m0, m13            ; x * b * 164 + (1 << 11) + (1 << 15)
969*c0909341SAndroid Build Coastguard Worker    psubd           m1, m13
970*c0909341SAndroid Build Coastguard Worker    mova    [t4+r10+4], m16
971*c0909341SAndroid Build Coastguard Worker    psrld          m16, m0, 12         ; b
972*c0909341SAndroid Build Coastguard Worker    psrld          m17, m1, 12
973*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+  8], xm16
974*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+ 24], xm17
975*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+ 40], ym16, 1
976*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+ 56], ym17, 1
977*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+ 72], m16, 2
978*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+ 88], m17, 2
979*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+104], m16, 3
980*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+120], m17, 3
981*c0909341SAndroid Build Coastguard Worker    add            r10, 64
982*c0909341SAndroid Build Coastguard Worker    jl .v_loop
983*c0909341SAndroid Build Coastguard Worker    ret
984*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup
985*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
986*c0909341SAndroid Build Coastguard Worker.prep_n_loop:
987*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+r10*1+ 2]
988*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+r10*2+ 4]
989*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+r10*2+68]
990*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0, [t4+r10*1+ 0]
991*c0909341SAndroid Build Coastguard Worker    paddd          m16, m1, [t3+r10*2+ 0]
992*c0909341SAndroid Build Coastguard Worker    paddd          m17, m2, [t3+r10*2+64]
993*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+r10*1+ 4]
994*c0909341SAndroid Build Coastguard Worker    paddd          m16, [t3+r10*2+ 8]
995*c0909341SAndroid Build Coastguard Worker    paddd          m17, [t3+r10*2+72]
996*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3
997*c0909341SAndroid Build Coastguard Worker    psllw           m3, 2
998*c0909341SAndroid Build Coastguard Worker    paddd           m1, m16
999*c0909341SAndroid Build Coastguard Worker    pslld          m16, 2
1000*c0909341SAndroid Build Coastguard Worker    paddd           m2, m17
1001*c0909341SAndroid Build Coastguard Worker    pslld          m17, 2
1002*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3             ; a 565
1003*c0909341SAndroid Build Coastguard Worker    paddd           m1, m16            ; b 565
1004*c0909341SAndroid Build Coastguard Worker    paddd           m2, m17
1005*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*2+ 0], m0
1006*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*4+ 0], m1
1007*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*4+64], m2
1008*c0909341SAndroid Build Coastguard Worker    add            r10, 64
1009*c0909341SAndroid Build Coastguard Worker    jl .prep_n_loop
1010*c0909341SAndroid Build Coastguard Worker    ret
1011*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1012*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows)
1013*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1014*c0909341SAndroid Build Coastguard Worker.n0_loop:
1015*c0909341SAndroid Build Coastguard Worker    movu            m0, [t4+r10*1+ 2]
1016*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+r10*2+ 4]
1017*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+r10*2+68]
1018*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0, [t4+r10*1+ 0]
1019*c0909341SAndroid Build Coastguard Worker    paddd          m16, m1, [t3+r10*2+ 0]
1020*c0909341SAndroid Build Coastguard Worker    paddd          m17, m2, [t3+r10*2+64]
1021*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+r10*1+ 4]
1022*c0909341SAndroid Build Coastguard Worker    paddd          m16, [t3+r10*2+ 8]
1023*c0909341SAndroid Build Coastguard Worker    paddd          m17, [t3+r10*2+72]
1024*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3
1025*c0909341SAndroid Build Coastguard Worker    psllw           m3, 2
1026*c0909341SAndroid Build Coastguard Worker    paddd           m1, m16
1027*c0909341SAndroid Build Coastguard Worker    pslld          m16, 2
1028*c0909341SAndroid Build Coastguard Worker    paddd           m2, m17
1029*c0909341SAndroid Build Coastguard Worker    pslld          m17, 2
1030*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3             ; a 565
1031*c0909341SAndroid Build Coastguard Worker    paddd           m1, m16            ; b 565
1032*c0909341SAndroid Build Coastguard Worker    paddd           m2, m17
1033*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0, [t4+r10*1+416*2+ 0]
1034*c0909341SAndroid Build Coastguard Worker    paddd          m16, m1, [t3+r10*2+416*4+ 0]
1035*c0909341SAndroid Build Coastguard Worker    paddd          m17, m2, [t3+r10*2+416*4+64]
1036*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*2+ 0], m0
1037*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*4+ 0], m1
1038*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*4+64], m2
1039*c0909341SAndroid Build Coastguard Worker    mova            m0, [dstq+r10]
1040*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m0, m6          ; src
1041*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6          ; a
1042*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m1              ; a * src
1043*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m6
1044*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1045*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1
1046*c0909341SAndroid Build Coastguard Worker    vshufi32x4      m1, m16, m17, q2020
1047*c0909341SAndroid Build Coastguard Worker    vshufi32x4     m16, m17, q3131
1048*c0909341SAndroid Build Coastguard Worker    psubd           m1, m2              ; b - a * src + (1 << 8)
1049*c0909341SAndroid Build Coastguard Worker    psubd          m16, m3
1050*c0909341SAndroid Build Coastguard Worker    psrad           m1, 9
1051*c0909341SAndroid Build Coastguard Worker    psrad          m16, 9
1052*c0909341SAndroid Build Coastguard Worker    packssdw        m1, m16
1053*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m1, m7
1054*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
1055*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m6
1056*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m14
1057*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
1058*c0909341SAndroid Build Coastguard Worker    add            r10, 64
1059*c0909341SAndroid Build Coastguard Worker    jl .n0_loop
1060*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
1061*c0909341SAndroid Build Coastguard Worker    ret
1062*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1063*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows)
1064*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1065*c0909341SAndroid Build Coastguard Worker.n1_loop:
1066*c0909341SAndroid Build Coastguard Worker    mova            m0, [dstq+r10]
1067*c0909341SAndroid Build Coastguard Worker    mova            m3, [t4+r10*1+416*2+ 0]
1068*c0909341SAndroid Build Coastguard Worker    mova           m16, [t3+r10*2+416*4+ 0]
1069*c0909341SAndroid Build Coastguard Worker    mova           m17, [t3+r10*2+416*4+64]
1070*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m0, m6          ; src
1071*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6          ; a
1072*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m1
1073*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m6
1074*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1075*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1
1076*c0909341SAndroid Build Coastguard Worker    vshufi32x4      m1, m16, m17, q2020
1077*c0909341SAndroid Build Coastguard Worker    vshufi32x4     m16, m17, q3131
1078*c0909341SAndroid Build Coastguard Worker    psubd           m1, m2              ; b - a * src + (1 << 7)
1079*c0909341SAndroid Build Coastguard Worker    psubd          m16, m3
1080*c0909341SAndroid Build Coastguard Worker    psrad           m1, 8
1081*c0909341SAndroid Build Coastguard Worker    psrad          m16, 8
1082*c0909341SAndroid Build Coastguard Worker    packssdw        m1, m16
1083*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m1, m7
1084*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
1085*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m6
1086*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m14
1087*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
1088*c0909341SAndroid Build Coastguard Worker    add            r10, 64
1089*c0909341SAndroid Build Coastguard Worker    jl .n1_loop
1090*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
1091*c0909341SAndroid Build Coastguard Worker    ret
1092*c0909341SAndroid Build Coastguard Worker
1093*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_3x3_16bpc, 4, 14, 22, 416*42+8, dst, stride, left, lpf, \
1094*c0909341SAndroid Build Coastguard Worker                                                   w, h, edge, params
1095*c0909341SAndroid Build Coastguard Worker    movifnidn       wd, wm
1096*c0909341SAndroid Build Coastguard Worker    mov        paramsq, r6mp
1097*c0909341SAndroid Build Coastguard Worker    lea            r13, [r_ext_mask+72]
1098*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
1099*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
1100*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
1101*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m7, [paramsq+10] ; w1
1102*c0909341SAndroid Build Coastguard Worker    add             wd, wd
1103*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m8, [base+pd_8]
1104*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
1105*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [base+pd_m9]
1106*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
1107*c0909341SAndroid Build Coastguard Worker    vpsubd         m10, m6, [paramsq+4] {1to16} ; -s1
1108*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+wq*2+416*12+8]
1109*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m11, [base+pw_164_455]
1110*c0909341SAndroid Build Coastguard Worker    lea             t4, [rsp+wq+416*32+8]
1111*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m12, [base+pw_61448]
1112*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq+12]
1113*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m13, [base+pd_m34816]
1114*c0909341SAndroid Build Coastguard Worker    neg             wq
1115*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m14, [base+pw_1023]
1116*c0909341SAndroid Build Coastguard Worker    psllw           m7, 4
1117*c0909341SAndroid Build Coastguard Worker    mova           m18, [sgr_x_by_x+64*0]
1118*c0909341SAndroid Build Coastguard Worker    mov           r10d, 0xfffffffc
1119*c0909341SAndroid Build Coastguard Worker    mova           m19, [sgr_x_by_x+64*1]
1120*c0909341SAndroid Build Coastguard Worker    kmovd           k1, r10d
1121*c0909341SAndroid Build Coastguard Worker    mova           m20, [sgr_x_by_x+64*2]
1122*c0909341SAndroid Build Coastguard Worker    mov            r10, 0x3333333333333333
1123*c0909341SAndroid Build Coastguard Worker    mova           m21, [sgr_x_by_x+64*3]
1124*c0909341SAndroid Build Coastguard Worker    kmovq           k2, r10
1125*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
1126*c0909341SAndroid Build Coastguard Worker    jz .no_top
1127*c0909341SAndroid Build Coastguard Worker    call .h_top
1128*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1129*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1130*c0909341SAndroid Build Coastguard Worker    add             t1, 416*6
1131*c0909341SAndroid Build Coastguard Worker    call .h_top
1132*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1133*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1134*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
1135*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10 ; below
1136*c0909341SAndroid Build Coastguard Worker    call .hv0
1137*c0909341SAndroid Build Coastguard Worker.main:
1138*c0909341SAndroid Build Coastguard Worker    dec             hd
1139*c0909341SAndroid Build Coastguard Worker    jz .height1
1140*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1141*c0909341SAndroid Build Coastguard Worker    call .hv1
1142*c0909341SAndroid Build Coastguard Worker    call .prep_n
1143*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1144*c0909341SAndroid Build Coastguard Worker    jl .extend_bottom
1145*c0909341SAndroid Build Coastguard Worker.main_loop:
1146*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1147*c0909341SAndroid Build Coastguard Worker    call .hv0
1148*c0909341SAndroid Build Coastguard Worker    test            hd, hd
1149*c0909341SAndroid Build Coastguard Worker    jz .odd_height
1150*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1151*c0909341SAndroid Build Coastguard Worker    call .hv1
1152*c0909341SAndroid Build Coastguard Worker    call .n0
1153*c0909341SAndroid Build Coastguard Worker    call .n1
1154*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1155*c0909341SAndroid Build Coastguard Worker    jge .main_loop
1156*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
1157*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
1158*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
1159*c0909341SAndroid Build Coastguard Worker    call .hv0_bottom
1160*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1161*c0909341SAndroid Build Coastguard Worker    call .hv1_bottom
1162*c0909341SAndroid Build Coastguard Worker.end:
1163*c0909341SAndroid Build Coastguard Worker    call .n0
1164*c0909341SAndroid Build Coastguard Worker    call .n1
1165*c0909341SAndroid Build Coastguard Worker.end2:
1166*c0909341SAndroid Build Coastguard Worker    RET
1167*c0909341SAndroid Build Coastguard Worker.height1:
1168*c0909341SAndroid Build Coastguard Worker    call .v1
1169*c0909341SAndroid Build Coastguard Worker    call .prep_n
1170*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
1171*c0909341SAndroid Build Coastguard Worker.odd_height:
1172*c0909341SAndroid Build Coastguard Worker    call .v1
1173*c0909341SAndroid Build Coastguard Worker    call .n0
1174*c0909341SAndroid Build Coastguard Worker    call .n1
1175*c0909341SAndroid Build Coastguard Worker.odd_height_end:
1176*c0909341SAndroid Build Coastguard Worker    call .v0
1177*c0909341SAndroid Build Coastguard Worker    call .v1
1178*c0909341SAndroid Build Coastguard Worker    call .n0
1179*c0909341SAndroid Build Coastguard Worker    jmp .end2
1180*c0909341SAndroid Build Coastguard Worker.extend_bottom:
1181*c0909341SAndroid Build Coastguard Worker    call .v0
1182*c0909341SAndroid Build Coastguard Worker    call .v1
1183*c0909341SAndroid Build Coastguard Worker    jmp .end
1184*c0909341SAndroid Build Coastguard Worker.no_top:
1185*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1186*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1187*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
1188*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10
1189*c0909341SAndroid Build Coastguard Worker    call .h
1190*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1191*c0909341SAndroid Build Coastguard Worker    lea             t2, [t1+416*6]
1192*c0909341SAndroid Build Coastguard Worker.top_fixup_loop:
1193*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10+416*0]
1194*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+r10+416*2]
1195*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10+416*4]
1196*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*0], m0
1197*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*2], m1
1198*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*4], m2
1199*c0909341SAndroid Build Coastguard Worker    add            r10, 64
1200*c0909341SAndroid Build Coastguard Worker    jl .top_fixup_loop
1201*c0909341SAndroid Build Coastguard Worker    call .v0
1202*c0909341SAndroid Build Coastguard Worker    jmp .main
1203*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum
1204*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1205*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1206*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1207*c0909341SAndroid Build Coastguard Worker    movd          xm16, [leftq+4]
1208*c0909341SAndroid Build Coastguard Worker    vmovdqu16  m16{k1}, [lpfq+wq-4]
1209*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
1210*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1211*c0909341SAndroid Build Coastguard Worker.h_extend_left:
1212*c0909341SAndroid Build Coastguard Worker    vpbroadcastw  xm16, [lpfq+wq]
1213*c0909341SAndroid Build Coastguard Worker    vmovdqu16  m16{k1}, [lpfq+wq-4]
1214*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1215*c0909341SAndroid Build Coastguard Worker.h_top:
1216*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1217*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1218*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1219*c0909341SAndroid Build Coastguard Worker.h_loop:
1220*c0909341SAndroid Build Coastguard Worker    movu           m16, [lpfq+r10+ 0]
1221*c0909341SAndroid Build Coastguard Worker.h_main:
1222*c0909341SAndroid Build Coastguard Worker    movu           m17, [lpfq+r10+16]
1223*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1224*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
1225*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -66
1226*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
1227*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m0, [lpfq-2]
1228*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, [r13+r10+ 0], 0xe4
1229*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m0, [r13+r10+16], 0xe4
1230*c0909341SAndroid Build Coastguard Worker.h_have_right:
1231*c0909341SAndroid Build Coastguard Worker    palignr         m0, m17, m16, 2
1232*c0909341SAndroid Build Coastguard Worker    paddw           m1, m16, m0
1233*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m16, m0
1234*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1235*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m16, m0
1236*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1237*c0909341SAndroid Build Coastguard Worker    palignr        m17, m16, 4
1238*c0909341SAndroid Build Coastguard Worker    paddw           m1, m17            ; sum
1239*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m17, m6
1240*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m16, m16       ; sumsq
1241*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m6
1242*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m17, m17
1243*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416*0], m1
1244*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416*2], m2
1245*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416*4], m3
1246*c0909341SAndroid Build Coastguard Worker    add            r10, 64
1247*c0909341SAndroid Build Coastguard Worker    jl .h_loop
1248*c0909341SAndroid Build Coastguard Worker    ret
1249*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1250*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
1251*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1252*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1253*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
1254*c0909341SAndroid Build Coastguard Worker    movd          xm16, [leftq+4]
1255*c0909341SAndroid Build Coastguard Worker    vmovdqu16  m16{k1}, [lpfq+wq-4]
1256*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
1257*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
1258*c0909341SAndroid Build Coastguard Worker.hv0_extend_left:
1259*c0909341SAndroid Build Coastguard Worker    vpbroadcastw  xm16, [lpfq+wq]
1260*c0909341SAndroid Build Coastguard Worker    vmovdqu16  m16{k1}, [lpfq+wq-4]
1261*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
1262*c0909341SAndroid Build Coastguard Worker.hv0_bottom:
1263*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1264*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1265*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
1266*c0909341SAndroid Build Coastguard Worker.hv0_loop:
1267*c0909341SAndroid Build Coastguard Worker    movu           m16, [lpfq+r10+ 0]
1268*c0909341SAndroid Build Coastguard Worker.hv0_main:
1269*c0909341SAndroid Build Coastguard Worker    movu           m17, [lpfq+r10+16]
1270*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1271*c0909341SAndroid Build Coastguard Worker    jnz .hv0_have_right
1272*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -66
1273*c0909341SAndroid Build Coastguard Worker    jl .hv0_have_right
1274*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m0, [lpfq-2]
1275*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, [r13+r10+ 0], 0xe4
1276*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m0, [r13+r10+16], 0xe4
1277*c0909341SAndroid Build Coastguard Worker.hv0_have_right:
1278*c0909341SAndroid Build Coastguard Worker    palignr         m0, m17, m16, 2
1279*c0909341SAndroid Build Coastguard Worker    paddw           m1, m16, m0
1280*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m16, m0
1281*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1282*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m16, m0
1283*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1284*c0909341SAndroid Build Coastguard Worker    palignr        m17, m16, 4
1285*c0909341SAndroid Build Coastguard Worker    paddw           m1, m17            ; sum
1286*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m17, m6
1287*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m16, m16       ; sumsq
1288*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m6
1289*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m17, m17
1290*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1, [t1+r10+416*0]
1291*c0909341SAndroid Build Coastguard Worker    paddd          m16, m2, [t1+r10+416*2]
1292*c0909341SAndroid Build Coastguard Worker    paddd          m17, m3, [t1+r10+416*4]
1293*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416*0], m1
1294*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416*2], m2
1295*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416*4], m3
1296*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10+416*0]
1297*c0909341SAndroid Build Coastguard Worker    paddd           m2, m16, [t2+r10+416*2]
1298*c0909341SAndroid Build Coastguard Worker    paddd           m3, m17, [t2+r10+416*4]
1299*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*0], m0
1300*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*2], m16
1301*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*4], m17
1302*c0909341SAndroid Build Coastguard Worker    paddd           m2, m8
1303*c0909341SAndroid Build Coastguard Worker    paddd           m3, m8
1304*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a + 8) >> 4
1305*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
1306*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m9             ; -((a + 8) >> 4) * 9
1307*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m9
1308*c0909341SAndroid Build Coastguard Worker    psrlw          m17, m1, 1
1309*c0909341SAndroid Build Coastguard Worker    pavgw          m17, m6             ; (b + 2) >> 2
1310*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m17, m6
1311*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m16, m16       ; -p
1312*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m6
1313*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m17, m17
1314*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m6, m1         ; b
1315*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m6, m1
1316*c0909341SAndroid Build Coastguard Worker    pminsd          m2, m6
1317*c0909341SAndroid Build Coastguard Worker    pminsd          m3, m6
1318*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m10            ; p * s
1319*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m10
1320*c0909341SAndroid Build Coastguard Worker    pmaddwd        m16, m11            ; b * 455
1321*c0909341SAndroid Build Coastguard Worker    pmaddwd        m17, m11
1322*c0909341SAndroid Build Coastguard Worker    vpalignr    m3{k2}, m2, m2, 2
1323*c0909341SAndroid Build Coastguard Worker    mova            m2, m20
1324*c0909341SAndroid Build Coastguard Worker    paddusw         m3, m12
1325*c0909341SAndroid Build Coastguard Worker    psraw           m3, 4              ; min(z, 255) - 256
1326*c0909341SAndroid Build Coastguard Worker    vpermt2b        m2, m3, m21        ; sgr_x_by_x[128..255]
1327*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m3
1328*c0909341SAndroid Build Coastguard Worker    vpermi2b        m3, m18, m19       ; sgr_x_by_x[  0..127]
1329*c0909341SAndroid Build Coastguard Worker    vmovdqu8    m3{k3}, m2             ; x
1330*c0909341SAndroid Build Coastguard Worker    pandn           m2, m13, m3
1331*c0909341SAndroid Build Coastguard Worker    psrld           m3, 16
1332*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m2
1333*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m3
1334*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
1335*c0909341SAndroid Build Coastguard Worker    psubd          m16, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
1336*c0909341SAndroid Build Coastguard Worker    psubd          m17, m13
1337*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*0+4], m2
1338*c0909341SAndroid Build Coastguard Worker    psrld          m16, 12
1339*c0909341SAndroid Build Coastguard Worker    psrld          m17, 12
1340*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*0+  8], xm16
1341*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*0+ 24], xm17
1342*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*0+ 40], ym16, 1
1343*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*0+ 56], ym17, 1
1344*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
1345*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
1346*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*0+104], m16, 3
1347*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*0+120], m17, 3
1348*c0909341SAndroid Build Coastguard Worker    add            r10, 64
1349*c0909341SAndroid Build Coastguard Worker    jl .hv0_loop
1350*c0909341SAndroid Build Coastguard Worker    ret
1351*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1352*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
1353*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1354*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1355*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
1356*c0909341SAndroid Build Coastguard Worker    movd          xm16, [leftq+4]
1357*c0909341SAndroid Build Coastguard Worker    vmovdqu16  m16{k1}, [lpfq+wq-4]
1358*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
1359*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
1360*c0909341SAndroid Build Coastguard Worker.hv1_extend_left:
1361*c0909341SAndroid Build Coastguard Worker    vpbroadcastw  xm16, [lpfq+wq]
1362*c0909341SAndroid Build Coastguard Worker    vmovdqu16  m16{k1}, [lpfq+wq-4]
1363*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
1364*c0909341SAndroid Build Coastguard Worker.hv1_bottom:
1365*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1366*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1367*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
1368*c0909341SAndroid Build Coastguard Worker.hv1_loop:
1369*c0909341SAndroid Build Coastguard Worker    movu           m16, [lpfq+r10+ 0]
1370*c0909341SAndroid Build Coastguard Worker.hv1_main:
1371*c0909341SAndroid Build Coastguard Worker    movu           m17, [lpfq+r10+16]
1372*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1373*c0909341SAndroid Build Coastguard Worker    jnz .hv1_have_right
1374*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -66
1375*c0909341SAndroid Build Coastguard Worker    jl .hv1_have_right
1376*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m0, [lpfq-2]
1377*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, [r13+r10+ 0], 0xe4
1378*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m0, [r13+r10+16], 0xe4
1379*c0909341SAndroid Build Coastguard Worker.hv1_have_right:
1380*c0909341SAndroid Build Coastguard Worker    palignr         m1, m17, m16, 2
1381*c0909341SAndroid Build Coastguard Worker    paddw           m0, m16, m1
1382*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m16, m1
1383*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1384*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m16, m1
1385*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1386*c0909341SAndroid Build Coastguard Worker    palignr        m17, m16, 4
1387*c0909341SAndroid Build Coastguard Worker    paddw           m0, m17            ; h sum
1388*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m17, m6
1389*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m1, m1         ; h sumsq
1390*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m6
1391*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m17, m17
1392*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10+416*0]
1393*c0909341SAndroid Build Coastguard Worker    paddd          m16, m2, [t2+r10+416*2]
1394*c0909341SAndroid Build Coastguard Worker    paddd          m17, m3, [t2+r10+416*4]
1395*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*0], m0
1396*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*2], m2
1397*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*4], m3
1398*c0909341SAndroid Build Coastguard Worker    paddd          m16, m8
1399*c0909341SAndroid Build Coastguard Worker    paddd          m17, m8
1400*c0909341SAndroid Build Coastguard Worker    psrld          m16, 4              ; (a + 8) >> 4
1401*c0909341SAndroid Build Coastguard Worker    psrld          m17, 4
1402*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m9             ; -((a + 8) >> 4) * 9
1403*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m9
1404*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m1, 1
1405*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m6             ; (b + 2) >> 2
1406*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
1407*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m16, m2, m2         ; -p
1408*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1409*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m17, m3, m3
1410*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m6, m1         ; b
1411*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m6, m1
1412*c0909341SAndroid Build Coastguard Worker    pminsd         m16, m6
1413*c0909341SAndroid Build Coastguard Worker    pminsd         m17, m6
1414*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m10            ; p * s
1415*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m10
1416*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m11            ; b * 455
1417*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m11
1418*c0909341SAndroid Build Coastguard Worker    vpalignr   m17{k2}, m16, m16, 2
1419*c0909341SAndroid Build Coastguard Worker    mova           m16, m20
1420*c0909341SAndroid Build Coastguard Worker    paddusw        m17, m12
1421*c0909341SAndroid Build Coastguard Worker    psraw          m17, 4              ; min(z, 255) - 256
1422*c0909341SAndroid Build Coastguard Worker    vpermt2b       m16, m17, m21       ; sgr_x_by_x[128..255]
1423*c0909341SAndroid Build Coastguard Worker    vpmovb2m       k3, m17
1424*c0909341SAndroid Build Coastguard Worker    vpermi2b       m17, m18, m19       ; sgr_x_by_x[  0..127]
1425*c0909341SAndroid Build Coastguard Worker    vmovdqu8   m17{k3}, m16            ; x
1426*c0909341SAndroid Build Coastguard Worker    pandn          m16, m13, m17
1427*c0909341SAndroid Build Coastguard Worker    psrld          m17, 16
1428*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m16
1429*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m17
1430*c0909341SAndroid Build Coastguard Worker    packssdw       m16, m17
1431*c0909341SAndroid Build Coastguard Worker    psubd           m0, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
1432*c0909341SAndroid Build Coastguard Worker    psubd           m1, m13
1433*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*2+4], m16
1434*c0909341SAndroid Build Coastguard Worker    psrld          m16, m0, 12
1435*c0909341SAndroid Build Coastguard Worker    psrld          m17, m1, 12
1436*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*4+  8], xm16
1437*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*4+ 24], xm17
1438*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*4+ 40], ym16, 1
1439*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*4+ 56], ym17, 1
1440*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
1441*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
1442*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*4+104], m16, 3
1443*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*4+120], m17, 3
1444*c0909341SAndroid Build Coastguard Worker    add            r10, 64
1445*c0909341SAndroid Build Coastguard Worker    jl .hv1_loop
1446*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
1447*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1448*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
1449*c0909341SAndroid Build Coastguard Worker    ret
1450*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab (even rows)
1451*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1452*c0909341SAndroid Build Coastguard Worker.v0_loop:
1453*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10+416*0]
1454*c0909341SAndroid Build Coastguard Worker    mova           m16, [t1+r10+416*2]
1455*c0909341SAndroid Build Coastguard Worker    mova           m17, [t1+r10+416*4]
1456*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
1457*c0909341SAndroid Build Coastguard Worker    paddd          m16, m16
1458*c0909341SAndroid Build Coastguard Worker    paddd          m17, m17
1459*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10+416*0]
1460*c0909341SAndroid Build Coastguard Worker    paddd           m2, m16, [t2+r10+416*2]
1461*c0909341SAndroid Build Coastguard Worker    paddd           m3, m17, [t2+r10+416*4]
1462*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*0], m0
1463*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*2], m16
1464*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*4], m17
1465*c0909341SAndroid Build Coastguard Worker    paddd           m2, m8
1466*c0909341SAndroid Build Coastguard Worker    paddd           m3, m8
1467*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a + 8) >> 4
1468*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
1469*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m9             ; -((a + 8) >> 4) * 9
1470*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m9
1471*c0909341SAndroid Build Coastguard Worker    psrlw          m17, m1, 1
1472*c0909341SAndroid Build Coastguard Worker    pavgw          m17, m6             ; (b + 2) >> 2
1473*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m17, m6
1474*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m16, m16       ; -p
1475*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m6
1476*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m17, m17
1477*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m6, m1         ; b
1478*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m6, m1
1479*c0909341SAndroid Build Coastguard Worker    pminsd          m2, m6
1480*c0909341SAndroid Build Coastguard Worker    pminsd          m3, m6
1481*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m10            ; p * s
1482*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m10
1483*c0909341SAndroid Build Coastguard Worker    pmaddwd        m16, m11            ; b * 455
1484*c0909341SAndroid Build Coastguard Worker    pmaddwd        m17, m11
1485*c0909341SAndroid Build Coastguard Worker    vpalignr    m3{k2}, m2, m2, 2
1486*c0909341SAndroid Build Coastguard Worker    mova            m2, m20
1487*c0909341SAndroid Build Coastguard Worker    paddusw         m3, m12
1488*c0909341SAndroid Build Coastguard Worker    psraw           m3, 4              ; min(z, 255) - 256
1489*c0909341SAndroid Build Coastguard Worker    vpermt2b        m2, m3, m21        ; sgr_x_by_x[128..255]
1490*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m3
1491*c0909341SAndroid Build Coastguard Worker    vpermi2b        m3, m18, m19       ; sgr_x_by_x[  0..127]
1492*c0909341SAndroid Build Coastguard Worker    vmovdqu8    m3{k3}, m2             ; x
1493*c0909341SAndroid Build Coastguard Worker    pandn           m2, m13, m3
1494*c0909341SAndroid Build Coastguard Worker    psrld           m3, 16
1495*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m2
1496*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m3
1497*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
1498*c0909341SAndroid Build Coastguard Worker    psubd          m16, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
1499*c0909341SAndroid Build Coastguard Worker    psubd          m17, m13
1500*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*0+4], m2
1501*c0909341SAndroid Build Coastguard Worker    psrld          m16, 12
1502*c0909341SAndroid Build Coastguard Worker    psrld          m17, 12
1503*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*0+  8], xm16
1504*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*0+ 24], xm17
1505*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*0+ 40], ym16, 1
1506*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*0+ 56], ym17, 1
1507*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
1508*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
1509*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*0+104], m16, 3
1510*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*0+120], m17, 3
1511*c0909341SAndroid Build Coastguard Worker    add            r10, 64
1512*c0909341SAndroid Build Coastguard Worker    jl .v0_loop
1513*c0909341SAndroid Build Coastguard Worker    ret
1514*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows)
1515*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1516*c0909341SAndroid Build Coastguard Worker.v1_loop:
1517*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10+416*0]
1518*c0909341SAndroid Build Coastguard Worker    mova           m16, [t1+r10+416*2]
1519*c0909341SAndroid Build Coastguard Worker    mova           m17, [t1+r10+416*4]
1520*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10+416*0]
1521*c0909341SAndroid Build Coastguard Worker    paddd           m2, m16, [t2+r10+416*2]
1522*c0909341SAndroid Build Coastguard Worker    paddd           m3, m17, [t2+r10+416*4]
1523*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*0], m0
1524*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*2], m16
1525*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*4], m17
1526*c0909341SAndroid Build Coastguard Worker    paddd           m2, m8
1527*c0909341SAndroid Build Coastguard Worker    paddd           m3, m8
1528*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a + 8) >> 4
1529*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
1530*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m9             ; -((a + 8) >> 4) * 9
1531*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m9
1532*c0909341SAndroid Build Coastguard Worker    psrlw          m17, m1, 1
1533*c0909341SAndroid Build Coastguard Worker    pavgw          m17, m6             ; (b + 2) >> 2
1534*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m17, m6
1535*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m16, m16       ; -p
1536*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m6
1537*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m17, m17
1538*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m6, m1         ; b
1539*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m6, m1
1540*c0909341SAndroid Build Coastguard Worker    pminsd          m2, m6
1541*c0909341SAndroid Build Coastguard Worker    pminsd          m3, m6
1542*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m10            ; p * s
1543*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m10
1544*c0909341SAndroid Build Coastguard Worker    pmaddwd        m16, m11            ; b * 455
1545*c0909341SAndroid Build Coastguard Worker    pmaddwd        m17, m11
1546*c0909341SAndroid Build Coastguard Worker    vpalignr    m3{k2}, m2, m2, 2
1547*c0909341SAndroid Build Coastguard Worker    mova            m2, m20
1548*c0909341SAndroid Build Coastguard Worker    paddusw         m3, m12
1549*c0909341SAndroid Build Coastguard Worker    psraw           m3, 4              ; min(z, 255) - 256
1550*c0909341SAndroid Build Coastguard Worker    vpermt2b        m2, m3, m21        ; sgr_x_by_x[128..255]
1551*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m3
1552*c0909341SAndroid Build Coastguard Worker    vpermi2b        m3, m18, m19       ; sgr_x_by_x[  0..127]
1553*c0909341SAndroid Build Coastguard Worker    vmovdqu8    m3{k3}, m2             ; x
1554*c0909341SAndroid Build Coastguard Worker    pandn           m2, m13, m3
1555*c0909341SAndroid Build Coastguard Worker    psrld           m3, 16
1556*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m2
1557*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m3
1558*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
1559*c0909341SAndroid Build Coastguard Worker    psubd          m16, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
1560*c0909341SAndroid Build Coastguard Worker    psubd          m17, m13
1561*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*2+4], m2
1562*c0909341SAndroid Build Coastguard Worker    psrld          m16, 12
1563*c0909341SAndroid Build Coastguard Worker    psrld          m17, 12
1564*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*4+  8], xm16
1565*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*4+ 24], xm17
1566*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*4+ 40], ym16, 1
1567*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*4+ 56], ym17, 1
1568*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
1569*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
1570*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*4+104], m16, 3
1571*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*4+120], m17, 3
1572*c0909341SAndroid Build Coastguard Worker    add            r10, 64
1573*c0909341SAndroid Build Coastguard Worker    jl .v1_loop
1574*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
1575*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1576*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
1577*c0909341SAndroid Build Coastguard Worker    ret
1578*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup
1579*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1580*c0909341SAndroid Build Coastguard Worker.prep_n_loop:
1581*c0909341SAndroid Build Coastguard Worker    mova          ym16, [t4+r10*1+416*0+0]
1582*c0909341SAndroid Build Coastguard Worker    paddw         ym16, [t4+r10*1+416*0+4]
1583*c0909341SAndroid Build Coastguard Worker    paddw         ym17, ym16, [t4+r10*1+416*0+2]
1584*c0909341SAndroid Build Coastguard Worker    mova            m0, [t3+r10*2+416*0+0]
1585*c0909341SAndroid Build Coastguard Worker    paddd           m0, [t3+r10*2+416*0+8]
1586*c0909341SAndroid Build Coastguard Worker    paddd           m1, m0, [t3+r10*2+416*0+4]
1587*c0909341SAndroid Build Coastguard Worker    psllw         ym17, 2                ; a[-1] 444
1588*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2                ; b[-1] 444
1589*c0909341SAndroid Build Coastguard Worker    psubw         ym17, ym16             ; a[-1] 343
1590*c0909341SAndroid Build Coastguard Worker    psubd           m1, m0               ; b[-1] 343
1591*c0909341SAndroid Build Coastguard Worker    vmovdqa32 [t4+r10*1+416* 4], ym17
1592*c0909341SAndroid Build Coastguard Worker    vmovdqa32 [t3+r10*2+416* 8], m1
1593*c0909341SAndroid Build Coastguard Worker    mova          ym16, [t4+r10*1+416*2+0]
1594*c0909341SAndroid Build Coastguard Worker    paddw         ym16, [t4+r10*1+416*2+4]
1595*c0909341SAndroid Build Coastguard Worker    paddw         ym17, ym16, [t4+r10*1+416*2+2]
1596*c0909341SAndroid Build Coastguard Worker    mova            m0, [t3+r10*2+416*4+0]
1597*c0909341SAndroid Build Coastguard Worker    paddd           m0, [t3+r10*2+416*4+8]
1598*c0909341SAndroid Build Coastguard Worker    paddd           m1, m0, [t3+r10*2+416*4+4]
1599*c0909341SAndroid Build Coastguard Worker    psllw         ym17, 2                 ; a[ 0] 444
1600*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2                 ; b[ 0] 444
1601*c0909341SAndroid Build Coastguard Worker    vmovdqa32 [t4+r10*1+416* 6], ym17
1602*c0909341SAndroid Build Coastguard Worker    vmovdqa32 [t3+r10*2+416*12], m1
1603*c0909341SAndroid Build Coastguard Worker    psubw         ym17, ym16              ; a[ 0] 343
1604*c0909341SAndroid Build Coastguard Worker    psubd           m1, m0                ; b[ 0] 343
1605*c0909341SAndroid Build Coastguard Worker    vmovdqa32 [t4+r10*1+416* 8], ym17
1606*c0909341SAndroid Build Coastguard Worker    vmovdqa32 [t3+r10*2+416*16], m1
1607*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1608*c0909341SAndroid Build Coastguard Worker    jl .prep_n_loop
1609*c0909341SAndroid Build Coastguard Worker    ret
1610*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1611*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows)
1612*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1613*c0909341SAndroid Build Coastguard Worker.n0_loop:
1614*c0909341SAndroid Build Coastguard Worker    mova            m3, [t4+r10*1+416*0+0]
1615*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+r10*1+416*0+4]
1616*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3, [t4+r10*1+416*0+2]
1617*c0909341SAndroid Build Coastguard Worker    psllw           m1, 2                ; a[ 1] 444
1618*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1, m3           ; a[ 1] 343
1619*c0909341SAndroid Build Coastguard Worker    paddw           m3, m2, [t4+r10*1+416*4]
1620*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+r10*1+416*6]
1621*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*4], m2
1622*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*6], m1
1623*c0909341SAndroid Build Coastguard Worker    mova           m16, [t3+r10*2+416*0+0]
1624*c0909341SAndroid Build Coastguard Worker    paddd          m16, [t3+r10*2+416*0+8]
1625*c0909341SAndroid Build Coastguard Worker    paddd           m1, m16, [t3+r10*2+416*0+4]
1626*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2                ; b[ 1] 444
1627*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1, m16          ; b[ 1] 343
1628*c0909341SAndroid Build Coastguard Worker    paddd          m16, m2, [t3+r10*2+416* 8+ 0]
1629*c0909341SAndroid Build Coastguard Worker    paddd          m16, [t3+r10*2+416*12+ 0]
1630*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416* 8+ 0], m2
1631*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*12+ 0], m1
1632*c0909341SAndroid Build Coastguard Worker    mova           m17, [t3+r10*2+416*0+64]
1633*c0909341SAndroid Build Coastguard Worker    paddd          m17, [t3+r10*2+416*0+72]
1634*c0909341SAndroid Build Coastguard Worker    paddd           m1, m17, [t3+r10*2+416*0+68]
1635*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2
1636*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1, m17
1637*c0909341SAndroid Build Coastguard Worker    paddd          m17, m2, [t3+r10*2+416* 8+64]
1638*c0909341SAndroid Build Coastguard Worker    paddd          m17, [t3+r10*2+416*12+64]
1639*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416* 8+64], m2
1640*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*12+64], m1
1641*c0909341SAndroid Build Coastguard Worker    mova            m0, [dstq+r10]
1642*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m0, m6
1643*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
1644*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m1               ; a * src
1645*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m6
1646*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1647*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1
1648*c0909341SAndroid Build Coastguard Worker    vshufi32x4      m1, m16, m17, q2020
1649*c0909341SAndroid Build Coastguard Worker    vshufi32x4     m16, m17, q3131
1650*c0909341SAndroid Build Coastguard Worker    psubd           m1, m2               ; b - a * src + (1 << 8)
1651*c0909341SAndroid Build Coastguard Worker    psubd          m16, m3
1652*c0909341SAndroid Build Coastguard Worker    psrad           m1, 9
1653*c0909341SAndroid Build Coastguard Worker    psrad          m16, 9
1654*c0909341SAndroid Build Coastguard Worker    packssdw        m1, m16
1655*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m1, m7
1656*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
1657*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m6
1658*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m14
1659*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
1660*c0909341SAndroid Build Coastguard Worker    add            r10, 64
1661*c0909341SAndroid Build Coastguard Worker    jl .n0_loop
1662*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
1663*c0909341SAndroid Build Coastguard Worker    ret
1664*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1665*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows)
1666*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1667*c0909341SAndroid Build Coastguard Worker.n1_loop:
1668*c0909341SAndroid Build Coastguard Worker    mova            m3, [t4+r10*1+416*2+0]
1669*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+r10*1+416*2+4]
1670*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3, [t4+r10*1+416*2+2]
1671*c0909341SAndroid Build Coastguard Worker    psllw           m1, 2                ; a[ 1] 444
1672*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1, m3           ; a[ 1] 343
1673*c0909341SAndroid Build Coastguard Worker    paddw           m3, m2, [t4+r10*1+416*6]
1674*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t4+r10*1+416*8]
1675*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*6], m1
1676*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*8], m2
1677*c0909341SAndroid Build Coastguard Worker    mova           m16, [t3+r10*2+416*4+0]
1678*c0909341SAndroid Build Coastguard Worker    paddd          m16, [t3+r10*2+416*4+8]
1679*c0909341SAndroid Build Coastguard Worker    paddd           m1, m16, [t3+r10*2+416*4+4]
1680*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2                ; b[ 1] 444
1681*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1, m16          ; b[ 1] 343
1682*c0909341SAndroid Build Coastguard Worker    paddd          m16, m2, [t3+r10*2+416*12+ 0]
1683*c0909341SAndroid Build Coastguard Worker    paddd          m16, [t3+r10*2+416*16+ 0]
1684*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*12+ 0], m1
1685*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*16+ 0], m2
1686*c0909341SAndroid Build Coastguard Worker    mova           m17, [t3+r10*2+416*4+64]
1687*c0909341SAndroid Build Coastguard Worker    paddd          m17, [t3+r10*2+416*4+72]
1688*c0909341SAndroid Build Coastguard Worker    paddd           m1, m17, [t3+r10*2+416*4+68]
1689*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2
1690*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1, m17
1691*c0909341SAndroid Build Coastguard Worker    paddd          m17, m2, [t3+r10*2+416*12+64]
1692*c0909341SAndroid Build Coastguard Worker    paddd          m17, [t3+r10*2+416*16+64]
1693*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*12+64], m1
1694*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*16+64], m2
1695*c0909341SAndroid Build Coastguard Worker    mova            m0, [dstq+r10]
1696*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m0, m6
1697*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
1698*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m1               ; a * src
1699*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m0, m6
1700*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
1701*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1
1702*c0909341SAndroid Build Coastguard Worker    vshufi32x4      m1, m16, m17, q2020
1703*c0909341SAndroid Build Coastguard Worker    vshufi32x4     m16, m17, q3131
1704*c0909341SAndroid Build Coastguard Worker    psubd           m1, m2               ; b - a * src + (1 << 8)
1705*c0909341SAndroid Build Coastguard Worker    psubd          m16, m3
1706*c0909341SAndroid Build Coastguard Worker    psrad           m1, 9
1707*c0909341SAndroid Build Coastguard Worker    psrad          m16, 9
1708*c0909341SAndroid Build Coastguard Worker    packssdw        m1, m16
1709*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m1, m7
1710*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
1711*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m6
1712*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m14
1713*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], m0
1714*c0909341SAndroid Build Coastguard Worker    add            r10, 64
1715*c0909341SAndroid Build Coastguard Worker    jl .n1_loop
1716*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
1717*c0909341SAndroid Build Coastguard Worker    ret
1718*c0909341SAndroid Build Coastguard Worker
1719*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_mix_16bpc, 4, 14, 23, 416*66+8, dst, stride, left, lpf, \
1720*c0909341SAndroid Build Coastguard Worker                                                   w, h, edge, params
1721*c0909341SAndroid Build Coastguard Worker    movifnidn       wd, wm
1722*c0909341SAndroid Build Coastguard Worker    mov        paramsq, r6mp
1723*c0909341SAndroid Build Coastguard Worker    lea            r13, [r_ext_mask+72]
1724*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
1725*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
1726*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m7, [paramsq+8] ; w0 w1
1727*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
1728*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m8, [base+pd_8]
1729*c0909341SAndroid Build Coastguard Worker    add             wd, wd
1730*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [base+pd_m9]
1731*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
1732*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m10, [base+pd_m25]
1733*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
1734*c0909341SAndroid Build Coastguard Worker    vpsubd         m11, m6, [paramsq+0] {1to16} ; -s0
1735*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+wq*2+416*24+8]
1736*c0909341SAndroid Build Coastguard Worker    vpsubd         m12, m6, [paramsq+4] {1to16} ; -s1
1737*c0909341SAndroid Build Coastguard Worker    lea             t4, [rsp+wq+416*52+8]
1738*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m13, [base+pw_164_455]
1739*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq+12]
1740*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m14, [base+pw_61448]
1741*c0909341SAndroid Build Coastguard Worker    neg             wq
1742*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m15, [base+pd_m34816]
1743*c0909341SAndroid Build Coastguard Worker    psllw           m7, 2
1744*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m22, [base+pd_2147483648]
1745*c0909341SAndroid Build Coastguard Worker    mov           r10d, 0xfffffff8
1746*c0909341SAndroid Build Coastguard Worker    mova           m18, [sgr_x_by_x+64*0]
1747*c0909341SAndroid Build Coastguard Worker    kmovd           k1, r10d
1748*c0909341SAndroid Build Coastguard Worker    mova           m19, [sgr_x_by_x+64*1]
1749*c0909341SAndroid Build Coastguard Worker    mov            r10, 0x3333333333333333
1750*c0909341SAndroid Build Coastguard Worker    mova           m20, [sgr_x_by_x+64*2]
1751*c0909341SAndroid Build Coastguard Worker    kmovq           k2, r10
1752*c0909341SAndroid Build Coastguard Worker    mova           m21, [sgr_x_by_x+64*3]
1753*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
1754*c0909341SAndroid Build Coastguard Worker    jz .no_top
1755*c0909341SAndroid Build Coastguard Worker    call .h_top
1756*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1757*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1758*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx512icl).top_fixup
1759*c0909341SAndroid Build Coastguard Worker    add             t1, 416*12
1760*c0909341SAndroid Build Coastguard Worker    call .h_top
1761*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1762*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1763*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
1764*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10 ; below
1765*c0909341SAndroid Build Coastguard Worker    call .hv0
1766*c0909341SAndroid Build Coastguard Worker.main:
1767*c0909341SAndroid Build Coastguard Worker    dec             hd
1768*c0909341SAndroid Build Coastguard Worker    jz .height1
1769*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1770*c0909341SAndroid Build Coastguard Worker    call .hv1
1771*c0909341SAndroid Build Coastguard Worker    call .prep_n
1772*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1773*c0909341SAndroid Build Coastguard Worker    jl .extend_bottom
1774*c0909341SAndroid Build Coastguard Worker.main_loop:
1775*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1776*c0909341SAndroid Build Coastguard Worker    call .hv0
1777*c0909341SAndroid Build Coastguard Worker    test            hd, hd
1778*c0909341SAndroid Build Coastguard Worker    jz .odd_height
1779*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1780*c0909341SAndroid Build Coastguard Worker    call .hv1
1781*c0909341SAndroid Build Coastguard Worker    call .n0
1782*c0909341SAndroid Build Coastguard Worker    call .n1
1783*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1784*c0909341SAndroid Build Coastguard Worker    jge .main_loop
1785*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
1786*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
1787*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
1788*c0909341SAndroid Build Coastguard Worker    call .hv0_bottom
1789*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1790*c0909341SAndroid Build Coastguard Worker    call .hv1_bottom
1791*c0909341SAndroid Build Coastguard Worker.end:
1792*c0909341SAndroid Build Coastguard Worker    call .n0
1793*c0909341SAndroid Build Coastguard Worker    call .n1
1794*c0909341SAndroid Build Coastguard Worker.end2:
1795*c0909341SAndroid Build Coastguard Worker    RET
1796*c0909341SAndroid Build Coastguard Worker.height1:
1797*c0909341SAndroid Build Coastguard Worker    call .v1
1798*c0909341SAndroid Build Coastguard Worker    call .prep_n
1799*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
1800*c0909341SAndroid Build Coastguard Worker.odd_height:
1801*c0909341SAndroid Build Coastguard Worker    call .v1
1802*c0909341SAndroid Build Coastguard Worker    call .n0
1803*c0909341SAndroid Build Coastguard Worker    call .n1
1804*c0909341SAndroid Build Coastguard Worker.odd_height_end:
1805*c0909341SAndroid Build Coastguard Worker    call .v0
1806*c0909341SAndroid Build Coastguard Worker    call .v1
1807*c0909341SAndroid Build Coastguard Worker    call .n0
1808*c0909341SAndroid Build Coastguard Worker    jmp .end2
1809*c0909341SAndroid Build Coastguard Worker.extend_bottom:
1810*c0909341SAndroid Build Coastguard Worker    call .v0
1811*c0909341SAndroid Build Coastguard Worker    call .v1
1812*c0909341SAndroid Build Coastguard Worker    jmp .end
1813*c0909341SAndroid Build Coastguard Worker.no_top:
1814*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1815*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1816*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
1817*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10
1818*c0909341SAndroid Build Coastguard Worker    call .h
1819*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1820*c0909341SAndroid Build Coastguard Worker    lea             t2, [t1+416*12]
1821*c0909341SAndroid Build Coastguard Worker.top_fixup_loop:
1822*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10+416* 0]
1823*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+r10+416* 2]
1824*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10+416* 4]
1825*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
1826*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+r10+416* 6]
1827*c0909341SAndroid Build Coastguard Worker    paddd           m1, m1
1828*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+r10+416* 8]
1829*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
1830*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+r10+416*10]
1831*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416* 0], m0
1832*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416* 2], m1
1833*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416* 4], m2
1834*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416* 6], m3
1835*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416* 8], m4
1836*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*10], m5
1837*c0909341SAndroid Build Coastguard Worker    add            r10, 64
1838*c0909341SAndroid Build Coastguard Worker    jl .top_fixup_loop
1839*c0909341SAndroid Build Coastguard Worker    call .v0
1840*c0909341SAndroid Build Coastguard Worker    jmp .main
1841*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum
1842*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1843*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1844*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1845*c0909341SAndroid Build Coastguard Worker    movq          xm16, [leftq+2]
1846*c0909341SAndroid Build Coastguard Worker    vmovdqu16  m16{k1}, [lpfq+wq-6]
1847*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
1848*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1849*c0909341SAndroid Build Coastguard Worker.h_extend_left:
1850*c0909341SAndroid Build Coastguard Worker    vpbroadcastw  xm16, [lpfq+wq]
1851*c0909341SAndroid Build Coastguard Worker    vmovdqu16  m16{k1}, [lpfq+wq-6]
1852*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1853*c0909341SAndroid Build Coastguard Worker.h_top:
1854*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1855*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1856*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1857*c0909341SAndroid Build Coastguard Worker.h_loop:
1858*c0909341SAndroid Build Coastguard Worker    movu           m16, [lpfq+r10- 2]
1859*c0909341SAndroid Build Coastguard Worker.h_main:
1860*c0909341SAndroid Build Coastguard Worker    movu           m17, [lpfq+r10+14]
1861*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1862*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
1863*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -68
1864*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
1865*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m0, [lpfq-2]
1866*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, [r13+r10+ 0], 0xe4
1867*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m0, [r13+r10+16], 0xe4
1868*c0909341SAndroid Build Coastguard Worker.h_have_right:
1869*c0909341SAndroid Build Coastguard Worker    palignr         m3, m17, m16, 2
1870*c0909341SAndroid Build Coastguard Worker    palignr         m0, m17, m16, 4
1871*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3, m0
1872*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m0
1873*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1874*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m0
1875*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1876*c0909341SAndroid Build Coastguard Worker    palignr         m0, m17, m16, 6
1877*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0             ; sum3
1878*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m0, m6
1879*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m4, m4         ; sumsq3
1880*c0909341SAndroid Build Coastguard Worker    punpckhwd       m0, m6
1881*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m0, m0
1882*c0909341SAndroid Build Coastguard Worker    shufpd          m4, m16, m17, 0x55
1883*c0909341SAndroid Build Coastguard Worker    punpcklwd      m17, m4, m16
1884*c0909341SAndroid Build Coastguard Worker    paddw           m0, m16, m4
1885*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m16
1886*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416* 6], m1
1887*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416* 8], m2
1888*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416*10], m3
1889*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0             ; sum5
1890*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m17, m17       ; sumsq5
1891*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m4, m4
1892*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416* 0], m1
1893*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416* 2], m2
1894*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416* 4], m3
1895*c0909341SAndroid Build Coastguard Worker    add            r10, 64
1896*c0909341SAndroid Build Coastguard Worker    jl .h_loop
1897*c0909341SAndroid Build Coastguard Worker    ret
1898*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1899*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
1900*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1901*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1902*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
1903*c0909341SAndroid Build Coastguard Worker    movq          xm16, [leftq+2]
1904*c0909341SAndroid Build Coastguard Worker    vmovdqu16  m16{k1}, [lpfq+wq-6]
1905*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
1906*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
1907*c0909341SAndroid Build Coastguard Worker.hv0_extend_left:
1908*c0909341SAndroid Build Coastguard Worker    vpbroadcastw  xm16, [lpfq+wq]
1909*c0909341SAndroid Build Coastguard Worker    vmovdqu16  m16{k1}, [lpfq+wq-6]
1910*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
1911*c0909341SAndroid Build Coastguard Worker.hv0_bottom:
1912*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
1913*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1914*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
1915*c0909341SAndroid Build Coastguard Worker.hv0_loop:
1916*c0909341SAndroid Build Coastguard Worker    movu           m16, [lpfq+r10- 2]
1917*c0909341SAndroid Build Coastguard Worker.hv0_main:
1918*c0909341SAndroid Build Coastguard Worker    movu           m17, [lpfq+r10+14]
1919*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1920*c0909341SAndroid Build Coastguard Worker    jnz .hv0_have_right
1921*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -68
1922*c0909341SAndroid Build Coastguard Worker    jl .hv0_have_right
1923*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m0, [lpfq-2]
1924*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, [r13+r10+ 0], 0xe4
1925*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m0, [r13+r10+16], 0xe4
1926*c0909341SAndroid Build Coastguard Worker.hv0_have_right:
1927*c0909341SAndroid Build Coastguard Worker    palignr         m3, m17, m16, 2
1928*c0909341SAndroid Build Coastguard Worker    palignr         m0, m17, m16, 4
1929*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3, m0
1930*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m0
1931*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1932*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m0
1933*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m3
1934*c0909341SAndroid Build Coastguard Worker    palignr         m0, m17, m16, 6
1935*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0             ; h sum3
1936*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m0, m6
1937*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m4, m4         ; h sumsq3
1938*c0909341SAndroid Build Coastguard Worker    punpckhwd       m0, m6
1939*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m0, m0
1940*c0909341SAndroid Build Coastguard Worker    shufpd         m17, m16, m17, 0x55
1941*c0909341SAndroid Build Coastguard Worker    paddw           m4, m1, [t1+r10+416* 6]
1942*c0909341SAndroid Build Coastguard Worker    paddd           m5, m2, [t1+r10+416* 8]
1943*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416* 6], m1
1944*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416* 8], m2
1945*c0909341SAndroid Build Coastguard Worker    paddw           m1, m16
1946*c0909341SAndroid Build Coastguard Worker    paddw           m1, m17            ; h sum5
1947*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m17, m16
1948*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m0, m0         ; h sumsq5
1949*c0909341SAndroid Build Coastguard Worker    paddd           m0, m3, [t1+r10+416*10]
1950*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416*10], m3
1951*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m16
1952*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m17, m17
1953*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*8+ 8], m1       ; we need a clean copy of the last row
1954*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*0+ 8], m2       ; in case height is odd
1955*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*0+72], m3
1956*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t1+r10+416* 0]
1957*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+r10+416* 2]
1958*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t1+r10+416* 4]
1959*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416* 0], m1
1960*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416* 2], m2
1961*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416* 4], m3
1962*c0909341SAndroid Build Coastguard Worker    paddw          m17, m4, [t2+r10+416* 6]
1963*c0909341SAndroid Build Coastguard Worker    paddd           m2, m5, [t2+r10+416* 8]
1964*c0909341SAndroid Build Coastguard Worker    paddd           m3, m0, [t2+r10+416*10]
1965*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416* 6], m4
1966*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416* 8], m5
1967*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*10], m0
1968*c0909341SAndroid Build Coastguard Worker    paddd           m2, m8
1969*c0909341SAndroid Build Coastguard Worker    paddd           m3, m8
1970*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a3 + 8) >> 4
1971*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
1972*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m9             ; -((a3 + 8) >> 4) * 9
1973*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m9
1974*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m17, 1
1975*c0909341SAndroid Build Coastguard Worker    pavgw           m5, m6             ; (b3 + 2) >> 2
1976*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5, m6
1977*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m4, m4         ; -p3
1978*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m6
1979*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m5, m5
1980*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m6, m17        ; b3
1981*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m6, m17
1982*c0909341SAndroid Build Coastguard Worker    pminsd          m2, m6
1983*c0909341SAndroid Build Coastguard Worker    pminsd          m3, m6
1984*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m12            ; p3 * s1
1985*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m12
1986*c0909341SAndroid Build Coastguard Worker    pmaddwd        m16, m13            ; b3 * 455
1987*c0909341SAndroid Build Coastguard Worker    pmaddwd        m17, m13
1988*c0909341SAndroid Build Coastguard Worker    vpalignr    m3{k2}, m2, m2, 2
1989*c0909341SAndroid Build Coastguard Worker    mova            m2, m20
1990*c0909341SAndroid Build Coastguard Worker    paddusw         m3, m14
1991*c0909341SAndroid Build Coastguard Worker    psraw           m3, 4              ; min(z3, 255) - 256
1992*c0909341SAndroid Build Coastguard Worker    vpermt2b        m2, m3, m21        ; sgr_x_by_x[128..255]
1993*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m3
1994*c0909341SAndroid Build Coastguard Worker    vpermi2b        m3, m18, m19       ; sgr_x_by_x[  0..127]
1995*c0909341SAndroid Build Coastguard Worker    vmovdqu8    m3{k3}, m2             ; x3
1996*c0909341SAndroid Build Coastguard Worker    pandn           m2, m15, m3
1997*c0909341SAndroid Build Coastguard Worker    psrld           m3, 16
1998*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m2
1999*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m3
2000*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
2001*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*2+4], m2
2002*c0909341SAndroid Build Coastguard Worker    psubd          m16, m15            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2003*c0909341SAndroid Build Coastguard Worker    psubd          m17, m15
2004*c0909341SAndroid Build Coastguard Worker    psrld          m16, 12
2005*c0909341SAndroid Build Coastguard Worker    psrld          m17, 12
2006*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*4+  8], xm16
2007*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*4+ 24], xm17
2008*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*4+ 40], ym16, 1
2009*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*4+ 56], ym17, 1
2010*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
2011*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
2012*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*4+104], m16, 3
2013*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*4+120], m17, 3
2014*c0909341SAndroid Build Coastguard Worker    add            r10, 64
2015*c0909341SAndroid Build Coastguard Worker    jl .hv0_loop
2016*c0909341SAndroid Build Coastguard Worker    ret
2017*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2018*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
2019*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
2020*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2021*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
2022*c0909341SAndroid Build Coastguard Worker    movq          xm16, [leftq+2]
2023*c0909341SAndroid Build Coastguard Worker    vmovdqu16  m16{k1}, [lpfq+wq-6]
2024*c0909341SAndroid Build Coastguard Worker    add          leftq, 8
2025*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
2026*c0909341SAndroid Build Coastguard Worker.hv1_extend_left:
2027*c0909341SAndroid Build Coastguard Worker    vpbroadcastw  xm16, [lpfq+wq]
2028*c0909341SAndroid Build Coastguard Worker    vmovdqu16  m16{k1}, [lpfq+wq-6]
2029*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
2030*c0909341SAndroid Build Coastguard Worker.hv1_bottom:
2031*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
2032*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
2033*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
2034*c0909341SAndroid Build Coastguard Worker.hv1_loop:
2035*c0909341SAndroid Build Coastguard Worker    movu           m16, [lpfq+r10- 2]
2036*c0909341SAndroid Build Coastguard Worker.hv1_main:
2037*c0909341SAndroid Build Coastguard Worker    movu           m17, [lpfq+r10+14]
2038*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
2039*c0909341SAndroid Build Coastguard Worker    jnz .hv1_have_right
2040*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -68
2041*c0909341SAndroid Build Coastguard Worker    jl .hv1_have_right
2042*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m0, [lpfq-2]
2043*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, [r13+r10+ 0], 0xe4
2044*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m0, [r13+r10+16], 0xe4
2045*c0909341SAndroid Build Coastguard Worker.hv1_have_right:
2046*c0909341SAndroid Build Coastguard Worker    palignr         m1, m17, m16, 2
2047*c0909341SAndroid Build Coastguard Worker    palignr         m3, m17, m16, 4
2048*c0909341SAndroid Build Coastguard Worker    paddw           m2, m1, m3
2049*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m3
2050*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
2051*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m3
2052*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
2053*c0909341SAndroid Build Coastguard Worker    palignr         m3, m17, m16, 6
2054*c0909341SAndroid Build Coastguard Worker    paddw           m2, m3             ; h sum3
2055*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m3, m6
2056*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m5, m5         ; h sumsq3
2057*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
2058*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m3, m3
2059*c0909341SAndroid Build Coastguard Worker    shufpd          m3, m16, m17, 0x55
2060*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m16, m3
2061*c0909341SAndroid Build Coastguard Worker    paddw           m4, m16, m3
2062*c0909341SAndroid Build Coastguard Worker    punpckhwd      m16, m3
2063*c0909341SAndroid Build Coastguard Worker    paddw          m17, m2, [t2+r10+416* 6]
2064*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416* 6], m2
2065*c0909341SAndroid Build Coastguard Worker    paddw           m4, m2             ; h sum5
2066*c0909341SAndroid Build Coastguard Worker    paddd           m2, m0, [t2+r10+416* 8]
2067*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1, [t2+r10+416*10]
2068*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416* 8], m0
2069*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*10], m1
2070*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m5, m5         ; h sumsq5
2071*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m16, m16
2072*c0909341SAndroid Build Coastguard Worker    paddd           m2, m8
2073*c0909341SAndroid Build Coastguard Worker    paddd           m3, m8
2074*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a3 + 8) >> 4
2075*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
2076*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m9             ; -((a3 + 8) >> 4) * 9
2077*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m9
2078*c0909341SAndroid Build Coastguard Worker    psrlw          m16, m17, 1
2079*c0909341SAndroid Build Coastguard Worker    pavgw          m16, m6             ; (b3 + 2) >> 2
2080*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m16, m6
2081*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m5, m5         ; -p3
2082*c0909341SAndroid Build Coastguard Worker    punpckhwd      m16, m6
2083*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m16, m16
2084*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m6, m17        ; b3
2085*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m6, m17
2086*c0909341SAndroid Build Coastguard Worker    pminsd          m2, m6
2087*c0909341SAndroid Build Coastguard Worker    pminsd          m3, m6
2088*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m12            ; p3 * s1
2089*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m12
2090*c0909341SAndroid Build Coastguard Worker    pmaddwd        m16, m13            ; b3 * 455
2091*c0909341SAndroid Build Coastguard Worker    pmaddwd        m17, m13
2092*c0909341SAndroid Build Coastguard Worker    vpalignr    m3{k2}, m2, m2, 2
2093*c0909341SAndroid Build Coastguard Worker    mova            m2, m20
2094*c0909341SAndroid Build Coastguard Worker    paddusw         m3, m14
2095*c0909341SAndroid Build Coastguard Worker    psraw           m3, 4              ; min(z3, 255) - 256
2096*c0909341SAndroid Build Coastguard Worker    vpermt2b        m2, m3, m21        ; sgr_x_by_x[128..255]
2097*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m3
2098*c0909341SAndroid Build Coastguard Worker    vpermi2b        m3, m18, m19       ; sgr_x_by_x[  0..127]
2099*c0909341SAndroid Build Coastguard Worker    vmovdqu8    m3{k3}, m2             ; x3
2100*c0909341SAndroid Build Coastguard Worker    pandn           m2, m15, m3
2101*c0909341SAndroid Build Coastguard Worker    psrld           m3, 16
2102*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m2
2103*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m3
2104*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
2105*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*4+4], m2
2106*c0909341SAndroid Build Coastguard Worker    psubd          m16, m15            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2107*c0909341SAndroid Build Coastguard Worker    psubd          m17, m15
2108*c0909341SAndroid Build Coastguard Worker    psrld          m16, 12
2109*c0909341SAndroid Build Coastguard Worker    psrld          m17, 12
2110*c0909341SAndroid Build Coastguard Worker    paddw           m5, m4, [t2+r10+416*0]
2111*c0909341SAndroid Build Coastguard Worker    paddd           m2, m0, [t2+r10+416*2]
2112*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1, [t2+r10+416*4]
2113*c0909341SAndroid Build Coastguard Worker    paddw           m5, [t1+r10+416*0]
2114*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+r10+416*2]
2115*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t1+r10+416*4]
2116*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*0], m4
2117*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*2], m0
2118*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*4], m1
2119*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*8+  8], xm16
2120*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*8+ 24], xm17
2121*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*8+ 40], ym16, 1
2122*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*8+ 56], ym17, 1
2123*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2
2124*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2
2125*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*8+104], m16, 3
2126*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*8+120], m17, 3
2127*c0909341SAndroid Build Coastguard Worker    paddd           m2, m8
2128*c0909341SAndroid Build Coastguard Worker    paddd           m3, m8
2129*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a5 + 8) >> 4
2130*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
2131*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m10            ; -((a5 + 8) >> 4) * 25
2132*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m10
2133*c0909341SAndroid Build Coastguard Worker    psrlw          m17, m5, 1
2134*c0909341SAndroid Build Coastguard Worker    pavgw          m17, m6             ; (b5 + 2) >> 2
2135*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m17, m6
2136*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m16, m16       ; -p5
2137*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m6
2138*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m17, m17
2139*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m5, m6         ; b5
2140*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m5, m6
2141*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m11            ; p5 * s0
2142*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m11
2143*c0909341SAndroid Build Coastguard Worker    pmaddwd        m16, m13            ; b5 * 164
2144*c0909341SAndroid Build Coastguard Worker    pmaddwd        m17, m13
2145*c0909341SAndroid Build Coastguard Worker    vpalignr    m3{k2}, m2, m2, 2
2146*c0909341SAndroid Build Coastguard Worker    mova            m2, m20
2147*c0909341SAndroid Build Coastguard Worker    pmaxsw          m3, m6
2148*c0909341SAndroid Build Coastguard Worker    paddusw         m3, m14
2149*c0909341SAndroid Build Coastguard Worker    psraw           m3, 4              ; min(z5, 255) - 256
2150*c0909341SAndroid Build Coastguard Worker    vpermt2b        m2, m3, m21        ; sgr_x_by_x[128..255]
2151*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m3
2152*c0909341SAndroid Build Coastguard Worker    vpermi2b        m3, m18, m19       ; sgr_x_by_x[  0..127]
2153*c0909341SAndroid Build Coastguard Worker    vmovdqu8    m3{k3}, m2             ; x5
2154*c0909341SAndroid Build Coastguard Worker    pandn           m2, m15, m3
2155*c0909341SAndroid Build Coastguard Worker    psrld           m3, 16
2156*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m2
2157*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m3
2158*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
2159*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*0+4], m2
2160*c0909341SAndroid Build Coastguard Worker    psubd          m16, m15            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
2161*c0909341SAndroid Build Coastguard Worker    psubd          m17, m15
2162*c0909341SAndroid Build Coastguard Worker    psrld          m16, 12
2163*c0909341SAndroid Build Coastguard Worker    psrld          m17, 12
2164*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*0+  8], xm16
2165*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*0+ 24], xm17
2166*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*0+ 40], ym16, 1
2167*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*0+ 56], ym17, 1
2168*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
2169*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
2170*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*0+104], m16, 3
2171*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*0+120], m17, 3
2172*c0909341SAndroid Build Coastguard Worker    add            r10, 64
2173*c0909341SAndroid Build Coastguard Worker    jl .hv1_loop
2174*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
2175*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
2176*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
2177*c0909341SAndroid Build Coastguard Worker    ret
2178*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab3 (even rows)
2179*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
2180*c0909341SAndroid Build Coastguard Worker.v0_loop:
2181*c0909341SAndroid Build Coastguard Worker    mova           m16, [t1+r10+416* 6]
2182*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10+416* 8]
2183*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+r10+416*10]
2184*c0909341SAndroid Build Coastguard Worker    paddw          m16, m16
2185*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
2186*c0909341SAndroid Build Coastguard Worker    paddd           m3, m3
2187*c0909341SAndroid Build Coastguard Worker    paddw          m17, m16, [t2+r10+416* 6]
2188*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t2+r10+416* 8]
2189*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t2+r10+416*10]
2190*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416* 6], m16
2191*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416* 8], m2
2192*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*10], m3
2193*c0909341SAndroid Build Coastguard Worker    paddd           m4, m8
2194*c0909341SAndroid Build Coastguard Worker    paddd           m5, m8
2195*c0909341SAndroid Build Coastguard Worker    psrld           m4, 4              ; (a3 + 8) >> 4
2196*c0909341SAndroid Build Coastguard Worker    psrld           m5, 4
2197*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m9             ; -((a3 + 8) >> 4) * 9
2198*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m9
2199*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m17, 1
2200*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m6             ; (b3 + 2) >> 2
2201*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
2202*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m4, m2, m2         ; -p3
2203*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
2204*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m5, m3, m3
2205*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m6, m17        ; b3
2206*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m6, m17
2207*c0909341SAndroid Build Coastguard Worker    pminsd          m4, m6
2208*c0909341SAndroid Build Coastguard Worker    pminsd          m5, m6
2209*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m12            ; p3 * s1
2210*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m12
2211*c0909341SAndroid Build Coastguard Worker    pmaddwd        m16, m13            ; b3 * 455
2212*c0909341SAndroid Build Coastguard Worker    pmaddwd        m17, m13
2213*c0909341SAndroid Build Coastguard Worker    vpalignr    m5{k2}, m4, m4, 2
2214*c0909341SAndroid Build Coastguard Worker    mova            m4, m20
2215*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m14
2216*c0909341SAndroid Build Coastguard Worker    psraw           m5, 4              ; min(z3, 255) - 256
2217*c0909341SAndroid Build Coastguard Worker    vpermt2b        m4, m5, m21        ; sgr_x_by_x[128..255]
2218*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m5
2219*c0909341SAndroid Build Coastguard Worker    vpermi2b        m5, m18, m19       ; sgr_x_by_x[  0..127]
2220*c0909341SAndroid Build Coastguard Worker    vmovdqu8    m5{k3}, m4             ; x3
2221*c0909341SAndroid Build Coastguard Worker    pandn           m4, m15, m5
2222*c0909341SAndroid Build Coastguard Worker    psrld           m5, 16
2223*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m4
2224*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m5
2225*c0909341SAndroid Build Coastguard Worker    packssdw        m4, m5
2226*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*2+4], m4
2227*c0909341SAndroid Build Coastguard Worker    psubd          m16, m15            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2228*c0909341SAndroid Build Coastguard Worker    psubd          m17, m15
2229*c0909341SAndroid Build Coastguard Worker    psrld          m16, 12
2230*c0909341SAndroid Build Coastguard Worker    psrld          m17, 12
2231*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+r10+416*0]
2232*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1+r10+416*2]
2233*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+r10+416*4]
2234*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*8+ 8], m3
2235*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*0+ 8], m4
2236*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*0+72], m5
2237*c0909341SAndroid Build Coastguard Worker    paddw           m3, m3              ; cc5
2238*c0909341SAndroid Build Coastguard Worker    paddd           m4, m4
2239*c0909341SAndroid Build Coastguard Worker    paddd           m5, m5
2240*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416*0], m3
2241*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416*2], m4
2242*c0909341SAndroid Build Coastguard Worker    mova [t1+r10+416*4], m5
2243*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*4+  8], xm16
2244*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*4+ 24], xm17
2245*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*4+ 40], ym16, 1
2246*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*4+ 56], ym17, 1
2247*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
2248*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
2249*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*4+104], m16, 3
2250*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*4+120], m17, 3
2251*c0909341SAndroid Build Coastguard Worker    add            r10, 64
2252*c0909341SAndroid Build Coastguard Worker    jl .v0_loop
2253*c0909341SAndroid Build Coastguard Worker    ret
2254*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows)
2255*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-4]
2256*c0909341SAndroid Build Coastguard Worker.v1_loop:
2257*c0909341SAndroid Build Coastguard Worker    mova           m16, [t1+r10+416* 6]
2258*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10+416* 8]
2259*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+r10+416*10]
2260*c0909341SAndroid Build Coastguard Worker    paddw          m17, m16, [t2+r10+416* 6]
2261*c0909341SAndroid Build Coastguard Worker    paddd           m4, m2, [t2+r10+416* 8]
2262*c0909341SAndroid Build Coastguard Worker    paddd           m5, m3, [t2+r10+416*10]
2263*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416* 6], m16
2264*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416* 8], m2
2265*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*10], m3
2266*c0909341SAndroid Build Coastguard Worker    paddd           m4, m8
2267*c0909341SAndroid Build Coastguard Worker    paddd           m5, m8
2268*c0909341SAndroid Build Coastguard Worker    psrld           m4, 4              ; (a3 + 8) >> 4
2269*c0909341SAndroid Build Coastguard Worker    psrld           m5, 4
2270*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m9              ; -((a3 + 8) >> 4) * 9
2271*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m9
2272*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m17, 1
2273*c0909341SAndroid Build Coastguard Worker    pavgw           m3, m6             ; (b3 + 2) >> 2
2274*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m6
2275*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m4, m2, m2         ; -p3
2276*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m6
2277*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m5, m3, m3
2278*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m6, m17        ; b3
2279*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m6, m17
2280*c0909341SAndroid Build Coastguard Worker    pminsd          m4, m6
2281*c0909341SAndroid Build Coastguard Worker    pminsd          m5, m6
2282*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m12            ; p3 * s1
2283*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m12
2284*c0909341SAndroid Build Coastguard Worker    pmaddwd        m16, m13            ; b3 * 455
2285*c0909341SAndroid Build Coastguard Worker    pmaddwd        m17, m13
2286*c0909341SAndroid Build Coastguard Worker    vpalignr    m5{k2}, m4, m4, 2
2287*c0909341SAndroid Build Coastguard Worker    mova            m4, m20
2288*c0909341SAndroid Build Coastguard Worker    paddusw         m5, m14
2289*c0909341SAndroid Build Coastguard Worker    psraw           m5, 4              ; min(z3, 255) - 256
2290*c0909341SAndroid Build Coastguard Worker    vpermt2b        m4, m5, m21        ; sgr_x_by_x[128..255]
2291*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m5
2292*c0909341SAndroid Build Coastguard Worker    vpermi2b        m5, m18, m19       ; sgr_x_by_x[  0..127]
2293*c0909341SAndroid Build Coastguard Worker    vmovdqu8    m5{k3}, m4             ; x3
2294*c0909341SAndroid Build Coastguard Worker    pandn           m4, m15, m5
2295*c0909341SAndroid Build Coastguard Worker    psrld           m5, 16
2296*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m4
2297*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m5
2298*c0909341SAndroid Build Coastguard Worker    packssdw        m4, m5
2299*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*4+4], m4
2300*c0909341SAndroid Build Coastguard Worker    psubd          m16, m15            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2301*c0909341SAndroid Build Coastguard Worker    psubd          m17, m15
2302*c0909341SAndroid Build Coastguard Worker    psrld          m16, 12
2303*c0909341SAndroid Build Coastguard Worker    psrld          m17, 12
2304*c0909341SAndroid Build Coastguard Worker    mova            m0, [t3+r10*2+416*8+ 8]
2305*c0909341SAndroid Build Coastguard Worker    mova            m4, [t3+r10*2+416*0+ 8]
2306*c0909341SAndroid Build Coastguard Worker    mova            m5, [t3+r10*2+416*0+72]
2307*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10+416*0]
2308*c0909341SAndroid Build Coastguard Worker    paddd           m2, m4, [t2+r10+416*2]
2309*c0909341SAndroid Build Coastguard Worker    paddd           m3, m5, [t2+r10+416*4]
2310*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t1+r10+416*0]
2311*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+r10+416*2]
2312*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t1+r10+416*4]
2313*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*0], m0
2314*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*2], m4
2315*c0909341SAndroid Build Coastguard Worker    mova [t2+r10+416*4], m5
2316*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*8+  8], xm16
2317*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*8+ 24], xm17
2318*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*8+ 40], ym16, 1
2319*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*8+ 56], ym17, 1
2320*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2
2321*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2
2322*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*8+104], m16, 3
2323*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*8+120], m17, 3
2324*c0909341SAndroid Build Coastguard Worker    paddd           m2, m8
2325*c0909341SAndroid Build Coastguard Worker    paddd           m3, m8
2326*c0909341SAndroid Build Coastguard Worker    psrld           m2, 4              ; (a5 + 8) >> 4
2327*c0909341SAndroid Build Coastguard Worker    psrld           m3, 4
2328*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m10            ; -((a5 + 8) >> 4) * 25
2329*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m10
2330*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m1, 1
2331*c0909341SAndroid Build Coastguard Worker    pavgw           m5, m6             ; (b5 + 2) >> 2
2332*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5, m6
2333*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m4, m4         ; -p5
2334*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m6
2335*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m5, m5
2336*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m1, m6         ; b5
2337*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m1, m6
2338*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m11            ; p5 * s0
2339*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m11
2340*c0909341SAndroid Build Coastguard Worker    pmaddwd        m16, m13            ; b5 * 164
2341*c0909341SAndroid Build Coastguard Worker    pmaddwd        m17, m13
2342*c0909341SAndroid Build Coastguard Worker    vpalignr    m3{k2}, m2, m2, 2
2343*c0909341SAndroid Build Coastguard Worker    mova            m2, m20
2344*c0909341SAndroid Build Coastguard Worker    pmaxsw          m3, m6
2345*c0909341SAndroid Build Coastguard Worker    paddusw         m3, m14
2346*c0909341SAndroid Build Coastguard Worker    psraw           m3, 4              ; min(z5, 255) - 256
2347*c0909341SAndroid Build Coastguard Worker    vpermt2b        m2, m3, m21        ; sgr_x_by_x[128..255]
2348*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m3
2349*c0909341SAndroid Build Coastguard Worker    vpermi2b        m3, m18, m19       ; sgr_x_by_x[  0..127]
2350*c0909341SAndroid Build Coastguard Worker    vmovdqu8    m3{k3}, m2             ; x5
2351*c0909341SAndroid Build Coastguard Worker    pandn           m2, m15, m3
2352*c0909341SAndroid Build Coastguard Worker    psrld           m3, 16
2353*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m2
2354*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m3
2355*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
2356*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*0+4], m2
2357*c0909341SAndroid Build Coastguard Worker    psubd          m16, m15            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
2358*c0909341SAndroid Build Coastguard Worker    psubd          m17, m15
2359*c0909341SAndroid Build Coastguard Worker    psrld          m16, 12
2360*c0909341SAndroid Build Coastguard Worker    psrld          m17, 12
2361*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*0+  8], xm16
2362*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*2+416*0+ 24], xm17
2363*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*0+ 40], ym16, 1
2364*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*2+416*0+ 56], ym17, 1
2365*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
2366*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
2367*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*0+104], m16, 3
2368*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*2+416*0+120], m17, 3
2369*c0909341SAndroid Build Coastguard Worker    add            r10, 64
2370*c0909341SAndroid Build Coastguard Worker    jl .v1_loop
2371*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
2372*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
2373*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
2374*c0909341SAndroid Build Coastguard Worker    ret
2375*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup
2376*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
2377*c0909341SAndroid Build Coastguard Worker.prep_n_loop:
2378*c0909341SAndroid Build Coastguard Worker    movu           ym0, [t4+r10*1+416*0+2]
2379*c0909341SAndroid Build Coastguard Worker    paddw          ym2, ym0, [t4+r10*1+416*0+0]
2380*c0909341SAndroid Build Coastguard Worker    paddw          ym2, [t4+r10*1+416*0+4]
2381*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+r10*2+416*0+4]
2382*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1, [t3+r10*2+416*0+0]
2383*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t3+r10*2+416*0+8]
2384*c0909341SAndroid Build Coastguard Worker    paddw          ym0, ym2
2385*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
2386*c0909341SAndroid Build Coastguard Worker    psllw          ym2, 2
2387*c0909341SAndroid Build Coastguard Worker    pslld           m3, 2
2388*c0909341SAndroid Build Coastguard Worker    paddw          ym0, ym2              ; a5 565
2389*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3               ; b5 565
2390*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416* 6], ym0
2391*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*12], m1
2392*c0909341SAndroid Build Coastguard Worker    mova           ym0, [t4+r10*1+416*2+0]
2393*c0909341SAndroid Build Coastguard Worker    paddw          ym0, [t4+r10*1+416*2+4]
2394*c0909341SAndroid Build Coastguard Worker    paddw          ym2, ym0, [t4+r10*1+416*2+2]
2395*c0909341SAndroid Build Coastguard Worker    mova            m1, [t3+r10*2+416*4+0]
2396*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+r10*2+416*4+8]
2397*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1, [t3+r10*2+416*4+4]
2398*c0909341SAndroid Build Coastguard Worker    psllw          ym2, 2                ; a3[-1] 444
2399*c0909341SAndroid Build Coastguard Worker    pslld           m3, 2                ; b3[-1] 444
2400*c0909341SAndroid Build Coastguard Worker    psubw          ym2, ym0              ; a3[-1] 343
2401*c0909341SAndroid Build Coastguard Worker    psubd           m3, m1               ; b3[-1] 343
2402*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416* 8], ym2
2403*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*16], m3
2404*c0909341SAndroid Build Coastguard Worker    mova           ym0, [t4+r10*1+416*4+0]
2405*c0909341SAndroid Build Coastguard Worker    paddw          ym0, [t4+r10*1+416*4+4]
2406*c0909341SAndroid Build Coastguard Worker    paddw          ym2, ym0, [t4+r10*1+416*4+2]
2407*c0909341SAndroid Build Coastguard Worker    mova            m1, [t3+r10*2+416*8+0]
2408*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+r10*2+416*8+8]
2409*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1, [t3+r10*2+416*8+4]
2410*c0909341SAndroid Build Coastguard Worker    psllw          ym2, 2                 ; a3[ 0] 444
2411*c0909341SAndroid Build Coastguard Worker    pslld           m3, 2                 ; b3[ 0] 444
2412*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*10], ym2
2413*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*20], m3
2414*c0909341SAndroid Build Coastguard Worker    psubw          ym2, ym0               ; a3[ 0] 343
2415*c0909341SAndroid Build Coastguard Worker    psubd           m3, m1                ; b3[ 0] 343
2416*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*12], ym2
2417*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*24], m3
2418*c0909341SAndroid Build Coastguard Worker    add            r10, 32
2419*c0909341SAndroid Build Coastguard Worker    jl .prep_n_loop
2420*c0909341SAndroid Build Coastguard Worker    ret
2421*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2422*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows)
2423*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
2424*c0909341SAndroid Build Coastguard Worker.n0_loop:
2425*c0909341SAndroid Build Coastguard Worker    movu           ym2, [t4+r10*1+2]
2426*c0909341SAndroid Build Coastguard Worker    paddw          ym0, ym2, [t4+r10*1+0]
2427*c0909341SAndroid Build Coastguard Worker    paddw          ym0, [t4+r10*1+4]
2428*c0909341SAndroid Build Coastguard Worker    paddw          ym2, ym0
2429*c0909341SAndroid Build Coastguard Worker    psllw          ym0, 2
2430*c0909341SAndroid Build Coastguard Worker    paddw          ym0, ym2              ; a5
2431*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+r10*2+4]
2432*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1, [t3+r10*2+0]
2433*c0909341SAndroid Build Coastguard Worker    paddd           m4, [t3+r10*2+8]
2434*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4
2435*c0909341SAndroid Build Coastguard Worker    pslld           m4, 2
2436*c0909341SAndroid Build Coastguard Worker    paddd           m4, m1               ; b5
2437*c0909341SAndroid Build Coastguard Worker    paddw          ym2, ym0, [t4+r10*1+416* 6]
2438*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416* 6], ym0
2439*c0909341SAndroid Build Coastguard Worker    paddd           m0, m4, [t3+r10*2+416*12]
2440*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*12], m4
2441*c0909341SAndroid Build Coastguard Worker    mova           ym3, [t4+r10*1+416*2+0]
2442*c0909341SAndroid Build Coastguard Worker    paddw          ym3, [t4+r10*1+416*2+4]
2443*c0909341SAndroid Build Coastguard Worker    paddw          ym5, ym3, [t4+r10*1+416*2+2]
2444*c0909341SAndroid Build Coastguard Worker    psllw          ym5, 2                ; a3[ 1] 444
2445*c0909341SAndroid Build Coastguard Worker    psubw          ym4, ym5, ym3         ; a3[ 1] 343
2446*c0909341SAndroid Build Coastguard Worker    paddw          ym3, ym4, [t4+r10*1+416* 8]
2447*c0909341SAndroid Build Coastguard Worker    paddw          ym3, [t4+r10*1+416*10]
2448*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416* 8], ym4
2449*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*10], ym5
2450*c0909341SAndroid Build Coastguard Worker    mova            m1, [t3+r10*2+416*4+0]
2451*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+r10*2+416*4+8]
2452*c0909341SAndroid Build Coastguard Worker    paddd           m5, m1, [t3+r10*2+416*4+4]
2453*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2                ; b3[ 1] 444
2454*c0909341SAndroid Build Coastguard Worker    psubd           m4, m5, m1           ; b3[ 1] 343
2455*c0909341SAndroid Build Coastguard Worker    paddd           m1, m4, [t3+r10*2+416*16]
2456*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+r10*2+416*20]
2457*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*16], m4
2458*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*20], m5
2459*c0909341SAndroid Build Coastguard Worker    pmovzxwd        m4, [dstq+r10]
2460*c0909341SAndroid Build Coastguard Worker    pmovzxwd        m2, ym2              ; a5
2461*c0909341SAndroid Build Coastguard Worker    pmovzxwd        m3, ym3              ; a3
2462*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m4               ; a5 * src
2463*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m4               ; a3 * src
2464*c0909341SAndroid Build Coastguard Worker    vpshldd         m4, m22, 13
2465*c0909341SAndroid Build Coastguard Worker    psubd           m0, m2               ; b5 - a5 * src + (1 << 8)
2466*c0909341SAndroid Build Coastguard Worker    psubd           m1, m3               ; b3 - a3 * src + (1 << 8)
2467*c0909341SAndroid Build Coastguard Worker    psrld           m0, 9
2468*c0909341SAndroid Build Coastguard Worker    pslld           m1, 7
2469*c0909341SAndroid Build Coastguard Worker    vpblendmb   m0{k2}, m1, m0
2470*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m4, m0, m7
2471*c0909341SAndroid Build Coastguard Worker    psrad           m4, 7
2472*c0909341SAndroid Build Coastguard Worker    pmaxsd          m4, m6
2473*c0909341SAndroid Build Coastguard Worker    vpmovusdw     ym16, m4               ; clip
2474*c0909341SAndroid Build Coastguard Worker    psrlw         ym16, 6
2475*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], ym16
2476*c0909341SAndroid Build Coastguard Worker    add            r10, 32
2477*c0909341SAndroid Build Coastguard Worker    jl .n0_loop
2478*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
2479*c0909341SAndroid Build Coastguard Worker    ret
2480*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2481*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows)
2482*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
2483*c0909341SAndroid Build Coastguard Worker.n1_loop:
2484*c0909341SAndroid Build Coastguard Worker    mova           ym3, [t4+r10*1+416*4+0]
2485*c0909341SAndroid Build Coastguard Worker    paddw          ym3, [t4+r10*1+416*4+4]
2486*c0909341SAndroid Build Coastguard Worker    paddw          ym5, ym3, [t4+r10*1+416*4+2]
2487*c0909341SAndroid Build Coastguard Worker    psllw          ym5, 2                ; a3[ 1] 444
2488*c0909341SAndroid Build Coastguard Worker    psubw          ym4, ym5, ym3         ; a3[ 1] 343
2489*c0909341SAndroid Build Coastguard Worker    paddw          ym3, ym4, [t4+r10*1+416*12]
2490*c0909341SAndroid Build Coastguard Worker    paddw          ym3, [t4+r10*1+416*10]
2491*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*10], ym5
2492*c0909341SAndroid Build Coastguard Worker    mova [t4+r10*1+416*12], ym4
2493*c0909341SAndroid Build Coastguard Worker    mova            m0, [t3+r10*2+416*8+0]
2494*c0909341SAndroid Build Coastguard Worker    paddd           m0, [t3+r10*2+416*8+8]
2495*c0909341SAndroid Build Coastguard Worker    paddd           m5, m0, [t3+r10*2+416*8+4]
2496*c0909341SAndroid Build Coastguard Worker    pslld           m5, 2                ; b3[ 1] 444
2497*c0909341SAndroid Build Coastguard Worker    psubd           m4, m5, m0           ; b3[ 1] 343
2498*c0909341SAndroid Build Coastguard Worker    paddd           m0, m4, [t3+r10*2+416*24]
2499*c0909341SAndroid Build Coastguard Worker    paddd           m0, [t3+r10*2+416*20]
2500*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*20], m5
2501*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*2+416*24], m4
2502*c0909341SAndroid Build Coastguard Worker    pmovzxwd        m4, [dstq+r10]
2503*c0909341SAndroid Build Coastguard Worker    pmovzxwd        m2, [t4+r10*1+416* 6]
2504*c0909341SAndroid Build Coastguard Worker    pmovzxwd        m3, ym3
2505*c0909341SAndroid Build Coastguard Worker    mova            m1, [t3+r10*2+416*12]
2506*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m4               ; a5 * src
2507*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m4               ; a3 * src
2508*c0909341SAndroid Build Coastguard Worker    vpshldd         m4, m22, 13
2509*c0909341SAndroid Build Coastguard Worker    psubd           m1, m2               ; b5 - a5 * src + (1 << 8)
2510*c0909341SAndroid Build Coastguard Worker    psubd           m0, m3               ; b3 - a3 * src + (1 << 8)
2511*c0909341SAndroid Build Coastguard Worker    pslld           m0, 7
2512*c0909341SAndroid Build Coastguard Worker    vpalignr    m0{k2}, m1, m1, 1
2513*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m4, m0, m7
2514*c0909341SAndroid Build Coastguard Worker    psrad           m4, 7
2515*c0909341SAndroid Build Coastguard Worker    pmaxsd          m4, m6
2516*c0909341SAndroid Build Coastguard Worker    vpmovusdw     ym16, m4               ; clip
2517*c0909341SAndroid Build Coastguard Worker    psrlw         ym16, 6
2518*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], ym16
2519*c0909341SAndroid Build Coastguard Worker    add            r10, 32
2520*c0909341SAndroid Build Coastguard Worker    jl .n1_loop
2521*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
2522*c0909341SAndroid Build Coastguard Worker    ret
2523*c0909341SAndroid Build Coastguard Worker
2524*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
2525