xref: /aosp_15_r20/external/libdav1d/src/x86/looprestoration_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 32
32*c0909341SAndroid Build Coastguard Worker
33*c0909341SAndroid Build Coastguard Workerwiener_shufA:  db  1,  2,  7,  6,  3,  4,  9,  8,  5,  6, 11, 10,  7,  8, 13, 12
34*c0909341SAndroid Build Coastguard Workerwiener_shufB:  db  2,  3,  8,  7,  4,  5, 10,  9,  6,  7, 12, 11,  8,  9, 14, 13
35*c0909341SAndroid Build Coastguard Workerwiener_shufC:  db  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11
36*c0909341SAndroid Build Coastguard Workerwiener_shufD:  db  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12
37*c0909341SAndroid Build Coastguard Workerwiener_perm32: db  1,  9,  3, 11,  5, 13,  7, 15, 33, 41, 35, 43, 37, 45, 39, 47
38*c0909341SAndroid Build Coastguard Worker               db 17, 25, 19, 27, 21, 29, 23, 31, 49, 57, 51, 59, 53, 61, 55, 63
39*c0909341SAndroid Build Coastguard Workersgr_shuf:      db 128, 1, -1,  2,132,  3, -1,  4,136,  5, -1,  6,140,  7, -1,  8
40*c0909341SAndroid Build Coastguard Worker               db 129, 9, -1, 10,133, 11, -1, 12,137, -1, -1, -1,141, -1,  0,128
41*c0909341SAndroid Build Coastguard Workersgr_mix_perm:  db  1,  3,  5,  7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55
42*c0909341SAndroid Build Coastguard Workerr_ext_mask:    times 68 db -1
43*c0909341SAndroid Build Coastguard Worker               times  4 db  0
44*c0909341SAndroid Build Coastguard Workerwiener_x_shuf: db  0,  2, -1,  0
45*c0909341SAndroid Build Coastguard Workerwiener_x_add:  db  0,  1,127,  0
46*c0909341SAndroid Build Coastguard Worker
47*c0909341SAndroid Build Coastguard Workerpw_61448:      times 2 dw 61448
48*c0909341SAndroid Build Coastguard Workerpw_164_455:    dw 164, 455
49*c0909341SAndroid Build Coastguard Workerpd_m16380:     dd -16380
50*c0909341SAndroid Build Coastguard Workerpd_m4096:      dd -4096
51*c0909341SAndroid Build Coastguard Workerpd_m25         dd -25
52*c0909341SAndroid Build Coastguard Workerpd_m9:         dd -9
53*c0909341SAndroid Build Coastguard Workerpd_34816:      dd 34816
54*c0909341SAndroid Build Coastguard Workerpd_8421376:    dd 8421376
55*c0909341SAndroid Build Coastguard Worker
56*c0909341SAndroid Build Coastguard Workercextern sgr_x_by_x
57*c0909341SAndroid Build Coastguard Worker
58*c0909341SAndroid Build Coastguard WorkerSECTION .text
59*c0909341SAndroid Build Coastguard Worker
60*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
61*c0909341SAndroid Build Coastguard Worker
62*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
63*c0909341SAndroid Build Coastguard Workercglobal wiener_filter7_8bpc, 4, 15, 20, -384*12-16, dst, stride, left, lpf, \
64*c0909341SAndroid Build Coastguard Worker                                                    w, h, edge, flt
65*c0909341SAndroid Build Coastguard Worker    mov           fltq, r6mp
66*c0909341SAndroid Build Coastguard Worker    mov             wd, wm
67*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
68*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
69*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4 m6, [wiener_shufA]
70*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4 m7, [wiener_shufB]
71*c0909341SAndroid Build Coastguard Worker    mov           r10d, 0xfffe
72*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4 m8, [wiener_shufC]
73*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4 m9, [wiener_shufD]
74*c0909341SAndroid Build Coastguard Worker    kmovw           k1, r10d
75*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m0, [wiener_x_shuf]
76*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m1, [wiener_x_add]
77*c0909341SAndroid Build Coastguard Worker    mov            r10, 0xaaaaaaaaaaaaaaaa
78*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m11, [fltq+ 0]
79*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m12, [fltq+ 4]
80*c0909341SAndroid Build Coastguard Worker    kmovq           k2, r10
81*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m10, [pd_m16380]
82*c0909341SAndroid Build Coastguard Worker    packsswb       m11, m11 ; x0   x1   x0   x1
83*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m14, [fltq+16]
84*c0909341SAndroid Build Coastguard Worker    pshufb         m12, m0
85*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m15, [fltq+20]
86*c0909341SAndroid Build Coastguard Worker    paddb          m12, m1  ; x2   x3+1 x2   127
87*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m13, [pd_8421376]
88*c0909341SAndroid Build Coastguard Worker    psllw          m14, 5   ; y0 y1
89*c0909341SAndroid Build Coastguard Worker    psllw          m15, 5   ; y2 y3
90*c0909341SAndroid Build Coastguard Worker    cmp             wd, 32  ; the minimum lr unit size for chroma in 4:2:0 is 32
91*c0909341SAndroid Build Coastguard Worker    jle .w32                ; pixels, so we need a special case for small widths
92*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq*2+16]
93*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
94*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
95*c0909341SAndroid Build Coastguard Worker    neg             wq
96*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
97*c0909341SAndroid Build Coastguard Worker    jz .no_top
98*c0909341SAndroid Build Coastguard Worker    call .h_top
99*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
100*c0909341SAndroid Build Coastguard Worker    mov             t6, t1
101*c0909341SAndroid Build Coastguard Worker    mov             t5, t1
102*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
103*c0909341SAndroid Build Coastguard Worker    call .h_top
104*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
105*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
106*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
107*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
108*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
109*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10 ; below
110*c0909341SAndroid Build Coastguard Worker    call .h
111*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
112*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
113*c0909341SAndroid Build Coastguard Worker    dec             hd
114*c0909341SAndroid Build Coastguard Worker    jz .v1
115*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
116*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
117*c0909341SAndroid Build Coastguard Worker    call .h
118*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
119*c0909341SAndroid Build Coastguard Worker    dec             hd
120*c0909341SAndroid Build Coastguard Worker    jz .v2
121*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
122*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
123*c0909341SAndroid Build Coastguard Worker    call .h
124*c0909341SAndroid Build Coastguard Worker    dec             hd
125*c0909341SAndroid Build Coastguard Worker    jz .v3
126*c0909341SAndroid Build Coastguard Worker.main:
127*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+384*2]
128*c0909341SAndroid Build Coastguard Worker.main_loop:
129*c0909341SAndroid Build Coastguard Worker    call .hv
130*c0909341SAndroid Build Coastguard Worker    dec             hd
131*c0909341SAndroid Build Coastguard Worker    jnz .main_loop
132*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
133*c0909341SAndroid Build Coastguard Worker    jz .v3
134*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
135*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
136*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
137*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
138*c0909341SAndroid Build Coastguard Worker.v1:
139*c0909341SAndroid Build Coastguard Worker    call .v
140*c0909341SAndroid Build Coastguard Worker    RET
141*c0909341SAndroid Build Coastguard Worker.no_top:
142*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
143*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
144*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
145*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10
146*c0909341SAndroid Build Coastguard Worker    call .h
147*c0909341SAndroid Build Coastguard Worker    mov             t6, t1
148*c0909341SAndroid Build Coastguard Worker    mov             t5, t1
149*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
150*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
151*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
152*c0909341SAndroid Build Coastguard Worker    dec             hd
153*c0909341SAndroid Build Coastguard Worker    jz .v1
154*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
155*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
156*c0909341SAndroid Build Coastguard Worker    call .h
157*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
158*c0909341SAndroid Build Coastguard Worker    dec             hd
159*c0909341SAndroid Build Coastguard Worker    jz .v2
160*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
161*c0909341SAndroid Build Coastguard Worker    add             t1, 384*2
162*c0909341SAndroid Build Coastguard Worker    call .h
163*c0909341SAndroid Build Coastguard Worker    dec             hd
164*c0909341SAndroid Build Coastguard Worker    jz .v3
165*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+384*2]
166*c0909341SAndroid Build Coastguard Worker    call .hv
167*c0909341SAndroid Build Coastguard Worker    dec             hd
168*c0909341SAndroid Build Coastguard Worker    jz .v3
169*c0909341SAndroid Build Coastguard Worker    add             t0, 384*8
170*c0909341SAndroid Build Coastguard Worker    call .hv
171*c0909341SAndroid Build Coastguard Worker    dec             hd
172*c0909341SAndroid Build Coastguard Worker    jnz .main
173*c0909341SAndroid Build Coastguard Worker.v3:
174*c0909341SAndroid Build Coastguard Worker    call .v
175*c0909341SAndroid Build Coastguard Worker.v2:
176*c0909341SAndroid Build Coastguard Worker    call .v
177*c0909341SAndroid Build Coastguard Worker    jmp .v1
178*c0909341SAndroid Build Coastguard Worker.h:
179*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
180*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
181*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
182*c0909341SAndroid Build Coastguard Worker    movd          xm16, [leftq]
183*c0909341SAndroid Build Coastguard Worker    vmovdqu32  m16{k1}, [lpfq+r10-4]
184*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
185*c0909341SAndroid Build Coastguard Worker    jmp .h_main
186*c0909341SAndroid Build Coastguard Worker.h_extend_left:
187*c0909341SAndroid Build Coastguard Worker    vpbroadcastb  xm16, [lpfq+r10]   ; the masked load ensures that no exception
188*c0909341SAndroid Build Coastguard Worker    vmovdqu32  m16{k1}, [lpfq+r10-4] ; gets raised from accessing invalid memory
189*c0909341SAndroid Build Coastguard Worker    jmp .h_main
190*c0909341SAndroid Build Coastguard Worker.h_top:
191*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
192*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
193*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
194*c0909341SAndroid Build Coastguard Worker.h_loop:
195*c0909341SAndroid Build Coastguard Worker    movu           m16, [lpfq+r10-4]
196*c0909341SAndroid Build Coastguard Worker.h_main:
197*c0909341SAndroid Build Coastguard Worker    movu           m17, [lpfq+r10+4]
198*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
199*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
200*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -66
201*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
202*c0909341SAndroid Build Coastguard Worker    push            r0
203*c0909341SAndroid Build Coastguard Worker    lea             r0, [r_ext_mask+65]
204*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m0, [lpfq-1]
205*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, [r0+r10+0], 0xe4 ; c ? a : b
206*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m0, [r0+r10+8], 0xe4
207*c0909341SAndroid Build Coastguard Worker    pop             r0
208*c0909341SAndroid Build Coastguard Worker.h_have_right:
209*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m16, m6
210*c0909341SAndroid Build Coastguard Worker    mova            m0, m10
211*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m0, m4, m11
212*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m16, m7
213*c0909341SAndroid Build Coastguard Worker    mova            m2, m10
214*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m2, m4, m11
215*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m17, m6
216*c0909341SAndroid Build Coastguard Worker    mova            m1, m10
217*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m1, m4, m11
218*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m17, m7
219*c0909341SAndroid Build Coastguard Worker    mova            m3, m10
220*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m3, m4, m11
221*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m16, m8
222*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m0, m4, m12
223*c0909341SAndroid Build Coastguard Worker    pshufb         m16, m9
224*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m2, m16, m12
225*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m17, m8
226*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m1, m4, m12
227*c0909341SAndroid Build Coastguard Worker    pshufb         m17, m9
228*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m3, m17, m12
229*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m2
230*c0909341SAndroid Build Coastguard Worker    packssdw        m1, m3
231*c0909341SAndroid Build Coastguard Worker    psraw           m0, 3
232*c0909341SAndroid Build Coastguard Worker    psraw           m1, 3
233*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+ 0], m0
234*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+64], m1
235*c0909341SAndroid Build Coastguard Worker    add            r10, 64
236*c0909341SAndroid Build Coastguard Worker    jl .h_loop
237*c0909341SAndroid Build Coastguard Worker    ret
238*c0909341SAndroid Build Coastguard WorkerALIGN function_align
239*c0909341SAndroid Build Coastguard Worker.hv:
240*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
241*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
242*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
243*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
244*c0909341SAndroid Build Coastguard Worker    movd          xm16, [leftq]
245*c0909341SAndroid Build Coastguard Worker    vmovdqu32  m16{k1}, [lpfq+r10-4]
246*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
247*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
248*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
249*c0909341SAndroid Build Coastguard Worker    vpbroadcastb  xm16, [lpfq+r10]
250*c0909341SAndroid Build Coastguard Worker    vmovdqu32  m16{k1}, [lpfq+r10-4]
251*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
252*c0909341SAndroid Build Coastguard Worker.hv_bottom:
253*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
254*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
255*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
256*c0909341SAndroid Build Coastguard Worker.hv_loop:
257*c0909341SAndroid Build Coastguard Worker    movu           m16, [lpfq+r10-4]
258*c0909341SAndroid Build Coastguard Worker.hv_main:
259*c0909341SAndroid Build Coastguard Worker    movu           m17, [lpfq+r10+4]
260*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
261*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
262*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -66
263*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
264*c0909341SAndroid Build Coastguard Worker    push            r0
265*c0909341SAndroid Build Coastguard Worker    lea             r0, [r_ext_mask+65]
266*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m0, [lpfq-1]
267*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, [r0+r10+0], 0xe4 ; c ? a : b
268*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m0, [r0+r10+8], 0xe4
269*c0909341SAndroid Build Coastguard Worker    pop             r0
270*c0909341SAndroid Build Coastguard Worker.hv_have_right:
271*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m16, m6
272*c0909341SAndroid Build Coastguard Worker    mova            m0, m10
273*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m0, m4, m11
274*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m16, m7
275*c0909341SAndroid Build Coastguard Worker    mova            m2, m10
276*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m2, m4, m11
277*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m17, m6
278*c0909341SAndroid Build Coastguard Worker    mova            m1, m10
279*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m1, m4, m11
280*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m17, m7
281*c0909341SAndroid Build Coastguard Worker    mova            m3, m10
282*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m3, m4, m11
283*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m16, m8
284*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m0, m4, m12
285*c0909341SAndroid Build Coastguard Worker    pshufb         m16, m9
286*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m2, m16, m12
287*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m17, m8
288*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m1, m4, m12
289*c0909341SAndroid Build Coastguard Worker    pshufb         m17, m9
290*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m3, m17, m12
291*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m2
292*c0909341SAndroid Build Coastguard Worker    packssdw        m1, m3
293*c0909341SAndroid Build Coastguard Worker    psraw           m0, 3
294*c0909341SAndroid Build Coastguard Worker    psraw           m1, 3
295*c0909341SAndroid Build Coastguard Worker    mova           m16, [t4+r10*2]
296*c0909341SAndroid Build Coastguard Worker    paddw          m16, [t2+r10*2]
297*c0909341SAndroid Build Coastguard Worker    mova            m3, [t3+r10*2]
298*c0909341SAndroid Build Coastguard Worker    mova           m17, [t4+r10*2+64]
299*c0909341SAndroid Build Coastguard Worker    paddw          m17, [t2+r10*2+64]
300*c0909341SAndroid Build Coastguard Worker    mova            m5, [t3+r10*2+64]
301*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m16, m3
302*c0909341SAndroid Build Coastguard Worker    mova            m2, m13
303*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m4, m15
304*c0909341SAndroid Build Coastguard Worker    punpcklwd      m18, m17, m5
305*c0909341SAndroid Build Coastguard Worker    mova            m4, m13
306*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m4, m18, m15
307*c0909341SAndroid Build Coastguard Worker    punpckhwd      m16, m3
308*c0909341SAndroid Build Coastguard Worker    mova            m3, m13
309*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m16, m15
310*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m5
311*c0909341SAndroid Build Coastguard Worker    mova            m5, m13
312*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m5, m17, m15
313*c0909341SAndroid Build Coastguard Worker    mova           m17, [t5+r10*2]
314*c0909341SAndroid Build Coastguard Worker    paddw          m17, [t1+r10*2]
315*c0909341SAndroid Build Coastguard Worker    paddw          m16, m0, [t6+r10*2]
316*c0909341SAndroid Build Coastguard Worker    mova           m19, [t5+r10*2+64]
317*c0909341SAndroid Build Coastguard Worker    paddw          m19, [t1+r10*2+64]
318*c0909341SAndroid Build Coastguard Worker    paddw          m18, m1, [t6+r10*2+64]
319*c0909341SAndroid Build Coastguard Worker    mova [t0+r10*2+ 0], m0
320*c0909341SAndroid Build Coastguard Worker    mova [t0+r10*2+64], m1
321*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m16, m17
322*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m0, m14
323*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m18, m19
324*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m4, m1, m14
325*c0909341SAndroid Build Coastguard Worker    punpckhwd      m16, m17
326*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m16, m14
327*c0909341SAndroid Build Coastguard Worker    punpckhwd      m18, m19
328*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m5, m18, m14
329*c0909341SAndroid Build Coastguard Worker    packuswb        m2, m4
330*c0909341SAndroid Build Coastguard Worker    psrlw           m2, 8
331*c0909341SAndroid Build Coastguard Worker    vpackuswb   m2{k2}, m3, m5
332*c0909341SAndroid Build Coastguard Worker    movu    [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap
333*c0909341SAndroid Build Coastguard Worker    add            r10, 64 ; function is used for chroma as well, and in some
334*c0909341SAndroid Build Coastguard Worker    jl .hv_loop            ; esoteric edge cases chroma dst pointers may only
335*c0909341SAndroid Build Coastguard Worker    mov             t6, t5 ; have a 32-byte alignment despite having a width
336*c0909341SAndroid Build Coastguard Worker    mov             t5, t4 ; larger than 32, so use an unaligned store here.
337*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
338*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
339*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
340*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
341*c0909341SAndroid Build Coastguard Worker    mov             t0, t6
342*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
343*c0909341SAndroid Build Coastguard Worker    ret
344*c0909341SAndroid Build Coastguard Worker.v:
345*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
346*c0909341SAndroid Build Coastguard Worker.v_loop:
347*c0909341SAndroid Build Coastguard Worker    mova            m4, [t4+r10*2+ 0]
348*c0909341SAndroid Build Coastguard Worker    paddw           m4, [t2+r10*2+ 0]
349*c0909341SAndroid Build Coastguard Worker    mova            m1, [t3+r10*2+ 0]
350*c0909341SAndroid Build Coastguard Worker    mova            m5, [t4+r10*2+64]
351*c0909341SAndroid Build Coastguard Worker    paddw           m5, [t2+r10*2+64]
352*c0909341SAndroid Build Coastguard Worker    mova            m3, [t3+r10*2+64]
353*c0909341SAndroid Build Coastguard Worker    punpcklwd       m6, m4, m1
354*c0909341SAndroid Build Coastguard Worker    mova            m0, m13
355*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m6, m15
356*c0909341SAndroid Build Coastguard Worker    punpcklwd       m6, m5, m3
357*c0909341SAndroid Build Coastguard Worker    mova            m2, m13
358*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m6, m15
359*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m1
360*c0909341SAndroid Build Coastguard Worker    mova            m1, m13
361*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m4, m15
362*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m3
363*c0909341SAndroid Build Coastguard Worker    mova            m3, m13
364*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m5, m15
365*c0909341SAndroid Build Coastguard Worker    mova            m5, [t1+r10*2+ 0]
366*c0909341SAndroid Build Coastguard Worker    paddw           m4, m5, [t6+r10*2+ 0]
367*c0909341SAndroid Build Coastguard Worker    paddw           m5, [t5+r10*2+ 0]
368*c0909341SAndroid Build Coastguard Worker    mova            m7, [t1+r10*2+64]
369*c0909341SAndroid Build Coastguard Worker    paddw           m6, m7, [t6+r10*2+64]
370*c0909341SAndroid Build Coastguard Worker    paddw           m7, [t5+r10*2+64]
371*c0909341SAndroid Build Coastguard Worker    punpcklwd       m8, m4, m5
372*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m8, m14
373*c0909341SAndroid Build Coastguard Worker    punpcklwd       m8, m6, m7
374*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m8, m14
375*c0909341SAndroid Build Coastguard Worker    punpckhwd       m4, m5
376*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m4, m14
377*c0909341SAndroid Build Coastguard Worker    punpckhwd       m6, m7
378*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m6, m14
379*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m2
380*c0909341SAndroid Build Coastguard Worker    psrlw           m0, 8
381*c0909341SAndroid Build Coastguard Worker    vpackuswb   m0{k2}, m1, m3
382*c0909341SAndroid Build Coastguard Worker    movu    [dstq+r10], m0
383*c0909341SAndroid Build Coastguard Worker    add            r10, 64
384*c0909341SAndroid Build Coastguard Worker    jl .v_loop
385*c0909341SAndroid Build Coastguard Worker    mov             t6, t5
386*c0909341SAndroid Build Coastguard Worker    mov             t5, t4
387*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
388*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
389*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
390*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
391*c0909341SAndroid Build Coastguard Worker    ret
392*c0909341SAndroid Build Coastguard Worker.w32:
393*c0909341SAndroid Build Coastguard Worker    lea            r10, [r_ext_mask+73]
394*c0909341SAndroid Build Coastguard Worker    mova          ym18, [wiener_perm32]
395*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+16]
396*c0909341SAndroid Build Coastguard Worker    sub            r10, wq
397*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
398*c0909341SAndroid Build Coastguard Worker    jz .w32_no_top
399*c0909341SAndroid Build Coastguard Worker    call .w32_h_top
400*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
401*c0909341SAndroid Build Coastguard Worker    mov             t6, t1
402*c0909341SAndroid Build Coastguard Worker    mov             t5, t1
403*c0909341SAndroid Build Coastguard Worker    add             t1, 32*2
404*c0909341SAndroid Build Coastguard Worker    call .w32_h_top
405*c0909341SAndroid Build Coastguard Worker    lea             r9, [lpfq+strideq*4]
406*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
407*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
408*c0909341SAndroid Build Coastguard Worker    add             t1, 32*2
409*c0909341SAndroid Build Coastguard Worker    add             r9, strideq
410*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r9 ; below
411*c0909341SAndroid Build Coastguard Worker    call .w32_h
412*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
413*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
414*c0909341SAndroid Build Coastguard Worker    dec             hd
415*c0909341SAndroid Build Coastguard Worker    jz .w32_v1
416*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
417*c0909341SAndroid Build Coastguard Worker    add             t1, 32*2
418*c0909341SAndroid Build Coastguard Worker    call .w32_h
419*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
420*c0909341SAndroid Build Coastguard Worker    dec             hd
421*c0909341SAndroid Build Coastguard Worker    jz .w32_v2
422*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
423*c0909341SAndroid Build Coastguard Worker    add             t1, 32*2
424*c0909341SAndroid Build Coastguard Worker    call .w32_h
425*c0909341SAndroid Build Coastguard Worker    dec             hd
426*c0909341SAndroid Build Coastguard Worker    jz .w32_v3
427*c0909341SAndroid Build Coastguard Worker.w32_main:
428*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+32*2]
429*c0909341SAndroid Build Coastguard Worker.w32_main_loop:
430*c0909341SAndroid Build Coastguard Worker    call .w32_hv
431*c0909341SAndroid Build Coastguard Worker    dec             hd
432*c0909341SAndroid Build Coastguard Worker    jnz .w32_main_loop
433*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
434*c0909341SAndroid Build Coastguard Worker    jz .w32_v3
435*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
436*c0909341SAndroid Build Coastguard Worker    call .w32_hv_bottom
437*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
438*c0909341SAndroid Build Coastguard Worker    call .w32_hv_bottom
439*c0909341SAndroid Build Coastguard Worker.w32_v1:
440*c0909341SAndroid Build Coastguard Worker    call .w32_v
441*c0909341SAndroid Build Coastguard Worker    RET
442*c0909341SAndroid Build Coastguard Worker.w32_no_top:
443*c0909341SAndroid Build Coastguard Worker    lea             r9, [lpfq+strideq*4]
444*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
445*c0909341SAndroid Build Coastguard Worker    lea             r9, [r9+strideq*2]
446*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r9
447*c0909341SAndroid Build Coastguard Worker    call .w32_h
448*c0909341SAndroid Build Coastguard Worker    mov             t6, t1
449*c0909341SAndroid Build Coastguard Worker    mov             t5, t1
450*c0909341SAndroid Build Coastguard Worker    mov             t4, t1
451*c0909341SAndroid Build Coastguard Worker    mov             t3, t1
452*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
453*c0909341SAndroid Build Coastguard Worker    dec             hd
454*c0909341SAndroid Build Coastguard Worker    jz .w32_v1
455*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
456*c0909341SAndroid Build Coastguard Worker    add             t1, 32*2
457*c0909341SAndroid Build Coastguard Worker    call .w32_h
458*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
459*c0909341SAndroid Build Coastguard Worker    dec             hd
460*c0909341SAndroid Build Coastguard Worker    jz .w32_v2
461*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
462*c0909341SAndroid Build Coastguard Worker    add             t1, 32*2
463*c0909341SAndroid Build Coastguard Worker    call .w32_h
464*c0909341SAndroid Build Coastguard Worker    dec             hd
465*c0909341SAndroid Build Coastguard Worker    jz .w32_v3
466*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+32*2]
467*c0909341SAndroid Build Coastguard Worker    call .w32_hv
468*c0909341SAndroid Build Coastguard Worker    dec             hd
469*c0909341SAndroid Build Coastguard Worker    jz .w32_v3
470*c0909341SAndroid Build Coastguard Worker    add             t0, 32*8
471*c0909341SAndroid Build Coastguard Worker    call .w32_hv
472*c0909341SAndroid Build Coastguard Worker    dec             hd
473*c0909341SAndroid Build Coastguard Worker    jnz .w32_main
474*c0909341SAndroid Build Coastguard Worker.w32_v3:
475*c0909341SAndroid Build Coastguard Worker    call .w32_v
476*c0909341SAndroid Build Coastguard Worker.w32_v2:
477*c0909341SAndroid Build Coastguard Worker    call .w32_v
478*c0909341SAndroid Build Coastguard Worker    jmp .w32_v1
479*c0909341SAndroid Build Coastguard Worker.w32_h:
480*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
481*c0909341SAndroid Build Coastguard Worker    jz .w32_h_extend_left
482*c0909341SAndroid Build Coastguard Worker    movd          xm16, [leftq]
483*c0909341SAndroid Build Coastguard Worker    vmovdqu32 ym16{k1}, [lpfq-4]
484*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
485*c0909341SAndroid Build Coastguard Worker    jmp .w32_h_main
486*c0909341SAndroid Build Coastguard Worker.w32_h_extend_left:
487*c0909341SAndroid Build Coastguard Worker    vpbroadcastb  xm16, [lpfq]   ; the masked load ensures that no exception
488*c0909341SAndroid Build Coastguard Worker    vmovdqu32 ym16{k1}, [lpfq-4] ; gets raised from accessing invalid memory
489*c0909341SAndroid Build Coastguard Worker    jmp .w32_h_main
490*c0909341SAndroid Build Coastguard Worker.w32_h_top:
491*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
492*c0909341SAndroid Build Coastguard Worker    jz .w32_h_extend_left
493*c0909341SAndroid Build Coastguard Worker    movu          ym16, [lpfq-4]
494*c0909341SAndroid Build Coastguard Worker.w32_h_main:
495*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m16, [lpfq+4], 1
496*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
497*c0909341SAndroid Build Coastguard Worker    jnz .w32_h_have_right
498*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m0, [lpfq+wq-1]
499*c0909341SAndroid Build Coastguard Worker    movu          ym17, [r10-8]
500*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m17, [r10+0], 1
501*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, m17, 0xe4 ; c ? a : b
502*c0909341SAndroid Build Coastguard Worker.w32_h_have_right:
503*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m16, m6
504*c0909341SAndroid Build Coastguard Worker    mova            m0, m10
505*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m0, m2, m11
506*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m16, m7
507*c0909341SAndroid Build Coastguard Worker    mova            m1, m10
508*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m1, m2, m11
509*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m16, m8
510*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m0, m2, m12
511*c0909341SAndroid Build Coastguard Worker    pshufb         m16, m9
512*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m1, m16, m12
513*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
514*c0909341SAndroid Build Coastguard Worker    psraw           m0, 3
515*c0909341SAndroid Build Coastguard Worker    mova          [t1], m0
516*c0909341SAndroid Build Coastguard Worker    ret
517*c0909341SAndroid Build Coastguard Worker.w32_hv:
518*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
519*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
520*c0909341SAndroid Build Coastguard Worker    jz .w32_hv_extend_left
521*c0909341SAndroid Build Coastguard Worker    movd          xm16, [leftq]
522*c0909341SAndroid Build Coastguard Worker    vmovdqu32 ym16{k1}, [lpfq-4]
523*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
524*c0909341SAndroid Build Coastguard Worker    jmp .w32_hv_main
525*c0909341SAndroid Build Coastguard Worker.w32_hv_extend_left:
526*c0909341SAndroid Build Coastguard Worker    vpbroadcastb  xm16, [lpfq]
527*c0909341SAndroid Build Coastguard Worker    vmovdqu32 ym16{k1}, [lpfq-4]
528*c0909341SAndroid Build Coastguard Worker    jmp .w32_hv_main
529*c0909341SAndroid Build Coastguard Worker.w32_hv_bottom:
530*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
531*c0909341SAndroid Build Coastguard Worker    jz .w32_hv_extend_left
532*c0909341SAndroid Build Coastguard Worker    movu          ym16, [lpfq-4]
533*c0909341SAndroid Build Coastguard Worker.w32_hv_main:
534*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m16, [lpfq+4], 1
535*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
536*c0909341SAndroid Build Coastguard Worker    jnz .w32_hv_have_right
537*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m0, [lpfq+wq-1]
538*c0909341SAndroid Build Coastguard Worker    movu          ym17, [r10-8]
539*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m17, [r10+0], 1
540*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, m17, 0xe4
541*c0909341SAndroid Build Coastguard Worker.w32_hv_have_right:
542*c0909341SAndroid Build Coastguard Worker    mova            m3, [t4]
543*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t2]
544*c0909341SAndroid Build Coastguard Worker    mova            m2, [t3]
545*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m16, m6
546*c0909341SAndroid Build Coastguard Worker    mova            m0, m10
547*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m0, m4, m11
548*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m16, m7
549*c0909341SAndroid Build Coastguard Worker    mova            m5, m10
550*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m5, m4, m11
551*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m3, m2
552*c0909341SAndroid Build Coastguard Worker    mova            m1, m13
553*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m4, m15
554*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m2
555*c0909341SAndroid Build Coastguard Worker    mova            m2, m13
556*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m3, m15
557*c0909341SAndroid Build Coastguard Worker    pshufb          m4, m16, m8
558*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m0, m4, m12
559*c0909341SAndroid Build Coastguard Worker    pshufb         m16, m9
560*c0909341SAndroid Build Coastguard Worker    vpdpbusd        m5, m16, m12
561*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m5
562*c0909341SAndroid Build Coastguard Worker    psraw           m0, 3
563*c0909341SAndroid Build Coastguard Worker    mova            m4, [t5]
564*c0909341SAndroid Build Coastguard Worker    paddw           m4, [t1]
565*c0909341SAndroid Build Coastguard Worker    paddw           m3, m0, [t6]
566*c0909341SAndroid Build Coastguard Worker    mova          [t0], m0
567*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m3, m4
568*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m0, m14
569*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4
570*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m3, m14
571*c0909341SAndroid Build Coastguard Worker    packuswb        m1, m2
572*c0909341SAndroid Build Coastguard Worker    vpermb         m16, m18, m1
573*c0909341SAndroid Build Coastguard Worker    mova        [dstq], ym16
574*c0909341SAndroid Build Coastguard Worker    mov             t6, t5
575*c0909341SAndroid Build Coastguard Worker    mov             t5, t4
576*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
577*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
578*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
579*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
580*c0909341SAndroid Build Coastguard Worker    mov             t0, t6
581*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
582*c0909341SAndroid Build Coastguard Worker    ret
583*c0909341SAndroid Build Coastguard Worker.w32_v:
584*c0909341SAndroid Build Coastguard Worker    mova            m2, [t4]
585*c0909341SAndroid Build Coastguard Worker    paddw           m2, [t2]
586*c0909341SAndroid Build Coastguard Worker    mova            m1, [t3]
587*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1]
588*c0909341SAndroid Build Coastguard Worker    paddw           m3, m4, [t6]
589*c0909341SAndroid Build Coastguard Worker    paddw           m4, [t5]
590*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m2, m1
591*c0909341SAndroid Build Coastguard Worker    mova            m0, m13
592*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m5, m15
593*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m1
594*c0909341SAndroid Build Coastguard Worker    mova            m1, m13
595*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m2, m15
596*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m4
597*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m2, m14
598*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4
599*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m3, m14
600*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m1
601*c0909341SAndroid Build Coastguard Worker    vpermb         m16, m18, m0
602*c0909341SAndroid Build Coastguard Worker    mova        [dstq], ym16
603*c0909341SAndroid Build Coastguard Worker    mov             t6, t5
604*c0909341SAndroid Build Coastguard Worker    mov             t5, t4
605*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
606*c0909341SAndroid Build Coastguard Worker    mov             t3, t2
607*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
608*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
609*c0909341SAndroid Build Coastguard Worker    ret
610*c0909341SAndroid Build Coastguard Worker
611*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_5x5_8bpc, 4, 13, 23, 416*24+16, dst, stride, left, lpf, \
612*c0909341SAndroid Build Coastguard Worker                                                   w, h, edge, params
613*c0909341SAndroid Build Coastguard Worker    mov        paramsq, r6mp
614*c0909341SAndroid Build Coastguard Worker    mov             wd, wm
615*c0909341SAndroid Build Coastguard Worker    mov             hd, hm
616*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
617*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4 m5, [sgr_shuf+1]
618*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
619*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4 m6, [sgr_shuf+9]
620*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
621*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4 m7, [sgr_shuf+3]
622*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+wq*4+16+416*12]
623*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4 m8, [sgr_shuf+7]
624*c0909341SAndroid Build Coastguard Worker    pxor            m4, m4
625*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [pd_m25]
626*c0909341SAndroid Build Coastguard Worker    vpsubd         m11, m4, [paramsq+0] {1to16} ; -s0
627*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m15, [paramsq+8]             ; w0
628*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq*2+20]
629*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m10, [pw_164_455]
630*c0909341SAndroid Build Coastguard Worker    neg             wq
631*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m12, [pw_61448]              ; (15 << 12) + (1 << 3)
632*c0909341SAndroid Build Coastguard Worker    mov           r10d, 0xfe
633*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m13, [pd_m4096]
634*c0909341SAndroid Build Coastguard Worker    kmovb           k1, r10d
635*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m14, [pd_34816]              ; (1 << 11) + (1 << 15)
636*c0909341SAndroid Build Coastguard Worker    mov            r10, 0x3333333333333333
637*c0909341SAndroid Build Coastguard Worker    mova           m18, [sgr_x_by_x+64*0]
638*c0909341SAndroid Build Coastguard Worker    kmovq           k2, r10
639*c0909341SAndroid Build Coastguard Worker    mova           m19, [sgr_x_by_x+64*1]
640*c0909341SAndroid Build Coastguard Worker    lea            r12, [r_ext_mask+75]
641*c0909341SAndroid Build Coastguard Worker    mova           m20, [sgr_x_by_x+64*2]
642*c0909341SAndroid Build Coastguard Worker    psllw          m15, 4
643*c0909341SAndroid Build Coastguard Worker    mova           m21, [sgr_x_by_x+64*3]
644*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
645*c0909341SAndroid Build Coastguard Worker    mova          ym22, [sgr_shuf]
646*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
647*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10 ; below
648*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
649*c0909341SAndroid Build Coastguard Worker    jz .no_top
650*c0909341SAndroid Build Coastguard Worker    call .h_top
651*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
652*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
653*c0909341SAndroid Build Coastguard Worker    call .top_fixup
654*c0909341SAndroid Build Coastguard Worker    add             t1, 416*6
655*c0909341SAndroid Build Coastguard Worker    call .h_top
656*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
657*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
658*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
659*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10 ; below
660*c0909341SAndroid Build Coastguard Worker    mov             t0, t2
661*c0909341SAndroid Build Coastguard Worker    dec             hd
662*c0909341SAndroid Build Coastguard Worker    jz .height1
663*c0909341SAndroid Build Coastguard Worker    or           edged, 16
664*c0909341SAndroid Build Coastguard Worker    call .h
665*c0909341SAndroid Build Coastguard Worker.main:
666*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
667*c0909341SAndroid Build Coastguard Worker    call .hv
668*c0909341SAndroid Build Coastguard Worker    call .prep_n
669*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
670*c0909341SAndroid Build Coastguard Worker    jl .extend_bottom
671*c0909341SAndroid Build Coastguard Worker.main_loop:
672*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
673*c0909341SAndroid Build Coastguard Worker    test            hd, hd
674*c0909341SAndroid Build Coastguard Worker    jz .odd_height
675*c0909341SAndroid Build Coastguard Worker    call .h
676*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
677*c0909341SAndroid Build Coastguard Worker    call .hv
678*c0909341SAndroid Build Coastguard Worker    call .n0
679*c0909341SAndroid Build Coastguard Worker    call .n1
680*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
681*c0909341SAndroid Build Coastguard Worker    jge .main_loop
682*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
683*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
684*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
685*c0909341SAndroid Build Coastguard Worker    call .h_top
686*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
687*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
688*c0909341SAndroid Build Coastguard Worker.end:
689*c0909341SAndroid Build Coastguard Worker    call .n0
690*c0909341SAndroid Build Coastguard Worker    call .n1
691*c0909341SAndroid Build Coastguard Worker.end2:
692*c0909341SAndroid Build Coastguard Worker    RET
693*c0909341SAndroid Build Coastguard Worker.height1:
694*c0909341SAndroid Build Coastguard Worker    call .hv
695*c0909341SAndroid Build Coastguard Worker    call .prep_n
696*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
697*c0909341SAndroid Build Coastguard Worker.odd_height:
698*c0909341SAndroid Build Coastguard Worker    call .hv
699*c0909341SAndroid Build Coastguard Worker    call .n0
700*c0909341SAndroid Build Coastguard Worker    call .n1
701*c0909341SAndroid Build Coastguard Worker.odd_height_end:
702*c0909341SAndroid Build Coastguard Worker    call .v
703*c0909341SAndroid Build Coastguard Worker    call .n0
704*c0909341SAndroid Build Coastguard Worker    jmp .end2
705*c0909341SAndroid Build Coastguard Worker.extend_bottom:
706*c0909341SAndroid Build Coastguard Worker    call .v
707*c0909341SAndroid Build Coastguard Worker    jmp .end
708*c0909341SAndroid Build Coastguard Worker.no_top:
709*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
710*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
711*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
712*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10
713*c0909341SAndroid Build Coastguard Worker    call .h
714*c0909341SAndroid Build Coastguard Worker    lea             t2, [t1+416*6]
715*c0909341SAndroid Build Coastguard Worker    call .top_fixup
716*c0909341SAndroid Build Coastguard Worker    dec             hd
717*c0909341SAndroid Build Coastguard Worker    jz .no_top_height1
718*c0909341SAndroid Build Coastguard Worker    or           edged, 16
719*c0909341SAndroid Build Coastguard Worker    mov             t0, t1
720*c0909341SAndroid Build Coastguard Worker    mov             t1, t2
721*c0909341SAndroid Build Coastguard Worker    jmp .main
722*c0909341SAndroid Build Coastguard Worker.no_top_height1:
723*c0909341SAndroid Build Coastguard Worker    call .v
724*c0909341SAndroid Build Coastguard Worker    call .prep_n
725*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
726*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum
727*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
728*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
729*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
730*c0909341SAndroid Build Coastguard Worker    movd          xm17, [leftq]
731*c0909341SAndroid Build Coastguard Worker    vmovdqu32 ym17{k1}, [lpfq+wq-4]
732*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
733*c0909341SAndroid Build Coastguard Worker    jmp .h_main
734*c0909341SAndroid Build Coastguard Worker.h_extend_left:
735*c0909341SAndroid Build Coastguard Worker    vpbroadcastb  xm17, [lpfq+wq]
736*c0909341SAndroid Build Coastguard Worker    vmovdqu32 ym17{k1}, [lpfq+wq-4]
737*c0909341SAndroid Build Coastguard Worker    jmp .h_main
738*c0909341SAndroid Build Coastguard Worker.h_top:
739*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
740*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
741*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
742*c0909341SAndroid Build Coastguard Worker.h_loop:
743*c0909341SAndroid Build Coastguard Worker    movu          ym17, [lpfq+r10-2]
744*c0909341SAndroid Build Coastguard Worker.h_main:
745*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m17, [lpfq+r10+6], 1
746*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
747*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
748*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -34
749*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
750*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m0, [lpfq-1]
751*c0909341SAndroid Build Coastguard Worker    movu          ym16, [r12+r10-8]
752*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m16, [r12+r10+0], 1
753*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m0, m16, 0xe4
754*c0909341SAndroid Build Coastguard Worker.h_have_right:
755*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m17, m5
756*c0909341SAndroid Build Coastguard Worker    pmullw          m2, m3, m3
757*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m17, m6
758*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3, m1
759*c0909341SAndroid Build Coastguard Worker    shufps          m3, m1, q2121
760*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3
761*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m3, m1
762*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m1
763*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m2, m4
764*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m16, m16
765*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m4
766*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m3, m3
767*c0909341SAndroid Build Coastguard Worker    pshufb         m16, m17, m7
768*c0909341SAndroid Build Coastguard Worker    paddw           m0, m16
769*c0909341SAndroid Build Coastguard Worker    pshufb         m17, m8
770*c0909341SAndroid Build Coastguard Worker    paddw           m0, m17              ; sum
771*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m16, m17
772*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m3, m3           ; sumsq
773*c0909341SAndroid Build Coastguard Worker    punpckhwd      m16, m17
774*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m16, m16
775*c0909341SAndroid Build Coastguard Worker    test         edgeb, 16 ; y > 0
776*c0909341SAndroid Build Coastguard Worker    jz .h_loop_end
777*c0909341SAndroid Build Coastguard Worker    paddw           m0, [t1+r10*2+416*0]
778*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t1+r10*2+416*2]
779*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+r10*2+416*4]
780*c0909341SAndroid Build Coastguard Worker.h_loop_end:
781*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416*0], m0
782*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416*2], m1
783*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416*4], m2
784*c0909341SAndroid Build Coastguard Worker    add            r10, 32
785*c0909341SAndroid Build Coastguard Worker    jl .h_loop
786*c0909341SAndroid Build Coastguard Worker    ret
787*c0909341SAndroid Build Coastguard Worker.top_fixup:
788*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
789*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: ; the sums of the first row needs to be doubled
790*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10*2+416*0]
791*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+r10*2+416*2]
792*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10*2+416*4]
793*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
794*c0909341SAndroid Build Coastguard Worker    paddd           m1, m1
795*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
796*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416*0], m0
797*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416*2], m1
798*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416*4], m2
799*c0909341SAndroid Build Coastguard Worker    add            r10, 32
800*c0909341SAndroid Build Coastguard Worker    jl .top_fixup_loop
801*c0909341SAndroid Build Coastguard Worker    ret
802*c0909341SAndroid Build Coastguard WorkerALIGN function_align
803*c0909341SAndroid Build Coastguard Worker.hv: ; horizontal boxsum + vertical boxsum + ab
804*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
805*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
806*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
807*c0909341SAndroid Build Coastguard Worker    movd          xm17, [leftq]
808*c0909341SAndroid Build Coastguard Worker    vmovdqu32 ym17{k1}, [lpfq+wq-4]
809*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
810*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
811*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
812*c0909341SAndroid Build Coastguard Worker    vpbroadcastb  xm17, [lpfq+wq]
813*c0909341SAndroid Build Coastguard Worker    vmovdqu32 ym17{k1}, [lpfq+wq-4]
814*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
815*c0909341SAndroid Build Coastguard Worker.hv_bottom:
816*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
817*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
818*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
819*c0909341SAndroid Build Coastguard Worker.hv_loop:
820*c0909341SAndroid Build Coastguard Worker    movu          ym17, [lpfq+r10-2]
821*c0909341SAndroid Build Coastguard Worker.hv_main:
822*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m17, [lpfq+r10+6], 1
823*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
824*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
825*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -34
826*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
827*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m0, [lpfq-1]
828*c0909341SAndroid Build Coastguard Worker    movu          ym16, [r12+r10-8]
829*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m16, [r12+r10+0], 1
830*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m0, m16, 0xe4
831*c0909341SAndroid Build Coastguard Worker.hv_have_right:
832*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m17, m5
833*c0909341SAndroid Build Coastguard Worker    pmullw          m3, m1, m1
834*c0909341SAndroid Build Coastguard Worker    pshufb          m2, m17, m6
835*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1, m2
836*c0909341SAndroid Build Coastguard Worker    shufps          m1, m2, q2121
837*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
838*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m1, m2
839*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m2
840*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m4
841*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m16, m16
842*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4
843*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m1, m1
844*c0909341SAndroid Build Coastguard Worker    pshufb         m16, m17, m7
845*c0909341SAndroid Build Coastguard Worker    paddw           m0, m16
846*c0909341SAndroid Build Coastguard Worker    pshufb         m17, m8
847*c0909341SAndroid Build Coastguard Worker    paddw           m0, m17              ; h sum
848*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m16, m17
849*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m1, m1           ; h sumsq
850*c0909341SAndroid Build Coastguard Worker    punpckhwd      m16, m17
851*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m16, m16
852*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t1+r10*2+416*0]
853*c0909341SAndroid Build Coastguard Worker    paddd          m16, m2, [t1+r10*2+416*2]
854*c0909341SAndroid Build Coastguard Worker    paddd          m17, m3, [t1+r10*2+416*4]
855*c0909341SAndroid Build Coastguard Worker    test            hd, hd
856*c0909341SAndroid Build Coastguard Worker    jz .hv_last_row
857*c0909341SAndroid Build Coastguard Worker.hv_main2:
858*c0909341SAndroid Build Coastguard Worker    paddd          m16, [t2+r10*2+416*2] ; hv sumsq
859*c0909341SAndroid Build Coastguard Worker    paddd          m17, [t2+r10*2+416*4]
860*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t2+r10*2+416*0] ; hv sum
861*c0909341SAndroid Build Coastguard Worker    mova [t0+r10*2+416*2], m2
862*c0909341SAndroid Build Coastguard Worker    mova [t0+r10*2+416*4], m3
863*c0909341SAndroid Build Coastguard Worker    mova [t0+r10*2+416*0], m0
864*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m9               ; -a * 25
865*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m9
866*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m4           ; b
867*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m16, m0, m0           ; -p
868*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m4
869*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m17, m1, m1
870*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m10              ; b * 164
871*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m10
872*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m11              ; p * s
873*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m11
874*c0909341SAndroid Build Coastguard Worker    vpalignr   m17{k2}, m16, m16, 2
875*c0909341SAndroid Build Coastguard Worker    mova           m16, m20
876*c0909341SAndroid Build Coastguard Worker    paddusw        m17, m12
877*c0909341SAndroid Build Coastguard Worker    psraw          m17, 4                ; min(z, 255) - 256
878*c0909341SAndroid Build Coastguard Worker    vpermt2b       m16, m17, m21         ; sgr_x_by_x[128..255]
879*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m17
880*c0909341SAndroid Build Coastguard Worker    vpermi2b       m17, m18, m19         ; sgr_x_by_x[  0..127]
881*c0909341SAndroid Build Coastguard Worker    vmovdqu8   m17{k3}, m16              ; x
882*c0909341SAndroid Build Coastguard Worker    pandn          m16, m13, m17
883*c0909341SAndroid Build Coastguard Worker    psrld          m17, 16
884*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m16
885*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m17
886*c0909341SAndroid Build Coastguard Worker    paddd           m0, m14              ; x * b * 164 + (1 << 11) + (1 << 15)
887*c0909341SAndroid Build Coastguard Worker    paddd           m1, m14
888*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, m13, 0xd8    ; a | (b << 12)
889*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m1, m13, 0xd8
890*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+  8], m16    ; The neighbor calculations requires
891*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+ 24], xm17   ; 13 bits for a and 21 bits for b.
892*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+ 56], m17, 2 ; Packing them allows for 12+20, but
893*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+ 72], m17    ; that gets us most of the way.
894*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*4+ 72], ym16, 1
895*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+104], m16, 3
896*c0909341SAndroid Build Coastguard Worker    add            r10, 32
897*c0909341SAndroid Build Coastguard Worker    jl .hv_loop
898*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
899*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
900*c0909341SAndroid Build Coastguard Worker    mov             t0, t2
901*c0909341SAndroid Build Coastguard Worker    ret
902*c0909341SAndroid Build Coastguard Worker.hv_last_row: ; esoteric edge case for odd heights
903*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416*0], m1
904*c0909341SAndroid Build Coastguard Worker    paddw              m1, m0
905*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416*2], m16
906*c0909341SAndroid Build Coastguard Worker    paddd             m16, m2
907*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416*4], m17
908*c0909341SAndroid Build Coastguard Worker    paddd             m17, m3
909*c0909341SAndroid Build Coastguard Worker    jmp .hv_main2
910*c0909341SAndroid Build Coastguard Worker.v: ; vertical boxsum + ab
911*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
912*c0909341SAndroid Build Coastguard Worker.v_loop:
913*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10*2+416*2]
914*c0909341SAndroid Build Coastguard Worker    paddd          m16, m2, [t2+r10*2+416*2]
915*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+r10*2+416*4]
916*c0909341SAndroid Build Coastguard Worker    paddd          m17, m3, [t2+r10*2+416*4]
917*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
918*c0909341SAndroid Build Coastguard Worker    paddd           m3, m3
919*c0909341SAndroid Build Coastguard Worker    paddd          m16, m2               ; hv sumsq
920*c0909341SAndroid Build Coastguard Worker    paddd          m17, m3
921*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m9               ; -a * 25
922*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m9
923*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10*2+416*0]
924*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10*2+416*0]
925*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
926*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0               ; hv sum
927*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m4           ; b
928*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m16, m0, m0           ; -p
929*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m4
930*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m17, m1, m1
931*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m10              ; b * 164
932*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m10
933*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m11              ; p * s
934*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m11
935*c0909341SAndroid Build Coastguard Worker    vpalignr   m17{k2}, m16, m16, 2
936*c0909341SAndroid Build Coastguard Worker    mova           m16, m20
937*c0909341SAndroid Build Coastguard Worker    paddusw        m17, m12
938*c0909341SAndroid Build Coastguard Worker    psraw          m17, 4                ; min(z, 255) - 256
939*c0909341SAndroid Build Coastguard Worker    vpermt2b       m16, m17, m21         ; sgr_x_by_x[128..255]
940*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m17
941*c0909341SAndroid Build Coastguard Worker    vpermi2b       m17, m18, m19         ; sgr_x_by_x[  0..127]
942*c0909341SAndroid Build Coastguard Worker    vmovdqu8   m17{k3}, m16              ; x
943*c0909341SAndroid Build Coastguard Worker    pandn          m16, m13, m17
944*c0909341SAndroid Build Coastguard Worker    psrld          m17, 16
945*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m16
946*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m17
947*c0909341SAndroid Build Coastguard Worker    paddd           m0, m14              ; x * b * 164 + (1 << 11) + (1 << 15)
948*c0909341SAndroid Build Coastguard Worker    paddd           m1, m14
949*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, m13, 0xd8    ; a | (b << 12)
950*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m1, m13, 0xd8
951*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+  8], m16
952*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+ 24], xm17
953*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+ 56], m17, 2
954*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+ 72], m17
955*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*4+ 72], ym16, 1
956*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+104], m16, 3
957*c0909341SAndroid Build Coastguard Worker    add            r10, 32
958*c0909341SAndroid Build Coastguard Worker    jl .v_loop
959*c0909341SAndroid Build Coastguard Worker    ret
960*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup
961*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
962*c0909341SAndroid Build Coastguard Worker.prep_n_loop:
963*c0909341SAndroid Build Coastguard Worker    movu            m0, [t3+r10*4+ 4]
964*c0909341SAndroid Build Coastguard Worker    movu            m1, [t3+r10*4+68]
965*c0909341SAndroid Build Coastguard Worker    paddd           m2, m0, [t3+r10*4+ 0]
966*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1, [t3+r10*4+64]
967*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t3+r10*4+ 8]
968*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t3+r10*4+72]
969*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2
970*c0909341SAndroid Build Coastguard Worker    pslld           m2, 2
971*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
972*c0909341SAndroid Build Coastguard Worker    pslld           m3, 2
973*c0909341SAndroid Build Coastguard Worker    paddd           m2, m0                ; ab 565
974*c0909341SAndroid Build Coastguard Worker    paddd           m3, m1
975*c0909341SAndroid Build Coastguard Worker    pandn           m0, m13, m2           ; a
976*c0909341SAndroid Build Coastguard Worker    psrld           m2, 12                ; b
977*c0909341SAndroid Build Coastguard Worker    pandn           m1, m13, m3
978*c0909341SAndroid Build Coastguard Worker    psrld           m3, 12
979*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*4+ 0], m0
980*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*8+ 0], m2
981*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*4+64], m1
982*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*8+64], m3
983*c0909341SAndroid Build Coastguard Worker    add            r10, 32
984*c0909341SAndroid Build Coastguard Worker    jl .prep_n_loop
985*c0909341SAndroid Build Coastguard Worker    ret
986*c0909341SAndroid Build Coastguard WorkerALIGN function_align
987*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows)
988*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
989*c0909341SAndroid Build Coastguard Worker.n0_loop:
990*c0909341SAndroid Build Coastguard Worker    movu           m16, [t3+r10*4+ 4]
991*c0909341SAndroid Build Coastguard Worker    movu           m17, [t3+r10*4+68]
992*c0909341SAndroid Build Coastguard Worker    paddd           m0, m16, [t3+r10*4+ 0]
993*c0909341SAndroid Build Coastguard Worker    paddd           m1, m17, [t3+r10*4+64]
994*c0909341SAndroid Build Coastguard Worker    paddd           m0, [t3+r10*4+ 8]
995*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+r10*4+72]
996*c0909341SAndroid Build Coastguard Worker    paddd          m16, m0
997*c0909341SAndroid Build Coastguard Worker    pslld           m0, 2
998*c0909341SAndroid Build Coastguard Worker    paddd          m17, m1
999*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2
1000*c0909341SAndroid Build Coastguard Worker    paddd           m0, m16
1001*c0909341SAndroid Build Coastguard Worker    paddd           m1, m17
1002*c0909341SAndroid Build Coastguard Worker    pandn          m16, m13, m0
1003*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
1004*c0909341SAndroid Build Coastguard Worker    pandn          m17, m13, m1
1005*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
1006*c0909341SAndroid Build Coastguard Worker    paddd           m2, m16, [t3+r10*4+416*4+ 0] ; a
1007*c0909341SAndroid Build Coastguard Worker    paddd           m3, m17, [t3+r10*4+416*4+64]
1008*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*4+ 0], m16
1009*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*4+64], m17
1010*c0909341SAndroid Build Coastguard Worker    paddd          m16, m0, [t3+r10*4+416*8+ 0] ; b + (1 << 8)
1011*c0909341SAndroid Build Coastguard Worker    paddd          m17, m1, [t3+r10*4+416*8+64]
1012*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*8+ 0], m0
1013*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*8+64], m1
1014*c0909341SAndroid Build Coastguard Worker    pmovzxbd        m0, [dstq+r10+ 0]
1015*c0909341SAndroid Build Coastguard Worker    pmovzxbd        m1, [dstq+r10+16]
1016*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0                      ; a * src
1017*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1
1018*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
1019*c0909341SAndroid Build Coastguard Worker    psubd          m16, m2                      ; b - a * src + (1 << 8)
1020*c0909341SAndroid Build Coastguard Worker    psubd          m17, m3
1021*c0909341SAndroid Build Coastguard Worker    psrad          m16, 9
1022*c0909341SAndroid Build Coastguard Worker    psrad          m17, 9
1023*c0909341SAndroid Build Coastguard Worker    packssdw       m16, m17
1024*c0909341SAndroid Build Coastguard Worker    pmulhrsw       m16, m15
1025*c0909341SAndroid Build Coastguard Worker    paddw          m16, m0
1026*c0909341SAndroid Build Coastguard Worker    packuswb       m16, m16
1027*c0909341SAndroid Build Coastguard Worker    vpermd         m16, m22, m16
1028*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], ym16
1029*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1030*c0909341SAndroid Build Coastguard Worker    jl .n0_loop
1031*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
1032*c0909341SAndroid Build Coastguard Worker    ret
1033*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1034*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows)
1035*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1036*c0909341SAndroid Build Coastguard Worker.n1_loop:
1037*c0909341SAndroid Build Coastguard Worker    pmovzxbd        m0, [dstq+r10+ 0]
1038*c0909341SAndroid Build Coastguard Worker    pmovzxbd        m1, [dstq+r10+16]
1039*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m0, [t3+r10*4+416*4+ 0] ; a * src
1040*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m1, [t3+r10*4+416*4+64]
1041*c0909341SAndroid Build Coastguard Worker    mova           m16, [t3+r10*4+416*8+ 0]     ; b + (1 << 7)
1042*c0909341SAndroid Build Coastguard Worker    mova           m17, [t3+r10*4+416*8+64]
1043*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
1044*c0909341SAndroid Build Coastguard Worker    psubd          m16, m2                      ; b - a * src + (1 << 7)
1045*c0909341SAndroid Build Coastguard Worker    psubd          m17, m3
1046*c0909341SAndroid Build Coastguard Worker    psrad          m16, 8
1047*c0909341SAndroid Build Coastguard Worker    psrad          m17, 8
1048*c0909341SAndroid Build Coastguard Worker    packssdw       m16, m17
1049*c0909341SAndroid Build Coastguard Worker    pmulhrsw       m16, m15
1050*c0909341SAndroid Build Coastguard Worker    paddw          m16, m0
1051*c0909341SAndroid Build Coastguard Worker    packuswb       m16, m16
1052*c0909341SAndroid Build Coastguard Worker    vpermd         m16, m22, m16
1053*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], ym16
1054*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1055*c0909341SAndroid Build Coastguard Worker    jl .n1_loop
1056*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
1057*c0909341SAndroid Build Coastguard Worker    ret
1058*c0909341SAndroid Build Coastguard Worker
1059*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_3x3_8bpc, 4, 15, 22, -416*28-16, dst, stride, left, lpf, \
1060*c0909341SAndroid Build Coastguard Worker                                                    w, h, edge, params
1061*c0909341SAndroid Build Coastguard Worker    mov        paramsq, r6mp
1062*c0909341SAndroid Build Coastguard Worker    mov             wd, wm
1063*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
1064*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
1065*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4 m5, [sgr_shuf+3]
1066*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
1067*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4 m6, [sgr_shuf+5]
1068*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
1069*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4 m7, [sgr_shuf+7]
1070*c0909341SAndroid Build Coastguard Worker    pxor            m4, m4
1071*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m8, [pd_m9]
1072*c0909341SAndroid Build Coastguard Worker    vpsubd         m11, m4, [paramsq+4] {1to16} ; -s1
1073*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m15, [paramsq+10]            ; w1
1074*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq*2+20]
1075*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m10, [pw_164_455]
1076*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+wq*4+16+416*12]
1077*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m12, [pw_61448]              ; (15 << 12) + (1 << 3)
1078*c0909341SAndroid Build Coastguard Worker    neg             wq
1079*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m13, [pd_m4096]
1080*c0909341SAndroid Build Coastguard Worker    mov           r10d, 0xfe
1081*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m14, [pd_34816]              ; (1 << 11) + (1 << 15)
1082*c0909341SAndroid Build Coastguard Worker    kmovb           k1, r10d
1083*c0909341SAndroid Build Coastguard Worker    mova           m18, [sgr_x_by_x+64*0]
1084*c0909341SAndroid Build Coastguard Worker    mov            r10, 0x3333333333333333
1085*c0909341SAndroid Build Coastguard Worker    mova           m19, [sgr_x_by_x+64*1]
1086*c0909341SAndroid Build Coastguard Worker    kmovq           k2, r10
1087*c0909341SAndroid Build Coastguard Worker    mova           m20, [sgr_x_by_x+64*2]
1088*c0909341SAndroid Build Coastguard Worker    psllw          m15, 4
1089*c0909341SAndroid Build Coastguard Worker    mova           m21, [sgr_x_by_x+64*3]
1090*c0909341SAndroid Build Coastguard Worker    lea            r14, [r_ext_mask+75]
1091*c0909341SAndroid Build Coastguard Worker    mova           ym9, [sgr_shuf]
1092*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
1093*c0909341SAndroid Build Coastguard Worker    jz .no_top
1094*c0909341SAndroid Build Coastguard Worker    call .h_top
1095*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1096*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1097*c0909341SAndroid Build Coastguard Worker    add             t1, 416*6
1098*c0909341SAndroid Build Coastguard Worker    call .h_top
1099*c0909341SAndroid Build Coastguard Worker    lea             t4, [lpfq+strideq*4]
1100*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1101*c0909341SAndroid Build Coastguard Worker    add             t4, strideq
1102*c0909341SAndroid Build Coastguard Worker    mov          [rsp], t4 ; below
1103*c0909341SAndroid Build Coastguard Worker    mov             t0, t2
1104*c0909341SAndroid Build Coastguard Worker    call .hv
1105*c0909341SAndroid Build Coastguard Worker.main:
1106*c0909341SAndroid Build Coastguard Worker    mov             t5, t3
1107*c0909341SAndroid Build Coastguard Worker    add             t3, 416*4
1108*c0909341SAndroid Build Coastguard Worker    dec             hd
1109*c0909341SAndroid Build Coastguard Worker    jz .height1
1110*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1111*c0909341SAndroid Build Coastguard Worker    call .hv
1112*c0909341SAndroid Build Coastguard Worker    call .prep_n
1113*c0909341SAndroid Build Coastguard Worker    dec             hd
1114*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
1115*c0909341SAndroid Build Coastguard Worker.main_loop:
1116*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1117*c0909341SAndroid Build Coastguard Worker    call .hv
1118*c0909341SAndroid Build Coastguard Worker    call .n
1119*c0909341SAndroid Build Coastguard Worker    dec             hd
1120*c0909341SAndroid Build Coastguard Worker    jnz .main_loop
1121*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
1122*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
1123*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
1124*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
1125*c0909341SAndroid Build Coastguard Worker    call .n
1126*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1127*c0909341SAndroid Build Coastguard Worker    call .hv_bottom
1128*c0909341SAndroid Build Coastguard Worker.end:
1129*c0909341SAndroid Build Coastguard Worker    call .n
1130*c0909341SAndroid Build Coastguard Worker    RET
1131*c0909341SAndroid Build Coastguard Worker.height1:
1132*c0909341SAndroid Build Coastguard Worker    call .v
1133*c0909341SAndroid Build Coastguard Worker    call .prep_n
1134*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1135*c0909341SAndroid Build Coastguard Worker    call .v
1136*c0909341SAndroid Build Coastguard Worker    jmp .end
1137*c0909341SAndroid Build Coastguard Worker.extend_bottom:
1138*c0909341SAndroid Build Coastguard Worker    call .v
1139*c0909341SAndroid Build Coastguard Worker    call .n
1140*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1141*c0909341SAndroid Build Coastguard Worker    call .v
1142*c0909341SAndroid Build Coastguard Worker    jmp .end
1143*c0909341SAndroid Build Coastguard Worker.no_top:
1144*c0909341SAndroid Build Coastguard Worker    lea             t4, [lpfq+strideq*4]
1145*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1146*c0909341SAndroid Build Coastguard Worker    lea             t4, [t4+strideq*2]
1147*c0909341SAndroid Build Coastguard Worker    mov          [rsp], t4
1148*c0909341SAndroid Build Coastguard Worker    call .h
1149*c0909341SAndroid Build Coastguard Worker    lea             t0, [t1+416*6]
1150*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1151*c0909341SAndroid Build Coastguard Worker    call .v
1152*c0909341SAndroid Build Coastguard Worker    jmp .main
1153*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum
1154*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1155*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1156*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1157*c0909341SAndroid Build Coastguard Worker    movd          xm17, [leftq]
1158*c0909341SAndroid Build Coastguard Worker    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1159*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
1160*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1161*c0909341SAndroid Build Coastguard Worker.h_extend_left:
1162*c0909341SAndroid Build Coastguard Worker    vpbroadcastb  xm17, [lpfq+wq]
1163*c0909341SAndroid Build Coastguard Worker    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1164*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1165*c0909341SAndroid Build Coastguard Worker.h_top:
1166*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1167*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1168*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1169*c0909341SAndroid Build Coastguard Worker.h_loop:
1170*c0909341SAndroid Build Coastguard Worker    movu          ym17, [lpfq+r10-2]
1171*c0909341SAndroid Build Coastguard Worker.h_main:
1172*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m17, [lpfq+r10+6], 1
1173*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1174*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
1175*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -33
1176*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
1177*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m0, [lpfq-1]
1178*c0909341SAndroid Build Coastguard Worker    movu          ym16, [r14+r10-8]
1179*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m16, [r14+r10+0], 1
1180*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m0, m16, 0xe4
1181*c0909341SAndroid Build Coastguard Worker.h_have_right:
1182*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m17, m5
1183*c0909341SAndroid Build Coastguard Worker    pmullw          m2, m0, m0
1184*c0909341SAndroid Build Coastguard Worker    pshufb         m16, m17, m6
1185*c0909341SAndroid Build Coastguard Worker    paddw           m0, m16
1186*c0909341SAndroid Build Coastguard Worker    pshufb         m17, m7
1187*c0909341SAndroid Build Coastguard Worker    paddw           m0, m17    ; sum
1188*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m16, m17
1189*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m2, m4
1190*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m3, m3 ; sumsq
1191*c0909341SAndroid Build Coastguard Worker    punpckhwd      m16, m17
1192*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m4
1193*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m16, m16
1194*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416*0], m0
1195*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416*2], m1
1196*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416*4], m2
1197*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1198*c0909341SAndroid Build Coastguard Worker    jl .h_loop
1199*c0909341SAndroid Build Coastguard Worker    ret
1200*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1201*c0909341SAndroid Build Coastguard Worker.hv: ; horizontal boxsum + vertical boxsum + ab
1202*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1203*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1204*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
1205*c0909341SAndroid Build Coastguard Worker    movd          xm17, [leftq]
1206*c0909341SAndroid Build Coastguard Worker    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1207*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
1208*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
1209*c0909341SAndroid Build Coastguard Worker.hv_extend_left:
1210*c0909341SAndroid Build Coastguard Worker    vpbroadcastb  xm17, [lpfq+wq]
1211*c0909341SAndroid Build Coastguard Worker    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1212*c0909341SAndroid Build Coastguard Worker    jmp .hv_main
1213*c0909341SAndroid Build Coastguard Worker.hv_bottom:
1214*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1215*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1216*c0909341SAndroid Build Coastguard Worker    jz .hv_extend_left
1217*c0909341SAndroid Build Coastguard Worker.hv_loop:
1218*c0909341SAndroid Build Coastguard Worker    movu          ym17, [lpfq+r10-2]
1219*c0909341SAndroid Build Coastguard Worker.hv_main:
1220*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m17, [lpfq+r10+6], 1
1221*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1222*c0909341SAndroid Build Coastguard Worker    jnz .hv_have_right
1223*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -33
1224*c0909341SAndroid Build Coastguard Worker    jl .hv_have_right
1225*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m0, [lpfq-1]
1226*c0909341SAndroid Build Coastguard Worker    movu          ym16, [r14+r10-8]
1227*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m16, [r14+r10+0], 1
1228*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m0, m16, 0xe4
1229*c0909341SAndroid Build Coastguard Worker.hv_have_right:
1230*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m17, m5
1231*c0909341SAndroid Build Coastguard Worker    pmullw          m3, m0, m0
1232*c0909341SAndroid Build Coastguard Worker    pshufb          m1, m17, m6
1233*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
1234*c0909341SAndroid Build Coastguard Worker    pshufb         m17, m7
1235*c0909341SAndroid Build Coastguard Worker    paddw           m0, m17              ; h sum
1236*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m17, m1
1237*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m4
1238*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m16, m16         ; h sumsq
1239*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m1
1240*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4
1241*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m17, m17
1242*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10*2+416*0]
1243*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t1+r10*2+416*0] ; hv sum
1244*c0909341SAndroid Build Coastguard Worker    paddd          m16, m2, [t2+r10*2+416*2]
1245*c0909341SAndroid Build Coastguard Worker    paddd          m17, m3, [t2+r10*2+416*4]
1246*c0909341SAndroid Build Coastguard Worker    paddd          m16, [t1+r10*2+416*2] ; hv sumsq
1247*c0909341SAndroid Build Coastguard Worker    paddd          m17, [t1+r10*2+416*4]
1248*c0909341SAndroid Build Coastguard Worker    mova [t0+r10*2+416*0], m0
1249*c0909341SAndroid Build Coastguard Worker    mova [t0+r10*2+416*2], m2
1250*c0909341SAndroid Build Coastguard Worker    mova [t0+r10*2+416*4], m3
1251*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m8               ; -a * 9
1252*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m8
1253*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m4, m1           ; b
1254*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m16, m0, m0           ; -p
1255*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m4, m1
1256*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m17, m1, m1
1257*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m10              ; b * 455
1258*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m10
1259*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m11              ; p * s
1260*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m11
1261*c0909341SAndroid Build Coastguard Worker    vpalignr   m17{k2}, m16, m16, 2
1262*c0909341SAndroid Build Coastguard Worker    mova           m16, m20
1263*c0909341SAndroid Build Coastguard Worker    paddusw        m17, m12
1264*c0909341SAndroid Build Coastguard Worker    psraw          m17, 4                ; min(z, 255) - 256
1265*c0909341SAndroid Build Coastguard Worker    vpermt2b       m16, m17, m21         ; sgr_x_by_x[128..255]
1266*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m17
1267*c0909341SAndroid Build Coastguard Worker    vpermi2b       m17, m18, m19         ; sgr_x_by_x[  0..127]
1268*c0909341SAndroid Build Coastguard Worker    vmovdqu8   m17{k3}, m16              ; x
1269*c0909341SAndroid Build Coastguard Worker    pandn          m16, m13, m17
1270*c0909341SAndroid Build Coastguard Worker    psrld          m17, 16
1271*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m16
1272*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m17
1273*c0909341SAndroid Build Coastguard Worker    paddd           m0, m14              ; x * b * 455 + (1 << 11) + (1 << 15)
1274*c0909341SAndroid Build Coastguard Worker    paddd           m1, m14
1275*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, m13, 0xd8    ; a | (b << 12)
1276*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m1, m13, 0xd8
1277*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+  8], m16
1278*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+ 24], xm17
1279*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+ 56], m17, 2
1280*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+ 72], m17
1281*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*4+ 72], ym16, 1
1282*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+104], m16, 3
1283*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1284*c0909341SAndroid Build Coastguard Worker    jl .hv_loop
1285*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1286*c0909341SAndroid Build Coastguard Worker    mov             t1, t0
1287*c0909341SAndroid Build Coastguard Worker    mov             t0, t2
1288*c0909341SAndroid Build Coastguard Worker    ret
1289*c0909341SAndroid Build Coastguard Worker.v: ; vertical boxsum + ab
1290*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1291*c0909341SAndroid Build Coastguard Worker.v_loop:
1292*c0909341SAndroid Build Coastguard Worker    mova           m16, [t1+r10*2+416*2]
1293*c0909341SAndroid Build Coastguard Worker    mova           m17, [t1+r10*2+416*4]
1294*c0909341SAndroid Build Coastguard Worker    paddd          m16, m16
1295*c0909341SAndroid Build Coastguard Worker    paddd          m17, m17
1296*c0909341SAndroid Build Coastguard Worker    paddd          m16, [t2+r10*2+416*2] ; hv sumsq
1297*c0909341SAndroid Build Coastguard Worker    paddd          m17, [t2+r10*2+416*4]
1298*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m8               ; -a * 9
1299*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m8
1300*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+r10*2+416*0]
1301*c0909341SAndroid Build Coastguard Worker    paddw           m1, m1
1302*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t2+r10*2+416*0] ; hv sum
1303*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m4, m1           ; b
1304*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m16, m0, m0           ; -p
1305*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m4, m1
1306*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m17, m1, m1
1307*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m10              ; b * 455
1308*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m10
1309*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m11              ; p * s
1310*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m11
1311*c0909341SAndroid Build Coastguard Worker    vpalignr   m17{k2}, m16, m16, 2
1312*c0909341SAndroid Build Coastguard Worker    mova           m16, m20
1313*c0909341SAndroid Build Coastguard Worker    paddusw        m17, m12
1314*c0909341SAndroid Build Coastguard Worker    psraw          m17, 4                ; min(z, 255) - 256
1315*c0909341SAndroid Build Coastguard Worker    vpermt2b       m16, m17, m21         ; sgr_x_by_x[128..255]
1316*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m17
1317*c0909341SAndroid Build Coastguard Worker    vpermi2b       m17, m18, m19         ; sgr_x_by_x[  0..127]
1318*c0909341SAndroid Build Coastguard Worker    vmovdqu8   m17{k3}, m16              ; x
1319*c0909341SAndroid Build Coastguard Worker    pandn          m16, m13, m17
1320*c0909341SAndroid Build Coastguard Worker    psrld          m17, 16
1321*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m16
1322*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m17
1323*c0909341SAndroid Build Coastguard Worker    paddd           m0, m14              ; x * b * 455 + (1 << 11) + (1 << 15)
1324*c0909341SAndroid Build Coastguard Worker    paddd           m1, m14
1325*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, m13, 0xd8    ; a | (b << 12)
1326*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m1, m13, 0xd8
1327*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+  8], m16
1328*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+ 24], xm17
1329*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+ 56], m17, 2
1330*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+ 72], m17
1331*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*4+ 72], ym16, 1
1332*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+104], m16, 3
1333*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1334*c0909341SAndroid Build Coastguard Worker    jl .v_loop
1335*c0909341SAndroid Build Coastguard Worker    ret
1336*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup
1337*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1338*c0909341SAndroid Build Coastguard Worker    mov             t4, t3
1339*c0909341SAndroid Build Coastguard Worker    add             t3, 416*4
1340*c0909341SAndroid Build Coastguard Worker.prep_n_loop:
1341*c0909341SAndroid Build Coastguard Worker    mova            m2, [t5+r10*4+0]
1342*c0909341SAndroid Build Coastguard Worker    mova            m3, [t4+r10*4+0]
1343*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t5+r10*4+8]
1344*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t4+r10*4+8]
1345*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2, [t5+r10*4+4]
1346*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3, [t4+r10*4+4]
1347*c0909341SAndroid Build Coastguard Worker    pslld           m0, 2
1348*c0909341SAndroid Build Coastguard Worker    paddd           m1, m1                ; ab[ 0] 222
1349*c0909341SAndroid Build Coastguard Worker    psubd           m0, m2                ; ab[-1] 343
1350*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*4], m1
1351*c0909341SAndroid Build Coastguard Worker    paddd           m1, m1
1352*c0909341SAndroid Build Coastguard Worker    mova    [t5+r10*4], m0
1353*c0909341SAndroid Build Coastguard Worker    psubd           m1, m3                ; ab[ 0] 343
1354*c0909341SAndroid Build Coastguard Worker    mova    [t4+r10*4], m1
1355*c0909341SAndroid Build Coastguard Worker    add            r10, 16
1356*c0909341SAndroid Build Coastguard Worker    jl .prep_n_loop
1357*c0909341SAndroid Build Coastguard Worker    ret
1358*c0909341SAndroid Build Coastguard Worker; a+b are packed together in a single dword, but we can't do the
1359*c0909341SAndroid Build Coastguard Worker; full neighbor calculations before splitting them since we don't
1360*c0909341SAndroid Build Coastguard Worker; have sufficient precision. The solution is to do the calculations
1361*c0909341SAndroid Build Coastguard Worker; in two equal halves and split a and b before doing the final sum.
1362*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1363*c0909341SAndroid Build Coastguard Worker.n: ; neighbor + output
1364*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
1365*c0909341SAndroid Build Coastguard Worker.n_loop:
1366*c0909341SAndroid Build Coastguard Worker    mova           m16, [t3+r10*4+ 0]
1367*c0909341SAndroid Build Coastguard Worker    paddd          m16, [t3+r10*4+ 8]
1368*c0909341SAndroid Build Coastguard Worker    paddd          m17, m16, [t3+r10*4+ 4]
1369*c0909341SAndroid Build Coastguard Worker    paddd          m17, m17               ; ab[+1] 222
1370*c0909341SAndroid Build Coastguard Worker    mova            m2, [t3+r10*4+416*4+ 0]
1371*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343
1372*c0909341SAndroid Build Coastguard Worker    mova            m3, [t3+r10*4+416*4+64]
1373*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3, [t5+r10*4+64]
1374*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*4+ 0], m17
1375*c0909341SAndroid Build Coastguard Worker    paddd          m17, m17
1376*c0909341SAndroid Build Coastguard Worker    psubd          m17, m16               ; ab[+1] 343
1377*c0909341SAndroid Build Coastguard Worker    mova [t5+r10*4+ 0], m17
1378*c0909341SAndroid Build Coastguard Worker    paddd           m2, m17               ; ab[ 0] 222 + ab[+1] 343
1379*c0909341SAndroid Build Coastguard Worker    mova           m16, [t3+r10*4+64]
1380*c0909341SAndroid Build Coastguard Worker    paddd          m16, [t3+r10*4+72]
1381*c0909341SAndroid Build Coastguard Worker    paddd          m17, m16, [t3+r10*4+68]
1382*c0909341SAndroid Build Coastguard Worker    paddd          m17, m17
1383*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*4+64], m17
1384*c0909341SAndroid Build Coastguard Worker    paddd          m17, m17
1385*c0909341SAndroid Build Coastguard Worker    psubd          m17, m16
1386*c0909341SAndroid Build Coastguard Worker    mova [t5+r10*4+64], m17
1387*c0909341SAndroid Build Coastguard Worker    pandn          m16, m13, m0
1388*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
1389*c0909341SAndroid Build Coastguard Worker    paddd           m3, m17
1390*c0909341SAndroid Build Coastguard Worker    pandn          m17, m13, m2
1391*c0909341SAndroid Build Coastguard Worker    psrld           m2, 12
1392*c0909341SAndroid Build Coastguard Worker    paddd          m16, m17               ; a
1393*c0909341SAndroid Build Coastguard Worker    pandn          m17, m13, m1
1394*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12
1395*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2                ; b + (1 << 8)
1396*c0909341SAndroid Build Coastguard Worker    pandn           m2, m13, m3
1397*c0909341SAndroid Build Coastguard Worker    psrld           m3, 12
1398*c0909341SAndroid Build Coastguard Worker    paddd          m17, m2
1399*c0909341SAndroid Build Coastguard Worker    pmovzxbd        m2, [dstq+r10+ 0]
1400*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3
1401*c0909341SAndroid Build Coastguard Worker    pmovzxbd        m3, [dstq+r10+16]
1402*c0909341SAndroid Build Coastguard Worker    pmaddwd        m16, m2                ; a * src
1403*c0909341SAndroid Build Coastguard Worker    pmaddwd        m17, m3
1404*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
1405*c0909341SAndroid Build Coastguard Worker    psubd           m0, m16               ; b - a * src + (1 << 8)
1406*c0909341SAndroid Build Coastguard Worker    psubd           m1, m17
1407*c0909341SAndroid Build Coastguard Worker    psrad           m0, 9
1408*c0909341SAndroid Build Coastguard Worker    psrad           m1, 9
1409*c0909341SAndroid Build Coastguard Worker    packssdw        m0, m1
1410*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m0, m15
1411*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
1412*c0909341SAndroid Build Coastguard Worker    packuswb        m0, m0
1413*c0909341SAndroid Build Coastguard Worker    vpermd         m16, m9, m0
1414*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], ym16
1415*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1416*c0909341SAndroid Build Coastguard Worker    jl .n_loop
1417*c0909341SAndroid Build Coastguard Worker    mov            r10, t5
1418*c0909341SAndroid Build Coastguard Worker    mov             t5, t4
1419*c0909341SAndroid Build Coastguard Worker    mov             t4, r10
1420*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
1421*c0909341SAndroid Build Coastguard Worker    ret
1422*c0909341SAndroid Build Coastguard Worker
1423*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_mix_8bpc, 4, 13, 28, 416*56+8, dst, stride, left, lpf, \
1424*c0909341SAndroid Build Coastguard Worker                                                  w, h, edge, params
1425*c0909341SAndroid Build Coastguard Worker    mov        paramsq, r6mp
1426*c0909341SAndroid Build Coastguard Worker    mov             wd, wm
1427*c0909341SAndroid Build Coastguard Worker    movifnidn       hd, hm
1428*c0909341SAndroid Build Coastguard Worker    mov          edged, r7m
1429*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m5, [sgr_shuf+1]
1430*c0909341SAndroid Build Coastguard Worker    add           lpfq, wq
1431*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m6, [sgr_shuf+9]
1432*c0909341SAndroid Build Coastguard Worker    add           dstq, wq
1433*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m7, [sgr_shuf+3]
1434*c0909341SAndroid Build Coastguard Worker    lea             t3, [rsp+wq*4+416*24+8]
1435*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m8, [sgr_shuf+7]
1436*c0909341SAndroid Build Coastguard Worker    pxor            m4, m4
1437*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [pd_m9]
1438*c0909341SAndroid Build Coastguard Worker    vpsubd         m11, m4, [paramsq+0] {1to16} ; -s0
1439*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m14, [pw_61448]
1440*c0909341SAndroid Build Coastguard Worker    vpsubd         m12, m4, [paramsq+4] {1to16} ; -s1
1441*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m26, [paramsq+8]             ; w0 w1
1442*c0909341SAndroid Build Coastguard Worker    lea             t1, [rsp+wq*2+12]
1443*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m10, [pd_m25]
1444*c0909341SAndroid Build Coastguard Worker    neg             wq
1445*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m13, [pw_164_455]
1446*c0909341SAndroid Build Coastguard Worker    mov           r10d, 0xfe
1447*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m15, [pd_34816]
1448*c0909341SAndroid Build Coastguard Worker    kmovb           k1, r10d
1449*c0909341SAndroid Build Coastguard Worker    mova           m20, [sgr_x_by_x+64*0]
1450*c0909341SAndroid Build Coastguard Worker    mov            r10, 0x3333333333333333
1451*c0909341SAndroid Build Coastguard Worker    mova           m21, [sgr_x_by_x+64*1]
1452*c0909341SAndroid Build Coastguard Worker    kmovq           k2, r10
1453*c0909341SAndroid Build Coastguard Worker    mova           m22, [sgr_x_by_x+64*2]
1454*c0909341SAndroid Build Coastguard Worker    lea            r12, [r_ext_mask+75]
1455*c0909341SAndroid Build Coastguard Worker    mova           m23, [sgr_x_by_x+64*3]
1456*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m24, [pd_m4096]
1457*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m25, [sgr_shuf+28]           ; 0x8000____
1458*c0909341SAndroid Build Coastguard Worker    psllw          m26, 5
1459*c0909341SAndroid Build Coastguard Worker    mova          xm27, [sgr_mix_perm]
1460*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; LR_HAVE_TOP
1461*c0909341SAndroid Build Coastguard Worker    jz .no_top
1462*c0909341SAndroid Build Coastguard Worker    call .h_top
1463*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1464*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1465*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx512icl).top_fixup
1466*c0909341SAndroid Build Coastguard Worker    add             t1, 416*12
1467*c0909341SAndroid Build Coastguard Worker    call .h_top
1468*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1469*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1470*c0909341SAndroid Build Coastguard Worker    add            r10, strideq
1471*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10 ; below
1472*c0909341SAndroid Build Coastguard Worker    call .hv0
1473*c0909341SAndroid Build Coastguard Worker.main:
1474*c0909341SAndroid Build Coastguard Worker    dec             hd
1475*c0909341SAndroid Build Coastguard Worker    jz .height1
1476*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1477*c0909341SAndroid Build Coastguard Worker    call .hv1
1478*c0909341SAndroid Build Coastguard Worker    call .prep_n
1479*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1480*c0909341SAndroid Build Coastguard Worker    jl .extend_bottom
1481*c0909341SAndroid Build Coastguard Worker.main_loop:
1482*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1483*c0909341SAndroid Build Coastguard Worker    call .hv0
1484*c0909341SAndroid Build Coastguard Worker    test            hd, hd
1485*c0909341SAndroid Build Coastguard Worker    jz .odd_height
1486*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1487*c0909341SAndroid Build Coastguard Worker    call .hv1
1488*c0909341SAndroid Build Coastguard Worker    call .n0
1489*c0909341SAndroid Build Coastguard Worker    call .n1
1490*c0909341SAndroid Build Coastguard Worker    sub             hd, 2
1491*c0909341SAndroid Build Coastguard Worker    jge .main_loop
1492*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; LR_HAVE_BOTTOM
1493*c0909341SAndroid Build Coastguard Worker    jz .extend_bottom
1494*c0909341SAndroid Build Coastguard Worker    mov           lpfq, [rsp]
1495*c0909341SAndroid Build Coastguard Worker    call .hv0_bottom
1496*c0909341SAndroid Build Coastguard Worker    add           lpfq, strideq
1497*c0909341SAndroid Build Coastguard Worker    call .hv1_bottom
1498*c0909341SAndroid Build Coastguard Worker.end:
1499*c0909341SAndroid Build Coastguard Worker    call .n0
1500*c0909341SAndroid Build Coastguard Worker    call .n1
1501*c0909341SAndroid Build Coastguard Worker.end2:
1502*c0909341SAndroid Build Coastguard Worker    RET
1503*c0909341SAndroid Build Coastguard Worker.height1:
1504*c0909341SAndroid Build Coastguard Worker    call .v1
1505*c0909341SAndroid Build Coastguard Worker    call .prep_n
1506*c0909341SAndroid Build Coastguard Worker    jmp .odd_height_end
1507*c0909341SAndroid Build Coastguard Worker.odd_height:
1508*c0909341SAndroid Build Coastguard Worker    call .v1
1509*c0909341SAndroid Build Coastguard Worker    call .n0
1510*c0909341SAndroid Build Coastguard Worker    call .n1
1511*c0909341SAndroid Build Coastguard Worker.odd_height_end:
1512*c0909341SAndroid Build Coastguard Worker    call .v0
1513*c0909341SAndroid Build Coastguard Worker    call .v1
1514*c0909341SAndroid Build Coastguard Worker    call .n0
1515*c0909341SAndroid Build Coastguard Worker    jmp .end2
1516*c0909341SAndroid Build Coastguard Worker.extend_bottom:
1517*c0909341SAndroid Build Coastguard Worker    call .v0
1518*c0909341SAndroid Build Coastguard Worker    call .v1
1519*c0909341SAndroid Build Coastguard Worker    jmp .end
1520*c0909341SAndroid Build Coastguard Worker.no_top:
1521*c0909341SAndroid Build Coastguard Worker    lea            r10, [lpfq+strideq*4]
1522*c0909341SAndroid Build Coastguard Worker    mov           lpfq, dstq
1523*c0909341SAndroid Build Coastguard Worker    lea            r10, [r10+strideq*2]
1524*c0909341SAndroid Build Coastguard Worker    mov          [rsp], r10
1525*c0909341SAndroid Build Coastguard Worker    call .h
1526*c0909341SAndroid Build Coastguard Worker    lea             t2, [t1+416*12]
1527*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1528*c0909341SAndroid Build Coastguard Worker.top_fixup_loop:
1529*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10*2+416* 0]
1530*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+r10*2+416* 2]
1531*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10*2+416* 4]
1532*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
1533*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+r10*2+416* 6]
1534*c0909341SAndroid Build Coastguard Worker    paddd           m1, m1
1535*c0909341SAndroid Build Coastguard Worker    mova           m16, [t1+r10*2+416* 8]
1536*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
1537*c0909341SAndroid Build Coastguard Worker    mova           m17, [t1+r10*2+416*10]
1538*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416* 0], m0
1539*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416* 2], m1
1540*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416* 4], m2
1541*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416* 6], m3
1542*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416* 8], m16
1543*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416*10], m17
1544*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1545*c0909341SAndroid Build Coastguard Worker    jl .top_fixup_loop
1546*c0909341SAndroid Build Coastguard Worker    call .v0
1547*c0909341SAndroid Build Coastguard Worker    jmp .main
1548*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsums
1549*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1550*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1551*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1552*c0909341SAndroid Build Coastguard Worker    movd          xm17, [leftq]
1553*c0909341SAndroid Build Coastguard Worker    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1554*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
1555*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1556*c0909341SAndroid Build Coastguard Worker.h_extend_left:
1557*c0909341SAndroid Build Coastguard Worker    vpbroadcastb  xm17, [lpfq+wq]
1558*c0909341SAndroid Build Coastguard Worker    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1559*c0909341SAndroid Build Coastguard Worker    jmp .h_main
1560*c0909341SAndroid Build Coastguard Worker.h_top:
1561*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1562*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1563*c0909341SAndroid Build Coastguard Worker    jz .h_extend_left
1564*c0909341SAndroid Build Coastguard Worker.h_loop:
1565*c0909341SAndroid Build Coastguard Worker    movu          ym17, [lpfq+r10-2]
1566*c0909341SAndroid Build Coastguard Worker.h_main:
1567*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m17, [lpfq+r10+6], 1
1568*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1569*c0909341SAndroid Build Coastguard Worker    jnz .h_have_right
1570*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -34
1571*c0909341SAndroid Build Coastguard Worker    jl .h_have_right
1572*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m0, [lpfq-1]
1573*c0909341SAndroid Build Coastguard Worker    movu          ym16, [r12+r10-8]
1574*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m16, [r12+r10+0], 1
1575*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m0, m16, 0xe4
1576*c0909341SAndroid Build Coastguard Worker.h_have_right:
1577*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m17, m5
1578*c0909341SAndroid Build Coastguard Worker    pshufb         m18, m17, m6
1579*c0909341SAndroid Build Coastguard Worker    shufps          m0, m3, m18, q2121
1580*c0909341SAndroid Build Coastguard Worker    pmullw          m2, m0, m0
1581*c0909341SAndroid Build Coastguard Worker    pshufb         m19, m17, m7
1582*c0909341SAndroid Build Coastguard Worker    paddw           m0, m19
1583*c0909341SAndroid Build Coastguard Worker    pshufb         m17, m8
1584*c0909341SAndroid Build Coastguard Worker    paddw           m0, m17           ; sum3
1585*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m19, m17
1586*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m2, m4
1587*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m16, m16      ; sumsq3
1588*c0909341SAndroid Build Coastguard Worker    punpckhwd      m19, m17
1589*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m4
1590*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m19, m19
1591*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416* 6], m0
1592*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416* 8], m1
1593*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416*10], m2
1594*c0909341SAndroid Build Coastguard Worker    punpcklwd      m19, m3, m18
1595*c0909341SAndroid Build Coastguard Worker    paddw           m0, m3
1596*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m19, m19      ; sumsq5
1597*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m18
1598*c0909341SAndroid Build Coastguard Worker    paddw           m0, m18           ; sum5
1599*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m3, m3
1600*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416* 0], m0
1601*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416* 2], m1
1602*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416* 4], m2
1603*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1604*c0909341SAndroid Build Coastguard Worker    jl .h_loop
1605*c0909341SAndroid Build Coastguard Worker    ret
1606*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1607*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows)
1608*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1609*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1610*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
1611*c0909341SAndroid Build Coastguard Worker    movd          xm17, [leftq]
1612*c0909341SAndroid Build Coastguard Worker    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1613*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
1614*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
1615*c0909341SAndroid Build Coastguard Worker.hv0_extend_left:
1616*c0909341SAndroid Build Coastguard Worker    vpbroadcastb  xm17, [lpfq+wq]
1617*c0909341SAndroid Build Coastguard Worker    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1618*c0909341SAndroid Build Coastguard Worker    jmp .hv0_main
1619*c0909341SAndroid Build Coastguard Worker.hv0_bottom:
1620*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1621*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1622*c0909341SAndroid Build Coastguard Worker    jz .hv0_extend_left
1623*c0909341SAndroid Build Coastguard Worker.hv0_loop:
1624*c0909341SAndroid Build Coastguard Worker    movu          ym17, [lpfq+r10-2]
1625*c0909341SAndroid Build Coastguard Worker.hv0_main:
1626*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m17, [lpfq+r10+6], 1
1627*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1628*c0909341SAndroid Build Coastguard Worker    jnz .hv0_have_right
1629*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -34
1630*c0909341SAndroid Build Coastguard Worker    jl .hv0_have_right
1631*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m0, [lpfq-1]
1632*c0909341SAndroid Build Coastguard Worker    movu          ym16, [r12+r10-8]
1633*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m16, [r12+r10+0], 1
1634*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m0, m16, 0xe4
1635*c0909341SAndroid Build Coastguard Worker.hv0_have_right:
1636*c0909341SAndroid Build Coastguard Worker    pshufb         m18, m17, m5
1637*c0909341SAndroid Build Coastguard Worker    pshufb         m19, m17, m6
1638*c0909341SAndroid Build Coastguard Worker    shufps          m1, m18, m19, q2121
1639*c0909341SAndroid Build Coastguard Worker    pmullw          m3, m1, m1
1640*c0909341SAndroid Build Coastguard Worker    pshufb          m0, m17, m7
1641*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0
1642*c0909341SAndroid Build Coastguard Worker    pshufb         m17, m8
1643*c0909341SAndroid Build Coastguard Worker    paddw           m1, m17           ; sum3
1644*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m0, m17
1645*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m4
1646*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m16, m16      ; sumsq3
1647*c0909341SAndroid Build Coastguard Worker    punpckhwd       m0, m17
1648*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4
1649*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m0, m0
1650*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1, [t1+r10*2+416* 6]
1651*c0909341SAndroid Build Coastguard Worker    paddd          m16, m2, [t1+r10*2+416* 8]
1652*c0909341SAndroid Build Coastguard Worker    paddd          m17, m3, [t1+r10*2+416*10]
1653*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416* 6], m1
1654*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416* 8], m2
1655*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416*10], m3
1656*c0909341SAndroid Build Coastguard Worker    paddw           m1, m18
1657*c0909341SAndroid Build Coastguard Worker    paddw           m1, m19           ; sum5
1658*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*8+ 8], m1
1659*c0909341SAndroid Build Coastguard Worker    paddw           m1, [t1+r10*2+416* 0]
1660*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416* 0], m1
1661*c0909341SAndroid Build Coastguard Worker    punpcklwd       m1, m18, m19
1662*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m2, m1, m1        ; sumsq5
1663*c0909341SAndroid Build Coastguard Worker    punpckhwd      m18, m19
1664*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m18, m18
1665*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*0+ 8], m2      ; we need a clean copy of the last row
1666*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*0+72], m3      ; in case height is odd
1667*c0909341SAndroid Build Coastguard Worker    paddd           m2, [t1+r10*2+416* 2]
1668*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t1+r10*2+416* 4]
1669*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416* 2], m2
1670*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416* 4], m3
1671*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10*2+416* 6]
1672*c0909341SAndroid Build Coastguard Worker    paddd           m2, m16, [t2+r10*2+416* 8]
1673*c0909341SAndroid Build Coastguard Worker    paddd           m3, m17, [t2+r10*2+416*10]
1674*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416* 6], m0
1675*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416* 8], m16
1676*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416*10], m17
1677*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m2, m9        ; -a3 * 9
1678*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m3, m9
1679*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m4, m1        ; b3
1680*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m16, m0, m0        ; -p3
1681*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m4, m1
1682*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m17, m1, m1
1683*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m12           ; p3 * s1
1684*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m12
1685*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m13           ; b3 * 455
1686*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m13
1687*c0909341SAndroid Build Coastguard Worker    vpalignr   m17{k2}, m16, m16, 2
1688*c0909341SAndroid Build Coastguard Worker    mova           m16, m22
1689*c0909341SAndroid Build Coastguard Worker    paddusw        m17, m14
1690*c0909341SAndroid Build Coastguard Worker    psraw          m17, 4             ; min(z3, 255) - 256
1691*c0909341SAndroid Build Coastguard Worker    vpermt2b       m16, m17, m23      ; sgr_x_by_x[128..255]
1692*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m17
1693*c0909341SAndroid Build Coastguard Worker    vpermi2b       m17, m20, m21      ; sgr_x_by_x[  0..127]
1694*c0909341SAndroid Build Coastguard Worker    vmovdqu8   m17{k3}, m16           ; x3
1695*c0909341SAndroid Build Coastguard Worker    pandn          m16, m24, m17
1696*c0909341SAndroid Build Coastguard Worker    psrld          m17, 16
1697*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m16
1698*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m17
1699*c0909341SAndroid Build Coastguard Worker    paddd           m0, m15           ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
1700*c0909341SAndroid Build Coastguard Worker    paddd           m1, m15
1701*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
1702*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m1, m24, 0xd8
1703*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+416*4+  8], m16
1704*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+416*4+ 24], xm17
1705*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2
1706*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+416*4+ 72], m17
1707*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*4+416*4+ 72], ym16, 1
1708*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+416*4+104], m16, 3
1709*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1710*c0909341SAndroid Build Coastguard Worker    jl .hv0_loop
1711*c0909341SAndroid Build Coastguard Worker    ret
1712*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1713*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
1714*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1715*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1716*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
1717*c0909341SAndroid Build Coastguard Worker    movd          xm17, [leftq]
1718*c0909341SAndroid Build Coastguard Worker    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1719*c0909341SAndroid Build Coastguard Worker    add          leftq, 4
1720*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
1721*c0909341SAndroid Build Coastguard Worker.hv1_extend_left:
1722*c0909341SAndroid Build Coastguard Worker    vpbroadcastb  xm17, [lpfq+wq]
1723*c0909341SAndroid Build Coastguard Worker    vmovdqu32 ym17{k1}, [lpfq+wq-4]
1724*c0909341SAndroid Build Coastguard Worker    jmp .hv1_main
1725*c0909341SAndroid Build Coastguard Worker.hv1_bottom:
1726*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1727*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; LR_HAVE_LEFT
1728*c0909341SAndroid Build Coastguard Worker    jz .hv1_extend_left
1729*c0909341SAndroid Build Coastguard Worker.hv1_loop:
1730*c0909341SAndroid Build Coastguard Worker    movu          ym17, [lpfq+r10-2]
1731*c0909341SAndroid Build Coastguard Worker.hv1_main:
1732*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m17, [lpfq+r10+6], 1
1733*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; LR_HAVE_RIGHT
1734*c0909341SAndroid Build Coastguard Worker    jnz .hv1_have_right
1735*c0909341SAndroid Build Coastguard Worker    cmp           r10d, -34
1736*c0909341SAndroid Build Coastguard Worker    jl .hv1_have_right
1737*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m0, [lpfq-1]
1738*c0909341SAndroid Build Coastguard Worker    movu          ym16, [r12+r10-8]
1739*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m16, [r12+r10+0], 1
1740*c0909341SAndroid Build Coastguard Worker    vpternlogd    m17, m0, m16, 0xe4
1741*c0909341SAndroid Build Coastguard Worker.hv1_have_right:
1742*c0909341SAndroid Build Coastguard Worker    pshufb          m3, m17, m5
1743*c0909341SAndroid Build Coastguard Worker    pshufb         m19, m17, m6
1744*c0909341SAndroid Build Coastguard Worker    shufps          m2, m3, m19, q2121
1745*c0909341SAndroid Build Coastguard Worker    pmullw          m1, m2, m2
1746*c0909341SAndroid Build Coastguard Worker    pshufb         m18, m17, m7
1747*c0909341SAndroid Build Coastguard Worker    paddw           m2, m18
1748*c0909341SAndroid Build Coastguard Worker    pshufb         m17, m8
1749*c0909341SAndroid Build Coastguard Worker    paddw           m2, m17           ; sum3
1750*c0909341SAndroid Build Coastguard Worker    punpcklwd      m16, m17, m18
1751*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1, m4
1752*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m16, m16      ; sumsq3
1753*c0909341SAndroid Build Coastguard Worker    punpckhwd      m17, m18
1754*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m4
1755*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m17, m17
1756*c0909341SAndroid Build Coastguard Worker    paddd          m16, m0, [t2+r10*2+416* 8]
1757*c0909341SAndroid Build Coastguard Worker    paddd          m17, m1, [t2+r10*2+416*10]
1758*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416* 8], m0
1759*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416*10], m1
1760*c0909341SAndroid Build Coastguard Worker    punpcklwd      m18, m3, m19
1761*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m18, m18      ; sumsq5
1762*c0909341SAndroid Build Coastguard Worker    punpckhwd      m18, m3, m19
1763*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m1, m18, m18
1764*c0909341SAndroid Build Coastguard Worker    paddw           m3, m19
1765*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m9            ; -a3 * 9
1766*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m9
1767*c0909341SAndroid Build Coastguard Worker    paddd          m18, m0, [t2+r10*2+416*2]
1768*c0909341SAndroid Build Coastguard Worker    paddd          m19, m1, [t2+r10*2+416*4]
1769*c0909341SAndroid Build Coastguard Worker    paddd          m18, [t1+r10*2+416*2]
1770*c0909341SAndroid Build Coastguard Worker    paddd          m19, [t1+r10*2+416*4]
1771*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416*2], m0
1772*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416*4], m1
1773*c0909341SAndroid Build Coastguard Worker    pmulld         m18, m10           ; -a5 * 25
1774*c0909341SAndroid Build Coastguard Worker    pmulld         m19, m10
1775*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2, [t2+r10*2+416* 6]
1776*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416* 6], m2
1777*c0909341SAndroid Build Coastguard Worker    paddw           m2, m3            ; sum5
1778*c0909341SAndroid Build Coastguard Worker    paddw           m3, m2, [t2+r10*2+416*0]
1779*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t1+r10*2+416*0]
1780*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416*0], m2
1781*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m4, m1        ; b3
1782*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m16, m0, m0        ; -p3
1783*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m4, m1
1784*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m17, m1, m1
1785*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m4        ; b5
1786*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m18, m2, m2        ; -p5
1787*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4
1788*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m19, m3, m3
1789*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m12           ; p3 * s1
1790*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m12
1791*c0909341SAndroid Build Coastguard Worker    pmulld         m18, m11           ; p5 * s0
1792*c0909341SAndroid Build Coastguard Worker    pmulld         m19, m11
1793*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m13           ; b3 * 455
1794*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m13
1795*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m13           ; b5 * 164
1796*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m13
1797*c0909341SAndroid Build Coastguard Worker    vpalignr   m17{k2}, m16, m16, 2
1798*c0909341SAndroid Build Coastguard Worker    vpalignr   m19{k2}, m18, m18, 2
1799*c0909341SAndroid Build Coastguard Worker    paddusw        m17, m14
1800*c0909341SAndroid Build Coastguard Worker    mova           m16, m22
1801*c0909341SAndroid Build Coastguard Worker    psraw          m17, 4             ; min(z3, 255) - 256
1802*c0909341SAndroid Build Coastguard Worker    vpermt2b       m16, m17, m23      ; sgr_x_by_x[128..255]
1803*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m17
1804*c0909341SAndroid Build Coastguard Worker    vpermi2b       m17, m20, m21      ; sgr_x_by_x[  0..127]
1805*c0909341SAndroid Build Coastguard Worker    paddusw        m19, m14
1806*c0909341SAndroid Build Coastguard Worker    mova           m18, m22
1807*c0909341SAndroid Build Coastguard Worker    psraw          m19, 4             ; min(z5, 255) - 256
1808*c0909341SAndroid Build Coastguard Worker    vpermt2b       m18, m19, m23      ; sgr_x_by_x[128..255]
1809*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k4, m19
1810*c0909341SAndroid Build Coastguard Worker    vpermi2b       m19, m20, m21      ; sgr_x_by_x[  0..127]
1811*c0909341SAndroid Build Coastguard Worker    vmovdqu8   m17{k3}, m16           ; x3
1812*c0909341SAndroid Build Coastguard Worker    vmovdqu8   m19{k4}, m18           ; x5
1813*c0909341SAndroid Build Coastguard Worker    pandn          m16, m24, m17
1814*c0909341SAndroid Build Coastguard Worker    psrld          m17, 16
1815*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m16
1816*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m17
1817*c0909341SAndroid Build Coastguard Worker    pandn          m18, m24, m19
1818*c0909341SAndroid Build Coastguard Worker    psrld          m19, 16
1819*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m18
1820*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m19
1821*c0909341SAndroid Build Coastguard Worker    paddd           m0, m15           ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
1822*c0909341SAndroid Build Coastguard Worker    paddd           m1, m15
1823*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
1824*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m1, m24, 0xd8
1825*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+416*8+  8], m16
1826*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+416*8+ 24], xm17
1827*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2
1828*c0909341SAndroid Build Coastguard Worker    paddd           m2, m15           ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
1829*c0909341SAndroid Build Coastguard Worker    paddd           m3, m15
1830*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+416*8+ 72], m17
1831*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*4+416*8+ 72], ym16, 1
1832*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+416*8+104], m16, 3
1833*c0909341SAndroid Build Coastguard Worker    vpternlogd     m18, m2, m24, 0xd8 ; a5 | (b5 << 12)
1834*c0909341SAndroid Build Coastguard Worker    vpternlogd     m19, m3, m24, 0xd8
1835*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+416*0+  8], m18
1836*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+416*0+ 24], xm19
1837*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2
1838*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+416*0+ 72], m19
1839*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*4+416*0+ 72], ym18, 1
1840*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+416*0+104], m18, 3
1841*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1842*c0909341SAndroid Build Coastguard Worker    jl .hv1_loop
1843*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
1844*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
1845*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
1846*c0909341SAndroid Build Coastguard Worker    ret
1847*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab3 (even rows)
1848*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1849*c0909341SAndroid Build Coastguard Worker.v0_loop:
1850*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10*2+416* 8]
1851*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+r10*2+416*10]
1852*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
1853*c0909341SAndroid Build Coastguard Worker    paddd           m3, m3
1854*c0909341SAndroid Build Coastguard Worker    paddd          m16, m2, [t2+r10*2+416* 8]
1855*c0909341SAndroid Build Coastguard Worker    paddd          m17, m3, [t2+r10*2+416*10]
1856*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10*2+416* 6]
1857*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0
1858*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10*2+416* 6]
1859*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m9            ; -a3 * 9
1860*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m9
1861*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416* 6], m0
1862*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416* 8], m2
1863*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416*10], m3
1864*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1+r10*2+416*0]
1865*c0909341SAndroid Build Coastguard Worker    mova            m3, [t1+r10*2+416*2]
1866*c0909341SAndroid Build Coastguard Worker    mova           m18, [t1+r10*2+416*4]
1867*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m4, m1        ; b3
1868*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m16, m0, m0        ; -p3
1869*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m4, m1
1870*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m17, m1, m1
1871*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m12           ; p3 * s1
1872*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m12
1873*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m13           ; b3 * 455
1874*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m13
1875*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*8+ 8], m2
1876*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*0+ 8], m3
1877*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*0+72], m18
1878*c0909341SAndroid Build Coastguard Worker    vpalignr   m17{k2}, m16, m16, 2
1879*c0909341SAndroid Build Coastguard Worker    mova           m16, m22
1880*c0909341SAndroid Build Coastguard Worker    paddusw        m17, m14
1881*c0909341SAndroid Build Coastguard Worker    psraw          m17, 4             ; min(z3, 255) - 256
1882*c0909341SAndroid Build Coastguard Worker    vpermt2b       m16, m17, m23      ; sgr_x_by_x[128..255]
1883*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m17
1884*c0909341SAndroid Build Coastguard Worker    vpermi2b       m17, m20, m21      ; sgr_x_by_x[  0..127]
1885*c0909341SAndroid Build Coastguard Worker    vmovdqu8   m17{k3}, m16           ; x3
1886*c0909341SAndroid Build Coastguard Worker    pandn          m16, m24, m17
1887*c0909341SAndroid Build Coastguard Worker    psrld          m17, 16
1888*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m16
1889*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m17
1890*c0909341SAndroid Build Coastguard Worker    paddw           m2, m2            ; cc5
1891*c0909341SAndroid Build Coastguard Worker    paddd           m3, m3
1892*c0909341SAndroid Build Coastguard Worker    paddd          m18, m18
1893*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416*0], m2
1894*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416*2], m3
1895*c0909341SAndroid Build Coastguard Worker    mova [t1+r10*2+416*4], m18
1896*c0909341SAndroid Build Coastguard Worker    paddd           m0, m15           ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
1897*c0909341SAndroid Build Coastguard Worker    paddd           m1, m15
1898*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
1899*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m1, m24, 0xd8
1900*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+416*4+  8], m16
1901*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+416*4+ 24], xm17
1902*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2
1903*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+416*4+ 72], m17
1904*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*4+416*4+ 72], ym16, 1
1905*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+416*4+104], m16, 3
1906*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1907*c0909341SAndroid Build Coastguard Worker    jl .v0_loop
1908*c0909341SAndroid Build Coastguard Worker    ret
1909*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows)
1910*c0909341SAndroid Build Coastguard Worker    lea            r10, [wq-2]
1911*c0909341SAndroid Build Coastguard Worker.v1_loop:
1912*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10*2+416* 8]
1913*c0909341SAndroid Build Coastguard Worker    paddd          m16, m0, [t2+r10*2+416* 8]
1914*c0909341SAndroid Build Coastguard Worker    mova            m1, [t1+r10*2+416*10]
1915*c0909341SAndroid Build Coastguard Worker    paddd          m17, m1, [t2+r10*2+416*10]
1916*c0909341SAndroid Build Coastguard Worker    mova            m2, [t3+r10*4+416*0+ 8]
1917*c0909341SAndroid Build Coastguard Worker    paddd          m18, m2, [t2+r10*2+416* 2]
1918*c0909341SAndroid Build Coastguard Worker    mova            m3, [t3+r10*4+416*0+72]
1919*c0909341SAndroid Build Coastguard Worker    paddd          m19, m3, [t2+r10*2+416* 4]
1920*c0909341SAndroid Build Coastguard Worker    paddd          m18, [t1+r10*2+416* 2]
1921*c0909341SAndroid Build Coastguard Worker    paddd          m19, [t1+r10*2+416* 4]
1922*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416* 8], m0
1923*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416*10], m1
1924*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416* 2], m2
1925*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416* 4], m3
1926*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m9            ; -a3 * 9
1927*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m9
1928*c0909341SAndroid Build Coastguard Worker    pmulld         m18, m10           ; -a5 * 25
1929*c0909341SAndroid Build Coastguard Worker    pmulld         m19, m10
1930*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1+r10*2+416* 6]
1931*c0909341SAndroid Build Coastguard Worker    paddw           m1, m0, [t2+r10*2+416* 6]
1932*c0909341SAndroid Build Coastguard Worker    mova            m2, [t3+r10*4+416*8+ 8]
1933*c0909341SAndroid Build Coastguard Worker    paddw           m3, m2, [t2+r10*2+416*0]
1934*c0909341SAndroid Build Coastguard Worker    paddw           m3, [t1+r10*2+416*0]
1935*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416* 6], m0
1936*c0909341SAndroid Build Coastguard Worker    mova [t2+r10*2+416*0], m2
1937*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m4, m1        ; b3
1938*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m16, m0, m0        ; -p3
1939*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m4, m1
1940*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m17, m1, m1
1941*c0909341SAndroid Build Coastguard Worker    punpcklwd       m2, m3, m4        ; b5
1942*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m18, m2, m2        ; -p5
1943*c0909341SAndroid Build Coastguard Worker    punpckhwd       m3, m4
1944*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m19, m3, m3
1945*c0909341SAndroid Build Coastguard Worker    pmulld         m16, m12           ; p3 * s1
1946*c0909341SAndroid Build Coastguard Worker    pmulld         m17, m12
1947*c0909341SAndroid Build Coastguard Worker    pmulld         m18, m11           ; p5 * s0
1948*c0909341SAndroid Build Coastguard Worker    pmulld         m19, m11
1949*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m13           ; b3 * 455
1950*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m13
1951*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m13           ; b5 * 164
1952*c0909341SAndroid Build Coastguard Worker    pmaddwd         m3, m13
1953*c0909341SAndroid Build Coastguard Worker    vpalignr   m17{k2}, m16, m16, 2
1954*c0909341SAndroid Build Coastguard Worker    vpalignr   m19{k2}, m18, m18, 2
1955*c0909341SAndroid Build Coastguard Worker    paddusw        m17, m14
1956*c0909341SAndroid Build Coastguard Worker    mova           m16, m22
1957*c0909341SAndroid Build Coastguard Worker    psraw          m17, 4             ; min(z3, 255) - 256
1958*c0909341SAndroid Build Coastguard Worker    vpermt2b       m16, m17, m23      ; sgr_x_by_x[128..255]
1959*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k3, m17
1960*c0909341SAndroid Build Coastguard Worker    vpermi2b       m17, m20, m21      ; sgr_x_by_x[  0..127]
1961*c0909341SAndroid Build Coastguard Worker    paddusw        m19, m14
1962*c0909341SAndroid Build Coastguard Worker    mova           m18, m22
1963*c0909341SAndroid Build Coastguard Worker    psraw          m19, 4             ; min(z5, 255) - 256
1964*c0909341SAndroid Build Coastguard Worker    vpermt2b       m18, m19, m23      ; sgr_x_by_x[128..255]
1965*c0909341SAndroid Build Coastguard Worker    vpmovb2m        k4, m19
1966*c0909341SAndroid Build Coastguard Worker    vpermi2b       m19, m20, m21      ; sgr_x_by_x[  0..127]
1967*c0909341SAndroid Build Coastguard Worker    vmovdqu8   m17{k3}, m16           ; x3
1968*c0909341SAndroid Build Coastguard Worker    vmovdqu8   m19{k4}, m18           ; x5
1969*c0909341SAndroid Build Coastguard Worker    pandn          m16, m24, m17
1970*c0909341SAndroid Build Coastguard Worker    psrld          m17, 16
1971*c0909341SAndroid Build Coastguard Worker    pmulld          m0, m16
1972*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m17
1973*c0909341SAndroid Build Coastguard Worker    pandn          m18, m24, m19
1974*c0909341SAndroid Build Coastguard Worker    psrld          m19, m19, 16
1975*c0909341SAndroid Build Coastguard Worker    pmulld          m2, m18
1976*c0909341SAndroid Build Coastguard Worker    pmulld          m3, m19
1977*c0909341SAndroid Build Coastguard Worker    paddd           m0, m15           ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
1978*c0909341SAndroid Build Coastguard Worker    paddd           m1, m15
1979*c0909341SAndroid Build Coastguard Worker    vpternlogd     m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
1980*c0909341SAndroid Build Coastguard Worker    vpternlogd     m17, m1, m24, 0xd8
1981*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+416*8+  8], m16
1982*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+416*8+ 24], xm17
1983*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2
1984*c0909341SAndroid Build Coastguard Worker    paddd           m2, m15           ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
1985*c0909341SAndroid Build Coastguard Worker    paddd           m3, m15
1986*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+416*8+ 72], m17
1987*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*4+416*8+ 72], ym16, 1
1988*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+416*8+104], m16, 3
1989*c0909341SAndroid Build Coastguard Worker    vpternlogd     m18, m2, m24, 0xd8 ; a5 | (b5 << 12)
1990*c0909341SAndroid Build Coastguard Worker    vpternlogd     m19, m3, m24, 0xd8
1991*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+416*0+  8], m18
1992*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+416*0+ 24], xm19
1993*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2
1994*c0909341SAndroid Build Coastguard Worker    mova          [t3+r10*4+416*0+ 72], m19
1995*c0909341SAndroid Build Coastguard Worker    vextracti128  [t3+r10*4+416*0+ 72], ym18, 1
1996*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [t3+r10*4+416*0+104], m18, 3
1997*c0909341SAndroid Build Coastguard Worker    add            r10, 32
1998*c0909341SAndroid Build Coastguard Worker    jl .v1_loop
1999*c0909341SAndroid Build Coastguard Worker    mov            r10, t2
2000*c0909341SAndroid Build Coastguard Worker    mov             t2, t1
2001*c0909341SAndroid Build Coastguard Worker    mov             t1, r10
2002*c0909341SAndroid Build Coastguard Worker    ret
2003*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup
2004*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
2005*c0909341SAndroid Build Coastguard Worker.prep_n_loop:
2006*c0909341SAndroid Build Coastguard Worker    movu            m0, [t3+r10*4+416*0+4]
2007*c0909341SAndroid Build Coastguard Worker    paddd           m1, m0, [t3+r10*4+416*0+0]
2008*c0909341SAndroid Build Coastguard Worker    mova           m16, [t3+r10*4+416*4+0]
2009*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+r10*4+416*0+8]
2010*c0909341SAndroid Build Coastguard Worker    mova           m17, [t3+r10*4+416*8+0]
2011*c0909341SAndroid Build Coastguard Worker    paddd          m16, [t3+r10*4+416*4+8]
2012*c0909341SAndroid Build Coastguard Worker    paddd          m17, [t3+r10*4+416*8+8]
2013*c0909341SAndroid Build Coastguard Worker    paddd           m2, m16, [t3+r10*4+416*4+4]
2014*c0909341SAndroid Build Coastguard Worker    paddd           m3, m17, [t3+r10*4+416*8+4]
2015*c0909341SAndroid Build Coastguard Worker    paddd           m0, m1
2016*c0909341SAndroid Build Coastguard Worker    pslld           m1, 2
2017*c0909341SAndroid Build Coastguard Worker    pslld           m2, 2
2018*c0909341SAndroid Build Coastguard Worker    paddd           m1, m0            ; ab5 565
2019*c0909341SAndroid Build Coastguard Worker    paddd           m3, m3            ; ab3[ 0] 222
2020*c0909341SAndroid Build Coastguard Worker    psubd           m2, m16           ; ab3[-1] 343
2021*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*20], m3
2022*c0909341SAndroid Build Coastguard Worker    pandn           m0, m24, m1       ; a5 565
2023*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*24], m2
2024*c0909341SAndroid Build Coastguard Worker    psrld           m1, 12            ; b5 565
2025*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*12], m0
2026*c0909341SAndroid Build Coastguard Worker    paddd           m3, m3
2027*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*16], m1
2028*c0909341SAndroid Build Coastguard Worker    psubd           m3, m17           ; ab3[ 0] 343
2029*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*28], m3
2030*c0909341SAndroid Build Coastguard Worker    add            r10, 16
2031*c0909341SAndroid Build Coastguard Worker    jl .prep_n_loop
2032*c0909341SAndroid Build Coastguard Worker    ret
2033*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2034*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows)
2035*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
2036*c0909341SAndroid Build Coastguard Worker.n0_loop:
2037*c0909341SAndroid Build Coastguard Worker    movu            m2, [t3+r10*4+4]
2038*c0909341SAndroid Build Coastguard Worker    paddd           m3, m2, [t3+r10*4+0]
2039*c0909341SAndroid Build Coastguard Worker    paddd           m3, [t3+r10*4+8]
2040*c0909341SAndroid Build Coastguard Worker    mova            m1, [t3+r10*4+416*4+0]
2041*c0909341SAndroid Build Coastguard Worker    paddd           m2, m3
2042*c0909341SAndroid Build Coastguard Worker    pslld           m3, 2
2043*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+r10*4+416*4+8]
2044*c0909341SAndroid Build Coastguard Worker    paddd           m3, m2
2045*c0909341SAndroid Build Coastguard Worker    pandn           m2, m24, m3
2046*c0909341SAndroid Build Coastguard Worker    psrld           m3, 12
2047*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2, [t3+r10*4+416*12] ; a5
2048*c0909341SAndroid Build Coastguard Worker    paddd          m16, m3, [t3+r10*4+416*16] ; b5 + (1 << 8)
2049*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*12], m2
2050*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*16], m3
2051*c0909341SAndroid Build Coastguard Worker    paddd           m2, m1, [t3+r10*4+416*4+4]
2052*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2                    ; ab3[ 1] 222
2053*c0909341SAndroid Build Coastguard Worker    mova            m3, [t3+r10*4+416*20]
2054*c0909341SAndroid Build Coastguard Worker    paddd          m17, m3, [t3+r10*4+416*24] ; ab3[ 0] 222 + ab3[-1] 343
2055*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*20], m2
2056*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
2057*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1                    ; ab3[ 1] 343
2058*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*24], m2
2059*c0909341SAndroid Build Coastguard Worker    paddd           m2, m3                    ; ab3[ 0] 222 + ab3[ 1] 343
2060*c0909341SAndroid Build Coastguard Worker    pandn           m1, m24, m17
2061*c0909341SAndroid Build Coastguard Worker    psrld          m17, 12
2062*c0909341SAndroid Build Coastguard Worker    pandn           m3, m24, m2
2063*c0909341SAndroid Build Coastguard Worker    psrld           m2, 12
2064*c0909341SAndroid Build Coastguard Worker    paddd           m1, m3                    ; a3
2065*c0909341SAndroid Build Coastguard Worker    pmovzxbd        m3, [dstq+r10]
2066*c0909341SAndroid Build Coastguard Worker    paddd          m17, m2                    ; b3 + (1 << 8)
2067*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m3                    ; a5 * src
2068*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m3                    ; a3 * src
2069*c0909341SAndroid Build Coastguard Worker    vpshldd         m3, m25, 16               ; (dst << 16) + (1 << 15)
2070*c0909341SAndroid Build Coastguard Worker    psubd          m16, m0                    ; b5 - a5 * src + (1 << 8)
2071*c0909341SAndroid Build Coastguard Worker    psubd          m17, m1                    ; b3 - a3 * src + (1 << 8)
2072*c0909341SAndroid Build Coastguard Worker    psrld          m16, 9
2073*c0909341SAndroid Build Coastguard Worker    pslld          m17, 7
2074*c0909341SAndroid Build Coastguard Worker    vmovdqu8   m17{k2}, m16
2075*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m17, m26
2076*c0909341SAndroid Build Coastguard Worker    packuswb        m3, m2
2077*c0909341SAndroid Build Coastguard Worker    vpermb         m16, m27, m3
2078*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], xm16
2079*c0909341SAndroid Build Coastguard Worker    add            r10, 16
2080*c0909341SAndroid Build Coastguard Worker    jl .n0_loop
2081*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
2082*c0909341SAndroid Build Coastguard Worker    ret
2083*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2084*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows)
2085*c0909341SAndroid Build Coastguard Worker    mov            r10, wq
2086*c0909341SAndroid Build Coastguard Worker.n1_loop:
2087*c0909341SAndroid Build Coastguard Worker    mova            m1, [t3+r10*4+416*8+0]
2088*c0909341SAndroid Build Coastguard Worker    paddd           m1, [t3+r10*4+416*8+8]
2089*c0909341SAndroid Build Coastguard Worker    paddd           m2, m1, [t3+r10*4+416*8+4]
2090*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2                    ; ab3[ 1] 222
2091*c0909341SAndroid Build Coastguard Worker    mova            m0, [t3+r10*4+416*20]
2092*c0909341SAndroid Build Coastguard Worker    paddd          m17, m0, [t3+r10*4+416*28] ; ab3[ 0] 222 + ab3[-1] 343
2093*c0909341SAndroid Build Coastguard Worker    pmovzxbd        m3, [dstq+r10]
2094*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*20], m2
2095*c0909341SAndroid Build Coastguard Worker    paddd           m2, m2
2096*c0909341SAndroid Build Coastguard Worker    psubd           m2, m1                    ; ab3[ 1] 343
2097*c0909341SAndroid Build Coastguard Worker    mova [t3+r10*4+416*28], m2
2098*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2                    ; ab3[ 0] 222 + ab3[ 1] 343
2099*c0909341SAndroid Build Coastguard Worker    pandn           m1, m24, m17
2100*c0909341SAndroid Build Coastguard Worker    psrld          m17, 12
2101*c0909341SAndroid Build Coastguard Worker    pandn           m2, m24, m0
2102*c0909341SAndroid Build Coastguard Worker    psrld           m0, 12
2103*c0909341SAndroid Build Coastguard Worker    paddd           m1, m2                    ; a3
2104*c0909341SAndroid Build Coastguard Worker    paddd          m17, m0                    ; b3 + (1 << 8)
2105*c0909341SAndroid Build Coastguard Worker    mova           m16, [t3+r10*4+416*16]     ; b5 + (1 << 7)
2106*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m3                    ; a3 * src
2107*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m3, [t3+r10*4+416*12] ; a5 * src
2108*c0909341SAndroid Build Coastguard Worker    vpshldd         m3, m25, 16               ; (dst << 16) + (1 << 15)
2109*c0909341SAndroid Build Coastguard Worker    psubd          m17, m1                    ; b3 - a3 * src + (1 << 8)
2110*c0909341SAndroid Build Coastguard Worker    psubd          m16, m0                    ; b5 - a5 * src + (1 << 7)
2111*c0909341SAndroid Build Coastguard Worker    pslld          m17, 7
2112*c0909341SAndroid Build Coastguard Worker    palignr    m17{k2}, m16, m16, 1
2113*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m3, m17, m26
2114*c0909341SAndroid Build Coastguard Worker    packuswb        m3, m3
2115*c0909341SAndroid Build Coastguard Worker    vpermb         m16, m27, m3
2116*c0909341SAndroid Build Coastguard Worker    mova    [dstq+r10], xm16
2117*c0909341SAndroid Build Coastguard Worker    add            r10, 16
2118*c0909341SAndroid Build Coastguard Worker    jl .n1_loop
2119*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
2120*c0909341SAndroid Build Coastguard Worker    ret
2121*c0909341SAndroid Build Coastguard Worker
2122*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
2123