xref: /aosp_15_r20/external/libdav1d/src/x86/cdef_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 2-*
32*c0909341SAndroid Build Coastguard Worker %xdefine %1_jmptable %%table
33*c0909341SAndroid Build Coastguard Worker %xdefine %%base mangle(private_prefix %+ _%1_avx2)
34*c0909341SAndroid Build Coastguard Worker %%table:
35*c0909341SAndroid Build Coastguard Worker %rep %0 - 1
36*c0909341SAndroid Build Coastguard Worker    dd %%base %+ .%2 - %%table
37*c0909341SAndroid Build Coastguard Worker  %rotate 1
38*c0909341SAndroid Build Coastguard Worker %endrep
39*c0909341SAndroid Build Coastguard Worker%endmacro
40*c0909341SAndroid Build Coastguard Worker
41*c0909341SAndroid Build Coastguard Worker%macro CDEF_FILTER_JMP_TABLE 1
42*c0909341SAndroid Build Coastguard WorkerJMP_TABLE cdef_filter_%1_8bpc, \
43*c0909341SAndroid Build Coastguard Worker    d6k0, d6k1, d7k0, d7k1, \
44*c0909341SAndroid Build Coastguard Worker    d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \
45*c0909341SAndroid Build Coastguard Worker    d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \
46*c0909341SAndroid Build Coastguard Worker    d0k0, d0k1, d1k0, d1k1
47*c0909341SAndroid Build Coastguard Worker%endmacro
48*c0909341SAndroid Build Coastguard Worker
49*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 32
50*c0909341SAndroid Build Coastguard Worker
51*c0909341SAndroid Build Coastguard Workerpd_47130256:   dd  4,  7,  1,  3,  0,  2,  5,  6
52*c0909341SAndroid Build Coastguard Workerblend_4x4:     dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00
53*c0909341SAndroid Build Coastguard Worker               dd 0x80, 0x00, 0x00
54*c0909341SAndroid Build Coastguard Workerblend_4x8_0:   dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
55*c0909341SAndroid Build Coastguard Workerblend_4x8_1:   dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
56*c0909341SAndroid Build Coastguard Worker               dd 0x00, 0x00
57*c0909341SAndroid Build Coastguard Workerblend_4x8_2:   dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
58*c0909341SAndroid Build Coastguard Worker               dd 0x0000
59*c0909341SAndroid Build Coastguard Workerblend_4x8_3:   dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
60*c0909341SAndroid Build Coastguard Worker               dd 0x0000, 0x0000
61*c0909341SAndroid Build Coastguard Workerblend_8x8_0:   dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80
62*c0909341SAndroid Build Coastguard Workerblend_8x8_1:   dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000
63*c0909341SAndroid Build Coastguard Workerdiv_table:     dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105
64*c0909341SAndroid Build Coastguard Workershufw_6543210x:db 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1, 14, 15
65*c0909341SAndroid Build Coastguard Workershufb_lohi:    db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
66*c0909341SAndroid Build Coastguard Workerpw_128:        times 2 dw 128
67*c0909341SAndroid Build Coastguard Workerpw_2048:       times 2 dw 2048
68*c0909341SAndroid Build Coastguard Workertap_table:     ; masks for 8 bit shifts
69*c0909341SAndroid Build Coastguard Worker               db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
70*c0909341SAndroid Build Coastguard Worker               ; weights
71*c0909341SAndroid Build Coastguard Worker               db  4,  2,  3,  3,  2,  1
72*c0909341SAndroid Build Coastguard Worker               db -1 * 16 + 1, -2 * 16 + 2
73*c0909341SAndroid Build Coastguard Worker               db  0 * 16 + 1, -1 * 16 + 2
74*c0909341SAndroid Build Coastguard Worker               db  0 * 16 + 1,  0 * 16 + 2
75*c0909341SAndroid Build Coastguard Worker               db  0 * 16 + 1,  1 * 16 + 2
76*c0909341SAndroid Build Coastguard Worker               db  1 * 16 + 1,  2 * 16 + 2
77*c0909341SAndroid Build Coastguard Worker               db  1 * 16 + 0,  2 * 16 + 1
78*c0909341SAndroid Build Coastguard Worker               db  1 * 16 + 0,  2 * 16 + 0
79*c0909341SAndroid Build Coastguard Worker               db  1 * 16 + 0,  2 * 16 - 1
80*c0909341SAndroid Build Coastguard Worker               ; the last 6 are repeats of the first 6 so we don't need to & 7
81*c0909341SAndroid Build Coastguard Worker               db -1 * 16 + 1, -2 * 16 + 2
82*c0909341SAndroid Build Coastguard Worker               db  0 * 16 + 1, -1 * 16 + 2
83*c0909341SAndroid Build Coastguard Worker               db  0 * 16 + 1,  0 * 16 + 2
84*c0909341SAndroid Build Coastguard Worker               db  0 * 16 + 1,  1 * 16 + 2
85*c0909341SAndroid Build Coastguard Worker               db  1 * 16 + 1,  2 * 16 + 2
86*c0909341SAndroid Build Coastguard Worker               db  1 * 16 + 0,  2 * 16 + 1
87*c0909341SAndroid Build Coastguard Worker
88*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER_JMP_TABLE 4x4
89*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER_JMP_TABLE 4x8
90*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER_JMP_TABLE 8x8
91*c0909341SAndroid Build Coastguard Worker
92*c0909341SAndroid Build Coastguard WorkerSECTION .text
93*c0909341SAndroid Build Coastguard Worker
94*c0909341SAndroid Build Coastguard Worker%macro PREP_REGS 2 ; w, h
95*c0909341SAndroid Build Coastguard Worker    ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
96*c0909341SAndroid Build Coastguard Worker    mov           dird, r7m
97*c0909341SAndroid Build Coastguard Worker    lea         tableq, [cdef_filter_%1x%2_8bpc_jmptable]
98*c0909341SAndroid Build Coastguard Worker    lea           dirq, [tableq+dirq*2*4]
99*c0909341SAndroid Build Coastguard Worker%if %1 == 4
100*c0909341SAndroid Build Coastguard Worker %if %2 == 4
101*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \
102*c0909341SAndroid Build Coastguard Worker              table, dir, dirjmp, stride3, k
103*c0909341SAndroid Build Coastguard Worker %else
104*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \
105*c0909341SAndroid Build Coastguard Worker              table, dir, dirjmp, dst4, stride3, k
106*c0909341SAndroid Build Coastguard Worker    lea          dst4q, [dstq+strideq*4]
107*c0909341SAndroid Build Coastguard Worker %endif
108*c0909341SAndroid Build Coastguard Worker%else
109*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS dst, stride, h, top1, bot, pri, sec, \
110*c0909341SAndroid Build Coastguard Worker              table, dir, dirjmp, top2, stride3, k
111*c0909341SAndroid Build Coastguard Worker    mov             hq, -8
112*c0909341SAndroid Build Coastguard Worker    lea          top1q, [top1q+strideq*0]
113*c0909341SAndroid Build Coastguard Worker    lea          top2q, [top1q+strideq*1]
114*c0909341SAndroid Build Coastguard Worker%endif
115*c0909341SAndroid Build Coastguard Worker%if %1 == 4
116*c0909341SAndroid Build Coastguard Worker    lea       stride3q, [strideq*3]
117*c0909341SAndroid Build Coastguard Worker%endif
118*c0909341SAndroid Build Coastguard Worker%endmacro
119*c0909341SAndroid Build Coastguard Worker
120*c0909341SAndroid Build Coastguard Worker%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max
121*c0909341SAndroid Build Coastguard Worker    mov             kd, 1
122*c0909341SAndroid Build Coastguard Worker    pxor           m15, m15                     ; sum
123*c0909341SAndroid Build Coastguard Worker%if %2 == 8
124*c0909341SAndroid Build Coastguard Worker    pxor           m12, m12
125*c0909341SAndroid Build Coastguard Worker %if %1 == 4
126*c0909341SAndroid Build Coastguard Worker    movd           xm4, [dstq +strideq*0]
127*c0909341SAndroid Build Coastguard Worker    movd           xm6, [dstq +strideq*1]
128*c0909341SAndroid Build Coastguard Worker    movd           xm5, [dstq +strideq*2]
129*c0909341SAndroid Build Coastguard Worker    movd           xm7, [dstq +stride3q ]
130*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [dst4q+strideq*0], 1
131*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, [dst4q+strideq*1], 1
132*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [dst4q+strideq*2], 1
133*c0909341SAndroid Build Coastguard Worker    vinserti128     m7, [dst4q+stride3q ], 1
134*c0909341SAndroid Build Coastguard Worker    punpckldq       m4, m6
135*c0909341SAndroid Build Coastguard Worker    punpckldq       m5, m7
136*c0909341SAndroid Build Coastguard Worker %else
137*c0909341SAndroid Build Coastguard Worker    movq           xm4, [dstq+strideq*0]
138*c0909341SAndroid Build Coastguard Worker    movq           xm5, [dstq+strideq*1]
139*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [dstq+strideq*2], 1
140*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [dstq+stride3q ], 1
141*c0909341SAndroid Build Coastguard Worker %endif
142*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m4, m5
143*c0909341SAndroid Build Coastguard Worker%else
144*c0909341SAndroid Build Coastguard Worker    movd           xm4, [dstq+strideq*0]
145*c0909341SAndroid Build Coastguard Worker    movd           xm5, [dstq+strideq*1]
146*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [dstq+strideq*2], 1
147*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [dstq+stride3q ], 1
148*c0909341SAndroid Build Coastguard Worker    punpckldq       m4, m5
149*c0909341SAndroid Build Coastguard Worker%endif
150*c0909341SAndroid Build Coastguard Worker%if %3 == 1
151*c0909341SAndroid Build Coastguard Worker    mova            m7, m4                      ; min
152*c0909341SAndroid Build Coastguard Worker    mova            m8, m4                      ; max
153*c0909341SAndroid Build Coastguard Worker%endif
154*c0909341SAndroid Build Coastguard Worker%endmacro
155*c0909341SAndroid Build Coastguard Worker
156*c0909341SAndroid Build Coastguard Worker%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength
157*c0909341SAndroid Build Coastguard Worker                                 ; mul_tap, w, h, clip
158*c0909341SAndroid Build Coastguard Worker    ; load p0/p1
159*c0909341SAndroid Build Coastguard Worker    movsxd     dirjmpq, [dirq+kq*4+%1*2*4]
160*c0909341SAndroid Build Coastguard Worker    add        dirjmpq, tableq
161*c0909341SAndroid Build Coastguard Worker    call       dirjmpq
162*c0909341SAndroid Build Coastguard Worker
163*c0909341SAndroid Build Coastguard Worker%if %8 == 1
164*c0909341SAndroid Build Coastguard Worker    pmaxub          m7, m5
165*c0909341SAndroid Build Coastguard Worker    pminub          m8, m5
166*c0909341SAndroid Build Coastguard Worker    pmaxub          m7, m6
167*c0909341SAndroid Build Coastguard Worker    pminub          m8, m6
168*c0909341SAndroid Build Coastguard Worker%endif
169*c0909341SAndroid Build Coastguard Worker
170*c0909341SAndroid Build Coastguard Worker    ; accumulate sum[m15] over p0/p1
171*c0909341SAndroid Build Coastguard Worker%if %7 == 4
172*c0909341SAndroid Build Coastguard Worker    punpcklbw       m5, m6
173*c0909341SAndroid Build Coastguard Worker    punpcklbw       m6, m4, m4
174*c0909341SAndroid Build Coastguard Worker    psubusb         m9, m5, m6
175*c0909341SAndroid Build Coastguard Worker    psubusb         m5, m6, m5
176*c0909341SAndroid Build Coastguard Worker    por             m9, m5     ; abs_diff_p01(p01 - px)
177*c0909341SAndroid Build Coastguard Worker    pcmpeqb         m5, m9
178*c0909341SAndroid Build Coastguard Worker    por             m5, %5
179*c0909341SAndroid Build Coastguard Worker    psignb          m6, %5, m5
180*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m9, %2 ; emulate 8-bit shift
181*c0909341SAndroid Build Coastguard Worker    pand            m5, %3
182*c0909341SAndroid Build Coastguard Worker    psubusb         m5, %4, m5
183*c0909341SAndroid Build Coastguard Worker    pminub          m5, m9
184*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m5, m6
185*c0909341SAndroid Build Coastguard Worker    paddw          m15, m5
186*c0909341SAndroid Build Coastguard Worker%else
187*c0909341SAndroid Build Coastguard Worker    psubusb         m9, m5, m4
188*c0909341SAndroid Build Coastguard Worker    psubusb         m5, m4, m5
189*c0909341SAndroid Build Coastguard Worker    psubusb        m11, m6, m4
190*c0909341SAndroid Build Coastguard Worker    psubusb         m6, m4, m6
191*c0909341SAndroid Build Coastguard Worker    por             m9, m5      ; abs_diff_p0(p0 - px)
192*c0909341SAndroid Build Coastguard Worker    por            m11, m6      ; abs_diff_p1(p1 - px)
193*c0909341SAndroid Build Coastguard Worker    pcmpeqb         m5, m9
194*c0909341SAndroid Build Coastguard Worker    pcmpeqb         m6, m11
195*c0909341SAndroid Build Coastguard Worker    punpckhbw      m10, m9, m11
196*c0909341SAndroid Build Coastguard Worker    punpcklbw       m9, m11
197*c0909341SAndroid Build Coastguard Worker    por             m5, %5
198*c0909341SAndroid Build Coastguard Worker    por            m11, m6, %5
199*c0909341SAndroid Build Coastguard Worker    punpckhbw       m6, m5, m11
200*c0909341SAndroid Build Coastguard Worker    punpcklbw       m5, m11
201*c0909341SAndroid Build Coastguard Worker    psignb         m11, %5, m6
202*c0909341SAndroid Build Coastguard Worker    psrlw           m6, m10, %2 ; emulate 8-bit shift
203*c0909341SAndroid Build Coastguard Worker    pand            m6, %3
204*c0909341SAndroid Build Coastguard Worker    psubusb         m6, %4, m6
205*c0909341SAndroid Build Coastguard Worker    pminub          m6, m10
206*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m6, m11
207*c0909341SAndroid Build Coastguard Worker    paddw          m12, m6
208*c0909341SAndroid Build Coastguard Worker    psignb         m11, %5, m5
209*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m9, %2  ; emulate 8-bit shift
210*c0909341SAndroid Build Coastguard Worker    pand            m5, %3
211*c0909341SAndroid Build Coastguard Worker    psubusb         m5, %4, m5
212*c0909341SAndroid Build Coastguard Worker    pminub          m5, m9
213*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m5, m11
214*c0909341SAndroid Build Coastguard Worker    paddw          m15, m5
215*c0909341SAndroid Build Coastguard Worker%endif
216*c0909341SAndroid Build Coastguard Worker%endmacro
217*c0909341SAndroid Build Coastguard Worker
218*c0909341SAndroid Build Coastguard Worker%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip
219*c0909341SAndroid Build Coastguard Worker%if %2 == 4
220*c0909341SAndroid Build Coastguard Worker %if %5 == 1
221*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, %3
222*c0909341SAndroid Build Coastguard Worker %endif
223*c0909341SAndroid Build Coastguard Worker    pcmpgtw         %3, m15
224*c0909341SAndroid Build Coastguard Worker    paddw          m15, %3
225*c0909341SAndroid Build Coastguard Worker    pmulhrsw       m15, %4
226*c0909341SAndroid Build Coastguard Worker %if %5 == 0
227*c0909341SAndroid Build Coastguard Worker    packsswb       m15, m15
228*c0909341SAndroid Build Coastguard Worker    paddb           m4, m15
229*c0909341SAndroid Build Coastguard Worker %else
230*c0909341SAndroid Build Coastguard Worker    paddw           m4, m15
231*c0909341SAndroid Build Coastguard Worker    packuswb        m4, m4 ; clip px in [0x0,0xff]
232*c0909341SAndroid Build Coastguard Worker    pminub          m4, m7
233*c0909341SAndroid Build Coastguard Worker    pmaxub          m4, m8
234*c0909341SAndroid Build Coastguard Worker %endif
235*c0909341SAndroid Build Coastguard Worker    vextracti128   xm5, m4, 1
236*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm4
237*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm5
238*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm4, 1
239*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm5, 1
240*c0909341SAndroid Build Coastguard Worker%else
241*c0909341SAndroid Build Coastguard Worker    pcmpgtw         m6, %3, m12
242*c0909341SAndroid Build Coastguard Worker    pcmpgtw         m5, %3, m15
243*c0909341SAndroid Build Coastguard Worker    paddw          m12, m6
244*c0909341SAndroid Build Coastguard Worker    paddw          m15, m5
245*c0909341SAndroid Build Coastguard Worker %if %5 == 1
246*c0909341SAndroid Build Coastguard Worker    punpckhbw       m5, m4, %3
247*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, %3
248*c0909341SAndroid Build Coastguard Worker %endif
249*c0909341SAndroid Build Coastguard Worker    pmulhrsw       m12, %4
250*c0909341SAndroid Build Coastguard Worker    pmulhrsw       m15, %4
251*c0909341SAndroid Build Coastguard Worker %if %5 == 0
252*c0909341SAndroid Build Coastguard Worker    packsswb       m15, m12
253*c0909341SAndroid Build Coastguard Worker    paddb           m4, m15
254*c0909341SAndroid Build Coastguard Worker %else
255*c0909341SAndroid Build Coastguard Worker    paddw           m5, m12
256*c0909341SAndroid Build Coastguard Worker    paddw           m4, m15
257*c0909341SAndroid Build Coastguard Worker    packuswb        m4, m5 ; clip px in [0x0,0xff]
258*c0909341SAndroid Build Coastguard Worker    pminub          m4, m7
259*c0909341SAndroid Build Coastguard Worker    pmaxub          m4, m8
260*c0909341SAndroid Build Coastguard Worker %endif
261*c0909341SAndroid Build Coastguard Worker    vextracti128   xm5, m4, 1
262*c0909341SAndroid Build Coastguard Worker %if %1 == 4
263*c0909341SAndroid Build Coastguard Worker    movd   [dstq +strideq*0], xm4
264*c0909341SAndroid Build Coastguard Worker    movd   [dst4q+strideq*0], xm5
265*c0909341SAndroid Build Coastguard Worker    pextrd [dstq +strideq*1], xm4, 1
266*c0909341SAndroid Build Coastguard Worker    pextrd [dst4q+strideq*1], xm5, 1
267*c0909341SAndroid Build Coastguard Worker    pextrd [dstq +strideq*2], xm4, 2
268*c0909341SAndroid Build Coastguard Worker    pextrd [dst4q+strideq*2], xm5, 2
269*c0909341SAndroid Build Coastguard Worker    pextrd [dstq +stride3q ], xm4, 3
270*c0909341SAndroid Build Coastguard Worker    pextrd [dst4q+stride3q ], xm5, 3
271*c0909341SAndroid Build Coastguard Worker %else
272*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm4
273*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm5
274*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm4
275*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm5
276*c0909341SAndroid Build Coastguard Worker %endif
277*c0909341SAndroid Build Coastguard Worker%endif
278*c0909341SAndroid Build Coastguard Worker%endmacro
279*c0909341SAndroid Build Coastguard Worker
280*c0909341SAndroid Build Coastguard Worker%macro BORDER_PREP_REGS 2 ; w, h
281*c0909341SAndroid Build Coastguard Worker    ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
282*c0909341SAndroid Build Coastguard Worker    mov           dird, r7m
283*c0909341SAndroid Build Coastguard Worker    lea           dirq, [tableq+dirq*2+14]
284*c0909341SAndroid Build Coastguard Worker%if %1*%2*2/mmsize > 1
285*c0909341SAndroid Build Coastguard Worker %if %1 == 4
286*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, h, off
287*c0909341SAndroid Build Coastguard Worker %else
288*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, h, off
289*c0909341SAndroid Build Coastguard Worker %endif
290*c0909341SAndroid Build Coastguard Worker    mov             hd, %1*%2*2/mmsize
291*c0909341SAndroid Build Coastguard Worker%else
292*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, off
293*c0909341SAndroid Build Coastguard Worker%endif
294*c0909341SAndroid Build Coastguard Worker    lea           stkq, [px]
295*c0909341SAndroid Build Coastguard Worker    pxor           m11, m11
296*c0909341SAndroid Build Coastguard Worker%endmacro
297*c0909341SAndroid Build Coastguard Worker
298*c0909341SAndroid Build Coastguard Worker%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max
299*c0909341SAndroid Build Coastguard Worker    mov             kd, 1
300*c0909341SAndroid Build Coastguard Worker%if %1 == 4
301*c0909341SAndroid Build Coastguard Worker    movq           xm4, [stkq+32*0]
302*c0909341SAndroid Build Coastguard Worker    movhps         xm4, [stkq+32*1]
303*c0909341SAndroid Build Coastguard Worker    movq           xm5, [stkq+32*2]
304*c0909341SAndroid Build Coastguard Worker    movhps         xm5, [stkq+32*3]
305*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, xm5, 1
306*c0909341SAndroid Build Coastguard Worker%else
307*c0909341SAndroid Build Coastguard Worker    mova           xm4, [stkq+32*0]             ; px
308*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [stkq+32*1], 1
309*c0909341SAndroid Build Coastguard Worker%endif
310*c0909341SAndroid Build Coastguard Worker    pxor           m15, m15                     ; sum
311*c0909341SAndroid Build Coastguard Worker%if %3 == 1
312*c0909341SAndroid Build Coastguard Worker    mova            m7, m4                      ; max
313*c0909341SAndroid Build Coastguard Worker    mova            m8, m4                      ; min
314*c0909341SAndroid Build Coastguard Worker%endif
315*c0909341SAndroid Build Coastguard Worker%endmacro
316*c0909341SAndroid Build Coastguard Worker
317*c0909341SAndroid Build Coastguard Worker%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength
318*c0909341SAndroid Build Coastguard Worker                                 ; mul_tap, w, clip
319*c0909341SAndroid Build Coastguard Worker    ; load p0/p1
320*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+kq+%1]       ; off1
321*c0909341SAndroid Build Coastguard Worker%if %6 == 4
322*c0909341SAndroid Build Coastguard Worker    movq           xm5, [stkq+offq*2+32*0]      ; p0
323*c0909341SAndroid Build Coastguard Worker    movq           xm6, [stkq+offq*2+32*2]
324*c0909341SAndroid Build Coastguard Worker    movhps         xm5, [stkq+offq*2+32*1]
325*c0909341SAndroid Build Coastguard Worker    movhps         xm6, [stkq+offq*2+32*3]
326*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, xm6, 1
327*c0909341SAndroid Build Coastguard Worker%else
328*c0909341SAndroid Build Coastguard Worker    movu           xm5, [stkq+offq*2+32*0]      ; p0
329*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [stkq+offq*2+32*1], 1
330*c0909341SAndroid Build Coastguard Worker%endif
331*c0909341SAndroid Build Coastguard Worker    neg           offq                          ; -off1
332*c0909341SAndroid Build Coastguard Worker%if %6 == 4
333*c0909341SAndroid Build Coastguard Worker    movq           xm6, [stkq+offq*2+32*0]      ; p1
334*c0909341SAndroid Build Coastguard Worker    movq           xm9, [stkq+offq*2+32*2]
335*c0909341SAndroid Build Coastguard Worker    movhps         xm6, [stkq+offq*2+32*1]
336*c0909341SAndroid Build Coastguard Worker    movhps         xm9, [stkq+offq*2+32*3]
337*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, xm9, 1
338*c0909341SAndroid Build Coastguard Worker%else
339*c0909341SAndroid Build Coastguard Worker    movu           xm6, [stkq+offq*2+32*0]      ; p1
340*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, [stkq+offq*2+32*1], 1
341*c0909341SAndroid Build Coastguard Worker%endif
342*c0909341SAndroid Build Coastguard Worker%if %7 == 1
343*c0909341SAndroid Build Coastguard Worker    ; out of bounds values are set to a value that is a both a large unsigned
344*c0909341SAndroid Build Coastguard Worker    ; value and a negative signed value.
345*c0909341SAndroid Build Coastguard Worker    ; use signed max and unsigned min to remove them
346*c0909341SAndroid Build Coastguard Worker    pmaxsw          m7, m5                      ; max after p0
347*c0909341SAndroid Build Coastguard Worker    pminuw          m8, m5                      ; min after p0
348*c0909341SAndroid Build Coastguard Worker    pmaxsw          m7, m6                      ; max after p1
349*c0909341SAndroid Build Coastguard Worker    pminuw          m8, m6                      ; min after p1
350*c0909341SAndroid Build Coastguard Worker%endif
351*c0909341SAndroid Build Coastguard Worker
352*c0909341SAndroid Build Coastguard Worker    ; accumulate sum[m15] over p0/p1
353*c0909341SAndroid Build Coastguard Worker    ; calculate difference before converting
354*c0909341SAndroid Build Coastguard Worker    psubw           m5, m4                      ; diff_p0(p0 - px)
355*c0909341SAndroid Build Coastguard Worker    psubw           m6, m4                      ; diff_p1(p1 - px)
356*c0909341SAndroid Build Coastguard Worker
357*c0909341SAndroid Build Coastguard Worker    ; convert to 8-bits with signed saturation
358*c0909341SAndroid Build Coastguard Worker    ; saturating to large diffs has no impact on the results
359*c0909341SAndroid Build Coastguard Worker    packsswb        m5, m6
360*c0909341SAndroid Build Coastguard Worker
361*c0909341SAndroid Build Coastguard Worker    ; group into pairs so we can accumulate using maddubsw
362*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m12
363*c0909341SAndroid Build Coastguard Worker    pabsb           m9, m5
364*c0909341SAndroid Build Coastguard Worker    psignb         m10, %5, m5
365*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m9, %2                  ; emulate 8-bit shift
366*c0909341SAndroid Build Coastguard Worker    pand            m5, %3
367*c0909341SAndroid Build Coastguard Worker    psubusb         m5, %4, m5
368*c0909341SAndroid Build Coastguard Worker
369*c0909341SAndroid Build Coastguard Worker    ; use unsigned min since abs diff can equal 0x80
370*c0909341SAndroid Build Coastguard Worker    pminub          m5, m9
371*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m5, m10
372*c0909341SAndroid Build Coastguard Worker    paddw          m15, m5
373*c0909341SAndroid Build Coastguard Worker%endmacro
374*c0909341SAndroid Build Coastguard Worker
375*c0909341SAndroid Build Coastguard Worker%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip
376*c0909341SAndroid Build Coastguard Worker    pcmpgtw         m9, m11, m15
377*c0909341SAndroid Build Coastguard Worker    paddw          m15, m9
378*c0909341SAndroid Build Coastguard Worker    pmulhrsw       m15, %2
379*c0909341SAndroid Build Coastguard Worker    paddw           m4, m15
380*c0909341SAndroid Build Coastguard Worker%if %3 == 1
381*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m7
382*c0909341SAndroid Build Coastguard Worker    pmaxsw          m4, m8
383*c0909341SAndroid Build Coastguard Worker%endif
384*c0909341SAndroid Build Coastguard Worker    packuswb        m4, m4
385*c0909341SAndroid Build Coastguard Worker    vextracti128   xm5, m4, 1
386*c0909341SAndroid Build Coastguard Worker%if %1 == 4
387*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm4
388*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm4, 1
389*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm5
390*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+stride3q ], xm5, 1
391*c0909341SAndroid Build Coastguard Worker%else
392*c0909341SAndroid Build Coastguard Worker    movq [dstq+strideq*0], xm4
393*c0909341SAndroid Build Coastguard Worker    movq [dstq+strideq*1], xm5
394*c0909341SAndroid Build Coastguard Worker%endif
395*c0909341SAndroid Build Coastguard Worker%endmacro
396*c0909341SAndroid Build Coastguard Worker
397*c0909341SAndroid Build Coastguard Worker%macro CDEF_FILTER 2 ; w, h
398*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
399*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \
400*c0909341SAndroid Build Coastguard Worker                                          pri, sec, dir, damping, edge
401*c0909341SAndroid Build Coastguard Worker    mov          edged, edgem
402*c0909341SAndroid Build Coastguard Worker    cmp          edged, 0xf
403*c0909341SAndroid Build Coastguard Worker    jne .border_block
404*c0909341SAndroid Build Coastguard Worker
405*c0909341SAndroid Build Coastguard Worker    PUSH           r11
406*c0909341SAndroid Build Coastguard Worker    PUSH           r12
407*c0909341SAndroid Build Coastguard Worker%if %2 == 4
408*c0909341SAndroid Build Coastguard Worker%assign regs_used 13
409*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK   0x60, 16
410*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm0, [leftq+1]
411*c0909341SAndroid Build Coastguard Worker    vpermq          m0, m0, q0110
412*c0909341SAndroid Build Coastguard Worker    psrldq          m1, m0, 4
413*c0909341SAndroid Build Coastguard Worker    vpalignr        m2, m0, m0, 12
414*c0909341SAndroid Build Coastguard Worker    movu    [rsp+0x10], m0
415*c0909341SAndroid Build Coastguard Worker    movu    [rsp+0x28], m1
416*c0909341SAndroid Build Coastguard Worker    movu    [rsp+0x40], m2
417*c0909341SAndroid Build Coastguard Worker%elif %1 == 4
418*c0909341SAndroid Build Coastguard Worker%assign regs_used 14
419*c0909341SAndroid Build Coastguard Worker    PUSH           r13
420*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK 8*2+%1*%2*1, 16
421*c0909341SAndroid Build Coastguard Worker    pmovzxwd        m0, [leftq]
422*c0909341SAndroid Build Coastguard Worker    mova    [rsp+0x10], m0
423*c0909341SAndroid Build Coastguard Worker%else
424*c0909341SAndroid Build Coastguard Worker%assign regs_used 15
425*c0909341SAndroid Build Coastguard Worker    PUSH           r13
426*c0909341SAndroid Build Coastguard Worker    PUSH           r14
427*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK 8*4+%1*%2*2+32, 16
428*c0909341SAndroid Build Coastguard Worker    lea            r11, [strideq*3]
429*c0909341SAndroid Build Coastguard Worker    movu           xm4, [dstq+strideq*2]
430*c0909341SAndroid Build Coastguard Worker    pmovzxwq        m0, [leftq+0]
431*c0909341SAndroid Build Coastguard Worker    pmovzxwq        m1, [leftq+8]
432*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [dstq+r11], 1
433*c0909341SAndroid Build Coastguard Worker    pmovzxbd        m2, [leftq+1]
434*c0909341SAndroid Build Coastguard Worker    pmovzxbd        m3, [leftq+9]
435*c0909341SAndroid Build Coastguard Worker    mov       [rsp+16], botq
436*c0909341SAndroid Build Coastguard Worker    mova    [rsp+0x20], m0
437*c0909341SAndroid Build Coastguard Worker    mova    [rsp+0x40], m1
438*c0909341SAndroid Build Coastguard Worker    mova    [rsp+0x60], m2
439*c0909341SAndroid Build Coastguard Worker    mova    [rsp+0x80], m3
440*c0909341SAndroid Build Coastguard Worker    mova    [rsp+0xa0], m4
441*c0909341SAndroid Build Coastguard Worker    lea           botq, [dstq+strideq*4]
442*c0909341SAndroid Build Coastguard Worker%endif
443*c0909341SAndroid Build Coastguard Worker
444*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, zero, pridmp, damping
445*c0909341SAndroid Build Coastguard Worker    mov       dampingd, r8m
446*c0909341SAndroid Build Coastguard Worker    xor          zerod, zerod
447*c0909341SAndroid Build Coastguard Worker    movifnidn     prid, prim
448*c0909341SAndroid Build Coastguard Worker    sub       dampingd, 31
449*c0909341SAndroid Build Coastguard Worker    movifnidn  secdmpd, secdmpm
450*c0909341SAndroid Build Coastguard Worker    test          prid, prid
451*c0909341SAndroid Build Coastguard Worker    jz .sec_only
452*c0909341SAndroid Build Coastguard Worker    movd           xm0, prid
453*c0909341SAndroid Build Coastguard Worker    lzcnt      pridmpd, prid
454*c0909341SAndroid Build Coastguard Worker    add        pridmpd, dampingd
455*c0909341SAndroid Build Coastguard Worker    cmovs      pridmpd, zerod
456*c0909341SAndroid Build Coastguard Worker    mov        [rsp+0], pridmpq                 ; pri_shift
457*c0909341SAndroid Build Coastguard Worker    test       secdmpd, secdmpd
458*c0909341SAndroid Build Coastguard Worker    jz .pri_only
459*c0909341SAndroid Build Coastguard Worker    movd           xm1, secdmpd
460*c0909341SAndroid Build Coastguard Worker    lzcnt      secdmpd, secdmpd
461*c0909341SAndroid Build Coastguard Worker    add        secdmpd, dampingd
462*c0909341SAndroid Build Coastguard Worker    mov        [rsp+8], secdmpq                 ; sec_shift
463*c0909341SAndroid Build Coastguard Worker
464*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, table, pridmp
465*c0909341SAndroid Build Coastguard Worker    lea         tableq, [tap_table]
466*c0909341SAndroid Build Coastguard Worker    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
467*c0909341SAndroid Build Coastguard Worker    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
468*c0909341SAndroid Build Coastguard Worker
469*c0909341SAndroid Build Coastguard Worker    ; pri/sec_taps[k] [4 total]
470*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, pri, sec, table, dir
471*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m0, xm0                     ; pri_strength
472*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m1, xm1                     ; sec_strength
473*c0909341SAndroid Build Coastguard Worker    and           prid, 1
474*c0909341SAndroid Build Coastguard Worker    lea           priq, [tableq+priq*2+8]       ; pri_taps
475*c0909341SAndroid Build Coastguard Worker    lea           secq, [tableq+12]             ; sec_taps
476*c0909341SAndroid Build Coastguard Worker
477*c0909341SAndroid Build Coastguard Worker    PREP_REGS       %1, %2
478*c0909341SAndroid Build Coastguard Worker%if %1*%2 > mmsize
479*c0909341SAndroid Build Coastguard Worker.v_loop:
480*c0909341SAndroid Build Coastguard Worker%endif
481*c0909341SAndroid Build Coastguard Worker    LOAD_BLOCK      %1, %2, 1
482*c0909341SAndroid Build Coastguard Worker.k_loop:
483*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m2, [priq+kq]                          ; pri_taps
484*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m3, [secq+kq]                          ; sec_taps
485*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0
486*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2
487*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2
488*c0909341SAndroid Build Coastguard Worker    dec             kq
489*c0909341SAndroid Build Coastguard Worker    jge .k_loop
490*c0909341SAndroid Build Coastguard Worker
491*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m10, [pw_2048]
492*c0909341SAndroid Build Coastguard Worker    pxor            m9, m9
493*c0909341SAndroid Build Coastguard Worker    ADJUST_PIXEL    %1, %2, m9, m10, 1
494*c0909341SAndroid Build Coastguard Worker%if %1*%2 > mmsize
495*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq+strideq*4]
496*c0909341SAndroid Build Coastguard Worker    lea          top1q, [rsp+0xa0]
497*c0909341SAndroid Build Coastguard Worker    lea          top2q, [rsp+0xb0]
498*c0909341SAndroid Build Coastguard Worker    mov           botq, [rsp+16]
499*c0909341SAndroid Build Coastguard Worker    add             hq, 4
500*c0909341SAndroid Build Coastguard Worker    jl .v_loop
501*c0909341SAndroid Build Coastguard Worker%endif
502*c0909341SAndroid Build Coastguard Worker    RET
503*c0909341SAndroid Build Coastguard Worker
504*c0909341SAndroid Build Coastguard Worker.pri_only:
505*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, pridmp
506*c0909341SAndroid Build Coastguard Worker    lea         tableq, [tap_table]
507*c0909341SAndroid Build Coastguard Worker    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
508*c0909341SAndroid Build Coastguard Worker    ; pri/sec_taps[k] [4 total]
509*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, dir
510*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m0, xm0                     ; pri_strength
511*c0909341SAndroid Build Coastguard Worker    and           prid, 1
512*c0909341SAndroid Build Coastguard Worker    lea           priq, [tableq+priq*2+8]       ; pri_taps
513*c0909341SAndroid Build Coastguard Worker    PREP_REGS       %1, %2
514*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m3, [pw_2048]
515*c0909341SAndroid Build Coastguard Worker    pxor            m1, m1
516*c0909341SAndroid Build Coastguard Worker%if %1*%2 > mmsize
517*c0909341SAndroid Build Coastguard Worker.pri_v_loop:
518*c0909341SAndroid Build Coastguard Worker%endif
519*c0909341SAndroid Build Coastguard Worker    LOAD_BLOCK      %1, %2
520*c0909341SAndroid Build Coastguard Worker.pri_k_loop:
521*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m2, [priq+kq]                       ; pri_taps
522*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0
523*c0909341SAndroid Build Coastguard Worker    dec             kq
524*c0909341SAndroid Build Coastguard Worker    jge .pri_k_loop
525*c0909341SAndroid Build Coastguard Worker    ADJUST_PIXEL    %1, %2, m1, m3
526*c0909341SAndroid Build Coastguard Worker%if %1*%2 > mmsize
527*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq+strideq*4]
528*c0909341SAndroid Build Coastguard Worker    lea          top1q, [rsp+0xa0]
529*c0909341SAndroid Build Coastguard Worker    lea          top2q, [rsp+0xb0]
530*c0909341SAndroid Build Coastguard Worker    mov           botq, [rsp+16]
531*c0909341SAndroid Build Coastguard Worker    add             hq, 4
532*c0909341SAndroid Build Coastguard Worker    jl .pri_v_loop
533*c0909341SAndroid Build Coastguard Worker%endif
534*c0909341SAndroid Build Coastguard Worker    RET
535*c0909341SAndroid Build Coastguard Worker
536*c0909341SAndroid Build Coastguard Worker.sec_only:
537*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, zero, _, damping
538*c0909341SAndroid Build Coastguard Worker    movd           xm1, secdmpd
539*c0909341SAndroid Build Coastguard Worker    lzcnt      secdmpd, secdmpd
540*c0909341SAndroid Build Coastguard Worker    add        secdmpd, dampingd
541*c0909341SAndroid Build Coastguard Worker    mov        [rsp+8], secdmpq                 ; sec_shift
542*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, table
543*c0909341SAndroid Build Coastguard Worker    lea         tableq, [tap_table]
544*c0909341SAndroid Build Coastguard Worker    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
545*c0909341SAndroid Build Coastguard Worker    ; pri/sec_taps[k] [4 total]
546*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, _, sec, table, dir
547*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m1, xm1                     ; sec_strength
548*c0909341SAndroid Build Coastguard Worker    lea           secq, [tableq+12]             ; sec_taps
549*c0909341SAndroid Build Coastguard Worker    PREP_REGS       %1, %2
550*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m2, [pw_2048]
551*c0909341SAndroid Build Coastguard Worker    pxor            m0, m0
552*c0909341SAndroid Build Coastguard Worker%if %1*%2 > mmsize
553*c0909341SAndroid Build Coastguard Worker.sec_v_loop:
554*c0909341SAndroid Build Coastguard Worker%endif
555*c0909341SAndroid Build Coastguard Worker    LOAD_BLOCK      %1, %2
556*c0909341SAndroid Build Coastguard Worker.sec_k_loop:
557*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m3, [secq+kq]                       ; sec_taps
558*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2
559*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2
560*c0909341SAndroid Build Coastguard Worker    dec             kq
561*c0909341SAndroid Build Coastguard Worker    jge .sec_k_loop
562*c0909341SAndroid Build Coastguard Worker    ADJUST_PIXEL    %1, %2, m0, m2
563*c0909341SAndroid Build Coastguard Worker%if %1*%2 > mmsize
564*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq+strideq*4]
565*c0909341SAndroid Build Coastguard Worker    lea          top1q, [rsp+0xa0]
566*c0909341SAndroid Build Coastguard Worker    lea          top2q, [rsp+0xb0]
567*c0909341SAndroid Build Coastguard Worker    mov           botq, [rsp+16]
568*c0909341SAndroid Build Coastguard Worker    add             hq, 4
569*c0909341SAndroid Build Coastguard Worker    jl .sec_v_loop
570*c0909341SAndroid Build Coastguard Worker%endif
571*c0909341SAndroid Build Coastguard Worker    RET
572*c0909341SAndroid Build Coastguard Worker
573*c0909341SAndroid Build Coastguard Worker.d0k0:
574*c0909341SAndroid Build Coastguard Worker%if %1 == 4
575*c0909341SAndroid Build Coastguard Worker %if %2 == 4
576*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    m6, [dstq+strideq*1-1]
577*c0909341SAndroid Build Coastguard Worker    vpbroadcastq   m10, [dstq+strideq*2-1]
578*c0909341SAndroid Build Coastguard Worker    movd           xm5, [topq+strideq*1+1]
579*c0909341SAndroid Build Coastguard Worker    movd           xm9, [dstq+strideq*0+1]
580*c0909341SAndroid Build Coastguard Worker    psrldq         m11, m6, 2
581*c0909341SAndroid Build Coastguard Worker    psrldq         m12, m10, 2
582*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, [dstq+stride3q -1], 1
583*c0909341SAndroid Build Coastguard Worker    vinserti128    m10, [botq          -1], 1
584*c0909341SAndroid Build Coastguard Worker    vpblendd        m5, m11, 0x10
585*c0909341SAndroid Build Coastguard Worker    vpblendd        m9, m12, 0x10
586*c0909341SAndroid Build Coastguard Worker    movu           m11, [blend_4x4+16]
587*c0909341SAndroid Build Coastguard Worker    punpckldq       m6, m10
588*c0909341SAndroid Build Coastguard Worker    punpckldq       m5, m9
589*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, [rsp+gprsize+0x28], m11
590*c0909341SAndroid Build Coastguard Worker %else
591*c0909341SAndroid Build Coastguard Worker    movd           xm5, [topq +strideq*1+1]
592*c0909341SAndroid Build Coastguard Worker    movq           xm6, [dstq +strideq*1-1]
593*c0909341SAndroid Build Coastguard Worker    movq          xm10, [dstq +stride3q -1]
594*c0909341SAndroid Build Coastguard Worker    movq          xm11, [dst4q+strideq*1-1]
595*c0909341SAndroid Build Coastguard Worker    pinsrd         xm5, [dstq +strideq*0+1], 1
596*c0909341SAndroid Build Coastguard Worker    movhps         xm6, [dstq +strideq*2-1]
597*c0909341SAndroid Build Coastguard Worker    movhps        xm10, [dst4q+strideq*0-1]
598*c0909341SAndroid Build Coastguard Worker    movhps        xm11, [dst4q+strideq*2-1]
599*c0909341SAndroid Build Coastguard Worker    psrldq         xm9, xm6, 2
600*c0909341SAndroid Build Coastguard Worker    shufps         xm5, xm9, q2010   ; -1 +0 +1 +2
601*c0909341SAndroid Build Coastguard Worker    shufps         xm6, xm10, q2020  ; +1 +2 +3 +4
602*c0909341SAndroid Build Coastguard Worker    psrldq         xm9, xm11, 2
603*c0909341SAndroid Build Coastguard Worker    psrldq        xm10, 2
604*c0909341SAndroid Build Coastguard Worker    shufps        xm10, xm9, q2020   ; +3 +4 +5 +6
605*c0909341SAndroid Build Coastguard Worker    movd           xm9, [dst4q+stride3q -1]
606*c0909341SAndroid Build Coastguard Worker    pinsrd         xm9, [botq           -1], 1
607*c0909341SAndroid Build Coastguard Worker    shufps        xm11, xm9, q1020   ; +5 +6 +7 +8
608*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m9, [leftq+3]
609*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, xm11, 1
610*c0909341SAndroid Build Coastguard Worker    movu           m11, [blend_4x8_0+4]
611*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, xm10, 1
612*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, m9, m11
613*c0909341SAndroid Build Coastguard Worker %endif
614*c0909341SAndroid Build Coastguard Worker%else
615*c0909341SAndroid Build Coastguard Worker    lea            r13, [blend_8x8_0+16]
616*c0909341SAndroid Build Coastguard Worker    movq           xm5, [top2q         +1]
617*c0909341SAndroid Build Coastguard Worker    vbroadcasti128 m10, [dstq+strideq*1-1]
618*c0909341SAndroid Build Coastguard Worker    vbroadcasti128 m11, [dstq+strideq*2-1]
619*c0909341SAndroid Build Coastguard Worker    movhps         xm5, [dstq+strideq*0+1]
620*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, m10, [dstq+stride3q-1], 1
621*c0909341SAndroid Build Coastguard Worker    vinserti128     m9, m11, [botq         -1], 1
622*c0909341SAndroid Build Coastguard Worker    psrldq         m10, 2
623*c0909341SAndroid Build Coastguard Worker    psrldq         m11, 2
624*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m6, m9
625*c0909341SAndroid Build Coastguard Worker    movu            m9, [r13+hq*2*1+16*1]
626*c0909341SAndroid Build Coastguard Worker    punpcklqdq     m10, m11
627*c0909341SAndroid Build Coastguard Worker    vpblendd        m5, m10, 0xF0
628*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, [rsp+gprsize+0x60+hq*8+64+8*1], m9
629*c0909341SAndroid Build Coastguard Worker%endif
630*c0909341SAndroid Build Coastguard Worker    ret
631*c0909341SAndroid Build Coastguard Worker.d1k0:
632*c0909341SAndroid Build Coastguard Worker.d2k0:
633*c0909341SAndroid Build Coastguard Worker.d3k0:
634*c0909341SAndroid Build Coastguard Worker%if %1 == 4
635*c0909341SAndroid Build Coastguard Worker %if %2 == 4
636*c0909341SAndroid Build Coastguard Worker    movq           xm6, [dstq+strideq*0-1]
637*c0909341SAndroid Build Coastguard Worker    movq           xm9, [dstq+strideq*1-1]
638*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, [dstq+strideq*2-1], 1
639*c0909341SAndroid Build Coastguard Worker    vinserti128     m9, [dstq+stride3q -1], 1
640*c0909341SAndroid Build Coastguard Worker    movu           m11, [rsp+gprsize+0x10]
641*c0909341SAndroid Build Coastguard Worker    pcmpeqd        m12, m12
642*c0909341SAndroid Build Coastguard Worker    psrldq          m5, m6, 2
643*c0909341SAndroid Build Coastguard Worker    psrldq         m10, m9, 2
644*c0909341SAndroid Build Coastguard Worker    psrld          m12, 24
645*c0909341SAndroid Build Coastguard Worker    punpckldq       m6, m9
646*c0909341SAndroid Build Coastguard Worker    punpckldq       m5, m10
647*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, m11, m12
648*c0909341SAndroid Build Coastguard Worker %else
649*c0909341SAndroid Build Coastguard Worker    movq           xm6, [dstq +strideq*0-1]
650*c0909341SAndroid Build Coastguard Worker    movq           xm9, [dstq +strideq*2-1]
651*c0909341SAndroid Build Coastguard Worker    movhps         xm6, [dstq +strideq*1-1]
652*c0909341SAndroid Build Coastguard Worker    movhps         xm9, [dstq +stride3q -1]
653*c0909341SAndroid Build Coastguard Worker    movq          xm10, [dst4q+strideq*0-1]
654*c0909341SAndroid Build Coastguard Worker    movhps        xm10, [dst4q+strideq*1-1]
655*c0909341SAndroid Build Coastguard Worker    psrldq         xm5, xm6, 2
656*c0909341SAndroid Build Coastguard Worker    psrldq        xm11, xm9, 2
657*c0909341SAndroid Build Coastguard Worker    shufps         xm5, xm11, q2020
658*c0909341SAndroid Build Coastguard Worker    movq          xm11, [dst4q+strideq*2-1]
659*c0909341SAndroid Build Coastguard Worker    movhps        xm11, [dst4q+stride3q -1]
660*c0909341SAndroid Build Coastguard Worker    shufps         xm6, xm9, q2020
661*c0909341SAndroid Build Coastguard Worker    shufps         xm9, xm10, xm11, q2020
662*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, xm9, 1
663*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m9, [leftq+1]
664*c0909341SAndroid Build Coastguard Worker    psrldq        xm10, 2
665*c0909341SAndroid Build Coastguard Worker    psrldq        xm11, 2
666*c0909341SAndroid Build Coastguard Worker    shufps        xm10, xm11, q2020
667*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m11, [blend_4x8_0+4]
668*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, xm10, 1
669*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, m9, m11
670*c0909341SAndroid Build Coastguard Worker %endif
671*c0909341SAndroid Build Coastguard Worker%else
672*c0909341SAndroid Build Coastguard Worker    movu           xm5, [dstq+strideq*0-1]
673*c0909341SAndroid Build Coastguard Worker    movu           xm9, [dstq+strideq*1-1]
674*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [dstq+strideq*2-1], 1
675*c0909341SAndroid Build Coastguard Worker    vinserti128     m9, [dstq+stride3q -1], 1
676*c0909341SAndroid Build Coastguard Worker    movu           m10, [blend_8x8_0+16]
677*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m6, m5, m9
678*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, [rsp+gprsize+0x60+hq*8+64], m10
679*c0909341SAndroid Build Coastguard Worker    psrldq          m5, 2
680*c0909341SAndroid Build Coastguard Worker    psrldq          m9, 2
681*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m5, m9
682*c0909341SAndroid Build Coastguard Worker%endif
683*c0909341SAndroid Build Coastguard Worker    ret
684*c0909341SAndroid Build Coastguard Worker.d4k0:
685*c0909341SAndroid Build Coastguard Worker%if %1 == 4
686*c0909341SAndroid Build Coastguard Worker %if %2 == 4
687*c0909341SAndroid Build Coastguard Worker    vpbroadcastq   m10, [dstq+strideq*1-1]
688*c0909341SAndroid Build Coastguard Worker    vpbroadcastq   m11, [dstq+strideq*2-1]
689*c0909341SAndroid Build Coastguard Worker    movd           xm6, [topq+strideq*1-1]
690*c0909341SAndroid Build Coastguard Worker    movd           xm9, [dstq+strideq*0-1]
691*c0909341SAndroid Build Coastguard Worker    psrldq          m5, m10, 2
692*c0909341SAndroid Build Coastguard Worker    psrldq         m12, m11, 2
693*c0909341SAndroid Build Coastguard Worker    vpblendd        m6, m10, 0x10
694*c0909341SAndroid Build Coastguard Worker    vpblendd        m9, m11, 0x10
695*c0909341SAndroid Build Coastguard Worker    movu           m10, [blend_4x4]
696*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [dstq+stride3q +1], 1
697*c0909341SAndroid Build Coastguard Worker    vinserti128    m12, [botq          +1], 1
698*c0909341SAndroid Build Coastguard Worker    punpckldq       m6, m9
699*c0909341SAndroid Build Coastguard Worker    punpckldq       m5, m12
700*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, [rsp+gprsize+0x40], m10
701*c0909341SAndroid Build Coastguard Worker %else
702*c0909341SAndroid Build Coastguard Worker    movd           xm6, [topq +strideq*1-1]
703*c0909341SAndroid Build Coastguard Worker    movq           xm9, [dstq +strideq*1-1]
704*c0909341SAndroid Build Coastguard Worker    movq          xm10, [dstq +stride3q -1]
705*c0909341SAndroid Build Coastguard Worker    movq          xm11, [dst4q+strideq*1-1]
706*c0909341SAndroid Build Coastguard Worker    pinsrd         xm6, [dstq +strideq*0-1], 1
707*c0909341SAndroid Build Coastguard Worker    movhps         xm9, [dstq +strideq*2-1]
708*c0909341SAndroid Build Coastguard Worker    movhps        xm10, [dst4q+strideq*0-1]
709*c0909341SAndroid Build Coastguard Worker    movhps        xm11, [dst4q+strideq*2-1]
710*c0909341SAndroid Build Coastguard Worker    psrldq         xm5, xm9, 2
711*c0909341SAndroid Build Coastguard Worker    shufps         xm6, xm9, q2010
712*c0909341SAndroid Build Coastguard Worker    psrldq         xm9, xm10, 2
713*c0909341SAndroid Build Coastguard Worker    shufps         xm5, xm9, q2020
714*c0909341SAndroid Build Coastguard Worker    shufps        xm10, xm11, q2020
715*c0909341SAndroid Build Coastguard Worker    movd           xm9, [dst4q+stride3q +1]
716*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, xm10, 1
717*c0909341SAndroid Build Coastguard Worker    pinsrd         xm9, [botq           +1], 1
718*c0909341SAndroid Build Coastguard Worker    psrldq        xm11, 2
719*c0909341SAndroid Build Coastguard Worker    pmovzxbw       m10, [leftq-1]
720*c0909341SAndroid Build Coastguard Worker    shufps        xm11, xm9, q1020
721*c0909341SAndroid Build Coastguard Worker    movu            m9, [blend_4x8_0]
722*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, xm11, 1
723*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, m10, m9
724*c0909341SAndroid Build Coastguard Worker %endif
725*c0909341SAndroid Build Coastguard Worker%else
726*c0909341SAndroid Build Coastguard Worker    lea            r13, [blend_8x8_0+8]
727*c0909341SAndroid Build Coastguard Worker    movq           xm6, [top2q         -1]
728*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m5, [dstq+strideq*1-1]
729*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m9, [dstq+strideq*2-1]
730*c0909341SAndroid Build Coastguard Worker    movhps         xm6, [dstq+strideq*0-1]
731*c0909341SAndroid Build Coastguard Worker    movu           m11, [r13+hq*2*1+16*1]
732*c0909341SAndroid Build Coastguard Worker    punpcklqdq     m10, m5, m9
733*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [dstq+stride3q -1], 1
734*c0909341SAndroid Build Coastguard Worker    vinserti128     m9, [botq          -1], 1
735*c0909341SAndroid Build Coastguard Worker    vpblendd        m6, m10, 0xF0
736*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, [rsp+gprsize+0x60+hq*8+64-8*1], m11
737*c0909341SAndroid Build Coastguard Worker    psrldq          m5, 2
738*c0909341SAndroid Build Coastguard Worker    psrldq          m9, 2
739*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m5, m9
740*c0909341SAndroid Build Coastguard Worker%endif
741*c0909341SAndroid Build Coastguard Worker    ret
742*c0909341SAndroid Build Coastguard Worker.d5k0:
743*c0909341SAndroid Build Coastguard Worker.d6k0:
744*c0909341SAndroid Build Coastguard Worker.d7k0:
745*c0909341SAndroid Build Coastguard Worker%if %1 == 4
746*c0909341SAndroid Build Coastguard Worker %if %2 == 4
747*c0909341SAndroid Build Coastguard Worker    movd           xm6, [topq+strideq*1  ]
748*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m5, [dstq+strideq*1  ]
749*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [dstq+strideq*2  ]
750*c0909341SAndroid Build Coastguard Worker    vpblendd       xm6, [dstq+strideq*0-4], 0x2
751*c0909341SAndroid Build Coastguard Worker    vpblendd        m5, m9, 0x22
752*c0909341SAndroid Build Coastguard Worker    vpblendd        m6, m5, 0x30
753*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [dstq+stride3q   ], 1
754*c0909341SAndroid Build Coastguard Worker    vpblendd        m5, [botq         -20], 0x20
755*c0909341SAndroid Build Coastguard Worker %else
756*c0909341SAndroid Build Coastguard Worker    movd           xm6, [topq +strideq*1]
757*c0909341SAndroid Build Coastguard Worker    movd           xm5, [dstq +strideq*1]
758*c0909341SAndroid Build Coastguard Worker    movd           xm9, [dstq +stride3q ]
759*c0909341SAndroid Build Coastguard Worker    movd          xm10, [dst4q+strideq*1]
760*c0909341SAndroid Build Coastguard Worker    movd          xm11, [dst4q+stride3q ]
761*c0909341SAndroid Build Coastguard Worker    pinsrd         xm6, [dstq +strideq*0], 1
762*c0909341SAndroid Build Coastguard Worker    pinsrd         xm5, [dstq +strideq*2], 1
763*c0909341SAndroid Build Coastguard Worker    pinsrd         xm9, [dst4q+strideq*0], 1
764*c0909341SAndroid Build Coastguard Worker    pinsrd        xm10, [dst4q+strideq*2], 1
765*c0909341SAndroid Build Coastguard Worker    pinsrd        xm11, [botq           ], 1
766*c0909341SAndroid Build Coastguard Worker    punpcklqdq     xm6, xm5
767*c0909341SAndroid Build Coastguard Worker    punpcklqdq     xm5, xm9
768*c0909341SAndroid Build Coastguard Worker    punpcklqdq     xm9, xm10
769*c0909341SAndroid Build Coastguard Worker    punpcklqdq    xm10, xm11
770*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, xm9, 1
771*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, xm10, 1
772*c0909341SAndroid Build Coastguard Worker %endif
773*c0909341SAndroid Build Coastguard Worker%else
774*c0909341SAndroid Build Coastguard Worker    movq           xm6, [top2q         ]
775*c0909341SAndroid Build Coastguard Worker    movq           xm5, [dstq+strideq*1]
776*c0909341SAndroid Build Coastguard Worker    movq           xm9, [dstq+stride3q ]
777*c0909341SAndroid Build Coastguard Worker    movhps         xm6, [dstq+strideq*0]
778*c0909341SAndroid Build Coastguard Worker    movhps         xm5, [dstq+strideq*2]
779*c0909341SAndroid Build Coastguard Worker    movhps         xm9, [botq          ]
780*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, xm5, 1
781*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, xm9, 1
782*c0909341SAndroid Build Coastguard Worker%endif
783*c0909341SAndroid Build Coastguard Worker    ret
784*c0909341SAndroid Build Coastguard Worker.d0k1:
785*c0909341SAndroid Build Coastguard Worker%if %1 == 4
786*c0909341SAndroid Build Coastguard Worker %if %2 == 4
787*c0909341SAndroid Build Coastguard Worker    movd           xm6, [dstq+strideq*2-2]
788*c0909341SAndroid Build Coastguard Worker    movd           xm9, [dstq+stride3q -2]
789*c0909341SAndroid Build Coastguard Worker    movd           xm5, [topq+strideq*0+2]
790*c0909341SAndroid Build Coastguard Worker    movd          xm10, [topq+strideq*1+2]
791*c0909341SAndroid Build Coastguard Worker    pinsrw         xm6, [leftq+4], 0
792*c0909341SAndroid Build Coastguard Worker    pinsrw         xm9, [leftq+6], 0
793*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [dstq+strideq*0+2], 1
794*c0909341SAndroid Build Coastguard Worker    vinserti128    m10, [dstq+strideq*1+2], 1
795*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, [botq+strideq*0-2], 1
796*c0909341SAndroid Build Coastguard Worker    vinserti128     m9, [botq+strideq*1-2], 1
797*c0909341SAndroid Build Coastguard Worker    punpckldq       m5, m10
798*c0909341SAndroid Build Coastguard Worker    punpckldq       m6, m9
799*c0909341SAndroid Build Coastguard Worker %else
800*c0909341SAndroid Build Coastguard Worker    movq           xm6, [dstq +strideq*2-2]
801*c0909341SAndroid Build Coastguard Worker    movd          xm10, [dst4q+strideq*2-2]
802*c0909341SAndroid Build Coastguard Worker    movd           xm5, [topq +strideq*0+2]
803*c0909341SAndroid Build Coastguard Worker    movq           xm9, [dst4q+strideq*0-2]
804*c0909341SAndroid Build Coastguard Worker    movhps         xm6, [dstq +stride3q -2]
805*c0909341SAndroid Build Coastguard Worker    pinsrw        xm10, [dst4q+stride3q   ], 3
806*c0909341SAndroid Build Coastguard Worker    pinsrd         xm5, [topq +strideq*1+2], 1
807*c0909341SAndroid Build Coastguard Worker    movhps         xm9, [dst4q+strideq*1-2]
808*c0909341SAndroid Build Coastguard Worker    pinsrd        xm10, [botq +strideq*0-2], 2
809*c0909341SAndroid Build Coastguard Worker    pinsrd         xm5, [dstq +strideq*0+2], 2
810*c0909341SAndroid Build Coastguard Worker    pinsrd        xm10, [botq +strideq*1-2], 3
811*c0909341SAndroid Build Coastguard Worker    pinsrd         xm5, [dstq +strideq*1+2], 3
812*c0909341SAndroid Build Coastguard Worker    shufps        xm11, xm6, xm9, q3131
813*c0909341SAndroid Build Coastguard Worker    shufps         xm6, xm9, q2020
814*c0909341SAndroid Build Coastguard Worker    movu            m9, [blend_4x8_3+8]
815*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, xm10, 1
816*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, xm11, 1
817*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, [rsp+gprsize+0x10+8], m9
818*c0909341SAndroid Build Coastguard Worker %endif
819*c0909341SAndroid Build Coastguard Worker%else
820*c0909341SAndroid Build Coastguard Worker    lea            r13, [blend_8x8_1+16]
821*c0909341SAndroid Build Coastguard Worker    movq           xm6, [dstq+strideq*2-2]
822*c0909341SAndroid Build Coastguard Worker    movq           xm9, [dstq+stride3q -2]
823*c0909341SAndroid Build Coastguard Worker    movq           xm5, [top1q         +2]
824*c0909341SAndroid Build Coastguard Worker    movq          xm10, [top2q         +2]
825*c0909341SAndroid Build Coastguard Worker    movu           m11, [r13+hq*2*2+16*2]
826*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, [botq+strideq*0-2], 1
827*c0909341SAndroid Build Coastguard Worker    vinserti128     m9, [botq+strideq*1-2], 1
828*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [dstq+strideq*0+2], 1
829*c0909341SAndroid Build Coastguard Worker    vinserti128    m10, [dstq+strideq*1+2], 1
830*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m6, m9
831*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m5, m10
832*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, [rsp+gprsize+0x20+hq*8+64+8*2], m11
833*c0909341SAndroid Build Coastguard Worker%endif
834*c0909341SAndroid Build Coastguard Worker    ret
835*c0909341SAndroid Build Coastguard Worker.d1k1:
836*c0909341SAndroid Build Coastguard Worker%if %1 == 4
837*c0909341SAndroid Build Coastguard Worker %if %2 == 4
838*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    m6, [dstq+strideq*1-2]
839*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    m9, [dstq+strideq*2-2]
840*c0909341SAndroid Build Coastguard Worker    movd           xm5, [topq+strideq*1+2]
841*c0909341SAndroid Build Coastguard Worker    movd          xm10, [dstq+strideq*0+2]
842*c0909341SAndroid Build Coastguard Worker    psrldq         m11, m6, 4
843*c0909341SAndroid Build Coastguard Worker    psrldq         m12, m9, 4
844*c0909341SAndroid Build Coastguard Worker    vpblendd        m5, m11, 0x10
845*c0909341SAndroid Build Coastguard Worker    movq          xm11, [leftq+2]
846*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, [dstq+stride3q-2], 1
847*c0909341SAndroid Build Coastguard Worker    punpckldq     xm11, xm11
848*c0909341SAndroid Build Coastguard Worker    vpblendd       m10, m12, 0x10
849*c0909341SAndroid Build Coastguard Worker    pcmpeqd        m12, m12
850*c0909341SAndroid Build Coastguard Worker    pmovzxwd       m11, xm11
851*c0909341SAndroid Build Coastguard Worker    psrld          m12, 16
852*c0909341SAndroid Build Coastguard Worker    punpckldq       m6, m9
853*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [botq-2]
854*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, m11, m12
855*c0909341SAndroid Build Coastguard Worker    punpckldq       m5, m10
856*c0909341SAndroid Build Coastguard Worker    vpblendd        m6, m9, 0x20
857*c0909341SAndroid Build Coastguard Worker %else
858*c0909341SAndroid Build Coastguard Worker    movd           xm5, [topq +strideq*1+2]
859*c0909341SAndroid Build Coastguard Worker    movq           xm6, [dstq +strideq*1-2]
860*c0909341SAndroid Build Coastguard Worker    movq           xm9, [dstq +stride3q -2]
861*c0909341SAndroid Build Coastguard Worker    movq          xm10, [dst4q+strideq*1-2]
862*c0909341SAndroid Build Coastguard Worker    movd          xm11, [dst4q+stride3q -2]
863*c0909341SAndroid Build Coastguard Worker    pinsrd         xm5, [dstq +strideq*0+2], 1
864*c0909341SAndroid Build Coastguard Worker    movhps         xm6, [dstq +strideq*2-2]
865*c0909341SAndroid Build Coastguard Worker    movhps         xm9, [dst4q+strideq*0-2]
866*c0909341SAndroid Build Coastguard Worker    movhps        xm10, [dst4q+strideq*2-2]
867*c0909341SAndroid Build Coastguard Worker    pinsrd        xm11, [botq           -2], 1
868*c0909341SAndroid Build Coastguard Worker    shufps         xm5, xm6, q3110
869*c0909341SAndroid Build Coastguard Worker    shufps         xm6, xm9, q2020
870*c0909341SAndroid Build Coastguard Worker    shufps         xm9, xm10, q3131
871*c0909341SAndroid Build Coastguard Worker    shufps        xm10, xm11, q1020
872*c0909341SAndroid Build Coastguard Worker    movu           m11, [blend_4x8_2+4]
873*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, xm10, 1
874*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, xm9, 1
875*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, [rsp+gprsize+0x10+4], m11
876*c0909341SAndroid Build Coastguard Worker %endif
877*c0909341SAndroid Build Coastguard Worker%else
878*c0909341SAndroid Build Coastguard Worker    lea            r13, [blend_8x8_1+16]
879*c0909341SAndroid Build Coastguard Worker    movq           xm5, [top2q         +2]
880*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m6, [dstq+strideq*1-2]
881*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m9, [dstq+strideq*2-2]
882*c0909341SAndroid Build Coastguard Worker    movhps         xm5, [dstq+strideq*0+2]
883*c0909341SAndroid Build Coastguard Worker    shufps         m10, m6, m9, q2121
884*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, [dstq+stride3q -2], 1
885*c0909341SAndroid Build Coastguard Worker    vinserti128     m9, [botq          -2], 1
886*c0909341SAndroid Build Coastguard Worker    movu           m11, [r13+hq*2*1+16*1]
887*c0909341SAndroid Build Coastguard Worker    vpblendd        m5, m10, 0xF0
888*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m6, m9
889*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, [rsp+gprsize+0x20+hq*8+64+8*1], m11
890*c0909341SAndroid Build Coastguard Worker%endif
891*c0909341SAndroid Build Coastguard Worker    ret
892*c0909341SAndroid Build Coastguard Worker.d2k1:
893*c0909341SAndroid Build Coastguard Worker%if %1 == 4
894*c0909341SAndroid Build Coastguard Worker %if %2 == 4
895*c0909341SAndroid Build Coastguard Worker    movq          xm11, [leftq]
896*c0909341SAndroid Build Coastguard Worker    movq           xm6, [dstq+strideq*0-2]
897*c0909341SAndroid Build Coastguard Worker    movq           xm9, [dstq+strideq*1-2]
898*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, [dstq+strideq*2-2], 1
899*c0909341SAndroid Build Coastguard Worker    vinserti128     m9, [dstq+stride3q -2], 1
900*c0909341SAndroid Build Coastguard Worker    punpckldq     xm11, xm11
901*c0909341SAndroid Build Coastguard Worker    psrldq          m5, m6, 4
902*c0909341SAndroid Build Coastguard Worker    psrldq         m10, m9, 4
903*c0909341SAndroid Build Coastguard Worker    pmovzxwd       m11, xm11
904*c0909341SAndroid Build Coastguard Worker    punpckldq       m6, m9
905*c0909341SAndroid Build Coastguard Worker    punpckldq       m5, m10
906*c0909341SAndroid Build Coastguard Worker    pblendw         m6, m11, 0x05
907*c0909341SAndroid Build Coastguard Worker %else
908*c0909341SAndroid Build Coastguard Worker    movq           xm5, [dstq +strideq*0-2]
909*c0909341SAndroid Build Coastguard Worker    movq           xm9, [dstq +strideq*2-2]
910*c0909341SAndroid Build Coastguard Worker    movq          xm10, [dst4q+strideq*0-2]
911*c0909341SAndroid Build Coastguard Worker    movq          xm11, [dst4q+strideq*2-2]
912*c0909341SAndroid Build Coastguard Worker    movhps         xm5, [dstq +strideq*1-2]
913*c0909341SAndroid Build Coastguard Worker    movhps         xm9, [dstq +stride3q -2]
914*c0909341SAndroid Build Coastguard Worker    movhps        xm10, [dst4q+strideq*1-2]
915*c0909341SAndroid Build Coastguard Worker    movhps        xm11, [dst4q+stride3q -2]
916*c0909341SAndroid Build Coastguard Worker    shufps         xm6, xm5, xm9, q2020
917*c0909341SAndroid Build Coastguard Worker    shufps         xm5, xm9, q3131
918*c0909341SAndroid Build Coastguard Worker    shufps         xm9, xm10, xm11, q2020
919*c0909341SAndroid Build Coastguard Worker    shufps        xm10, xm11, q3131
920*c0909341SAndroid Build Coastguard Worker    pmovzxwd       m11, [leftq]
921*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, xm9, 1
922*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, xm10, 1
923*c0909341SAndroid Build Coastguard Worker    pblendw         m6, m11, 0x55
924*c0909341SAndroid Build Coastguard Worker %endif
925*c0909341SAndroid Build Coastguard Worker%else
926*c0909341SAndroid Build Coastguard Worker    mova           m11, [rsp+gprsize+0x20+hq*8+64]
927*c0909341SAndroid Build Coastguard Worker    movu           xm5, [dstq+strideq*0-2]
928*c0909341SAndroid Build Coastguard Worker    movu           xm9, [dstq+strideq*1-2]
929*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [dstq+strideq*2-2], 1
930*c0909341SAndroid Build Coastguard Worker    vinserti128     m9, [dstq+stride3q -2], 1
931*c0909341SAndroid Build Coastguard Worker    shufps          m6, m5, m9, q1010
932*c0909341SAndroid Build Coastguard Worker    shufps          m5, m9, q2121
933*c0909341SAndroid Build Coastguard Worker    pblendw         m6, m11, 0x11
934*c0909341SAndroid Build Coastguard Worker%endif
935*c0909341SAndroid Build Coastguard Worker    ret
936*c0909341SAndroid Build Coastguard Worker.d3k1:
937*c0909341SAndroid Build Coastguard Worker%if %1 == 4
938*c0909341SAndroid Build Coastguard Worker %if %2 == 4
939*c0909341SAndroid Build Coastguard Worker    vpbroadcastq   m11, [dstq+strideq*1-2]
940*c0909341SAndroid Build Coastguard Worker    vpbroadcastq   m12, [dstq+strideq*2-2]
941*c0909341SAndroid Build Coastguard Worker    movd           xm6, [topq+strideq*1-2]
942*c0909341SAndroid Build Coastguard Worker    movd           xm9, [dstq+strideq*0-2]
943*c0909341SAndroid Build Coastguard Worker    pblendw        m11, [leftq-16+2], 0x01
944*c0909341SAndroid Build Coastguard Worker    pblendw        m12, [leftq-16+4], 0x01
945*c0909341SAndroid Build Coastguard Worker    pinsrw         xm9, [leftq- 0+0], 0
946*c0909341SAndroid Build Coastguard Worker    psrldq          m5, m11, 4
947*c0909341SAndroid Build Coastguard Worker    psrldq         m10, m12, 4
948*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [dstq+stride3q +2], 1
949*c0909341SAndroid Build Coastguard Worker    vinserti128    m10, [botq          +2], 1
950*c0909341SAndroid Build Coastguard Worker    vpblendd        m6, m11, 0x10
951*c0909341SAndroid Build Coastguard Worker    vpblendd        m9, m12, 0x10
952*c0909341SAndroid Build Coastguard Worker    punpckldq       m6, m9
953*c0909341SAndroid Build Coastguard Worker    punpckldq       m5, m10
954*c0909341SAndroid Build Coastguard Worker %else
955*c0909341SAndroid Build Coastguard Worker    movd           xm6, [topq +strideq*1-2]
956*c0909341SAndroid Build Coastguard Worker    movq           xm5, [dstq +strideq*1-2]
957*c0909341SAndroid Build Coastguard Worker    movq           xm9, [dstq +stride3q -2]
958*c0909341SAndroid Build Coastguard Worker    movq          xm10, [dst4q+strideq*1-2]
959*c0909341SAndroid Build Coastguard Worker    movd          xm11, [dst4q+stride3q +2]
960*c0909341SAndroid Build Coastguard Worker    pinsrw         xm6, [dstq +strideq*0  ], 3
961*c0909341SAndroid Build Coastguard Worker    movhps         xm5, [dstq +strideq*2-2]
962*c0909341SAndroid Build Coastguard Worker    movhps         xm9, [dst4q+strideq*0-2]
963*c0909341SAndroid Build Coastguard Worker    movhps        xm10, [dst4q+strideq*2-2]
964*c0909341SAndroid Build Coastguard Worker    pinsrd        xm11, [botq           +2], 1
965*c0909341SAndroid Build Coastguard Worker    shufps         xm6, xm5, q2010
966*c0909341SAndroid Build Coastguard Worker    shufps         xm5, xm9, q3131
967*c0909341SAndroid Build Coastguard Worker    shufps         xm9, xm10, q2020
968*c0909341SAndroid Build Coastguard Worker    shufps        xm10, xm11, q1031
969*c0909341SAndroid Build Coastguard Worker    movu           m11, [blend_4x8_2]
970*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, xm9, 1
971*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, xm10, 1
972*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, [rsp+gprsize+0x10-4], m11
973*c0909341SAndroid Build Coastguard Worker %endif
974*c0909341SAndroid Build Coastguard Worker%else
975*c0909341SAndroid Build Coastguard Worker    lea            r13, [blend_8x8_1+8]
976*c0909341SAndroid Build Coastguard Worker    movq           xm6, [top2q         -2]
977*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m5, [dstq+strideq*1-2]
978*c0909341SAndroid Build Coastguard Worker    vbroadcasti128 m10, [dstq+strideq*2-2]
979*c0909341SAndroid Build Coastguard Worker    movhps         xm6, [dstq+strideq*0-2]
980*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m9, m5, m10
981*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [dstq+stride3q -2], 1
982*c0909341SAndroid Build Coastguard Worker    vinserti128    m10, [botq          -2], 1
983*c0909341SAndroid Build Coastguard Worker    movu           m11, [r13+hq*2*1+16*1]
984*c0909341SAndroid Build Coastguard Worker    vpblendd        m6, m9, 0xF0
985*c0909341SAndroid Build Coastguard Worker    shufps          m5, m10, q2121
986*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, [rsp+gprsize+0x20+hq*8+64-8*1], m11
987*c0909341SAndroid Build Coastguard Worker%endif
988*c0909341SAndroid Build Coastguard Worker    ret
989*c0909341SAndroid Build Coastguard Worker.d4k1:
990*c0909341SAndroid Build Coastguard Worker%if %1 == 4
991*c0909341SAndroid Build Coastguard Worker %if %2 == 4
992*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, [dstq+strideq*0-2], 1
993*c0909341SAndroid Build Coastguard Worker    vinserti128     m9, [dstq+strideq*1-2], 1
994*c0909341SAndroid Build Coastguard Worker    movd           xm5, [dstq+strideq*2+2]
995*c0909341SAndroid Build Coastguard Worker    movd          xm10, [dstq+stride3q +2]
996*c0909341SAndroid Build Coastguard Worker    pblendw         m6, [leftq-16+0], 0x01
997*c0909341SAndroid Build Coastguard Worker    pblendw         m9, [leftq-16+2], 0x01
998*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [botq+strideq*0+2], 1
999*c0909341SAndroid Build Coastguard Worker    vinserti128    m10, [botq+strideq*1+2], 1
1000*c0909341SAndroid Build Coastguard Worker    vpblendd        m6, [topq+strideq*0-2], 0x01
1001*c0909341SAndroid Build Coastguard Worker    vpblendd        m9, [topq+strideq*1-2], 0x01
1002*c0909341SAndroid Build Coastguard Worker    punpckldq       m5, m10
1003*c0909341SAndroid Build Coastguard Worker    punpckldq       m6, m9
1004*c0909341SAndroid Build Coastguard Worker %else
1005*c0909341SAndroid Build Coastguard Worker    movd           xm6, [topq +strideq*0-2]
1006*c0909341SAndroid Build Coastguard Worker    movq           xm5, [dstq +strideq*2-2]
1007*c0909341SAndroid Build Coastguard Worker    movq           xm9, [dst4q+strideq*0-2]
1008*c0909341SAndroid Build Coastguard Worker    movd          xm10, [dst4q+strideq*2+2]
1009*c0909341SAndroid Build Coastguard Worker    pinsrd         xm6, [topq +strideq*1-2], 1
1010*c0909341SAndroid Build Coastguard Worker    movhps         xm5, [dstq +stride3q -2]
1011*c0909341SAndroid Build Coastguard Worker    movhps         xm9, [dst4q+strideq*1-2]
1012*c0909341SAndroid Build Coastguard Worker    pinsrd        xm10, [dst4q+stride3q +2], 1
1013*c0909341SAndroid Build Coastguard Worker    pinsrd         xm6, [dstq +strideq*0-2], 2
1014*c0909341SAndroid Build Coastguard Worker    pinsrd        xm10, [botq +strideq*0+2], 2
1015*c0909341SAndroid Build Coastguard Worker    pinsrd         xm6, [dstq +strideq*1-2], 3
1016*c0909341SAndroid Build Coastguard Worker    pinsrd        xm10, [botq +strideq*1+2], 3
1017*c0909341SAndroid Build Coastguard Worker    shufps        xm11, xm5, xm9, q2020
1018*c0909341SAndroid Build Coastguard Worker    shufps         xm5, xm9, q3131
1019*c0909341SAndroid Build Coastguard Worker    movu            m9, [blend_4x8_3]
1020*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, xm11, 1
1021*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, xm10, 1
1022*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, [rsp+gprsize+0x10-8], m9
1023*c0909341SAndroid Build Coastguard Worker %endif
1024*c0909341SAndroid Build Coastguard Worker%else
1025*c0909341SAndroid Build Coastguard Worker    lea            r13, [blend_8x8_1]
1026*c0909341SAndroid Build Coastguard Worker    movu           m11, [r13+hq*2*2+16*2]
1027*c0909341SAndroid Build Coastguard Worker    movq           xm6, [top1q         -2]
1028*c0909341SAndroid Build Coastguard Worker    movq           xm9, [top2q         -2]
1029*c0909341SAndroid Build Coastguard Worker    movq           xm5, [dstq+strideq*2+2]
1030*c0909341SAndroid Build Coastguard Worker    movq          xm10, [dstq+stride3q +2]
1031*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, [dstq+strideq*0-2], 1
1032*c0909341SAndroid Build Coastguard Worker    vinserti128     m9, [dstq+strideq*1-2], 1
1033*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [botq+strideq*0+2], 1
1034*c0909341SAndroid Build Coastguard Worker    vinserti128    m10, [botq+strideq*1+2], 1
1035*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m6, m9
1036*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, [rsp+gprsize+0x20+hq*8+64-8*2], m11
1037*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m5, m10
1038*c0909341SAndroid Build Coastguard Worker%endif
1039*c0909341SAndroid Build Coastguard Worker    ret
1040*c0909341SAndroid Build Coastguard Worker.d5k1:
1041*c0909341SAndroid Build Coastguard Worker%if %1 == 4
1042*c0909341SAndroid Build Coastguard Worker %if %2 == 4
1043*c0909341SAndroid Build Coastguard Worker    movd           xm6, [topq+strideq*0-1]
1044*c0909341SAndroid Build Coastguard Worker    movd           xm9, [topq+strideq*1-1]
1045*c0909341SAndroid Build Coastguard Worker    movd           xm5, [dstq+strideq*2+1]
1046*c0909341SAndroid Build Coastguard Worker    movd          xm10, [dstq+stride3q +1]
1047*c0909341SAndroid Build Coastguard Worker    pcmpeqd        m12, m12
1048*c0909341SAndroid Build Coastguard Worker    pmovzxbw       m11, [leftq-8+1]
1049*c0909341SAndroid Build Coastguard Worker    psrld          m12, 24
1050*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, [dstq+strideq*0-1], 1
1051*c0909341SAndroid Build Coastguard Worker    vinserti128     m9, [dstq+strideq*1-1], 1
1052*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [botq+strideq*0+1], 1
1053*c0909341SAndroid Build Coastguard Worker    vinserti128    m10, [botq+strideq*1+1], 1
1054*c0909341SAndroid Build Coastguard Worker    punpckldq       m6, m9
1055*c0909341SAndroid Build Coastguard Worker    pxor            m9, m9
1056*c0909341SAndroid Build Coastguard Worker    vpblendd       m12, m9, 0x0F
1057*c0909341SAndroid Build Coastguard Worker    punpckldq       m5, m10
1058*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, m11, m12
1059*c0909341SAndroid Build Coastguard Worker %else
1060*c0909341SAndroid Build Coastguard Worker    movd           xm6, [topq +strideq*0-1]
1061*c0909341SAndroid Build Coastguard Worker    movq           xm5, [dstq +strideq*2-1]
1062*c0909341SAndroid Build Coastguard Worker    movq           xm9, [dst4q+strideq*0-1]
1063*c0909341SAndroid Build Coastguard Worker    movd          xm10, [dst4q+strideq*2+1]
1064*c0909341SAndroid Build Coastguard Worker    pinsrd         xm6, [topq +strideq*1-1], 1
1065*c0909341SAndroid Build Coastguard Worker    movhps         xm5, [dstq +stride3q -1]
1066*c0909341SAndroid Build Coastguard Worker    movhps         xm9, [dst4q+strideq*1-1]
1067*c0909341SAndroid Build Coastguard Worker    pinsrd        xm10, [dst4q+stride3q +1], 1
1068*c0909341SAndroid Build Coastguard Worker    pinsrd         xm6, [dstq +strideq*0-1], 2
1069*c0909341SAndroid Build Coastguard Worker    pinsrd        xm10, [botq +strideq*0+1], 2
1070*c0909341SAndroid Build Coastguard Worker    pinsrd         xm6, [dstq +strideq*1-1], 3
1071*c0909341SAndroid Build Coastguard Worker    pinsrd        xm10, [botq +strideq*1+1], 3
1072*c0909341SAndroid Build Coastguard Worker    shufps        xm11, xm5, xm9, q2020
1073*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, xm11, 1
1074*c0909341SAndroid Build Coastguard Worker    pmovzxbw       m11, [leftq-3]
1075*c0909341SAndroid Build Coastguard Worker    psrldq         xm5, 2
1076*c0909341SAndroid Build Coastguard Worker    psrldq         xm9, 2
1077*c0909341SAndroid Build Coastguard Worker    shufps         xm5, xm9, q2020
1078*c0909341SAndroid Build Coastguard Worker    movu            m9, [blend_4x8_1]
1079*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, xm10, 1
1080*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, m11, m9
1081*c0909341SAndroid Build Coastguard Worker %endif
1082*c0909341SAndroid Build Coastguard Worker%else
1083*c0909341SAndroid Build Coastguard Worker    lea            r13, [blend_8x8_0]
1084*c0909341SAndroid Build Coastguard Worker    movu           m11, [r13+hq*2*2+16*2]
1085*c0909341SAndroid Build Coastguard Worker    movq           xm6, [top1q         -1]
1086*c0909341SAndroid Build Coastguard Worker    movq           xm9, [top2q         -1]
1087*c0909341SAndroid Build Coastguard Worker    movq           xm5, [dstq+strideq*2+1]
1088*c0909341SAndroid Build Coastguard Worker    movq          xm10, [dstq+stride3q +1]
1089*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, [dstq+strideq*0-1], 1
1090*c0909341SAndroid Build Coastguard Worker    vinserti128     m9, [dstq+strideq*1-1], 1
1091*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [botq+strideq*0+1], 1
1092*c0909341SAndroid Build Coastguard Worker    vinserti128    m10, [botq+strideq*1+1], 1
1093*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m6, m9
1094*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m5, m10
1095*c0909341SAndroid Build Coastguard Worker    vpblendvb       m6, [rsp+gprsize+0x60+hq*8+64-8*2], m11
1096*c0909341SAndroid Build Coastguard Worker%endif
1097*c0909341SAndroid Build Coastguard Worker    ret
1098*c0909341SAndroid Build Coastguard Worker.d6k1:
1099*c0909341SAndroid Build Coastguard Worker%if %1 == 4
1100*c0909341SAndroid Build Coastguard Worker %if %2 == 4
1101*c0909341SAndroid Build Coastguard Worker    movd           xm6, [topq+strideq*0]
1102*c0909341SAndroid Build Coastguard Worker    movd           xm9, [topq+strideq*1]
1103*c0909341SAndroid Build Coastguard Worker    movd           xm5, [dstq+strideq*2]
1104*c0909341SAndroid Build Coastguard Worker    movd          xm10, [dstq+stride3q ]
1105*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, [dstq+strideq*0], 1
1106*c0909341SAndroid Build Coastguard Worker    vinserti128     m9, [dstq+strideq*1], 1
1107*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [botq+strideq*0], 1
1108*c0909341SAndroid Build Coastguard Worker    vinserti128    m10, [botq+strideq*1], 1
1109*c0909341SAndroid Build Coastguard Worker    punpckldq       m6, m9
1110*c0909341SAndroid Build Coastguard Worker    punpckldq       m5, m10
1111*c0909341SAndroid Build Coastguard Worker %else
1112*c0909341SAndroid Build Coastguard Worker    movd           xm5, [dstq +strideq*2]
1113*c0909341SAndroid Build Coastguard Worker    movd           xm6, [topq +strideq*0]
1114*c0909341SAndroid Build Coastguard Worker    movd           xm9, [dst4q+strideq*2]
1115*c0909341SAndroid Build Coastguard Worker    pinsrd         xm5, [dstq +stride3q ], 1
1116*c0909341SAndroid Build Coastguard Worker    pinsrd         xm6, [topq +strideq*1], 1
1117*c0909341SAndroid Build Coastguard Worker    pinsrd         xm9, [dst4q+stride3q ], 1
1118*c0909341SAndroid Build Coastguard Worker    pinsrd         xm5, [dst4q+strideq*0], 2
1119*c0909341SAndroid Build Coastguard Worker    pinsrd         xm6, [dstq +strideq*0], 2
1120*c0909341SAndroid Build Coastguard Worker    pinsrd         xm9, [botq +strideq*0], 2
1121*c0909341SAndroid Build Coastguard Worker    pinsrd         xm5, [dst4q+strideq*1], 3
1122*c0909341SAndroid Build Coastguard Worker    pinsrd         xm6, [dstq +strideq*1], 3
1123*c0909341SAndroid Build Coastguard Worker    pinsrd         xm9, [botq +strideq*1], 3
1124*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, xm5, 1
1125*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, xm9, 1
1126*c0909341SAndroid Build Coastguard Worker %endif
1127*c0909341SAndroid Build Coastguard Worker%else
1128*c0909341SAndroid Build Coastguard Worker    movq           xm5, [dstq+strideq*2]
1129*c0909341SAndroid Build Coastguard Worker    movq           xm9, [botq+strideq*0]
1130*c0909341SAndroid Build Coastguard Worker    movq           xm6, [top1q         ]
1131*c0909341SAndroid Build Coastguard Worker    movq          xm10, [dstq+strideq*0]
1132*c0909341SAndroid Build Coastguard Worker    movhps         xm5, [dstq+stride3q ]
1133*c0909341SAndroid Build Coastguard Worker    movhps         xm9, [botq+strideq*1]
1134*c0909341SAndroid Build Coastguard Worker    movhps         xm6, [top2q         ]
1135*c0909341SAndroid Build Coastguard Worker    movhps        xm10, [dstq+strideq*1]
1136*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, xm9, 1
1137*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, xm10, 1
1138*c0909341SAndroid Build Coastguard Worker%endif
1139*c0909341SAndroid Build Coastguard Worker    ret
1140*c0909341SAndroid Build Coastguard Worker.d7k1:
1141*c0909341SAndroid Build Coastguard Worker%if %1 == 4
1142*c0909341SAndroid Build Coastguard Worker %if %2 == 4
1143*c0909341SAndroid Build Coastguard Worker    movd           xm5, [dstq+strideq*2-1]
1144*c0909341SAndroid Build Coastguard Worker    movd           xm9, [dstq+stride3q -1]
1145*c0909341SAndroid Build Coastguard Worker    movd           xm6, [topq+strideq*0+1]
1146*c0909341SAndroid Build Coastguard Worker    movd          xm10, [topq+strideq*1+1]
1147*c0909341SAndroid Build Coastguard Worker    pinsrb         xm5, [leftq+ 5], 0
1148*c0909341SAndroid Build Coastguard Worker    pinsrb         xm9, [leftq+ 7], 0
1149*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, [dstq+strideq*0+1], 1
1150*c0909341SAndroid Build Coastguard Worker    vinserti128    m10, [dstq+strideq*1+1], 1
1151*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [botq+strideq*0-1], 1
1152*c0909341SAndroid Build Coastguard Worker    vinserti128     m9, [botq+strideq*1-1], 1
1153*c0909341SAndroid Build Coastguard Worker    punpckldq       m6, m10
1154*c0909341SAndroid Build Coastguard Worker    punpckldq       m5, m9
1155*c0909341SAndroid Build Coastguard Worker %else
1156*c0909341SAndroid Build Coastguard Worker    movd           xm6, [topq +strideq*0+1]
1157*c0909341SAndroid Build Coastguard Worker    movq           xm9, [dstq +strideq*2-1]
1158*c0909341SAndroid Build Coastguard Worker    movq          xm10, [dst4q+strideq*0-1]
1159*c0909341SAndroid Build Coastguard Worker    movd          xm11, [dst4q+strideq*2-1]
1160*c0909341SAndroid Build Coastguard Worker    pinsrd         xm6, [topq +strideq*1+1], 1
1161*c0909341SAndroid Build Coastguard Worker    movhps         xm9, [dstq +stride3q -1]
1162*c0909341SAndroid Build Coastguard Worker    movhps        xm10, [dst4q+strideq*1-1]
1163*c0909341SAndroid Build Coastguard Worker    pinsrd        xm11, [dst4q+stride3q -1], 1
1164*c0909341SAndroid Build Coastguard Worker    pinsrd         xm6, [dstq +strideq*0+1], 2
1165*c0909341SAndroid Build Coastguard Worker    pinsrd        xm11, [botq +strideq*0-1], 2
1166*c0909341SAndroid Build Coastguard Worker    pinsrd         xm6, [dstq +strideq*1+1], 3
1167*c0909341SAndroid Build Coastguard Worker    pinsrd        xm11, [botq +strideq*1-1], 3
1168*c0909341SAndroid Build Coastguard Worker    shufps         xm5, xm9, xm10, q2020
1169*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, xm11, 1
1170*c0909341SAndroid Build Coastguard Worker    pmovzxbw       m11, [leftq+5]
1171*c0909341SAndroid Build Coastguard Worker    psrldq         xm9, 2
1172*c0909341SAndroid Build Coastguard Worker    psrldq        xm10, 2
1173*c0909341SAndroid Build Coastguard Worker    shufps         xm9, xm10, q2020
1174*c0909341SAndroid Build Coastguard Worker    movu           m10, [blend_4x8_1+8]
1175*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, xm9, 1
1176*c0909341SAndroid Build Coastguard Worker    vpblendvb       m5, m11, m10
1177*c0909341SAndroid Build Coastguard Worker %endif
1178*c0909341SAndroid Build Coastguard Worker%else
1179*c0909341SAndroid Build Coastguard Worker    lea            r13, [blend_8x8_0+16]
1180*c0909341SAndroid Build Coastguard Worker    movq           xm5, [dstq+strideq*2-1]
1181*c0909341SAndroid Build Coastguard Worker    movq           xm9, [botq+strideq*0-1]
1182*c0909341SAndroid Build Coastguard Worker    movq           xm6, [top1q         +1]
1183*c0909341SAndroid Build Coastguard Worker    movq          xm10, [dstq+strideq*0+1]
1184*c0909341SAndroid Build Coastguard Worker    movhps         xm5, [dstq+stride3q -1]
1185*c0909341SAndroid Build Coastguard Worker    movhps         xm9, [botq+strideq*1-1]
1186*c0909341SAndroid Build Coastguard Worker    movhps         xm6, [top2q         +1]
1187*c0909341SAndroid Build Coastguard Worker    movhps        xm10, [dstq+strideq*1+1]
1188*c0909341SAndroid Build Coastguard Worker    movu           m11, [r13+hq*2*2+16*2]
1189*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, xm9, 1
1190*c0909341SAndroid Build Coastguard Worker    vinserti128     m6, xm10, 1
1191*c0909341SAndroid Build Coastguard Worker    vpblendvb       m5, [rsp+gprsize+0x60+hq*8+64+8*2], m11
1192*c0909341SAndroid Build Coastguard Worker%endif
1193*c0909341SAndroid Build Coastguard Worker    ret
1194*c0909341SAndroid Build Coastguard Worker
1195*c0909341SAndroid Build Coastguard Worker.border_block:
1196*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge
1197*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
1198*c0909341SAndroid Build Coastguard Worker    %assign stack_offset stack_offset - (regs_used - 11) * gprsize
1199*c0909341SAndroid Build Coastguard Worker    %assign regs_used 11
1200*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK 2*16+(%2+4)*32, 16
1201*c0909341SAndroid Build Coastguard Worker%define px rsp+2*16+2*32
1202*c0909341SAndroid Build Coastguard Worker
1203*c0909341SAndroid Build Coastguard Worker    pcmpeqw        m14, m14
1204*c0909341SAndroid Build Coastguard Worker    psllw          m14, 15                  ; 0x8000
1205*c0909341SAndroid Build Coastguard Worker
1206*c0909341SAndroid Build Coastguard Worker    ; prepare pixel buffers - body/right
1207*c0909341SAndroid Build Coastguard Worker%if %1 == 4
1208*c0909341SAndroid Build Coastguard Worker    INIT_XMM avx2
1209*c0909341SAndroid Build Coastguard Worker%endif
1210*c0909341SAndroid Build Coastguard Worker%if %2 == 8
1211*c0909341SAndroid Build Coastguard Worker    lea          dst4q, [dstq+strideq*4]
1212*c0909341SAndroid Build Coastguard Worker%endif
1213*c0909341SAndroid Build Coastguard Worker    lea       stride3q, [strideq*3]
1214*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2                   ; have_right
1215*c0909341SAndroid Build Coastguard Worker    jz .no_right
1216*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m1, [dstq+strideq*0]
1217*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m2, [dstq+strideq*1]
1218*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m3, [dstq+strideq*2]
1219*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m4, [dstq+stride3q]
1220*c0909341SAndroid Build Coastguard Worker    mova     [px+0*32], m1
1221*c0909341SAndroid Build Coastguard Worker    mova     [px+1*32], m2
1222*c0909341SAndroid Build Coastguard Worker    mova     [px+2*32], m3
1223*c0909341SAndroid Build Coastguard Worker    mova     [px+3*32], m4
1224*c0909341SAndroid Build Coastguard Worker%if %2 == 8
1225*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m1, [dst4q+strideq*0]
1226*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m2, [dst4q+strideq*1]
1227*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m3, [dst4q+strideq*2]
1228*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m4, [dst4q+stride3q]
1229*c0909341SAndroid Build Coastguard Worker    mova     [px+4*32], m1
1230*c0909341SAndroid Build Coastguard Worker    mova     [px+5*32], m2
1231*c0909341SAndroid Build Coastguard Worker    mova     [px+6*32], m3
1232*c0909341SAndroid Build Coastguard Worker    mova     [px+7*32], m4
1233*c0909341SAndroid Build Coastguard Worker%endif
1234*c0909341SAndroid Build Coastguard Worker    jmp .body_done
1235*c0909341SAndroid Build Coastguard Worker.no_right:
1236*c0909341SAndroid Build Coastguard Worker%if %1 == 4
1237*c0909341SAndroid Build Coastguard Worker    movd           xm1, [dstq+strideq*0]
1238*c0909341SAndroid Build Coastguard Worker    movd           xm2, [dstq+strideq*1]
1239*c0909341SAndroid Build Coastguard Worker    movd           xm3, [dstq+strideq*2]
1240*c0909341SAndroid Build Coastguard Worker    movd           xm4, [dstq+stride3q]
1241*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm1, xm1
1242*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm2, xm2
1243*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm3, xm3
1244*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm4, xm4
1245*c0909341SAndroid Build Coastguard Worker    movq     [px+0*32], xm1
1246*c0909341SAndroid Build Coastguard Worker    movq     [px+1*32], xm2
1247*c0909341SAndroid Build Coastguard Worker    movq     [px+2*32], xm3
1248*c0909341SAndroid Build Coastguard Worker    movq     [px+3*32], xm4
1249*c0909341SAndroid Build Coastguard Worker%else
1250*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm1, [dstq+strideq*0]
1251*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm2, [dstq+strideq*1]
1252*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm3, [dstq+strideq*2]
1253*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm4, [dstq+stride3q]
1254*c0909341SAndroid Build Coastguard Worker    mova     [px+0*32], xm1
1255*c0909341SAndroid Build Coastguard Worker    mova     [px+1*32], xm2
1256*c0909341SAndroid Build Coastguard Worker    mova     [px+2*32], xm3
1257*c0909341SAndroid Build Coastguard Worker    mova     [px+3*32], xm4
1258*c0909341SAndroid Build Coastguard Worker%endif
1259*c0909341SAndroid Build Coastguard Worker    movd [px+0*32+%1*2], xm14
1260*c0909341SAndroid Build Coastguard Worker    movd [px+1*32+%1*2], xm14
1261*c0909341SAndroid Build Coastguard Worker    movd [px+2*32+%1*2], xm14
1262*c0909341SAndroid Build Coastguard Worker    movd [px+3*32+%1*2], xm14
1263*c0909341SAndroid Build Coastguard Worker%if %2 == 8
1264*c0909341SAndroid Build Coastguard Worker %if %1 == 4
1265*c0909341SAndroid Build Coastguard Worker    movd           xm1, [dst4q+strideq*0]
1266*c0909341SAndroid Build Coastguard Worker    movd           xm2, [dst4q+strideq*1]
1267*c0909341SAndroid Build Coastguard Worker    movd           xm3, [dst4q+strideq*2]
1268*c0909341SAndroid Build Coastguard Worker    movd           xm4, [dst4q+stride3q]
1269*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm1, xm1
1270*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm2, xm2
1271*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm3, xm3
1272*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm4, xm4
1273*c0909341SAndroid Build Coastguard Worker    movq     [px+4*32], xm1
1274*c0909341SAndroid Build Coastguard Worker    movq     [px+5*32], xm2
1275*c0909341SAndroid Build Coastguard Worker    movq     [px+6*32], xm3
1276*c0909341SAndroid Build Coastguard Worker    movq     [px+7*32], xm4
1277*c0909341SAndroid Build Coastguard Worker %else
1278*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm1, [dst4q+strideq*0]
1279*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm2, [dst4q+strideq*1]
1280*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm3, [dst4q+strideq*2]
1281*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm4, [dst4q+stride3q]
1282*c0909341SAndroid Build Coastguard Worker    mova     [px+4*32], xm1
1283*c0909341SAndroid Build Coastguard Worker    mova     [px+5*32], xm2
1284*c0909341SAndroid Build Coastguard Worker    mova     [px+6*32], xm3
1285*c0909341SAndroid Build Coastguard Worker    mova     [px+7*32], xm4
1286*c0909341SAndroid Build Coastguard Worker %endif
1287*c0909341SAndroid Build Coastguard Worker    movd [px+4*32+%1*2], xm14
1288*c0909341SAndroid Build Coastguard Worker    movd [px+5*32+%1*2], xm14
1289*c0909341SAndroid Build Coastguard Worker    movd [px+6*32+%1*2], xm14
1290*c0909341SAndroid Build Coastguard Worker    movd [px+7*32+%1*2], xm14
1291*c0909341SAndroid Build Coastguard Worker%endif
1292*c0909341SAndroid Build Coastguard Worker.body_done:
1293*c0909341SAndroid Build Coastguard Worker
1294*c0909341SAndroid Build Coastguard Worker    ; top
1295*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4                    ; have_top
1296*c0909341SAndroid Build Coastguard Worker    jz .no_top
1297*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1                    ; have_left
1298*c0909341SAndroid Build Coastguard Worker    jz .top_no_left
1299*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2                    ; have_right
1300*c0909341SAndroid Build Coastguard Worker    jz .top_no_right
1301*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m1, [topq+strideq*0-(%1/2)]
1302*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m2, [topq+strideq*1-(%1/2)]
1303*c0909341SAndroid Build Coastguard Worker    movu  [px-2*32-%1], m1
1304*c0909341SAndroid Build Coastguard Worker    movu  [px-1*32-%1], m2
1305*c0909341SAndroid Build Coastguard Worker    jmp .top_done
1306*c0909341SAndroid Build Coastguard Worker.top_no_right:
1307*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m1, [topq+strideq*0-%1]
1308*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m2, [topq+strideq*1-%1]
1309*c0909341SAndroid Build Coastguard Worker    movu [px-2*32-%1*2], m1
1310*c0909341SAndroid Build Coastguard Worker    movu [px-1*32-%1*2], m2
1311*c0909341SAndroid Build Coastguard Worker    movd [px-2*32+%1*2], xm14
1312*c0909341SAndroid Build Coastguard Worker    movd [px-1*32+%1*2], xm14
1313*c0909341SAndroid Build Coastguard Worker    jmp .top_done
1314*c0909341SAndroid Build Coastguard Worker.top_no_left:
1315*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2                   ; have_right
1316*c0909341SAndroid Build Coastguard Worker    jz .top_no_left_right
1317*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m1, [topq+strideq*0]
1318*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m2, [topq+strideq*1]
1319*c0909341SAndroid Build Coastguard Worker    mova   [px-2*32+0], m1
1320*c0909341SAndroid Build Coastguard Worker    mova   [px-1*32+0], m2
1321*c0909341SAndroid Build Coastguard Worker    movd   [px-2*32-4], xm14
1322*c0909341SAndroid Build Coastguard Worker    movd   [px-1*32-4], xm14
1323*c0909341SAndroid Build Coastguard Worker    jmp .top_done
1324*c0909341SAndroid Build Coastguard Worker.top_no_left_right:
1325*c0909341SAndroid Build Coastguard Worker%if %1 == 4
1326*c0909341SAndroid Build Coastguard Worker    movd           xm1, [topq+strideq*0]
1327*c0909341SAndroid Build Coastguard Worker    pinsrd         xm1, [topq+strideq*1], 1
1328*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm1, xm1
1329*c0909341SAndroid Build Coastguard Worker    movq   [px-2*32+0], xm1
1330*c0909341SAndroid Build Coastguard Worker    movhps [px-1*32+0], xm1
1331*c0909341SAndroid Build Coastguard Worker%else
1332*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm1, [topq+strideq*0]
1333*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm2, [topq+strideq*1]
1334*c0909341SAndroid Build Coastguard Worker    mova   [px-2*32+0], xm1
1335*c0909341SAndroid Build Coastguard Worker    mova   [px-1*32+0], xm2
1336*c0909341SAndroid Build Coastguard Worker%endif
1337*c0909341SAndroid Build Coastguard Worker    movd   [px-2*32-4], xm14
1338*c0909341SAndroid Build Coastguard Worker    movd   [px-1*32-4], xm14
1339*c0909341SAndroid Build Coastguard Worker    movd [px-2*32+%1*2], xm14
1340*c0909341SAndroid Build Coastguard Worker    movd [px-1*32+%1*2], xm14
1341*c0909341SAndroid Build Coastguard Worker    jmp .top_done
1342*c0909341SAndroid Build Coastguard Worker.no_top:
1343*c0909341SAndroid Build Coastguard Worker    movu   [px-2*32-%1], m14
1344*c0909341SAndroid Build Coastguard Worker    movu   [px-1*32-%1], m14
1345*c0909341SAndroid Build Coastguard Worker.top_done:
1346*c0909341SAndroid Build Coastguard Worker
1347*c0909341SAndroid Build Coastguard Worker    ; left
1348*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1                   ; have_left
1349*c0909341SAndroid Build Coastguard Worker    jz .no_left
1350*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm1, [leftq+ 0]
1351*c0909341SAndroid Build Coastguard Worker%if %2 == 8
1352*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm2, [leftq+ 8]
1353*c0909341SAndroid Build Coastguard Worker%endif
1354*c0909341SAndroid Build Coastguard Worker    movd   [px+0*32-4], xm1
1355*c0909341SAndroid Build Coastguard Worker    pextrd [px+1*32-4], xm1, 1
1356*c0909341SAndroid Build Coastguard Worker    pextrd [px+2*32-4], xm1, 2
1357*c0909341SAndroid Build Coastguard Worker    pextrd [px+3*32-4], xm1, 3
1358*c0909341SAndroid Build Coastguard Worker%if %2 == 8
1359*c0909341SAndroid Build Coastguard Worker    movd   [px+4*32-4], xm2
1360*c0909341SAndroid Build Coastguard Worker    pextrd [px+5*32-4], xm2, 1
1361*c0909341SAndroid Build Coastguard Worker    pextrd [px+6*32-4], xm2, 2
1362*c0909341SAndroid Build Coastguard Worker    pextrd [px+7*32-4], xm2, 3
1363*c0909341SAndroid Build Coastguard Worker%endif
1364*c0909341SAndroid Build Coastguard Worker    jmp .left_done
1365*c0909341SAndroid Build Coastguard Worker.no_left:
1366*c0909341SAndroid Build Coastguard Worker    movd   [px+0*32-4], xm14
1367*c0909341SAndroid Build Coastguard Worker    movd   [px+1*32-4], xm14
1368*c0909341SAndroid Build Coastguard Worker    movd   [px+2*32-4], xm14
1369*c0909341SAndroid Build Coastguard Worker    movd   [px+3*32-4], xm14
1370*c0909341SAndroid Build Coastguard Worker%if %2 == 8
1371*c0909341SAndroid Build Coastguard Worker    movd   [px+4*32-4], xm14
1372*c0909341SAndroid Build Coastguard Worker    movd   [px+5*32-4], xm14
1373*c0909341SAndroid Build Coastguard Worker    movd   [px+6*32-4], xm14
1374*c0909341SAndroid Build Coastguard Worker    movd   [px+7*32-4], xm14
1375*c0909341SAndroid Build Coastguard Worker%endif
1376*c0909341SAndroid Build Coastguard Worker.left_done:
1377*c0909341SAndroid Build Coastguard Worker
1378*c0909341SAndroid Build Coastguard Worker    ; bottom
1379*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, _, bot, pri, sec, stride3, _, edge
1380*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8                   ; have_bottom
1381*c0909341SAndroid Build Coastguard Worker    jz .no_bottom
1382*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1                   ; have_left
1383*c0909341SAndroid Build Coastguard Worker    jz .bottom_no_left
1384*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2                   ; have_right
1385*c0909341SAndroid Build Coastguard Worker    jz .bottom_no_right
1386*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m1, [botq+strideq*0-(%1/2)]
1387*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m2, [botq+strideq*1-(%1/2)]
1388*c0909341SAndroid Build Coastguard Worker    movu   [px+(%2+0)*32-%1], m1
1389*c0909341SAndroid Build Coastguard Worker    movu   [px+(%2+1)*32-%1], m2
1390*c0909341SAndroid Build Coastguard Worker    jmp .bottom_done
1391*c0909341SAndroid Build Coastguard Worker.bottom_no_right:
1392*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m1, [botq+strideq*0-%1]
1393*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m2, [botq+strideq*1-%1]
1394*c0909341SAndroid Build Coastguard Worker    movu  [px+(%2+0)*32-%1*2], m1
1395*c0909341SAndroid Build Coastguard Worker    movu  [px+(%2+1)*32-%1*2], m2
1396*c0909341SAndroid Build Coastguard Worker%if %1 == 8
1397*c0909341SAndroid Build Coastguard Worker    movd  [px+(%2-1)*32+%1*2], xm14                ; overwritten by previous movu
1398*c0909341SAndroid Build Coastguard Worker%endif
1399*c0909341SAndroid Build Coastguard Worker    movd  [px+(%2+0)*32+%1*2], xm14
1400*c0909341SAndroid Build Coastguard Worker    movd  [px+(%2+1)*32+%1*2], xm14
1401*c0909341SAndroid Build Coastguard Worker    jmp .bottom_done
1402*c0909341SAndroid Build Coastguard Worker.bottom_no_left:
1403*c0909341SAndroid Build Coastguard Worker    test          edgeb, 2                  ; have_right
1404*c0909341SAndroid Build Coastguard Worker    jz .bottom_no_left_right
1405*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m1, [botq+strideq*0]
1406*c0909341SAndroid Build Coastguard Worker    pmovzxbw        m2, [botq+strideq*1]
1407*c0909341SAndroid Build Coastguard Worker    mova   [px+(%2+0)*32+0], m1
1408*c0909341SAndroid Build Coastguard Worker    mova   [px+(%2+1)*32+0], m2
1409*c0909341SAndroid Build Coastguard Worker    movd   [px+(%2+0)*32-4], xm14
1410*c0909341SAndroid Build Coastguard Worker    movd   [px+(%2+1)*32-4], xm14
1411*c0909341SAndroid Build Coastguard Worker    jmp .bottom_done
1412*c0909341SAndroid Build Coastguard Worker.bottom_no_left_right:
1413*c0909341SAndroid Build Coastguard Worker%if %1 == 4
1414*c0909341SAndroid Build Coastguard Worker    movd           xm1, [botq+strideq*0]
1415*c0909341SAndroid Build Coastguard Worker    pinsrd         xm1, [botq+strideq*1], 1
1416*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm1, xm1
1417*c0909341SAndroid Build Coastguard Worker    movq   [px+(%2+0)*32+0], xm1
1418*c0909341SAndroid Build Coastguard Worker    movhps [px+(%2+1)*32+0], xm1
1419*c0909341SAndroid Build Coastguard Worker%else
1420*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm1, [botq+strideq*0]
1421*c0909341SAndroid Build Coastguard Worker    pmovzxbw       xm2, [botq+strideq*1]
1422*c0909341SAndroid Build Coastguard Worker    mova   [px+(%2+0)*32+0], xm1
1423*c0909341SAndroid Build Coastguard Worker    mova   [px+(%2+1)*32+0], xm2
1424*c0909341SAndroid Build Coastguard Worker%endif
1425*c0909341SAndroid Build Coastguard Worker    movd   [px+(%2+0)*32-4], xm14
1426*c0909341SAndroid Build Coastguard Worker    movd   [px+(%2+1)*32-4], xm14
1427*c0909341SAndroid Build Coastguard Worker    movd  [px+(%2+0)*32+%1*2], xm14
1428*c0909341SAndroid Build Coastguard Worker    movd  [px+(%2+1)*32+%1*2], xm14
1429*c0909341SAndroid Build Coastguard Worker    jmp .bottom_done
1430*c0909341SAndroid Build Coastguard Worker.no_bottom:
1431*c0909341SAndroid Build Coastguard Worker    movu   [px+(%2+0)*32-%1], m14
1432*c0909341SAndroid Build Coastguard Worker    movu   [px+(%2+1)*32-%1], m14
1433*c0909341SAndroid Build Coastguard Worker.bottom_done:
1434*c0909341SAndroid Build Coastguard Worker
1435*c0909341SAndroid Build Coastguard Worker    ; actual filter
1436*c0909341SAndroid Build Coastguard Worker INIT_YMM avx2
1437*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, pridmp, damping, pri, secdmp, stride3, zero
1438*c0909341SAndroid Build Coastguard Worker%undef edged
1439*c0909341SAndroid Build Coastguard Worker    ; register to shuffle values into after packing
1440*c0909341SAndroid Build Coastguard Worker    vbroadcasti128 m12, [shufb_lohi]
1441*c0909341SAndroid Build Coastguard Worker
1442*c0909341SAndroid Build Coastguard Worker    mov       dampingd, r8m
1443*c0909341SAndroid Build Coastguard Worker    xor          zerod, zerod
1444*c0909341SAndroid Build Coastguard Worker    movifnidn     prid, prim
1445*c0909341SAndroid Build Coastguard Worker    sub       dampingd, 31
1446*c0909341SAndroid Build Coastguard Worker    movifnidn  secdmpd, secdmpm
1447*c0909341SAndroid Build Coastguard Worker    test          prid, prid
1448*c0909341SAndroid Build Coastguard Worker    jz .border_sec_only
1449*c0909341SAndroid Build Coastguard Worker    movd           xm0, prid
1450*c0909341SAndroid Build Coastguard Worker    lzcnt      pridmpd, prid
1451*c0909341SAndroid Build Coastguard Worker    add        pridmpd, dampingd
1452*c0909341SAndroid Build Coastguard Worker    cmovs      pridmpd, zerod
1453*c0909341SAndroid Build Coastguard Worker    mov        [rsp+0], pridmpq                 ; pri_shift
1454*c0909341SAndroid Build Coastguard Worker    test       secdmpd, secdmpd
1455*c0909341SAndroid Build Coastguard Worker    jz .border_pri_only
1456*c0909341SAndroid Build Coastguard Worker    movd           xm1, secdmpd
1457*c0909341SAndroid Build Coastguard Worker    lzcnt      secdmpd, secdmpd
1458*c0909341SAndroid Build Coastguard Worker    add        secdmpd, dampingd
1459*c0909341SAndroid Build Coastguard Worker    mov        [rsp+8], secdmpq                 ; sec_shift
1460*c0909341SAndroid Build Coastguard Worker
1461*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, pridmp, table, pri, secdmp, stride3
1462*c0909341SAndroid Build Coastguard Worker    lea         tableq, [tap_table]
1463*c0909341SAndroid Build Coastguard Worker    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
1464*c0909341SAndroid Build Coastguard Worker    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
1465*c0909341SAndroid Build Coastguard Worker
1466*c0909341SAndroid Build Coastguard Worker    ; pri/sec_taps[k] [4 total]
1467*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, dir, table, pri, sec, stride3
1468*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m0, xm0                     ; pri_strength
1469*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m1, xm1                     ; sec_strength
1470*c0909341SAndroid Build Coastguard Worker    and           prid, 1
1471*c0909341SAndroid Build Coastguard Worker    lea           priq, [tableq+priq*2+8]       ; pri_taps
1472*c0909341SAndroid Build Coastguard Worker    lea           secq, [tableq+12]             ; sec_taps
1473*c0909341SAndroid Build Coastguard Worker
1474*c0909341SAndroid Build Coastguard Worker    BORDER_PREP_REGS %1, %2
1475*c0909341SAndroid Build Coastguard Worker%if %1*%2*2/mmsize > 1
1476*c0909341SAndroid Build Coastguard Worker.border_v_loop:
1477*c0909341SAndroid Build Coastguard Worker%endif
1478*c0909341SAndroid Build Coastguard Worker    BORDER_LOAD_BLOCK %1, %2, 1
1479*c0909341SAndroid Build Coastguard Worker.border_k_loop:
1480*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m2, [priq+kq]               ; pri_taps
1481*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m3, [secq+kq]               ; sec_taps
1482*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1
1483*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1
1484*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1
1485*c0909341SAndroid Build Coastguard Worker    dec             kq
1486*c0909341SAndroid Build Coastguard Worker    jge .border_k_loop
1487*c0909341SAndroid Build Coastguard Worker
1488*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m10, [pw_2048]
1489*c0909341SAndroid Build Coastguard Worker    BORDER_ADJUST_PIXEL %1, m10, 1
1490*c0909341SAndroid Build Coastguard Worker%if %1*%2*2/mmsize > 1
1491*c0909341SAndroid Build Coastguard Worker %define vloop_lines (mmsize/(%1*2))
1492*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq+strideq*vloop_lines]
1493*c0909341SAndroid Build Coastguard Worker    add           stkq, 32*vloop_lines
1494*c0909341SAndroid Build Coastguard Worker    dec             hd
1495*c0909341SAndroid Build Coastguard Worker    jg .border_v_loop
1496*c0909341SAndroid Build Coastguard Worker%endif
1497*c0909341SAndroid Build Coastguard Worker    RET
1498*c0909341SAndroid Build Coastguard Worker
1499*c0909341SAndroid Build Coastguard Worker.border_pri_only:
1500*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, pridmp, table, pri, _, stride3
1501*c0909341SAndroid Build Coastguard Worker    lea         tableq, [tap_table]
1502*c0909341SAndroid Build Coastguard Worker    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
1503*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, dir, table, pri, _, stride3
1504*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m0, xm0                     ; pri_strength
1505*c0909341SAndroid Build Coastguard Worker    and           prid, 1
1506*c0909341SAndroid Build Coastguard Worker    lea           priq, [tableq+priq*2+8]       ; pri_taps
1507*c0909341SAndroid Build Coastguard Worker    BORDER_PREP_REGS %1, %2
1508*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m1, [pw_2048]
1509*c0909341SAndroid Build Coastguard Worker%if %1*%2*2/mmsize > 1
1510*c0909341SAndroid Build Coastguard Worker.border_pri_v_loop:
1511*c0909341SAndroid Build Coastguard Worker%endif
1512*c0909341SAndroid Build Coastguard Worker    BORDER_LOAD_BLOCK %1, %2
1513*c0909341SAndroid Build Coastguard Worker.border_pri_k_loop:
1514*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m2, [priq+kq]               ; pri_taps
1515*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1
1516*c0909341SAndroid Build Coastguard Worker    dec             kq
1517*c0909341SAndroid Build Coastguard Worker    jge .border_pri_k_loop
1518*c0909341SAndroid Build Coastguard Worker    BORDER_ADJUST_PIXEL %1, m1
1519*c0909341SAndroid Build Coastguard Worker%if %1*%2*2/mmsize > 1
1520*c0909341SAndroid Build Coastguard Worker %define vloop_lines (mmsize/(%1*2))
1521*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq+strideq*vloop_lines]
1522*c0909341SAndroid Build Coastguard Worker    add           stkq, 32*vloop_lines
1523*c0909341SAndroid Build Coastguard Worker    dec             hd
1524*c0909341SAndroid Build Coastguard Worker    jg .border_pri_v_loop
1525*c0909341SAndroid Build Coastguard Worker%endif
1526*c0909341SAndroid Build Coastguard Worker    RET
1527*c0909341SAndroid Build Coastguard Worker
1528*c0909341SAndroid Build Coastguard Worker.border_sec_only:
1529*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, _, damping, _, secdmp, stride3
1530*c0909341SAndroid Build Coastguard Worker    movd           xm1, secdmpd
1531*c0909341SAndroid Build Coastguard Worker    lzcnt      secdmpd, secdmpd
1532*c0909341SAndroid Build Coastguard Worker    add        secdmpd, dampingd
1533*c0909341SAndroid Build Coastguard Worker    mov        [rsp+8], secdmpq                 ; sec_shift
1534*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, _, table, _, secdmp, stride3
1535*c0909341SAndroid Build Coastguard Worker    lea         tableq, [tap_table]
1536*c0909341SAndroid Build Coastguard Worker    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
1537*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, dir, table, _, sec, stride3
1538*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m1, xm1                     ; sec_strength
1539*c0909341SAndroid Build Coastguard Worker    lea           secq, [tableq+12]             ; sec_taps
1540*c0909341SAndroid Build Coastguard Worker    BORDER_PREP_REGS %1, %2
1541*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m0, [pw_2048]
1542*c0909341SAndroid Build Coastguard Worker%if %1*%2*2/mmsize > 1
1543*c0909341SAndroid Build Coastguard Worker.border_sec_v_loop:
1544*c0909341SAndroid Build Coastguard Worker%endif
1545*c0909341SAndroid Build Coastguard Worker    BORDER_LOAD_BLOCK %1, %2
1546*c0909341SAndroid Build Coastguard Worker.border_sec_k_loop:
1547*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    m3, [secq+kq]               ; sec_taps
1548*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1
1549*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1
1550*c0909341SAndroid Build Coastguard Worker    dec             kq
1551*c0909341SAndroid Build Coastguard Worker    jge .border_sec_k_loop
1552*c0909341SAndroid Build Coastguard Worker    BORDER_ADJUST_PIXEL %1, m0
1553*c0909341SAndroid Build Coastguard Worker%if %1*%2*2/mmsize > 1
1554*c0909341SAndroid Build Coastguard Worker %define vloop_lines (mmsize/(%1*2))
1555*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq+strideq*vloop_lines]
1556*c0909341SAndroid Build Coastguard Worker    add           stkq, 32*vloop_lines
1557*c0909341SAndroid Build Coastguard Worker    dec             hd
1558*c0909341SAndroid Build Coastguard Worker    jg .border_sec_v_loop
1559*c0909341SAndroid Build Coastguard Worker%endif
1560*c0909341SAndroid Build Coastguard Worker    RET
1561*c0909341SAndroid Build Coastguard Worker%endmacro
1562*c0909341SAndroid Build Coastguard Worker
1563*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 8, 8
1564*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 4, 8
1565*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 4, 4
1566*c0909341SAndroid Build Coastguard Worker
1567*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
1568*c0909341SAndroid Build Coastguard Workercglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3
1569*c0909341SAndroid Build Coastguard Worker    lea       stride3q, [strideq*3]
1570*c0909341SAndroid Build Coastguard Worker    movq           xm0, [srcq+strideq*0]
1571*c0909341SAndroid Build Coastguard Worker    movq           xm1, [srcq+strideq*1]
1572*c0909341SAndroid Build Coastguard Worker    movq           xm2, [srcq+strideq*2]
1573*c0909341SAndroid Build Coastguard Worker    movq           xm3, [srcq+stride3q ]
1574*c0909341SAndroid Build Coastguard Worker    lea           srcq, [srcq+strideq*4]
1575*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    m4, [srcq+stride3q ]
1576*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    m5, [srcq+strideq*2]
1577*c0909341SAndroid Build Coastguard Worker    vpblendd        m0, m4, 0xf0
1578*c0909341SAndroid Build Coastguard Worker    vpblendd        m1, m5, 0xf0
1579*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    m4, [srcq+strideq*1]
1580*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    m5, [srcq+strideq*0]
1581*c0909341SAndroid Build Coastguard Worker    vpblendd        m2, m4, 0xf0
1582*c0909341SAndroid Build Coastguard Worker    vpblendd        m3, m5, 0xf0
1583*c0909341SAndroid Build Coastguard Worker    pxor            m4, m4
1584*c0909341SAndroid Build Coastguard Worker    punpcklbw       m0, m4
1585*c0909341SAndroid Build Coastguard Worker    punpcklbw       m1, m4
1586*c0909341SAndroid Build Coastguard Worker    punpcklbw       m2, m4
1587*c0909341SAndroid Build Coastguard Worker    punpcklbw       m3, m4
1588*c0909341SAndroid Build Coastguard Workercglobal_label .main
1589*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m4, [pw_128]
1590*c0909341SAndroid Build Coastguard Worker    PROLOGUE 3, 4, 15
1591*c0909341SAndroid Build Coastguard Worker    psubw           m0, m4
1592*c0909341SAndroid Build Coastguard Worker    psubw           m1, m4
1593*c0909341SAndroid Build Coastguard Worker    psubw           m2, m4
1594*c0909341SAndroid Build Coastguard Worker    psubw           m3, m4
1595*c0909341SAndroid Build Coastguard Worker
1596*c0909341SAndroid Build Coastguard Worker    ; shuffle registers to generate partial_sum_diag[0-1] together
1597*c0909341SAndroid Build Coastguard Worker    vperm2i128      m7, m0, m0, 0x01
1598*c0909341SAndroid Build Coastguard Worker    vperm2i128      m6, m1, m1, 0x01
1599*c0909341SAndroid Build Coastguard Worker    vperm2i128      m5, m2, m2, 0x01
1600*c0909341SAndroid Build Coastguard Worker    vperm2i128      m4, m3, m3, 0x01
1601*c0909341SAndroid Build Coastguard Worker
1602*c0909341SAndroid Build Coastguard Worker    ; start with partial_sum_hv[0-1]
1603*c0909341SAndroid Build Coastguard Worker    paddw           m8, m0, m1
1604*c0909341SAndroid Build Coastguard Worker    paddw           m9, m2, m3
1605*c0909341SAndroid Build Coastguard Worker    phaddw         m10, m0, m1
1606*c0909341SAndroid Build Coastguard Worker    phaddw         m11, m2, m3
1607*c0909341SAndroid Build Coastguard Worker    paddw           m8, m9
1608*c0909341SAndroid Build Coastguard Worker    phaddw         m10, m11
1609*c0909341SAndroid Build Coastguard Worker    vextracti128   xm9, m8, 1
1610*c0909341SAndroid Build Coastguard Worker    vextracti128  xm11, m10, 1
1611*c0909341SAndroid Build Coastguard Worker    paddw          xm8, xm9                 ; partial_sum_hv[1]
1612*c0909341SAndroid Build Coastguard Worker    phaddw        xm10, xm11                ; partial_sum_hv[0]
1613*c0909341SAndroid Build Coastguard Worker    vinserti128     m8, xm10, 1
1614*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [div_table+44]
1615*c0909341SAndroid Build Coastguard Worker    pmaddwd         m8, m8
1616*c0909341SAndroid Build Coastguard Worker    pmulld          m8, m9                  ; cost6[2a-d] | cost2[a-d]
1617*c0909341SAndroid Build Coastguard Worker
1618*c0909341SAndroid Build Coastguard Worker    ; create aggregates [lower half]:
1619*c0909341SAndroid Build Coastguard Worker    ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+
1620*c0909341SAndroid Build Coastguard Worker    ;      m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0
1621*c0909341SAndroid Build Coastguard Worker    ; m10=             m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+
1622*c0909341SAndroid Build Coastguard Worker    ;      m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x
1623*c0909341SAndroid Build Coastguard Worker    ; and [upper half]:
1624*c0909341SAndroid Build Coastguard Worker    ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+
1625*c0909341SAndroid Build Coastguard Worker    ;      m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567
1626*c0909341SAndroid Build Coastguard Worker    ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+
1627*c0909341SAndroid Build Coastguard Worker    ;      m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx
1628*c0909341SAndroid Build Coastguard Worker    ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd
1629*c0909341SAndroid Build Coastguard Worker
1630*c0909341SAndroid Build Coastguard Worker    pslldq          m9, m1, 2
1631*c0909341SAndroid Build Coastguard Worker    psrldq         m10, m1, 14
1632*c0909341SAndroid Build Coastguard Worker    pslldq         m11, m2, 4
1633*c0909341SAndroid Build Coastguard Worker    psrldq         m12, m2, 12
1634*c0909341SAndroid Build Coastguard Worker    pslldq         m13, m3, 6
1635*c0909341SAndroid Build Coastguard Worker    psrldq         m14, m3, 10
1636*c0909341SAndroid Build Coastguard Worker    paddw           m9, m11
1637*c0909341SAndroid Build Coastguard Worker    paddw          m10, m12
1638*c0909341SAndroid Build Coastguard Worker    paddw           m9, m13
1639*c0909341SAndroid Build Coastguard Worker    paddw          m10, m14
1640*c0909341SAndroid Build Coastguard Worker    pslldq         m11, m4, 8
1641*c0909341SAndroid Build Coastguard Worker    psrldq         m12, m4, 8
1642*c0909341SAndroid Build Coastguard Worker    pslldq         m13, m5, 10
1643*c0909341SAndroid Build Coastguard Worker    psrldq         m14, m5, 6
1644*c0909341SAndroid Build Coastguard Worker    paddw           m9, m11
1645*c0909341SAndroid Build Coastguard Worker    paddw          m10, m12
1646*c0909341SAndroid Build Coastguard Worker    paddw           m9, m13
1647*c0909341SAndroid Build Coastguard Worker    paddw          m10, m14
1648*c0909341SAndroid Build Coastguard Worker    pslldq         m11, m6, 12
1649*c0909341SAndroid Build Coastguard Worker    psrldq         m12, m6, 4
1650*c0909341SAndroid Build Coastguard Worker    pslldq         m13, m7, 14
1651*c0909341SAndroid Build Coastguard Worker    psrldq         m14, m7, 2
1652*c0909341SAndroid Build Coastguard Worker    paddw           m9, m11
1653*c0909341SAndroid Build Coastguard Worker    paddw          m10, m12
1654*c0909341SAndroid Build Coastguard Worker    paddw           m9, m13
1655*c0909341SAndroid Build Coastguard Worker    paddw          m10, m14                 ; partial_sum_diag[0/1][8-14,zero]
1656*c0909341SAndroid Build Coastguard Worker    vbroadcasti128 m14, [shufw_6543210x]
1657*c0909341SAndroid Build Coastguard Worker    vbroadcasti128 m13, [div_table+16]
1658*c0909341SAndroid Build Coastguard Worker    vbroadcasti128 m12, [div_table+0]
1659*c0909341SAndroid Build Coastguard Worker    paddw           m9, m0                  ; partial_sum_diag[0/1][0-7]
1660*c0909341SAndroid Build Coastguard Worker    pshufb         m10, m14
1661*c0909341SAndroid Build Coastguard Worker    punpckhwd      m11, m9, m10
1662*c0909341SAndroid Build Coastguard Worker    punpcklwd       m9, m10
1663*c0909341SAndroid Build Coastguard Worker    pmaddwd        m11, m11
1664*c0909341SAndroid Build Coastguard Worker    pmaddwd         m9, m9
1665*c0909341SAndroid Build Coastguard Worker    pmulld         m11, m13
1666*c0909341SAndroid Build Coastguard Worker    pmulld          m9, m12
1667*c0909341SAndroid Build Coastguard Worker    paddd           m9, m11                 ; cost0[a-d] | cost4[a-d]
1668*c0909341SAndroid Build Coastguard Worker
1669*c0909341SAndroid Build Coastguard Worker    ; merge horizontally and vertically for partial_sum_alt[0-3]
1670*c0909341SAndroid Build Coastguard Worker    paddw          m10, m0, m1
1671*c0909341SAndroid Build Coastguard Worker    paddw          m11, m2, m3
1672*c0909341SAndroid Build Coastguard Worker    paddw          m12, m4, m5
1673*c0909341SAndroid Build Coastguard Worker    paddw          m13, m6, m7
1674*c0909341SAndroid Build Coastguard Worker    phaddw          m0, m4
1675*c0909341SAndroid Build Coastguard Worker    phaddw          m1, m5
1676*c0909341SAndroid Build Coastguard Worker    phaddw          m2, m6
1677*c0909341SAndroid Build Coastguard Worker    phaddw          m3, m7
1678*c0909341SAndroid Build Coastguard Worker
1679*c0909341SAndroid Build Coastguard Worker    ; create aggregates [lower half]:
1680*c0909341SAndroid Build Coastguard Worker    ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234
1681*c0909341SAndroid Build Coastguard Worker    ; m11=              m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx
1682*c0909341SAndroid Build Coastguard Worker    ; and [upper half]:
1683*c0909341SAndroid Build Coastguard Worker    ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
1684*c0909341SAndroid Build Coastguard Worker    ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
1685*c0909341SAndroid Build Coastguard Worker    ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd
1686*c0909341SAndroid Build Coastguard Worker
1687*c0909341SAndroid Build Coastguard Worker    pslldq          m4, m11, 2
1688*c0909341SAndroid Build Coastguard Worker    psrldq         m11, 14
1689*c0909341SAndroid Build Coastguard Worker    pslldq          m5, m12, 4
1690*c0909341SAndroid Build Coastguard Worker    psrldq         m12, 12
1691*c0909341SAndroid Build Coastguard Worker    pslldq          m6, m13, 6
1692*c0909341SAndroid Build Coastguard Worker    psrldq         m13, 10
1693*c0909341SAndroid Build Coastguard Worker    paddw           m4, m10
1694*c0909341SAndroid Build Coastguard Worker    paddw          m11, m12
1695*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m12, [div_table+44]
1696*c0909341SAndroid Build Coastguard Worker    paddw           m5, m6
1697*c0909341SAndroid Build Coastguard Worker    paddw          m11, m13                 ; partial_sum_alt[3/2] right
1698*c0909341SAndroid Build Coastguard Worker    vbroadcasti128 m13, [div_table+32]
1699*c0909341SAndroid Build Coastguard Worker    paddw           m4, m5                  ; partial_sum_alt[3/2] left
1700*c0909341SAndroid Build Coastguard Worker    pshuflw         m5, m11, q3012
1701*c0909341SAndroid Build Coastguard Worker    punpckhwd       m6, m11, m4
1702*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m5
1703*c0909341SAndroid Build Coastguard Worker    pmaddwd         m6, m6
1704*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
1705*c0909341SAndroid Build Coastguard Worker    pmulld          m6, m12
1706*c0909341SAndroid Build Coastguard Worker    pmulld          m4, m13
1707*c0909341SAndroid Build Coastguard Worker    paddd           m4, m6                  ; cost7[a-d] | cost5[a-d]
1708*c0909341SAndroid Build Coastguard Worker
1709*c0909341SAndroid Build Coastguard Worker    ; create aggregates [lower half]:
1710*c0909341SAndroid Build Coastguard Worker    ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234
1711*c0909341SAndroid Build Coastguard Worker    ; m1 =             m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx
1712*c0909341SAndroid Build Coastguard Worker    ; and [upper half]:
1713*c0909341SAndroid Build Coastguard Worker    ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
1714*c0909341SAndroid Build Coastguard Worker    ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
1715*c0909341SAndroid Build Coastguard Worker    ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd
1716*c0909341SAndroid Build Coastguard Worker
1717*c0909341SAndroid Build Coastguard Worker    pslldq          m5, m1, 2
1718*c0909341SAndroid Build Coastguard Worker    psrldq          m1, 14
1719*c0909341SAndroid Build Coastguard Worker    pslldq          m6, m2, 4
1720*c0909341SAndroid Build Coastguard Worker    psrldq          m2, 12
1721*c0909341SAndroid Build Coastguard Worker    pslldq          m7, m3, 6
1722*c0909341SAndroid Build Coastguard Worker    psrldq          m3, 10
1723*c0909341SAndroid Build Coastguard Worker    paddw           m5, m0
1724*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2
1725*c0909341SAndroid Build Coastguard Worker    paddw           m6, m7
1726*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3                  ; partial_sum_alt[0/1] right
1727*c0909341SAndroid Build Coastguard Worker    paddw           m5, m6                  ; partial_sum_alt[0/1] left
1728*c0909341SAndroid Build Coastguard Worker    pshuflw         m0, m1, q3012
1729*c0909341SAndroid Build Coastguard Worker    punpckhwd       m1, m5
1730*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m0
1731*c0909341SAndroid Build Coastguard Worker    pmaddwd         m1, m1
1732*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
1733*c0909341SAndroid Build Coastguard Worker    pmulld          m1, m12
1734*c0909341SAndroid Build Coastguard Worker    pmulld          m5, m13
1735*c0909341SAndroid Build Coastguard Worker    paddd           m5, m1                  ; cost1[a-d] | cost3[a-d]
1736*c0909341SAndroid Build Coastguard Worker
1737*c0909341SAndroid Build Coastguard Worker    mova           xm0, [pd_47130256+ 16]
1738*c0909341SAndroid Build Coastguard Worker    mova            m1, [pd_47130256]
1739*c0909341SAndroid Build Coastguard Worker    phaddd          m9, m8
1740*c0909341SAndroid Build Coastguard Worker    phaddd          m5, m4
1741*c0909341SAndroid Build Coastguard Worker    phaddd          m9, m5
1742*c0909341SAndroid Build Coastguard Worker    vpermd          m0, m9                  ; cost[0-3]
1743*c0909341SAndroid Build Coastguard Worker    vpermd          m1, m9                  ; cost[4-7] | cost[0-3]
1744*c0909341SAndroid Build Coastguard Worker
1745*c0909341SAndroid Build Coastguard Worker    ; now find the best cost
1746*c0909341SAndroid Build Coastguard Worker    pmaxsd         xm2, xm0, xm1
1747*c0909341SAndroid Build Coastguard Worker    pshufd         xm3, xm2, q1032
1748*c0909341SAndroid Build Coastguard Worker    pmaxsd         xm2, xm3
1749*c0909341SAndroid Build Coastguard Worker    pshufd         xm3, xm2, q2301
1750*c0909341SAndroid Build Coastguard Worker    pmaxsd         xm2, xm3 ; best cost
1751*c0909341SAndroid Build Coastguard Worker
1752*c0909341SAndroid Build Coastguard Worker    ; find the idx using minpos
1753*c0909341SAndroid Build Coastguard Worker    ; make everything other than the best cost negative via subtraction
1754*c0909341SAndroid Build Coastguard Worker    ; find the min of unsigned 16-bit ints to sort out the negative values
1755*c0909341SAndroid Build Coastguard Worker    psubd          xm4, xm1, xm2
1756*c0909341SAndroid Build Coastguard Worker    psubd          xm3, xm0, xm2
1757*c0909341SAndroid Build Coastguard Worker    packssdw       xm3, xm4
1758*c0909341SAndroid Build Coastguard Worker    phminposuw     xm3, xm3
1759*c0909341SAndroid Build Coastguard Worker
1760*c0909341SAndroid Build Coastguard Worker    ; convert idx to 32-bits
1761*c0909341SAndroid Build Coastguard Worker    psrld          xm3, 16
1762*c0909341SAndroid Build Coastguard Worker    movd           eax, xm3
1763*c0909341SAndroid Build Coastguard Worker
1764*c0909341SAndroid Build Coastguard Worker    ; get idx^4 complement
1765*c0909341SAndroid Build Coastguard Worker    vpermd          m3, m1
1766*c0909341SAndroid Build Coastguard Worker    psubd          xm2, xm3
1767*c0909341SAndroid Build Coastguard Worker    psrld          xm2, 10
1768*c0909341SAndroid Build Coastguard Worker    movd        [varq], xm2
1769*c0909341SAndroid Build Coastguard Worker    RET
1770*c0909341SAndroid Build Coastguard Worker
1771*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
1772