xref: /aosp_15_r20/external/libdav1d/src/x86/cdef16_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA
32*c0909341SAndroid Build Coastguard Worker
33*c0909341SAndroid Build Coastguard Worker%macro DIR_TABLE 1 ; stride
34*c0909341SAndroid Build Coastguard Worker    db  1 * %1 + 0,  2 * %1 + 0
35*c0909341SAndroid Build Coastguard Worker    db  1 * %1 + 0,  2 * %1 - 2
36*c0909341SAndroid Build Coastguard Worker    db -1 * %1 + 2, -2 * %1 + 4
37*c0909341SAndroid Build Coastguard Worker    db  0 * %1 + 2, -1 * %1 + 4
38*c0909341SAndroid Build Coastguard Worker    db  0 * %1 + 2,  0 * %1 + 4
39*c0909341SAndroid Build Coastguard Worker    db  0 * %1 + 2,  1 * %1 + 4
40*c0909341SAndroid Build Coastguard Worker    db  1 * %1 + 2,  2 * %1 + 4
41*c0909341SAndroid Build Coastguard Worker    db  1 * %1 + 0,  2 * %1 + 2
42*c0909341SAndroid Build Coastguard Worker    db  1 * %1 + 0,  2 * %1 + 0
43*c0909341SAndroid Build Coastguard Worker    db  1 * %1 + 0,  2 * %1 - 2
44*c0909341SAndroid Build Coastguard Worker    db -1 * %1 + 2, -2 * %1 + 4
45*c0909341SAndroid Build Coastguard Worker    db  0 * %1 + 2, -1 * %1 + 4
46*c0909341SAndroid Build Coastguard Worker%endmacro
47*c0909341SAndroid Build Coastguard Worker
48*c0909341SAndroid Build Coastguard Workerdir_table4: DIR_TABLE 16
49*c0909341SAndroid Build Coastguard Workerdir_table8: DIR_TABLE 32
50*c0909341SAndroid Build Coastguard Workerpri_taps:   dw  4, 4, 3, 3, 2, 2, 3, 3
51*c0909341SAndroid Build Coastguard Worker
52*c0909341SAndroid Build Coastguard Workerdir_shift:  times 2 dw 0x4000
53*c0909341SAndroid Build Coastguard Worker            times 2 dw 0x1000
54*c0909341SAndroid Build Coastguard Worker
55*c0909341SAndroid Build Coastguard Workerpw_2048:    times 2 dw 2048
56*c0909341SAndroid Build Coastguard Workerpw_m16384:  times 2 dw -16384
57*c0909341SAndroid Build Coastguard Worker
58*c0909341SAndroid Build Coastguard Workercextern cdef_dir_8bpc_avx2.main
59*c0909341SAndroid Build Coastguard Worker
60*c0909341SAndroid Build Coastguard WorkerSECTION .text
61*c0909341SAndroid Build Coastguard Worker
62*c0909341SAndroid Build Coastguard Worker%macro CDEF_FILTER 2 ; w, h
63*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, stride, _, dir, pridmp, pri, sec, tmp
64*c0909341SAndroid Build Coastguard Worker    movifnidn     prid, r5m
65*c0909341SAndroid Build Coastguard Worker    movifnidn     secd, r6m
66*c0909341SAndroid Build Coastguard Worker    mov           dird, r7m
67*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m8, [base+pw_2048]
68*c0909341SAndroid Build Coastguard Worker    lea           dirq, [base+dir_table%1+dirq*2]
69*c0909341SAndroid Build Coastguard Worker    test          prid, prid
70*c0909341SAndroid Build Coastguard Worker    jz .sec_only
71*c0909341SAndroid Build Coastguard Worker%if WIN64
72*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m6, prim
73*c0909341SAndroid Build Coastguard Worker    movaps  [rsp+16*0], xmm9
74*c0909341SAndroid Build Coastguard Worker    movaps  [rsp+16*1], xmm10
75*c0909341SAndroid Build Coastguard Worker%else
76*c0909341SAndroid Build Coastguard Worker    movd           xm6, prid
77*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m6, xm6
78*c0909341SAndroid Build Coastguard Worker%endif
79*c0909341SAndroid Build Coastguard Worker    lzcnt      pridmpd, prid
80*c0909341SAndroid Build Coastguard Worker    rorx          tmpd, prid, 2
81*c0909341SAndroid Build Coastguard Worker    cmp     dword r10m, 0xfff ; if (bpc == 12)
82*c0909341SAndroid Build Coastguard Worker    cmove         prid, tmpd  ;     pri >>= 2
83*c0909341SAndroid Build Coastguard Worker    mov           tmpd, r8m   ; damping
84*c0909341SAndroid Build Coastguard Worker    and           prid, 4
85*c0909341SAndroid Build Coastguard Worker    sub           tmpd, 31
86*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [base+pri_taps+priq+8*0]
87*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m10, [base+pri_taps+priq+8*1]
88*c0909341SAndroid Build Coastguard Worker    test          secd, secd
89*c0909341SAndroid Build Coastguard Worker    jz .pri_only
90*c0909341SAndroid Build Coastguard Worker%if WIN64
91*c0909341SAndroid Build Coastguard Worker    movaps         r8m, xmm13
92*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m13, secm
93*c0909341SAndroid Build Coastguard Worker    movaps         r4m, xmm11
94*c0909341SAndroid Build Coastguard Worker    movaps         r6m, xmm12
95*c0909341SAndroid Build Coastguard Worker%else
96*c0909341SAndroid Build Coastguard Worker    movd           xm0, secd
97*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m13, xm0
98*c0909341SAndroid Build Coastguard Worker%endif
99*c0909341SAndroid Build Coastguard Worker    lzcnt         secd, secd
100*c0909341SAndroid Build Coastguard Worker    xor           prid, prid
101*c0909341SAndroid Build Coastguard Worker    add        pridmpd, tmpd
102*c0909341SAndroid Build Coastguard Worker    cmovs      pridmpd, prid
103*c0909341SAndroid Build Coastguard Worker    add           secd, tmpd
104*c0909341SAndroid Build Coastguard Worker    lea           tmpq, [px]
105*c0909341SAndroid Build Coastguard Worker    mov    [pri_shift], pridmpq
106*c0909341SAndroid Build Coastguard Worker    mov    [sec_shift], secq
107*c0909341SAndroid Build Coastguard Worker%rep %1*%2/16
108*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
109*c0909341SAndroid Build Coastguard Worker%endrep
110*c0909341SAndroid Build Coastguard Worker%if WIN64
111*c0909341SAndroid Build Coastguard Worker    movaps       xmm11, r4m
112*c0909341SAndroid Build Coastguard Worker    movaps       xmm12, r6m
113*c0909341SAndroid Build Coastguard Worker    movaps       xmm13, r8m
114*c0909341SAndroid Build Coastguard Worker%endif
115*c0909341SAndroid Build Coastguard Worker    jmp .pri_end
116*c0909341SAndroid Build Coastguard Worker.pri_only:
117*c0909341SAndroid Build Coastguard Worker    add        pridmpd, tmpd
118*c0909341SAndroid Build Coastguard Worker    cmovs      pridmpd, secd
119*c0909341SAndroid Build Coastguard Worker    lea           tmpq, [px]
120*c0909341SAndroid Build Coastguard Worker    mov    [pri_shift], pridmpq
121*c0909341SAndroid Build Coastguard Worker%rep %1*%2/16
122*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
123*c0909341SAndroid Build Coastguard Worker%endrep
124*c0909341SAndroid Build Coastguard Worker.pri_end:
125*c0909341SAndroid Build Coastguard Worker%if WIN64
126*c0909341SAndroid Build Coastguard Worker    movaps        xmm9, [rsp+16*0]
127*c0909341SAndroid Build Coastguard Worker    movaps       xmm10, [rsp+16*1]
128*c0909341SAndroid Build Coastguard Worker%endif
129*c0909341SAndroid Build Coastguard Worker.end:
130*c0909341SAndroid Build Coastguard Worker    RET
131*c0909341SAndroid Build Coastguard Worker.sec_only:
132*c0909341SAndroid Build Coastguard Worker    mov           tmpd, r8m ; damping
133*c0909341SAndroid Build Coastguard Worker%if WIN64
134*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m6, secm
135*c0909341SAndroid Build Coastguard Worker%else
136*c0909341SAndroid Build Coastguard Worker    movd           xm6, secd
137*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m6, xm6
138*c0909341SAndroid Build Coastguard Worker%endif
139*c0909341SAndroid Build Coastguard Worker    tzcnt         secd, secd
140*c0909341SAndroid Build Coastguard Worker    sub           tmpd, secd
141*c0909341SAndroid Build Coastguard Worker    mov    [sec_shift], tmpq
142*c0909341SAndroid Build Coastguard Worker    lea           tmpq, [px]
143*c0909341SAndroid Build Coastguard Worker%rep %1*%2/16
144*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
145*c0909341SAndroid Build Coastguard Worker%endrep
146*c0909341SAndroid Build Coastguard Worker    jmp .end
147*c0909341SAndroid Build Coastguard Worker%if %1 == %2
148*c0909341SAndroid Build Coastguard WorkerALIGN function_align
149*c0909341SAndroid Build Coastguard Worker.pri:
150*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+4]    ; off_k0
151*c0909341SAndroid Build Coastguard Worker%if %1 == 4
152*c0909341SAndroid Build Coastguard Worker    mova            m1, [tmpq+32*0]
153*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m1, [tmpq+32*1]      ; 0 2 1 3
154*c0909341SAndroid Build Coastguard Worker    movu            m2, [tmpq+offq+32*0]
155*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m2, [tmpq+offq+32*1] ; k0p0
156*c0909341SAndroid Build Coastguard Worker    neg           offq
157*c0909341SAndroid Build Coastguard Worker    movu            m3, [tmpq+offq+32*0]
158*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m3, [tmpq+offq+32*1] ; k0p1
159*c0909341SAndroid Build Coastguard Worker%else
160*c0909341SAndroid Build Coastguard Worker    mova           xm1, [tmpq+32*0]
161*c0909341SAndroid Build Coastguard Worker    vinserti128     m1, [tmpq+32*1], 1
162*c0909341SAndroid Build Coastguard Worker    movu           xm2, [tmpq+offq+32*0]
163*c0909341SAndroid Build Coastguard Worker    vinserti128     m2, [tmpq+offq+32*1], 1
164*c0909341SAndroid Build Coastguard Worker    neg           offq
165*c0909341SAndroid Build Coastguard Worker    movu           xm3, [tmpq+offq+32*0]
166*c0909341SAndroid Build Coastguard Worker    vinserti128     m3, [tmpq+offq+32*1], 1
167*c0909341SAndroid Build Coastguard Worker%endif
168*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+5]    ; off_k1
169*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1               ; diff_k0p0
170*c0909341SAndroid Build Coastguard Worker    psubw           m3, m1               ; diff_k0p1
171*c0909341SAndroid Build Coastguard Worker    pabsw           m4, m2               ; adiff_k0p0
172*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m4, [pri_shift+gprsize]
173*c0909341SAndroid Build Coastguard Worker    psubusw         m0, m6, m5
174*c0909341SAndroid Build Coastguard Worker    pabsw           m5, m3               ; adiff_k0p1
175*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m4
176*c0909341SAndroid Build Coastguard Worker    psrlw           m4, m5, [pri_shift+gprsize]
177*c0909341SAndroid Build Coastguard Worker    psignw          m0, m2               ; constrain(diff_k0p0)
178*c0909341SAndroid Build Coastguard Worker    psubusw         m2, m6, m4
179*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m5
180*c0909341SAndroid Build Coastguard Worker%if %1 == 4
181*c0909341SAndroid Build Coastguard Worker    movu            m4, [tmpq+offq+32*0]
182*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m4, [tmpq+offq+32*1] ; k1p0
183*c0909341SAndroid Build Coastguard Worker    neg           offq
184*c0909341SAndroid Build Coastguard Worker    movu            m5, [tmpq+offq+32*0]
185*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m5, [tmpq+offq+32*1] ; k1p1
186*c0909341SAndroid Build Coastguard Worker%else
187*c0909341SAndroid Build Coastguard Worker    movu           xm4, [tmpq+offq+32*0]
188*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [tmpq+offq+32*1], 1
189*c0909341SAndroid Build Coastguard Worker    neg           offq
190*c0909341SAndroid Build Coastguard Worker    movu           xm5, [tmpq+offq+32*0]
191*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [tmpq+offq+32*1], 1
192*c0909341SAndroid Build Coastguard Worker%endif
193*c0909341SAndroid Build Coastguard Worker    psubw           m4, m1               ; diff_k1p0
194*c0909341SAndroid Build Coastguard Worker    psubw           m5, m1               ; diff_k1p1
195*c0909341SAndroid Build Coastguard Worker    psignw          m2, m3               ; constrain(diff_k0p1)
196*c0909341SAndroid Build Coastguard Worker    pabsw           m3, m4               ; adiff_k1p0
197*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2               ; constrain(diff_k0)
198*c0909341SAndroid Build Coastguard Worker    psrlw           m2, m3, [pri_shift+gprsize]
199*c0909341SAndroid Build Coastguard Worker    psubusw         m7, m6, m2
200*c0909341SAndroid Build Coastguard Worker    pabsw           m2, m5               ; adiff_k1p1
201*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m3
202*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m2, [pri_shift+gprsize]
203*c0909341SAndroid Build Coastguard Worker    psignw          m7, m4               ; constrain(diff_k1p0)
204*c0909341SAndroid Build Coastguard Worker    psubusw         m4, m6, m3
205*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m2
206*c0909341SAndroid Build Coastguard Worker    psignw          m4, m5               ; constrain(diff_k1p1)
207*c0909341SAndroid Build Coastguard Worker    paddw           m7, m4               ; constrain(diff_k1)
208*c0909341SAndroid Build Coastguard Worker    pmullw          m0, m9               ; pri_tap_k0
209*c0909341SAndroid Build Coastguard Worker    pmullw          m7, m10              ; pri_tap_k1
210*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7               ; sum
211*c0909341SAndroid Build Coastguard Worker    psraw           m2, m0, 15
212*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
213*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m0, m8
214*c0909341SAndroid Build Coastguard Worker    add           tmpq, 32*2
215*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
216*c0909341SAndroid Build Coastguard Worker%if %1 == 4
217*c0909341SAndroid Build Coastguard Worker    vextracti128   xm1, m0, 1
218*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
219*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
220*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
221*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r9       ], xm1
222*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq+strideq*4]
223*c0909341SAndroid Build Coastguard Worker%else
224*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
225*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
226*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq+strideq*2]
227*c0909341SAndroid Build Coastguard Worker%endif
228*c0909341SAndroid Build Coastguard Worker    ret
229*c0909341SAndroid Build Coastguard WorkerALIGN function_align
230*c0909341SAndroid Build Coastguard Worker.sec:
231*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+8]    ; off1_k0
232*c0909341SAndroid Build Coastguard Worker%if %1 == 4
233*c0909341SAndroid Build Coastguard Worker    mova            m1, [tmpq+32*0]
234*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m1, [tmpq+32*1]
235*c0909341SAndroid Build Coastguard Worker    movu            m2, [tmpq+offq+32*0]
236*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m2, [tmpq+offq+32*1] ; k0s0
237*c0909341SAndroid Build Coastguard Worker    neg           offq
238*c0909341SAndroid Build Coastguard Worker    movu            m3, [tmpq+offq+32*0]
239*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m3, [tmpq+offq+32*1] ; k0s1
240*c0909341SAndroid Build Coastguard Worker%else
241*c0909341SAndroid Build Coastguard Worker    mova           xm1, [tmpq+32*0]
242*c0909341SAndroid Build Coastguard Worker    vinserti128     m1, [tmpq+32*1], 1
243*c0909341SAndroid Build Coastguard Worker    movu           xm2, [tmpq+offq+32*0]
244*c0909341SAndroid Build Coastguard Worker    vinserti128     m2, [tmpq+offq+32*1], 1
245*c0909341SAndroid Build Coastguard Worker    neg           offq
246*c0909341SAndroid Build Coastguard Worker    movu           xm3, [tmpq+offq+32*0]
247*c0909341SAndroid Build Coastguard Worker    vinserti128     m3, [tmpq+offq+32*1], 1
248*c0909341SAndroid Build Coastguard Worker%endif
249*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+0]    ; off2_k0
250*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1               ; diff_k0s0
251*c0909341SAndroid Build Coastguard Worker    psubw           m3, m1               ; diff_k0s1
252*c0909341SAndroid Build Coastguard Worker    pabsw           m4, m2               ; adiff_k0s0
253*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m4, [sec_shift+gprsize]
254*c0909341SAndroid Build Coastguard Worker    psubusw         m0, m6, m5
255*c0909341SAndroid Build Coastguard Worker    pabsw           m5, m3               ; adiff_k0s1
256*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m4
257*c0909341SAndroid Build Coastguard Worker    psrlw           m4, m5, [sec_shift+gprsize]
258*c0909341SAndroid Build Coastguard Worker    psignw          m0, m2               ; constrain(diff_k0s0)
259*c0909341SAndroid Build Coastguard Worker    psubusw         m2, m6, m4
260*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m5
261*c0909341SAndroid Build Coastguard Worker%if %1 == 4
262*c0909341SAndroid Build Coastguard Worker    movu            m4, [tmpq+offq+32*0]
263*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m4, [tmpq+offq+32*1] ; k0s2
264*c0909341SAndroid Build Coastguard Worker    neg           offq
265*c0909341SAndroid Build Coastguard Worker    movu            m5, [tmpq+offq+32*0]
266*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m5, [tmpq+offq+32*1] ; k0s3
267*c0909341SAndroid Build Coastguard Worker%else
268*c0909341SAndroid Build Coastguard Worker    movu           xm4, [tmpq+offq+32*0]
269*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [tmpq+offq+32*1], 1
270*c0909341SAndroid Build Coastguard Worker    neg           offq
271*c0909341SAndroid Build Coastguard Worker    movu           xm5, [tmpq+offq+32*0]
272*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [tmpq+offq+32*1], 1
273*c0909341SAndroid Build Coastguard Worker%endif
274*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+9]    ; off1_k1
275*c0909341SAndroid Build Coastguard Worker    psubw           m4, m1               ; diff_k0s2
276*c0909341SAndroid Build Coastguard Worker    psubw           m5, m1               ; diff_k0s3
277*c0909341SAndroid Build Coastguard Worker    psignw          m2, m3               ; constrain(diff_k0s1)
278*c0909341SAndroid Build Coastguard Worker    pabsw           m3, m4               ; adiff_k0s2
279*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
280*c0909341SAndroid Build Coastguard Worker    psrlw           m2, m3, [sec_shift+gprsize]
281*c0909341SAndroid Build Coastguard Worker    psubusw         m7, m6, m2
282*c0909341SAndroid Build Coastguard Worker    pabsw           m2, m5               ; adiff_k0s3
283*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m3
284*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m2, [sec_shift+gprsize]
285*c0909341SAndroid Build Coastguard Worker    psignw          m7, m4               ; constrain(diff_k0s2)
286*c0909341SAndroid Build Coastguard Worker    psubusw         m4, m6, m3
287*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m2
288*c0909341SAndroid Build Coastguard Worker%if %1 == 4
289*c0909341SAndroid Build Coastguard Worker    movu            m2, [tmpq+offq+32*0]
290*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m2, [tmpq+offq+32*1] ; k1s0
291*c0909341SAndroid Build Coastguard Worker    neg           offq
292*c0909341SAndroid Build Coastguard Worker    movu            m3, [tmpq+offq+32*0]
293*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m3, [tmpq+offq+32*1] ; k1s1
294*c0909341SAndroid Build Coastguard Worker%else
295*c0909341SAndroid Build Coastguard Worker    movu           xm2, [tmpq+offq+32*0]
296*c0909341SAndroid Build Coastguard Worker    vinserti128     m2, [tmpq+offq+32*1], 1
297*c0909341SAndroid Build Coastguard Worker    neg           offq
298*c0909341SAndroid Build Coastguard Worker    movu           xm3, [tmpq+offq+32*0]
299*c0909341SAndroid Build Coastguard Worker    vinserti128     m3, [tmpq+offq+32*1], 1
300*c0909341SAndroid Build Coastguard Worker%endif
301*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+1]    ; off2_k1
302*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7
303*c0909341SAndroid Build Coastguard Worker    psignw          m4, m5               ; constrain(diff_k0s3)
304*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4               ; constrain(diff_k0)
305*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1               ; diff_k1s0
306*c0909341SAndroid Build Coastguard Worker    psubw           m3, m1               ; diff_k1s1
307*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0               ; sec_tap_k0
308*c0909341SAndroid Build Coastguard Worker    pabsw           m4, m2               ; adiff_k1s0
309*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m4, [sec_shift+gprsize]
310*c0909341SAndroid Build Coastguard Worker    psubusw         m7, m6, m5
311*c0909341SAndroid Build Coastguard Worker    pabsw           m5, m3               ; adiff_k1s1
312*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m4
313*c0909341SAndroid Build Coastguard Worker    psrlw           m4, m5, [sec_shift+gprsize]
314*c0909341SAndroid Build Coastguard Worker    psignw          m7, m2               ; constrain(diff_k1s0)
315*c0909341SAndroid Build Coastguard Worker    psubusw         m2, m6, m4
316*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m5
317*c0909341SAndroid Build Coastguard Worker%if %1 == 4
318*c0909341SAndroid Build Coastguard Worker    movu            m4, [tmpq+offq+32*0]
319*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m4, [tmpq+offq+32*1] ; k1s2
320*c0909341SAndroid Build Coastguard Worker    neg           offq
321*c0909341SAndroid Build Coastguard Worker    movu            m5, [tmpq+offq+32*0]
322*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m5, [tmpq+offq+32*1] ; k1s3
323*c0909341SAndroid Build Coastguard Worker%else
324*c0909341SAndroid Build Coastguard Worker    movu           xm4, [tmpq+offq+32*0]
325*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [tmpq+offq+32*1], 1
326*c0909341SAndroid Build Coastguard Worker    neg           offq
327*c0909341SAndroid Build Coastguard Worker    movu           xm5, [tmpq+offq+32*0]
328*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [tmpq+offq+32*1], 1
329*c0909341SAndroid Build Coastguard Worker%endif
330*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7
331*c0909341SAndroid Build Coastguard Worker    psubw           m4, m1               ; diff_k1s2
332*c0909341SAndroid Build Coastguard Worker    psubw           m5, m1               ; diff_k1s3
333*c0909341SAndroid Build Coastguard Worker    psignw          m2, m3               ; constrain(diff_k1s1)
334*c0909341SAndroid Build Coastguard Worker    pabsw           m3, m4               ; adiff_k1s2
335*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
336*c0909341SAndroid Build Coastguard Worker    psrlw           m2, m3, [sec_shift+gprsize]
337*c0909341SAndroid Build Coastguard Worker    psubusw         m7, m6, m2
338*c0909341SAndroid Build Coastguard Worker    pabsw           m2, m5               ; adiff_k1s3
339*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m3
340*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m2, [sec_shift+gprsize]
341*c0909341SAndroid Build Coastguard Worker    psignw          m7, m4               ; constrain(diff_k1s2)
342*c0909341SAndroid Build Coastguard Worker    psubusw         m4, m6, m3
343*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m2
344*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7
345*c0909341SAndroid Build Coastguard Worker    psignw          m4, m5               ; constrain(diff_k1s3)
346*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4               ; sum
347*c0909341SAndroid Build Coastguard Worker    psraw           m2, m0, 15
348*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
349*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m0, m8
350*c0909341SAndroid Build Coastguard Worker    add           tmpq, 32*2
351*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
352*c0909341SAndroid Build Coastguard Worker%if %1 == 4
353*c0909341SAndroid Build Coastguard Worker    vextracti128   xm1, m0, 1
354*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
355*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
356*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
357*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r9       ], xm1
358*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq+strideq*4]
359*c0909341SAndroid Build Coastguard Worker%else
360*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
361*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
362*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq+strideq*2]
363*c0909341SAndroid Build Coastguard Worker%endif
364*c0909341SAndroid Build Coastguard Worker    ret
365*c0909341SAndroid Build Coastguard WorkerALIGN function_align
366*c0909341SAndroid Build Coastguard Worker.pri_sec:
367*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+8]    ; off2_k0
368*c0909341SAndroid Build Coastguard Worker%if %1 == 4
369*c0909341SAndroid Build Coastguard Worker    mova            m1, [tmpq+32*0]
370*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m1, [tmpq+32*1]
371*c0909341SAndroid Build Coastguard Worker    movu            m2, [tmpq+offq+32*0]
372*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m2, [tmpq+offq+32*1] ; k0s0
373*c0909341SAndroid Build Coastguard Worker    neg           offq
374*c0909341SAndroid Build Coastguard Worker    movu            m3, [tmpq+offq+32*0]
375*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m3, [tmpq+offq+32*1] ; k0s1
376*c0909341SAndroid Build Coastguard Worker%else
377*c0909341SAndroid Build Coastguard Worker    mova           xm1, [dstq+strideq*0]
378*c0909341SAndroid Build Coastguard Worker    vinserti128     m1, [dstq+strideq*1], 1
379*c0909341SAndroid Build Coastguard Worker    movu           xm2, [tmpq+offq+32*0]
380*c0909341SAndroid Build Coastguard Worker    vinserti128     m2, [tmpq+offq+32*1], 1
381*c0909341SAndroid Build Coastguard Worker    neg           offq
382*c0909341SAndroid Build Coastguard Worker    movu           xm3, [tmpq+offq+32*0]
383*c0909341SAndroid Build Coastguard Worker    vinserti128     m3, [tmpq+offq+32*1], 1
384*c0909341SAndroid Build Coastguard Worker%endif
385*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+0]    ; off3_k0
386*c0909341SAndroid Build Coastguard Worker    pmaxsw         m11, m2, m3
387*c0909341SAndroid Build Coastguard Worker    pminuw         m12, m2, m3
388*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1               ; diff_k0s0
389*c0909341SAndroid Build Coastguard Worker    psubw           m3, m1               ; diff_k0s1
390*c0909341SAndroid Build Coastguard Worker    pabsw           m4, m2               ; adiff_k0s0
391*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m4, [sec_shift+gprsize]
392*c0909341SAndroid Build Coastguard Worker    psubusw         m0, m13, m5
393*c0909341SAndroid Build Coastguard Worker    pabsw           m5, m3               ; adiff_k0s1
394*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m4
395*c0909341SAndroid Build Coastguard Worker    psrlw           m4, m5, [sec_shift+gprsize]
396*c0909341SAndroid Build Coastguard Worker    psignw          m0, m2               ; constrain(diff_k0s0)
397*c0909341SAndroid Build Coastguard Worker    psubusw         m2, m13, m4
398*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m5
399*c0909341SAndroid Build Coastguard Worker%if %1 == 4
400*c0909341SAndroid Build Coastguard Worker    movu            m4, [tmpq+offq+32*0]
401*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m4, [tmpq+offq+32*1] ; k0s2
402*c0909341SAndroid Build Coastguard Worker    neg           offq
403*c0909341SAndroid Build Coastguard Worker    movu            m5, [tmpq+offq+32*0]
404*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m5, [tmpq+offq+32*1] ; k0s3
405*c0909341SAndroid Build Coastguard Worker%else
406*c0909341SAndroid Build Coastguard Worker    movu           xm4, [tmpq+offq+32*0]
407*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [tmpq+offq+32*1], 1
408*c0909341SAndroid Build Coastguard Worker    neg           offq
409*c0909341SAndroid Build Coastguard Worker    movu           xm5, [tmpq+offq+32*0]
410*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [tmpq+offq+32*1], 1
411*c0909341SAndroid Build Coastguard Worker%endif
412*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+9]    ; off2_k1
413*c0909341SAndroid Build Coastguard Worker    psignw          m2, m3               ; constrain(diff_k0s1)
414*c0909341SAndroid Build Coastguard Worker    pmaxsw         m11, m4
415*c0909341SAndroid Build Coastguard Worker    pminuw         m12, m4
416*c0909341SAndroid Build Coastguard Worker    pmaxsw         m11, m5
417*c0909341SAndroid Build Coastguard Worker    pminuw         m12, m5
418*c0909341SAndroid Build Coastguard Worker    psubw           m4, m1               ; diff_k0s2
419*c0909341SAndroid Build Coastguard Worker    psubw           m5, m1               ; diff_k0s3
420*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
421*c0909341SAndroid Build Coastguard Worker    pabsw           m3, m4               ; adiff_k0s2
422*c0909341SAndroid Build Coastguard Worker    psrlw           m2, m3, [sec_shift+gprsize]
423*c0909341SAndroid Build Coastguard Worker    psubusw         m7, m13, m2
424*c0909341SAndroid Build Coastguard Worker    pabsw           m2, m5               ; adiff_k0s3
425*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m3
426*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m2, [sec_shift+gprsize]
427*c0909341SAndroid Build Coastguard Worker    psignw          m7, m4               ; constrain(diff_k0s2)
428*c0909341SAndroid Build Coastguard Worker    psubusw         m4, m13, m3
429*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m2
430*c0909341SAndroid Build Coastguard Worker%if %1 == 4
431*c0909341SAndroid Build Coastguard Worker    movu            m2, [tmpq+offq+32*0]
432*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m2, [tmpq+offq+32*1] ; k1s0
433*c0909341SAndroid Build Coastguard Worker    neg           offq
434*c0909341SAndroid Build Coastguard Worker    movu            m3, [tmpq+offq+32*0]
435*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m3, [tmpq+offq+32*1] ; k1s1
436*c0909341SAndroid Build Coastguard Worker%else
437*c0909341SAndroid Build Coastguard Worker    movu           xm2, [tmpq+offq+32*0]
438*c0909341SAndroid Build Coastguard Worker    vinserti128     m2, [tmpq+offq+32*1], 1
439*c0909341SAndroid Build Coastguard Worker    neg           offq
440*c0909341SAndroid Build Coastguard Worker    movu           xm3, [tmpq+offq+32*0]
441*c0909341SAndroid Build Coastguard Worker    vinserti128     m3, [tmpq+offq+32*1], 1
442*c0909341SAndroid Build Coastguard Worker%endif
443*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+1]    ; off3_k1
444*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7
445*c0909341SAndroid Build Coastguard Worker    psignw          m4, m5               ; constrain(diff_k0s3)
446*c0909341SAndroid Build Coastguard Worker    pmaxsw         m11, m2
447*c0909341SAndroid Build Coastguard Worker    pminuw         m12, m2
448*c0909341SAndroid Build Coastguard Worker    pmaxsw         m11, m3
449*c0909341SAndroid Build Coastguard Worker    pminuw         m12, m3
450*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4               ; constrain(diff_k0)
451*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1               ; diff_k1s0
452*c0909341SAndroid Build Coastguard Worker    psubw           m3, m1               ; diff_k1s1
453*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0               ; sec_tap_k0
454*c0909341SAndroid Build Coastguard Worker    pabsw           m4, m2               ; adiff_k1s0
455*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m4, [sec_shift+gprsize]
456*c0909341SAndroid Build Coastguard Worker    psubusw         m7, m13, m5
457*c0909341SAndroid Build Coastguard Worker    pabsw           m5, m3               ; adiff_k1s1
458*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m4
459*c0909341SAndroid Build Coastguard Worker    psrlw           m4, m5, [sec_shift+gprsize]
460*c0909341SAndroid Build Coastguard Worker    psignw          m7, m2               ; constrain(diff_k1s0)
461*c0909341SAndroid Build Coastguard Worker    psubusw         m2, m13, m4
462*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m5
463*c0909341SAndroid Build Coastguard Worker%if %1 == 4
464*c0909341SAndroid Build Coastguard Worker    movu            m4, [tmpq+offq+32*0]
465*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m4, [tmpq+offq+32*1] ; k1s2
466*c0909341SAndroid Build Coastguard Worker    neg           offq
467*c0909341SAndroid Build Coastguard Worker    movu            m5, [tmpq+offq+32*0]
468*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m5, [tmpq+offq+32*1] ; k1s3
469*c0909341SAndroid Build Coastguard Worker%else
470*c0909341SAndroid Build Coastguard Worker    movu           xm4, [tmpq+offq+32*0]
471*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [tmpq+offq+32*1], 1
472*c0909341SAndroid Build Coastguard Worker    neg           offq
473*c0909341SAndroid Build Coastguard Worker    movu           xm5, [tmpq+offq+32*0]
474*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [tmpq+offq+32*1], 1
475*c0909341SAndroid Build Coastguard Worker%endif
476*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+4]    ; off1_k0
477*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7
478*c0909341SAndroid Build Coastguard Worker    psignw          m2, m3               ; constrain(diff_k1s1)
479*c0909341SAndroid Build Coastguard Worker    pmaxsw         m11, m4
480*c0909341SAndroid Build Coastguard Worker    pminuw         m12, m4
481*c0909341SAndroid Build Coastguard Worker    pmaxsw         m11, m5
482*c0909341SAndroid Build Coastguard Worker    pminuw         m12, m5
483*c0909341SAndroid Build Coastguard Worker    psubw           m4, m1               ; diff_k1s2
484*c0909341SAndroid Build Coastguard Worker    psubw           m5, m1               ; diff_k1s3
485*c0909341SAndroid Build Coastguard Worker    pabsw           m3, m4               ; adiff_k1s2
486*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
487*c0909341SAndroid Build Coastguard Worker    psrlw           m2, m3, [sec_shift+gprsize]
488*c0909341SAndroid Build Coastguard Worker    psubusw         m7, m13, m2
489*c0909341SAndroid Build Coastguard Worker    pabsw           m2, m5               ; adiff_k1s3
490*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m3
491*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m2, [sec_shift+gprsize]
492*c0909341SAndroid Build Coastguard Worker    psignw          m7, m4               ; constrain(diff_k1s2)
493*c0909341SAndroid Build Coastguard Worker    psubusw         m4, m13, m3
494*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m2
495*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7
496*c0909341SAndroid Build Coastguard Worker%if %1 == 4
497*c0909341SAndroid Build Coastguard Worker    movu            m2, [tmpq+offq+32*0]
498*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m2, [tmpq+offq+32*1] ; k0p0
499*c0909341SAndroid Build Coastguard Worker    neg           offq
500*c0909341SAndroid Build Coastguard Worker    movu            m3, [tmpq+offq+32*0]
501*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m3, [tmpq+offq+32*1] ; k0p1
502*c0909341SAndroid Build Coastguard Worker%else
503*c0909341SAndroid Build Coastguard Worker    movu           xm2, [tmpq+offq+32*0]
504*c0909341SAndroid Build Coastguard Worker    vinserti128     m2, [tmpq+offq+32*1], 1
505*c0909341SAndroid Build Coastguard Worker    neg           offq
506*c0909341SAndroid Build Coastguard Worker    movu           xm3, [tmpq+offq+32*0]
507*c0909341SAndroid Build Coastguard Worker    vinserti128     m3, [tmpq+offq+32*1], 1
508*c0909341SAndroid Build Coastguard Worker%endif
509*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+5]    ; off1_k1
510*c0909341SAndroid Build Coastguard Worker    psignw          m4, m5               ; constrain(diff_k1s3)
511*c0909341SAndroid Build Coastguard Worker    pmaxsw         m11, m2
512*c0909341SAndroid Build Coastguard Worker    pminuw         m12, m2
513*c0909341SAndroid Build Coastguard Worker    pmaxsw         m11, m3
514*c0909341SAndroid Build Coastguard Worker    pminuw         m12, m3
515*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1               ; diff_k0p0
516*c0909341SAndroid Build Coastguard Worker    psubw           m3, m1               ; diff_k0p1
517*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4
518*c0909341SAndroid Build Coastguard Worker    pabsw           m4, m2               ; adiff_k0p0
519*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m4, [pri_shift+gprsize]
520*c0909341SAndroid Build Coastguard Worker    psubusw         m7, m6, m5
521*c0909341SAndroid Build Coastguard Worker    pabsw           m5, m3               ; adiff_k0p1
522*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m4
523*c0909341SAndroid Build Coastguard Worker    psrlw           m4, m5, [pri_shift+gprsize]
524*c0909341SAndroid Build Coastguard Worker    psignw          m7, m2               ; constrain(diff_k0p0)
525*c0909341SAndroid Build Coastguard Worker    psubusw         m2, m6, m4
526*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m5
527*c0909341SAndroid Build Coastguard Worker%if %1 == 4
528*c0909341SAndroid Build Coastguard Worker    movu            m4, [tmpq+offq+32*0]
529*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m4, [tmpq+offq+32*1] ; k1p0
530*c0909341SAndroid Build Coastguard Worker    neg           offq
531*c0909341SAndroid Build Coastguard Worker    movu            m5, [tmpq+offq+32*0]
532*c0909341SAndroid Build Coastguard Worker    punpcklqdq      m5, [tmpq+offq+32*1] ; k1p1
533*c0909341SAndroid Build Coastguard Worker%else
534*c0909341SAndroid Build Coastguard Worker    movu           xm4, [tmpq+offq+32*0]
535*c0909341SAndroid Build Coastguard Worker    vinserti128     m4, [tmpq+offq+32*1], 1
536*c0909341SAndroid Build Coastguard Worker    neg           offq
537*c0909341SAndroid Build Coastguard Worker    movu           xm5, [tmpq+offq+32*0]
538*c0909341SAndroid Build Coastguard Worker    vinserti128     m5, [tmpq+offq+32*1], 1
539*c0909341SAndroid Build Coastguard Worker%endif
540*c0909341SAndroid Build Coastguard Worker    psignw          m2, m3               ; constrain(diff_k0p1)
541*c0909341SAndroid Build Coastguard Worker    paddw           m7, m2               ; constrain(diff_k0)
542*c0909341SAndroid Build Coastguard Worker    pmaxsw         m11, m4
543*c0909341SAndroid Build Coastguard Worker    pminuw         m12, m4
544*c0909341SAndroid Build Coastguard Worker    pmaxsw         m11, m5
545*c0909341SAndroid Build Coastguard Worker    pminuw         m12, m5
546*c0909341SAndroid Build Coastguard Worker    psubw           m4, m1               ; diff_k1p0
547*c0909341SAndroid Build Coastguard Worker    psubw           m5, m1               ; diff_k1p1
548*c0909341SAndroid Build Coastguard Worker    pabsw           m3, m4               ; adiff_k1p0
549*c0909341SAndroid Build Coastguard Worker    pmullw          m7, m9               ; pri_tap_k0
550*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7
551*c0909341SAndroid Build Coastguard Worker    psrlw           m2, m3, [pri_shift+gprsize]
552*c0909341SAndroid Build Coastguard Worker    psubusw         m7, m6, m2
553*c0909341SAndroid Build Coastguard Worker    pabsw           m2, m5               ; adiff_k1p1
554*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m3
555*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m2, [pri_shift+gprsize]
556*c0909341SAndroid Build Coastguard Worker    psignw          m7, m4               ; constrain(diff_k1p0)
557*c0909341SAndroid Build Coastguard Worker    psubusw         m4, m6, m3
558*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m2
559*c0909341SAndroid Build Coastguard Worker    psignw          m4, m5               ; constrain(diff_k1p1)
560*c0909341SAndroid Build Coastguard Worker    paddw           m7, m4               ; constrain(diff_k1)
561*c0909341SAndroid Build Coastguard Worker    pmullw          m7, m10              ; pri_tap_k1
562*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7               ; sum
563*c0909341SAndroid Build Coastguard Worker    psraw           m2, m0, 15
564*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
565*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m0, m8
566*c0909341SAndroid Build Coastguard Worker    add           tmpq, 32*2
567*c0909341SAndroid Build Coastguard Worker    pmaxsw         m11, m1
568*c0909341SAndroid Build Coastguard Worker    pminuw         m12, m1
569*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
570*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m11
571*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m12
572*c0909341SAndroid Build Coastguard Worker%if %1 == 4
573*c0909341SAndroid Build Coastguard Worker    vextracti128   xm1, m0, 1
574*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
575*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
576*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
577*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r9       ], xm1
578*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq+strideq*4]
579*c0909341SAndroid Build Coastguard Worker%else
580*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
581*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
582*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq+strideq*2]
583*c0909341SAndroid Build Coastguard Worker%endif
584*c0909341SAndroid Build Coastguard Worker    ret
585*c0909341SAndroid Build Coastguard Worker%endif
586*c0909341SAndroid Build Coastguard Worker%endmacro
587*c0909341SAndroid Build Coastguard Worker
588*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
589*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_4x4_16bpc, 5, 10, 9, 16*10, dst, stride, left, top, bot, \
590*c0909341SAndroid Build Coastguard Worker                                                pri, sec, edge
591*c0909341SAndroid Build Coastguard Worker%if WIN64
592*c0909341SAndroid Build Coastguard Worker    %define         px  rsp+16*6
593*c0909341SAndroid Build Coastguard Worker    %define       offq  r8
594*c0909341SAndroid Build Coastguard Worker    %define  pri_shift  rsp+16*2
595*c0909341SAndroid Build Coastguard Worker    %define  sec_shift  rsp+16*3
596*c0909341SAndroid Build Coastguard Worker%else
597*c0909341SAndroid Build Coastguard Worker    %define         px  rsp+16*4
598*c0909341SAndroid Build Coastguard Worker    %define       offq  r4
599*c0909341SAndroid Build Coastguard Worker    %define  pri_shift  rsp+16*0
600*c0909341SAndroid Build Coastguard Worker    %define  sec_shift  rsp+16*1
601*c0909341SAndroid Build Coastguard Worker%endif
602*c0909341SAndroid Build Coastguard Worker    %define       base  r8-dir_table4
603*c0909341SAndroid Build Coastguard Worker    mov          edged, r9m
604*c0909341SAndroid Build Coastguard Worker    lea             r8, [dir_table4]
605*c0909341SAndroid Build Coastguard Worker    movu           xm0, [dstq+strideq*0]
606*c0909341SAndroid Build Coastguard Worker    movu           xm1, [dstq+strideq*1]
607*c0909341SAndroid Build Coastguard Worker    lea             r9, [strideq*3]
608*c0909341SAndroid Build Coastguard Worker    movu           xm2, [dstq+strideq*2]
609*c0909341SAndroid Build Coastguard Worker    movu           xm3, [dstq+r9       ]
610*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m7, [base+pw_m16384]
611*c0909341SAndroid Build Coastguard Worker    mova   [px+16*0+0], xm0
612*c0909341SAndroid Build Coastguard Worker    mova   [px+16*1+0], xm1
613*c0909341SAndroid Build Coastguard Worker    mova   [px+16*2+0], xm2
614*c0909341SAndroid Build Coastguard Worker    mova   [px+16*3+0], xm3
615*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; HAVE_TOP
616*c0909341SAndroid Build Coastguard Worker    jz .no_top
617*c0909341SAndroid Build Coastguard Worker    movu           xm0, [topq+strideq*0]
618*c0909341SAndroid Build Coastguard Worker    movu           xm1, [topq+strideq*1]
619*c0909341SAndroid Build Coastguard Worker    mova   [px-16*2+0], xm0
620*c0909341SAndroid Build Coastguard Worker    mova   [px-16*1+0], xm1
621*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; HAVE_LEFT
622*c0909341SAndroid Build Coastguard Worker    jz .top_no_left
623*c0909341SAndroid Build Coastguard Worker    movd           xm0, [topq+strideq*0-4]
624*c0909341SAndroid Build Coastguard Worker    movd           xm1, [topq+strideq*1-4]
625*c0909341SAndroid Build Coastguard Worker    movd   [px-16*2-4], xm0
626*c0909341SAndroid Build Coastguard Worker    movd   [px-16*1-4], xm1
627*c0909341SAndroid Build Coastguard Worker    jmp .top_done
628*c0909341SAndroid Build Coastguard Worker.no_top:
629*c0909341SAndroid Build Coastguard Worker    mova   [px-16*2+0], m7
630*c0909341SAndroid Build Coastguard Worker.top_no_left:
631*c0909341SAndroid Build Coastguard Worker    movd   [px-16*2-4], xm7
632*c0909341SAndroid Build Coastguard Worker    movd   [px-16*1-4], xm7
633*c0909341SAndroid Build Coastguard Worker.top_done:
634*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; HAVE_BOTTOM
635*c0909341SAndroid Build Coastguard Worker    jz .no_bottom
636*c0909341SAndroid Build Coastguard Worker    movu           xm0, [botq+strideq*0]
637*c0909341SAndroid Build Coastguard Worker    movu           xm1, [botq+strideq*1]
638*c0909341SAndroid Build Coastguard Worker    mova   [px+16*4+0], xm0
639*c0909341SAndroid Build Coastguard Worker    mova   [px+16*5+0], xm1
640*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; HAVE_LEFT
641*c0909341SAndroid Build Coastguard Worker    jz .bottom_no_left
642*c0909341SAndroid Build Coastguard Worker    movd           xm0, [botq+strideq*0-4]
643*c0909341SAndroid Build Coastguard Worker    movd           xm1, [botq+strideq*1-4]
644*c0909341SAndroid Build Coastguard Worker    movd   [px+16*4-4], xm0
645*c0909341SAndroid Build Coastguard Worker    movd   [px+16*5-4], xm1
646*c0909341SAndroid Build Coastguard Worker    jmp .bottom_done
647*c0909341SAndroid Build Coastguard Worker.no_bottom:
648*c0909341SAndroid Build Coastguard Worker    mova   [px+16*4+0], m7
649*c0909341SAndroid Build Coastguard Worker.bottom_no_left:
650*c0909341SAndroid Build Coastguard Worker    movd   [px+16*4-4], xm7
651*c0909341SAndroid Build Coastguard Worker    movd   [px+16*5-4], xm7
652*c0909341SAndroid Build Coastguard Worker.bottom_done:
653*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; HAVE_LEFT
654*c0909341SAndroid Build Coastguard Worker    jz .no_left
655*c0909341SAndroid Build Coastguard Worker    movd           xm0, [leftq+4*0]
656*c0909341SAndroid Build Coastguard Worker    movd           xm1, [leftq+4*1]
657*c0909341SAndroid Build Coastguard Worker    movd           xm2, [leftq+4*2]
658*c0909341SAndroid Build Coastguard Worker    movd           xm3, [leftq+4*3]
659*c0909341SAndroid Build Coastguard Worker    movd   [px+16*0-4], xm0
660*c0909341SAndroid Build Coastguard Worker    movd   [px+16*1-4], xm1
661*c0909341SAndroid Build Coastguard Worker    movd   [px+16*2-4], xm2
662*c0909341SAndroid Build Coastguard Worker    movd   [px+16*3-4], xm3
663*c0909341SAndroid Build Coastguard Worker    jmp .left_done
664*c0909341SAndroid Build Coastguard Worker.no_left:
665*c0909341SAndroid Build Coastguard Worker    REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3
666*c0909341SAndroid Build Coastguard Worker.left_done:
667*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; HAVE_RIGHT
668*c0909341SAndroid Build Coastguard Worker    jnz .padding_done
669*c0909341SAndroid Build Coastguard Worker    REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5
670*c0909341SAndroid Build Coastguard Worker.padding_done:
671*c0909341SAndroid Build Coastguard Worker    CDEF_FILTER      4, 4
672*c0909341SAndroid Build Coastguard Worker
673*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_4x8_16bpc, 5, 10, 9, 16*14, dst, stride, left, top, bot, \
674*c0909341SAndroid Build Coastguard Worker                                                pri, sec, edge
675*c0909341SAndroid Build Coastguard Worker    mov          edged, r9m
676*c0909341SAndroid Build Coastguard Worker    movu           xm0, [dstq+strideq*0]
677*c0909341SAndroid Build Coastguard Worker    movu           xm1, [dstq+strideq*1]
678*c0909341SAndroid Build Coastguard Worker    lea             r9, [strideq*3]
679*c0909341SAndroid Build Coastguard Worker    movu           xm2, [dstq+strideq*2]
680*c0909341SAndroid Build Coastguard Worker    movu           xm3, [dstq+r9       ]
681*c0909341SAndroid Build Coastguard Worker    lea             r6, [dstq+strideq*4]
682*c0909341SAndroid Build Coastguard Worker    movu           xm4, [r6  +strideq*0]
683*c0909341SAndroid Build Coastguard Worker    movu           xm5, [r6  +strideq*1]
684*c0909341SAndroid Build Coastguard Worker    movu           xm6, [r6  +strideq*2]
685*c0909341SAndroid Build Coastguard Worker    movu           xm7, [r6  +r9       ]
686*c0909341SAndroid Build Coastguard Worker    lea             r8, [dir_table4]
687*c0909341SAndroid Build Coastguard Worker    mova   [px+16*0+0], xm0
688*c0909341SAndroid Build Coastguard Worker    mova   [px+16*1+0], xm1
689*c0909341SAndroid Build Coastguard Worker    mova   [px+16*2+0], xm2
690*c0909341SAndroid Build Coastguard Worker    mova   [px+16*3+0], xm3
691*c0909341SAndroid Build Coastguard Worker    mova   [px+16*4+0], xm4
692*c0909341SAndroid Build Coastguard Worker    mova   [px+16*5+0], xm5
693*c0909341SAndroid Build Coastguard Worker    mova   [px+16*6+0], xm6
694*c0909341SAndroid Build Coastguard Worker    mova   [px+16*7+0], xm7
695*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m7, [base+pw_m16384]
696*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; HAVE_TOP
697*c0909341SAndroid Build Coastguard Worker    jz .no_top
698*c0909341SAndroid Build Coastguard Worker    movu           xm0, [topq+strideq*0]
699*c0909341SAndroid Build Coastguard Worker    movu           xm1, [topq+strideq*1]
700*c0909341SAndroid Build Coastguard Worker    mova   [px-16*2+0], xm0
701*c0909341SAndroid Build Coastguard Worker    mova   [px-16*1+0], xm1
702*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; HAVE_LEFT
703*c0909341SAndroid Build Coastguard Worker    jz .top_no_left
704*c0909341SAndroid Build Coastguard Worker    movd           xm0, [topq+strideq*0-4]
705*c0909341SAndroid Build Coastguard Worker    movd           xm1, [topq+strideq*1-4]
706*c0909341SAndroid Build Coastguard Worker    movd   [px-16*2-4], xm0
707*c0909341SAndroid Build Coastguard Worker    movd   [px-16*1-4], xm1
708*c0909341SAndroid Build Coastguard Worker    jmp .top_done
709*c0909341SAndroid Build Coastguard Worker.no_top:
710*c0909341SAndroid Build Coastguard Worker    mova   [px-16*2+0], m7
711*c0909341SAndroid Build Coastguard Worker.top_no_left:
712*c0909341SAndroid Build Coastguard Worker    movd   [px-16*2-4], xm7
713*c0909341SAndroid Build Coastguard Worker    movd   [px-16*1-4], xm7
714*c0909341SAndroid Build Coastguard Worker.top_done:
715*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; HAVE_BOTTOM
716*c0909341SAndroid Build Coastguard Worker    jz .no_bottom
717*c0909341SAndroid Build Coastguard Worker    movu           xm0, [botq+strideq*0]
718*c0909341SAndroid Build Coastguard Worker    movu           xm1, [botq+strideq*1]
719*c0909341SAndroid Build Coastguard Worker    mova   [px+16*8+0], xm0
720*c0909341SAndroid Build Coastguard Worker    mova   [px+16*9+0], xm1
721*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; HAVE_LEFT
722*c0909341SAndroid Build Coastguard Worker    jz .bottom_no_left
723*c0909341SAndroid Build Coastguard Worker    movd           xm0, [botq+strideq*0-4]
724*c0909341SAndroid Build Coastguard Worker    movd           xm1, [botq+strideq*1-4]
725*c0909341SAndroid Build Coastguard Worker    movd   [px+16*8-4], xm0
726*c0909341SAndroid Build Coastguard Worker    movd   [px+16*9-4], xm1
727*c0909341SAndroid Build Coastguard Worker    jmp .bottom_done
728*c0909341SAndroid Build Coastguard Worker.no_bottom:
729*c0909341SAndroid Build Coastguard Worker    mova   [px+16*8+0], m7
730*c0909341SAndroid Build Coastguard Worker.bottom_no_left:
731*c0909341SAndroid Build Coastguard Worker    movd   [px+16*8-4], xm7
732*c0909341SAndroid Build Coastguard Worker    movd   [px+16*9-4], xm7
733*c0909341SAndroid Build Coastguard Worker.bottom_done:
734*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; HAVE_LEFT
735*c0909341SAndroid Build Coastguard Worker    jz .no_left
736*c0909341SAndroid Build Coastguard Worker    movd           xm0, [leftq+4*0]
737*c0909341SAndroid Build Coastguard Worker    movd           xm1, [leftq+4*1]
738*c0909341SAndroid Build Coastguard Worker    movd           xm2, [leftq+4*2]
739*c0909341SAndroid Build Coastguard Worker    movd           xm3, [leftq+4*3]
740*c0909341SAndroid Build Coastguard Worker    movd   [px+16*0-4], xm0
741*c0909341SAndroid Build Coastguard Worker    movd   [px+16*1-4], xm1
742*c0909341SAndroid Build Coastguard Worker    movd   [px+16*2-4], xm2
743*c0909341SAndroid Build Coastguard Worker    movd   [px+16*3-4], xm3
744*c0909341SAndroid Build Coastguard Worker    movd           xm0, [leftq+4*4]
745*c0909341SAndroid Build Coastguard Worker    movd           xm1, [leftq+4*5]
746*c0909341SAndroid Build Coastguard Worker    movd           xm2, [leftq+4*6]
747*c0909341SAndroid Build Coastguard Worker    movd           xm3, [leftq+4*7]
748*c0909341SAndroid Build Coastguard Worker    movd   [px+16*4-4], xm0
749*c0909341SAndroid Build Coastguard Worker    movd   [px+16*5-4], xm1
750*c0909341SAndroid Build Coastguard Worker    movd   [px+16*6-4], xm2
751*c0909341SAndroid Build Coastguard Worker    movd   [px+16*7-4], xm3
752*c0909341SAndroid Build Coastguard Worker    jmp .left_done
753*c0909341SAndroid Build Coastguard Worker.no_left:
754*c0909341SAndroid Build Coastguard Worker    REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7
755*c0909341SAndroid Build Coastguard Worker.left_done:
756*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; HAVE_RIGHT
757*c0909341SAndroid Build Coastguard Worker    jnz .padding_done
758*c0909341SAndroid Build Coastguard Worker    REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
759*c0909341SAndroid Build Coastguard Worker.padding_done:
760*c0909341SAndroid Build Coastguard Worker    CDEF_FILTER      4, 8
761*c0909341SAndroid Build Coastguard Worker
762*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*13, dst, stride, left, top, bot, \
763*c0909341SAndroid Build Coastguard Worker                                               pri, sec, edge
764*c0909341SAndroid Build Coastguard Worker%if WIN64
765*c0909341SAndroid Build Coastguard Worker    %define         px  rsp+32*4
766*c0909341SAndroid Build Coastguard Worker%else
767*c0909341SAndroid Build Coastguard Worker    %define         px  rsp+32*3
768*c0909341SAndroid Build Coastguard Worker%endif
769*c0909341SAndroid Build Coastguard Worker    %define       base  r8-dir_table8
770*c0909341SAndroid Build Coastguard Worker    mov          edged, r9m
771*c0909341SAndroid Build Coastguard Worker    movu            m0, [dstq+strideq*0]
772*c0909341SAndroid Build Coastguard Worker    movu            m1, [dstq+strideq*1]
773*c0909341SAndroid Build Coastguard Worker    lea             r6, [dstq+strideq*2]
774*c0909341SAndroid Build Coastguard Worker    movu            m2, [r6  +strideq*0]
775*c0909341SAndroid Build Coastguard Worker    movu            m3, [r6  +strideq*1]
776*c0909341SAndroid Build Coastguard Worker    lea             r6, [r6  +strideq*2]
777*c0909341SAndroid Build Coastguard Worker    movu            m4, [r6  +strideq*0]
778*c0909341SAndroid Build Coastguard Worker    movu            m5, [r6  +strideq*1]
779*c0909341SAndroid Build Coastguard Worker    lea             r6, [r6  +strideq*2]
780*c0909341SAndroid Build Coastguard Worker    movu            m6, [r6  +strideq*0]
781*c0909341SAndroid Build Coastguard Worker    movu            m7, [r6  +strideq*1]
782*c0909341SAndroid Build Coastguard Worker    lea             r8, [dir_table8]
783*c0909341SAndroid Build Coastguard Worker    mova   [px+32*0+0], m0
784*c0909341SAndroid Build Coastguard Worker    mova   [px+32*1+0], m1
785*c0909341SAndroid Build Coastguard Worker    mova   [px+32*2+0], m2
786*c0909341SAndroid Build Coastguard Worker    mova   [px+32*3+0], m3
787*c0909341SAndroid Build Coastguard Worker    mova   [px+32*4+0], m4
788*c0909341SAndroid Build Coastguard Worker    mova   [px+32*5+0], m5
789*c0909341SAndroid Build Coastguard Worker    mova   [px+32*6+0], m6
790*c0909341SAndroid Build Coastguard Worker    mova   [px+32*7+0], m7
791*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m7, [base+pw_m16384]
792*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; HAVE_TOP
793*c0909341SAndroid Build Coastguard Worker    jz .no_top
794*c0909341SAndroid Build Coastguard Worker    movu            m0, [topq+strideq*0]
795*c0909341SAndroid Build Coastguard Worker    movu            m1, [topq+strideq*1]
796*c0909341SAndroid Build Coastguard Worker    mova   [px-32*2+0], m0
797*c0909341SAndroid Build Coastguard Worker    mova   [px-32*1+0], m1
798*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; HAVE_LEFT
799*c0909341SAndroid Build Coastguard Worker    jz .top_no_left
800*c0909341SAndroid Build Coastguard Worker    movd           xm0, [topq+strideq*0-4]
801*c0909341SAndroid Build Coastguard Worker    movd           xm1, [topq+strideq*1-4]
802*c0909341SAndroid Build Coastguard Worker    movd   [px-32*2-4], xm0
803*c0909341SAndroid Build Coastguard Worker    movd   [px-32*1-4], xm1
804*c0909341SAndroid Build Coastguard Worker    jmp .top_done
805*c0909341SAndroid Build Coastguard Worker.no_top:
806*c0909341SAndroid Build Coastguard Worker    mova   [px-32*2+0], m7
807*c0909341SAndroid Build Coastguard Worker    mova   [px-32*1+0], m7
808*c0909341SAndroid Build Coastguard Worker.top_no_left:
809*c0909341SAndroid Build Coastguard Worker    movd   [px-32*2-4], xm7
810*c0909341SAndroid Build Coastguard Worker    movd   [px-32*1-4], xm7
811*c0909341SAndroid Build Coastguard Worker.top_done:
812*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; HAVE_BOTTOM
813*c0909341SAndroid Build Coastguard Worker    jz .no_bottom
814*c0909341SAndroid Build Coastguard Worker    movu            m0, [botq+strideq*0]
815*c0909341SAndroid Build Coastguard Worker    movu            m1, [botq+strideq*1]
816*c0909341SAndroid Build Coastguard Worker    mova   [px+32*8+0], m0
817*c0909341SAndroid Build Coastguard Worker    mova   [px+32*9+0], m1
818*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; HAVE_LEFT
819*c0909341SAndroid Build Coastguard Worker    jz .bottom_no_left
820*c0909341SAndroid Build Coastguard Worker    movd           xm0, [botq+strideq*0-4]
821*c0909341SAndroid Build Coastguard Worker    movd           xm1, [botq+strideq*1-4]
822*c0909341SAndroid Build Coastguard Worker    movd   [px+32*8-4], xm0
823*c0909341SAndroid Build Coastguard Worker    movd   [px+32*9-4], xm1
824*c0909341SAndroid Build Coastguard Worker    jmp .bottom_done
825*c0909341SAndroid Build Coastguard Worker.no_bottom:
826*c0909341SAndroid Build Coastguard Worker    mova   [px+32*8+0], m7
827*c0909341SAndroid Build Coastguard Worker    mova   [px+32*9+0], m7
828*c0909341SAndroid Build Coastguard Worker.bottom_no_left:
829*c0909341SAndroid Build Coastguard Worker    movd   [px+32*8-4], xm7
830*c0909341SAndroid Build Coastguard Worker    movd   [px+32*9-4], xm7
831*c0909341SAndroid Build Coastguard Worker.bottom_done:
832*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; HAVE_LEFT
833*c0909341SAndroid Build Coastguard Worker    jz .no_left
834*c0909341SAndroid Build Coastguard Worker    movd           xm0, [leftq+4*0]
835*c0909341SAndroid Build Coastguard Worker    movd           xm1, [leftq+4*1]
836*c0909341SAndroid Build Coastguard Worker    movd           xm2, [leftq+4*2]
837*c0909341SAndroid Build Coastguard Worker    movd           xm3, [leftq+4*3]
838*c0909341SAndroid Build Coastguard Worker    movd   [px+32*0-4], xm0
839*c0909341SAndroid Build Coastguard Worker    movd   [px+32*1-4], xm1
840*c0909341SAndroid Build Coastguard Worker    movd   [px+32*2-4], xm2
841*c0909341SAndroid Build Coastguard Worker    movd   [px+32*3-4], xm3
842*c0909341SAndroid Build Coastguard Worker    movd           xm0, [leftq+4*4]
843*c0909341SAndroid Build Coastguard Worker    movd           xm1, [leftq+4*5]
844*c0909341SAndroid Build Coastguard Worker    movd           xm2, [leftq+4*6]
845*c0909341SAndroid Build Coastguard Worker    movd           xm3, [leftq+4*7]
846*c0909341SAndroid Build Coastguard Worker    movd   [px+32*4-4], xm0
847*c0909341SAndroid Build Coastguard Worker    movd   [px+32*5-4], xm1
848*c0909341SAndroid Build Coastguard Worker    movd   [px+32*6-4], xm2
849*c0909341SAndroid Build Coastguard Worker    movd   [px+32*7-4], xm3
850*c0909341SAndroid Build Coastguard Worker    jmp .left_done
851*c0909341SAndroid Build Coastguard Worker.no_left:
852*c0909341SAndroid Build Coastguard Worker    REPX {movd [px+32*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7
853*c0909341SAndroid Build Coastguard Worker.left_done:
854*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; HAVE_RIGHT
855*c0909341SAndroid Build Coastguard Worker    jnz .padding_done
856*c0909341SAndroid Build Coastguard Worker    REPX {movd [px+32*x+16], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
857*c0909341SAndroid Build Coastguard Worker.padding_done:
858*c0909341SAndroid Build Coastguard Worker    CDEF_FILTER      8, 8
859*c0909341SAndroid Build Coastguard Worker
860*c0909341SAndroid Build Coastguard Workercglobal cdef_dir_16bpc, 4, 7, 6, src, stride, var, bdmax
861*c0909341SAndroid Build Coastguard Worker    lea             r6, [dir_shift]
862*c0909341SAndroid Build Coastguard Worker    shr         bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc
863*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m4, [r6+bdmaxq*4]
864*c0909341SAndroid Build Coastguard Worker    lea             r6, [strideq*3]
865*c0909341SAndroid Build Coastguard Worker    mova           xm0, [srcq+strideq*0]
866*c0909341SAndroid Build Coastguard Worker    mova           xm1, [srcq+strideq*1]
867*c0909341SAndroid Build Coastguard Worker    mova           xm2, [srcq+strideq*2]
868*c0909341SAndroid Build Coastguard Worker    mova           xm3, [srcq+r6       ]
869*c0909341SAndroid Build Coastguard Worker    lea           srcq, [srcq+strideq*4]
870*c0909341SAndroid Build Coastguard Worker    vinserti128     m0, [srcq+r6       ], 1
871*c0909341SAndroid Build Coastguard Worker    vinserti128     m1, [srcq+strideq*2], 1
872*c0909341SAndroid Build Coastguard Worker    vinserti128     m2, [srcq+strideq*1], 1
873*c0909341SAndroid Build Coastguard Worker    vinserti128     m3, [srcq+strideq*0], 1
874*c0909341SAndroid Build Coastguard Worker    REPX {pmulhuw x, m4}, m0, m1, m2, m3
875*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
876*c0909341SAndroid Build Coastguard Worker
877*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
878