xref: /aosp_15_r20/external/libdav1d/src/x86/cdef16_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; Copyright (c) 2017-2021, The rav1e contributors
4*c0909341SAndroid Build Coastguard Worker; Copyright (c) 2021, Nathan Egge
5*c0909341SAndroid Build Coastguard Worker; All rights reserved.
6*c0909341SAndroid Build Coastguard Worker;
7*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
8*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
9*c0909341SAndroid Build Coastguard Worker;
10*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
11*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
12*c0909341SAndroid Build Coastguard Worker;
13*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
14*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
15*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
16*c0909341SAndroid Build Coastguard Worker;
17*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*c0909341SAndroid Build Coastguard Worker
28*c0909341SAndroid Build Coastguard Worker%include "config.asm"
29*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA
32*c0909341SAndroid Build Coastguard Worker
33*c0909341SAndroid Build Coastguard Worker%macro DUP8 1-*
34*c0909341SAndroid Build Coastguard Worker    %rep %0
35*c0909341SAndroid Build Coastguard Worker        times 8 dw %1
36*c0909341SAndroid Build Coastguard Worker        %rotate 1
37*c0909341SAndroid Build Coastguard Worker    %endrep
38*c0909341SAndroid Build Coastguard Worker%endmacro
39*c0909341SAndroid Build Coastguard Worker
40*c0909341SAndroid Build Coastguard Workerpri_taps:  DUP8 4, 2, 3, 3
41*c0909341SAndroid Build Coastguard Workerdir_table: db  1 * 32 + 0,  2 * 32 + 0
42*c0909341SAndroid Build Coastguard Worker           db  1 * 32 + 0,  2 * 32 - 2
43*c0909341SAndroid Build Coastguard Worker           db -1 * 32 + 2, -2 * 32 + 4
44*c0909341SAndroid Build Coastguard Worker           db  0 * 32 + 2, -1 * 32 + 4
45*c0909341SAndroid Build Coastguard Worker           db  0 * 32 + 2,  0 * 32 + 4
46*c0909341SAndroid Build Coastguard Worker           db  0 * 32 + 2,  1 * 32 + 4
47*c0909341SAndroid Build Coastguard Worker           db  1 * 32 + 2,  2 * 32 + 4
48*c0909341SAndroid Build Coastguard Worker           db  1 * 32 + 0,  2 * 32 + 2
49*c0909341SAndroid Build Coastguard Worker           db  1 * 32 + 0,  2 * 32 + 0
50*c0909341SAndroid Build Coastguard Worker           db  1 * 32 + 0,  2 * 32 - 2
51*c0909341SAndroid Build Coastguard Worker           db -1 * 32 + 2, -2 * 32 + 4
52*c0909341SAndroid Build Coastguard Worker           db  0 * 32 + 2, -1 * 32 + 4
53*c0909341SAndroid Build Coastguard Worker
54*c0909341SAndroid Build Coastguard Workerdir_shift: times 4 dw 0x4000
55*c0909341SAndroid Build Coastguard Worker           times 4 dw 0x1000
56*c0909341SAndroid Build Coastguard Worker
57*c0909341SAndroid Build Coastguard Workerpw_128:    times 4 dw 128
58*c0909341SAndroid Build Coastguard Workerpw_2048:   times 8 dw 2048
59*c0909341SAndroid Build Coastguard Workerpw_m16384: times 8 dw -16384
60*c0909341SAndroid Build Coastguard Worker
61*c0909341SAndroid Build Coastguard Workercextern cdef_dir_8bpc_ssse3.main
62*c0909341SAndroid Build Coastguard Workercextern cdef_dir_8bpc_sse4.main
63*c0909341SAndroid Build Coastguard Workercextern shufw_6543210x
64*c0909341SAndroid Build Coastguard Worker
65*c0909341SAndroid Build Coastguard WorkerSECTION .text
66*c0909341SAndroid Build Coastguard Worker
67*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
68*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5, 3
69*c0909341SAndroid Build Coastguard Worker%elif WIN64
70*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 8, 4
71*c0909341SAndroid Build Coastguard Worker%else
72*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 8, 6
73*c0909341SAndroid Build Coastguard Worker%endif
74*c0909341SAndroid Build Coastguard Worker
75*c0909341SAndroid Build Coastguard Worker%macro CDEF_FILTER 2 ; w, h
76*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
77*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, stride, _, tmp, pridmp, pri, sec, dir
78*c0909341SAndroid Build Coastguard Worker    mova            m8, [base+pw_2048]
79*c0909341SAndroid Build Coastguard Worker%else
80*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, pridmp, tmp, sec, pri, _, dir
81*c0909341SAndroid Build Coastguard Worker    %define         m8  [base+pw_2048]
82*c0909341SAndroid Build Coastguard Worker    %define         m9  [rsp+16*1+gprsize]
83*c0909341SAndroid Build Coastguard Worker    %define        m10  [rsp+16*2+gprsize]
84*c0909341SAndroid Build Coastguard Worker%endif
85*c0909341SAndroid Build Coastguard Worker    movifnidn     prid, r5m
86*c0909341SAndroid Build Coastguard Worker    movifnidn     secd, r6m
87*c0909341SAndroid Build Coastguard Worker    test          prid, prid
88*c0909341SAndroid Build Coastguard Worker    jz .sec_only
89*c0909341SAndroid Build Coastguard Worker    movd            m6, r5m
90*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
91*c0909341SAndroid Build Coastguard Worker    mov       [rsp+24], pridmpd
92*c0909341SAndroid Build Coastguard Worker%endif
93*c0909341SAndroid Build Coastguard Worker    bsr        pridmpd, prid
94*c0909341SAndroid Build Coastguard Worker    lea           tmpd, [priq*4]
95*c0909341SAndroid Build Coastguard Worker    cmp     dword r10m, 0x3ff ; if (bpc == 10)
96*c0909341SAndroid Build Coastguard Worker    cmove         prid, tmpd  ;     pri <<= 2
97*c0909341SAndroid Build Coastguard Worker    mov           tmpd, r8m   ; damping
98*c0909341SAndroid Build Coastguard Worker    mov           dird, r7m
99*c0909341SAndroid Build Coastguard Worker    and           prid, 16
100*c0909341SAndroid Build Coastguard Worker    pshufb          m6, m7    ; splat
101*c0909341SAndroid Build Coastguard Worker    lea           dirq, [base+dir_table+dirq*2]
102*c0909341SAndroid Build Coastguard Worker    lea           priq, [base+pri_taps+priq*2]
103*c0909341SAndroid Build Coastguard Worker    test          secd, secd
104*c0909341SAndroid Build Coastguard Worker    jz .pri_only
105*c0909341SAndroid Build Coastguard Worker    mova         [rsp], m6
106*c0909341SAndroid Build Coastguard Worker    movd            m6, secd
107*c0909341SAndroid Build Coastguard Worker    tzcnt         secd, secd
108*c0909341SAndroid Build Coastguard Worker    sub        pridmpd, tmpd
109*c0909341SAndroid Build Coastguard Worker    sub           tmpd, secd
110*c0909341SAndroid Build Coastguard Worker    pshufb          m6, m7
111*c0909341SAndroid Build Coastguard Worker    xor           secd, secd
112*c0909341SAndroid Build Coastguard Worker    neg        pridmpd
113*c0909341SAndroid Build Coastguard Worker    cmovs      pridmpd, secd
114*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
115*c0909341SAndroid Build Coastguard Worker    mov  [pri_shift+4], secd
116*c0909341SAndroid Build Coastguard Worker    mov  [sec_shift+4], secd
117*c0909341SAndroid Build Coastguard Worker%endif
118*c0909341SAndroid Build Coastguard Worker    mov  [pri_shift+0], pridmpq
119*c0909341SAndroid Build Coastguard Worker    mov  [sec_shift+0], tmpq
120*c0909341SAndroid Build Coastguard Worker    lea           tmpq, [px]
121*c0909341SAndroid Build Coastguard Worker%if WIN64
122*c0909341SAndroid Build Coastguard Worker    movaps         r4m, m9
123*c0909341SAndroid Build Coastguard Worker    movaps         r6m, m10
124*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32
125*c0909341SAndroid Build Coastguard Worker    mov        pridmpd, [rsp+24]
126*c0909341SAndroid Build Coastguard Worker%endif
127*c0909341SAndroid Build Coastguard Worker%rep %1*%2/8
128*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
129*c0909341SAndroid Build Coastguard Worker%endrep
130*c0909341SAndroid Build Coastguard Worker%if WIN64
131*c0909341SAndroid Build Coastguard Worker    movaps          m9, r4m
132*c0909341SAndroid Build Coastguard Worker    movaps         m10, r6m
133*c0909341SAndroid Build Coastguard Worker%endif
134*c0909341SAndroid Build Coastguard Worker    jmp .end
135*c0909341SAndroid Build Coastguard Worker.pri_only:
136*c0909341SAndroid Build Coastguard Worker    sub           tmpd, pridmpd
137*c0909341SAndroid Build Coastguard Worker    cmovs         tmpd, secd
138*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
139*c0909341SAndroid Build Coastguard Worker    mov        pridmpd, [rsp+24]
140*c0909341SAndroid Build Coastguard Worker    mov  [pri_shift+4], secd
141*c0909341SAndroid Build Coastguard Worker%endif
142*c0909341SAndroid Build Coastguard Worker    mov  [pri_shift+0], tmpq
143*c0909341SAndroid Build Coastguard Worker    lea           tmpq, [px]
144*c0909341SAndroid Build Coastguard Worker%rep %1*%2/8
145*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
146*c0909341SAndroid Build Coastguard Worker%endrep
147*c0909341SAndroid Build Coastguard Worker.end:
148*c0909341SAndroid Build Coastguard Worker    RET
149*c0909341SAndroid Build Coastguard Worker.sec_only:
150*c0909341SAndroid Build Coastguard Worker    mov           tmpd, r8m ; damping
151*c0909341SAndroid Build Coastguard Worker    movd            m6, r6m
152*c0909341SAndroid Build Coastguard Worker    tzcnt         secd, secd
153*c0909341SAndroid Build Coastguard Worker    mov           dird, r7m
154*c0909341SAndroid Build Coastguard Worker    pshufb          m6, m7
155*c0909341SAndroid Build Coastguard Worker    sub           tmpd, secd
156*c0909341SAndroid Build Coastguard Worker    lea           dirq, [base+dir_table+dirq*2]
157*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
158*c0909341SAndroid Build Coastguard Worker    mov  [sec_shift+4], prid
159*c0909341SAndroid Build Coastguard Worker%endif
160*c0909341SAndroid Build Coastguard Worker    mov  [sec_shift+0], tmpq
161*c0909341SAndroid Build Coastguard Worker    lea           tmpq, [px]
162*c0909341SAndroid Build Coastguard Worker%rep %1*%2/8
163*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
164*c0909341SAndroid Build Coastguard Worker%endrep
165*c0909341SAndroid Build Coastguard Worker    jmp .end
166*c0909341SAndroid Build Coastguard Worker%if %1 == %2
167*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
168*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS dst, stride, _, tmp, off, pri, _, dir
169*c0909341SAndroid Build Coastguard Worker %else
170*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS dst, stride, tmp, off, pri, _, dir
171*c0909341SAndroid Build Coastguard Worker %endif
172*c0909341SAndroid Build Coastguard WorkerALIGN function_align
173*c0909341SAndroid Build Coastguard Worker.pri:
174*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+4]    ; off_k0
175*c0909341SAndroid Build Coastguard Worker%if %1 == 4
176*c0909341SAndroid Build Coastguard Worker    movq            m1, [dstq+strideq*0]
177*c0909341SAndroid Build Coastguard Worker    movhps          m1, [dstq+strideq*1]
178*c0909341SAndroid Build Coastguard Worker    movq            m2, [tmpq+offq+32*0] ; k0p0
179*c0909341SAndroid Build Coastguard Worker    movhps          m2, [tmpq+offq+32*1]
180*c0909341SAndroid Build Coastguard Worker    neg           offq
181*c0909341SAndroid Build Coastguard Worker    movq            m3, [tmpq+offq+32*0] ; k0p1
182*c0909341SAndroid Build Coastguard Worker    movhps          m3, [tmpq+offq+32*1]
183*c0909341SAndroid Build Coastguard Worker%else
184*c0909341SAndroid Build Coastguard Worker    mova            m1, [dstq]
185*c0909341SAndroid Build Coastguard Worker    movu            m2, [tmpq+offq]
186*c0909341SAndroid Build Coastguard Worker    neg           offq
187*c0909341SAndroid Build Coastguard Worker    movu            m3, [tmpq+offq]
188*c0909341SAndroid Build Coastguard Worker%endif
189*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+5]    ; off_k1
190*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1               ; diff_k0p0
191*c0909341SAndroid Build Coastguard Worker    psubw           m3, m1               ; diff_k0p1
192*c0909341SAndroid Build Coastguard Worker    pabsw           m4, m2               ; adiff_k0p0
193*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m4, [pri_shift+gprsize]
194*c0909341SAndroid Build Coastguard Worker    psubusw         m0, m6, m5
195*c0909341SAndroid Build Coastguard Worker    pabsw           m5, m3               ; adiff_k0p1
196*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m4
197*c0909341SAndroid Build Coastguard Worker    psrlw           m4, m5, [pri_shift+gprsize]
198*c0909341SAndroid Build Coastguard Worker    psignw          m0, m2               ; constrain(diff_k0p0)
199*c0909341SAndroid Build Coastguard Worker    psubusw         m2, m6, m4
200*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m5
201*c0909341SAndroid Build Coastguard Worker%if %1 == 4
202*c0909341SAndroid Build Coastguard Worker    movq            m4, [tmpq+offq+32*0] ; k1p0
203*c0909341SAndroid Build Coastguard Worker    movhps          m4, [tmpq+offq+32*1]
204*c0909341SAndroid Build Coastguard Worker    neg           offq
205*c0909341SAndroid Build Coastguard Worker    movq            m5, [tmpq+offq+32*0] ; k1p1
206*c0909341SAndroid Build Coastguard Worker    movhps          m5, [tmpq+offq+32*1]
207*c0909341SAndroid Build Coastguard Worker%else
208*c0909341SAndroid Build Coastguard Worker    movu            m4, [tmpq+offq]
209*c0909341SAndroid Build Coastguard Worker    neg           offq
210*c0909341SAndroid Build Coastguard Worker    movu            m5, [tmpq+offq]
211*c0909341SAndroid Build Coastguard Worker%endif
212*c0909341SAndroid Build Coastguard Worker    psubw           m4, m1               ; diff_k1p0
213*c0909341SAndroid Build Coastguard Worker    psubw           m5, m1               ; diff_k1p1
214*c0909341SAndroid Build Coastguard Worker    psignw          m2, m3               ; constrain(diff_k0p1)
215*c0909341SAndroid Build Coastguard Worker    pabsw           m3, m4               ; adiff_k1p0
216*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2               ; constrain(diff_k0)
217*c0909341SAndroid Build Coastguard Worker    psrlw           m2, m3, [pri_shift+gprsize]
218*c0909341SAndroid Build Coastguard Worker    psubusw         m7, m6, m2
219*c0909341SAndroid Build Coastguard Worker    pabsw           m2, m5               ; adiff_k1p1
220*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m3
221*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m2, [pri_shift+gprsize]
222*c0909341SAndroid Build Coastguard Worker    psignw          m7, m4               ; constrain(diff_k1p0)
223*c0909341SAndroid Build Coastguard Worker    psubusw         m4, m6, m3
224*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m2
225*c0909341SAndroid Build Coastguard Worker    psignw          m4, m5               ; constrain(diff_k1p1)
226*c0909341SAndroid Build Coastguard Worker    paddw           m7, m4               ; constrain(diff_k1)
227*c0909341SAndroid Build Coastguard Worker    pmullw          m0, [priq+16*0]      ; pri_tap_k0
228*c0909341SAndroid Build Coastguard Worker    pmullw          m7, [priq+16*1]      ; pri_tap_k1
229*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7               ; sum
230*c0909341SAndroid Build Coastguard Worker    psraw           m2, m0, 15
231*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
232*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m0, m8
233*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
234*c0909341SAndroid Build Coastguard Worker%if %1 == 4
235*c0909341SAndroid Build Coastguard Worker    add           tmpq, 32*2
236*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
237*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
238*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq+strideq*2]
239*c0909341SAndroid Build Coastguard Worker%else
240*c0909341SAndroid Build Coastguard Worker    add           tmpq, 32
241*c0909341SAndroid Build Coastguard Worker    mova        [dstq], m0
242*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
243*c0909341SAndroid Build Coastguard Worker%endif
244*c0909341SAndroid Build Coastguard Worker    ret
245*c0909341SAndroid Build Coastguard WorkerALIGN function_align
246*c0909341SAndroid Build Coastguard Worker.sec:
247*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+8]    ; off1_k0
248*c0909341SAndroid Build Coastguard Worker%if %1 == 4
249*c0909341SAndroid Build Coastguard Worker    movq            m1, [dstq+strideq*0]
250*c0909341SAndroid Build Coastguard Worker    movhps          m1, [dstq+strideq*1]
251*c0909341SAndroid Build Coastguard Worker    movq            m2, [tmpq+offq+32*0] ; k0s0
252*c0909341SAndroid Build Coastguard Worker    movhps          m2, [tmpq+offq+32*1]
253*c0909341SAndroid Build Coastguard Worker    neg           offq
254*c0909341SAndroid Build Coastguard Worker    movq            m3, [tmpq+offq+32*0] ; k0s1
255*c0909341SAndroid Build Coastguard Worker    movhps          m3, [tmpq+offq+32*1]
256*c0909341SAndroid Build Coastguard Worker%else
257*c0909341SAndroid Build Coastguard Worker    mova            m1, [dstq]
258*c0909341SAndroid Build Coastguard Worker    movu            m2, [tmpq+offq]
259*c0909341SAndroid Build Coastguard Worker    neg           offq
260*c0909341SAndroid Build Coastguard Worker    movu            m3, [tmpq+offq]
261*c0909341SAndroid Build Coastguard Worker%endif
262*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+0]    ; off2_k0
263*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1               ; diff_k0s0
264*c0909341SAndroid Build Coastguard Worker    psubw           m3, m1               ; diff_k0s1
265*c0909341SAndroid Build Coastguard Worker    pabsw           m4, m2               ; adiff_k0s0
266*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m4, [sec_shift+gprsize]
267*c0909341SAndroid Build Coastguard Worker    psubusw         m0, m6, m5
268*c0909341SAndroid Build Coastguard Worker    pabsw           m5, m3               ; adiff_k0s1
269*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m4
270*c0909341SAndroid Build Coastguard Worker    psrlw           m4, m5, [sec_shift+gprsize]
271*c0909341SAndroid Build Coastguard Worker    psignw          m0, m2               ; constrain(diff_k0s0)
272*c0909341SAndroid Build Coastguard Worker    psubusw         m2, m6, m4
273*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m5
274*c0909341SAndroid Build Coastguard Worker%if %1 == 4
275*c0909341SAndroid Build Coastguard Worker    movq            m4, [tmpq+offq+32*0] ; k0s2
276*c0909341SAndroid Build Coastguard Worker    movhps          m4, [tmpq+offq+32*1]
277*c0909341SAndroid Build Coastguard Worker    neg           offq
278*c0909341SAndroid Build Coastguard Worker    movq            m5, [tmpq+offq+32*0] ; k0s3
279*c0909341SAndroid Build Coastguard Worker    movhps          m5, [tmpq+offq+32*1]
280*c0909341SAndroid Build Coastguard Worker%else
281*c0909341SAndroid Build Coastguard Worker    movu            m4, [tmpq+offq]
282*c0909341SAndroid Build Coastguard Worker    neg           offq
283*c0909341SAndroid Build Coastguard Worker    movu            m5, [tmpq+offq]
284*c0909341SAndroid Build Coastguard Worker%endif
285*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+9]    ; off1_k1
286*c0909341SAndroid Build Coastguard Worker    psubw           m4, m1               ; diff_k0s2
287*c0909341SAndroid Build Coastguard Worker    psubw           m5, m1               ; diff_k0s3
288*c0909341SAndroid Build Coastguard Worker    psignw          m2, m3               ; constrain(diff_k0s1)
289*c0909341SAndroid Build Coastguard Worker    pabsw           m3, m4               ; adiff_k0s2
290*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
291*c0909341SAndroid Build Coastguard Worker    psrlw           m2, m3, [sec_shift+gprsize]
292*c0909341SAndroid Build Coastguard Worker    psubusw         m7, m6, m2
293*c0909341SAndroid Build Coastguard Worker    pabsw           m2, m5               ; adiff_k0s3
294*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m3
295*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m2, [sec_shift+gprsize]
296*c0909341SAndroid Build Coastguard Worker    psignw          m7, m4               ; constrain(diff_k0s2)
297*c0909341SAndroid Build Coastguard Worker    psubusw         m4, m6, m3
298*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m2
299*c0909341SAndroid Build Coastguard Worker%if %1 == 4
300*c0909341SAndroid Build Coastguard Worker    movq            m2, [tmpq+offq+32*0] ; k1s0
301*c0909341SAndroid Build Coastguard Worker    movhps          m2, [tmpq+offq+32*1]
302*c0909341SAndroid Build Coastguard Worker    neg           offq
303*c0909341SAndroid Build Coastguard Worker    movq            m3, [tmpq+offq+32*0] ; k1s1
304*c0909341SAndroid Build Coastguard Worker    movhps          m3, [tmpq+offq+32*1]
305*c0909341SAndroid Build Coastguard Worker%else
306*c0909341SAndroid Build Coastguard Worker    movu            m2, [tmpq+offq]
307*c0909341SAndroid Build Coastguard Worker    neg           offq
308*c0909341SAndroid Build Coastguard Worker    movu            m3, [tmpq+offq]
309*c0909341SAndroid Build Coastguard Worker%endif
310*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+1]    ; off2_k1
311*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7
312*c0909341SAndroid Build Coastguard Worker    psignw          m4, m5               ; constrain(diff_k0s3)
313*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4               ; constrain(diff_k0)
314*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1               ; diff_k1s0
315*c0909341SAndroid Build Coastguard Worker    psubw           m3, m1               ; diff_k1s1
316*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0               ; sec_tap_k0
317*c0909341SAndroid Build Coastguard Worker    pabsw           m4, m2               ; adiff_k1s0
318*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m4, [sec_shift+gprsize]
319*c0909341SAndroid Build Coastguard Worker    psubusw         m7, m6, m5
320*c0909341SAndroid Build Coastguard Worker    pabsw           m5, m3               ; adiff_k1s1
321*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m4
322*c0909341SAndroid Build Coastguard Worker    psrlw           m4, m5, [sec_shift+gprsize]
323*c0909341SAndroid Build Coastguard Worker    psignw          m7, m2               ; constrain(diff_k1s0)
324*c0909341SAndroid Build Coastguard Worker    psubusw         m2, m6, m4
325*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m5
326*c0909341SAndroid Build Coastguard Worker%if %1 == 4
327*c0909341SAndroid Build Coastguard Worker    movq            m4, [tmpq+offq+32*0] ; k1s2
328*c0909341SAndroid Build Coastguard Worker    movhps          m4, [tmpq+offq+32*1]
329*c0909341SAndroid Build Coastguard Worker    neg           offq
330*c0909341SAndroid Build Coastguard Worker    movq            m5, [tmpq+offq+32*0] ; k1s3
331*c0909341SAndroid Build Coastguard Worker    movhps          m5, [tmpq+offq+32*1]
332*c0909341SAndroid Build Coastguard Worker%else
333*c0909341SAndroid Build Coastguard Worker    movu            m4, [tmpq+offq]
334*c0909341SAndroid Build Coastguard Worker    neg           offq
335*c0909341SAndroid Build Coastguard Worker    movu            m5, [tmpq+offq]
336*c0909341SAndroid Build Coastguard Worker%endif
337*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7
338*c0909341SAndroid Build Coastguard Worker    psubw           m4, m1               ; diff_k1s2
339*c0909341SAndroid Build Coastguard Worker    psubw           m5, m1               ; diff_k1s3
340*c0909341SAndroid Build Coastguard Worker    psignw          m2, m3               ; constrain(diff_k1s1)
341*c0909341SAndroid Build Coastguard Worker    pabsw           m3, m4               ; adiff_k1s2
342*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
343*c0909341SAndroid Build Coastguard Worker    psrlw           m2, m3, [sec_shift+gprsize]
344*c0909341SAndroid Build Coastguard Worker    psubusw         m7, m6, m2
345*c0909341SAndroid Build Coastguard Worker    pabsw           m2, m5               ; adiff_k1s3
346*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m3
347*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m2, [sec_shift+gprsize]
348*c0909341SAndroid Build Coastguard Worker    psignw          m7, m4               ; constrain(diff_k1s2)
349*c0909341SAndroid Build Coastguard Worker    psubusw         m4, m6, m3
350*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m2
351*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7
352*c0909341SAndroid Build Coastguard Worker    psignw          m4, m5               ; constrain(diff_k1s3)
353*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4               ; sum
354*c0909341SAndroid Build Coastguard Worker    psraw           m2, m0, 15
355*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
356*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m0, m8
357*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
358*c0909341SAndroid Build Coastguard Worker%if %1 == 4
359*c0909341SAndroid Build Coastguard Worker    add           tmpq, 32*2
360*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
361*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
362*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq+strideq*2]
363*c0909341SAndroid Build Coastguard Worker%else
364*c0909341SAndroid Build Coastguard Worker    add           tmpq, 32
365*c0909341SAndroid Build Coastguard Worker    mova        [dstq], m0
366*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
367*c0909341SAndroid Build Coastguard Worker%endif
368*c0909341SAndroid Build Coastguard Worker    ret
369*c0909341SAndroid Build Coastguard WorkerALIGN function_align
370*c0909341SAndroid Build Coastguard Worker.pri_sec:
371*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+8]    ; off2_k0
372*c0909341SAndroid Build Coastguard Worker%if %1 == 4
373*c0909341SAndroid Build Coastguard Worker    movq            m1, [dstq+strideq*0]
374*c0909341SAndroid Build Coastguard Worker    movhps          m1, [dstq+strideq*1]
375*c0909341SAndroid Build Coastguard Worker    movq            m2, [tmpq+offq+32*0] ; k0s0
376*c0909341SAndroid Build Coastguard Worker    movhps          m2, [tmpq+offq+32*1]
377*c0909341SAndroid Build Coastguard Worker    neg           offq
378*c0909341SAndroid Build Coastguard Worker    movq            m3, [tmpq+offq+32*0] ; k0s1
379*c0909341SAndroid Build Coastguard Worker    movhps          m3, [tmpq+offq+32*1]
380*c0909341SAndroid Build Coastguard Worker%else
381*c0909341SAndroid Build Coastguard Worker    mova            m1, [dstq]
382*c0909341SAndroid Build Coastguard Worker    movu            m2, [tmpq+offq]
383*c0909341SAndroid Build Coastguard Worker    neg           offq
384*c0909341SAndroid Build Coastguard Worker    movu            m3, [tmpq+offq]
385*c0909341SAndroid Build Coastguard Worker%endif
386*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+0]    ; off3_k0
387*c0909341SAndroid Build Coastguard Worker    pabsw           m4, m2
388*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
389*c0909341SAndroid Build Coastguard Worker    pabsw          m10, m3
390*c0909341SAndroid Build Coastguard Worker    pmaxsw          m9, m2, m3
391*c0909341SAndroid Build Coastguard Worker    pminsw         m10, m4
392*c0909341SAndroid Build Coastguard Worker%else
393*c0909341SAndroid Build Coastguard Worker    pabsw           m7, m3
394*c0909341SAndroid Build Coastguard Worker    pmaxsw          m5, m2, m3
395*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m7
396*c0909341SAndroid Build Coastguard Worker    mova            m9, m5
397*c0909341SAndroid Build Coastguard Worker    mova           m10, m4
398*c0909341SAndroid Build Coastguard Worker%endif
399*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1               ; diff_k0s0
400*c0909341SAndroid Build Coastguard Worker    psubw           m3, m1               ; diff_k0s1
401*c0909341SAndroid Build Coastguard Worker    pabsw           m4, m2               ; adiff_k0s0
402*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m4, [sec_shift+gprsize]
403*c0909341SAndroid Build Coastguard Worker    psubusw         m0, m6, m5
404*c0909341SAndroid Build Coastguard Worker    pabsw           m5, m3               ; adiff_k0s1
405*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m4
406*c0909341SAndroid Build Coastguard Worker    psrlw           m4, m5, [sec_shift+gprsize]
407*c0909341SAndroid Build Coastguard Worker    psignw          m0, m2               ; constrain(diff_k0s0)
408*c0909341SAndroid Build Coastguard Worker    psubusw         m2, m6, m4
409*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m5
410*c0909341SAndroid Build Coastguard Worker%if %1 == 4
411*c0909341SAndroid Build Coastguard Worker    movq            m4, [tmpq+offq+32*0] ; k0s2
412*c0909341SAndroid Build Coastguard Worker    movhps          m4, [tmpq+offq+32*1]
413*c0909341SAndroid Build Coastguard Worker    neg           offq
414*c0909341SAndroid Build Coastguard Worker    movq            m5, [tmpq+offq+32*0] ; k0s3
415*c0909341SAndroid Build Coastguard Worker    movhps          m5, [tmpq+offq+32*1]
416*c0909341SAndroid Build Coastguard Worker%else
417*c0909341SAndroid Build Coastguard Worker    movu            m4, [tmpq+offq]
418*c0909341SAndroid Build Coastguard Worker    neg           offq
419*c0909341SAndroid Build Coastguard Worker    movu            m5, [tmpq+offq]
420*c0909341SAndroid Build Coastguard Worker%endif
421*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+9]    ; off2_k1
422*c0909341SAndroid Build Coastguard Worker    pabsw           m7, m4
423*c0909341SAndroid Build Coastguard Worker    psignw          m2, m3
424*c0909341SAndroid Build Coastguard Worker    pabsw           m3, m5               ; constrain(diff_k0s1)
425*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
426*c0909341SAndroid Build Coastguard Worker    pmaxsw          m9, m4
427*c0909341SAndroid Build Coastguard Worker    pminsw         m10, m7
428*c0909341SAndroid Build Coastguard Worker    pmaxsw          m9, m5
429*c0909341SAndroid Build Coastguard Worker    pminsw         m10, m3
430*c0909341SAndroid Build Coastguard Worker%else
431*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m10
432*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m3
433*c0909341SAndroid Build Coastguard Worker    pmaxsw          m3, m9, m4
434*c0909341SAndroid Build Coastguard Worker    pmaxsw          m3, m5
435*c0909341SAndroid Build Coastguard Worker    mova           m10, m7
436*c0909341SAndroid Build Coastguard Worker    mova            m9, m3
437*c0909341SAndroid Build Coastguard Worker%endif
438*c0909341SAndroid Build Coastguard Worker    psubw           m4, m1               ; diff_k0s2
439*c0909341SAndroid Build Coastguard Worker    psubw           m5, m1               ; diff_k0s3
440*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
441*c0909341SAndroid Build Coastguard Worker    pabsw           m3, m4               ; adiff_k0s2
442*c0909341SAndroid Build Coastguard Worker    psrlw           m2, m3, [sec_shift+gprsize]
443*c0909341SAndroid Build Coastguard Worker    psubusw         m7, m6, m2
444*c0909341SAndroid Build Coastguard Worker    pabsw           m2, m5               ; adiff_k0s3
445*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m3
446*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m2, [sec_shift+gprsize]
447*c0909341SAndroid Build Coastguard Worker    psignw          m7, m4               ; constrain(diff_k0s2)
448*c0909341SAndroid Build Coastguard Worker    psubusw         m4, m6, m3
449*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m2
450*c0909341SAndroid Build Coastguard Worker%if %1 == 4
451*c0909341SAndroid Build Coastguard Worker    movq            m2, [tmpq+offq+32*0] ; k1s0
452*c0909341SAndroid Build Coastguard Worker    movhps          m2, [tmpq+offq+32*1]
453*c0909341SAndroid Build Coastguard Worker    neg           offq
454*c0909341SAndroid Build Coastguard Worker    movq            m3, [tmpq+offq+32*0] ; k1s1
455*c0909341SAndroid Build Coastguard Worker    movhps          m3, [tmpq+offq+32*1]
456*c0909341SAndroid Build Coastguard Worker%else
457*c0909341SAndroid Build Coastguard Worker    movu            m2, [tmpq+offq]
458*c0909341SAndroid Build Coastguard Worker    neg           offq
459*c0909341SAndroid Build Coastguard Worker    movu            m3, [tmpq+offq]
460*c0909341SAndroid Build Coastguard Worker%endif
461*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+1]    ; off3_k1
462*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7
463*c0909341SAndroid Build Coastguard Worker    pabsw           m7, m2
464*c0909341SAndroid Build Coastguard Worker    psignw          m4, m5               ; constrain(diff_k0s3)
465*c0909341SAndroid Build Coastguard Worker    pabsw           m5, m3
466*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
467*c0909341SAndroid Build Coastguard Worker    pmaxsw          m9, m2
468*c0909341SAndroid Build Coastguard Worker    pminsw         m10, m7
469*c0909341SAndroid Build Coastguard Worker    pmaxsw          m9, m3
470*c0909341SAndroid Build Coastguard Worker    pminsw         m10, m5
471*c0909341SAndroid Build Coastguard Worker%else
472*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m10
473*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m5
474*c0909341SAndroid Build Coastguard Worker    pmaxsw          m5, m9, m2
475*c0909341SAndroid Build Coastguard Worker    pmaxsw          m5, m3
476*c0909341SAndroid Build Coastguard Worker    mova           m10, m7
477*c0909341SAndroid Build Coastguard Worker    mova            m9, m5
478*c0909341SAndroid Build Coastguard Worker%endif
479*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4               ; constrain(diff_k0)
480*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1               ; diff_k1s0
481*c0909341SAndroid Build Coastguard Worker    psubw           m3, m1               ; diff_k1s1
482*c0909341SAndroid Build Coastguard Worker    paddw           m0, m0               ; sec_tap_k0
483*c0909341SAndroid Build Coastguard Worker    pabsw           m4, m2               ; adiff_k1s0
484*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m4, [sec_shift+gprsize]
485*c0909341SAndroid Build Coastguard Worker    psubusw         m7, m6, m5
486*c0909341SAndroid Build Coastguard Worker    pabsw           m5, m3               ; adiff_k1s1
487*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m4
488*c0909341SAndroid Build Coastguard Worker    psrlw           m4, m5, [sec_shift+gprsize]
489*c0909341SAndroid Build Coastguard Worker    psignw          m7, m2               ; constrain(diff_k1s0)
490*c0909341SAndroid Build Coastguard Worker    psubusw         m2, m6, m4
491*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m5
492*c0909341SAndroid Build Coastguard Worker%if %1 == 4
493*c0909341SAndroid Build Coastguard Worker    movq            m4, [tmpq+offq+32*0] ; k1s2
494*c0909341SAndroid Build Coastguard Worker    movhps          m4, [tmpq+offq+32*1]
495*c0909341SAndroid Build Coastguard Worker    neg           offq
496*c0909341SAndroid Build Coastguard Worker    movq            m5, [tmpq+offq+32*0] ; k1s3
497*c0909341SAndroid Build Coastguard Worker    movhps          m5, [tmpq+offq+32*1]
498*c0909341SAndroid Build Coastguard Worker%else
499*c0909341SAndroid Build Coastguard Worker    movu            m4, [tmpq+offq]
500*c0909341SAndroid Build Coastguard Worker    neg           offq
501*c0909341SAndroid Build Coastguard Worker    movu            m5, [tmpq+offq]
502*c0909341SAndroid Build Coastguard Worker%endif
503*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+4]    ; off1_k0
504*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7
505*c0909341SAndroid Build Coastguard Worker    pabsw           m7, m4
506*c0909341SAndroid Build Coastguard Worker    psignw          m2, m3               ; constrain(diff_k1s1)
507*c0909341SAndroid Build Coastguard Worker    pabsw           m3, m5
508*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
509*c0909341SAndroid Build Coastguard Worker    pmaxsw          m9, m4
510*c0909341SAndroid Build Coastguard Worker    pminsw         m10, m7
511*c0909341SAndroid Build Coastguard Worker    pmaxsw          m9, m5
512*c0909341SAndroid Build Coastguard Worker    pminsw         m10, m3
513*c0909341SAndroid Build Coastguard Worker%else
514*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m10
515*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m3
516*c0909341SAndroid Build Coastguard Worker    pmaxsw          m3, m9, m4
517*c0909341SAndroid Build Coastguard Worker    pmaxsw          m3, m5
518*c0909341SAndroid Build Coastguard Worker    mova           m10, m7
519*c0909341SAndroid Build Coastguard Worker    mova            m9, m3
520*c0909341SAndroid Build Coastguard Worker%endif
521*c0909341SAndroid Build Coastguard Worker    psubw           m4, m1               ; diff_k1s2
522*c0909341SAndroid Build Coastguard Worker    psubw           m5, m1               ; diff_k1s3
523*c0909341SAndroid Build Coastguard Worker    pabsw           m3, m4               ; adiff_k1s2
524*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
525*c0909341SAndroid Build Coastguard Worker    psrlw           m2, m3, [sec_shift+gprsize]
526*c0909341SAndroid Build Coastguard Worker    psubusw         m7, m6, m2
527*c0909341SAndroid Build Coastguard Worker    pabsw           m2, m5               ; adiff_k1s3
528*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m3
529*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m2, [sec_shift+gprsize]
530*c0909341SAndroid Build Coastguard Worker    psignw          m7, m4               ; constrain(diff_k1s2)
531*c0909341SAndroid Build Coastguard Worker    psubusw         m4, m6, m3
532*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m2
533*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7
534*c0909341SAndroid Build Coastguard Worker%if %1 == 4
535*c0909341SAndroid Build Coastguard Worker    movq            m2, [tmpq+offq+32*0] ; k0p0
536*c0909341SAndroid Build Coastguard Worker    movhps          m2, [tmpq+offq+32*1]
537*c0909341SAndroid Build Coastguard Worker    neg           offq
538*c0909341SAndroid Build Coastguard Worker    movq            m3, [tmpq+offq+32*0] ; k0p1
539*c0909341SAndroid Build Coastguard Worker    movhps          m3, [tmpq+offq+32*1]
540*c0909341SAndroid Build Coastguard Worker%else
541*c0909341SAndroid Build Coastguard Worker    movu            m2, [tmpq+offq]
542*c0909341SAndroid Build Coastguard Worker    neg           offq
543*c0909341SAndroid Build Coastguard Worker    movu            m3, [tmpq+offq]
544*c0909341SAndroid Build Coastguard Worker%endif
545*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+5]    ; off1_k1
546*c0909341SAndroid Build Coastguard Worker    pabsw           m7, m2
547*c0909341SAndroid Build Coastguard Worker    psignw          m4, m5               ; constrain(diff_k1s3)
548*c0909341SAndroid Build Coastguard Worker    pabsw           m5, m3
549*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
550*c0909341SAndroid Build Coastguard Worker    pmaxsw          m9, m2
551*c0909341SAndroid Build Coastguard Worker    pminsw         m10, m7
552*c0909341SAndroid Build Coastguard Worker    pmaxsw          m9, m3
553*c0909341SAndroid Build Coastguard Worker    pminsw         m10, m5
554*c0909341SAndroid Build Coastguard Worker%else
555*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m10
556*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m5
557*c0909341SAndroid Build Coastguard Worker    pmaxsw          m5, m9, m2
558*c0909341SAndroid Build Coastguard Worker    pmaxsw          m5, m3
559*c0909341SAndroid Build Coastguard Worker    mova           m10, m7
560*c0909341SAndroid Build Coastguard Worker    mova            m9, m5
561*c0909341SAndroid Build Coastguard Worker%endif
562*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1               ; diff_k0p0
563*c0909341SAndroid Build Coastguard Worker    psubw           m3, m1               ; diff_k0p1
564*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4
565*c0909341SAndroid Build Coastguard Worker    pabsw           m4, m2               ; adiff_k0p0
566*c0909341SAndroid Build Coastguard Worker    psrlw           m5, m4, [pri_shift+gprsize]
567*c0909341SAndroid Build Coastguard Worker    psubusw         m7, [rsp+gprsize], m5
568*c0909341SAndroid Build Coastguard Worker    pabsw           m5, m3               ; adiff_k0p1
569*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m4
570*c0909341SAndroid Build Coastguard Worker    psrlw           m4, m5, [pri_shift+gprsize]
571*c0909341SAndroid Build Coastguard Worker    psignw          m7, m2               ; constrain(diff_k0p0)
572*c0909341SAndroid Build Coastguard Worker    psubusw         m2, [rsp+gprsize], m4
573*c0909341SAndroid Build Coastguard Worker    pminsw          m2, m5
574*c0909341SAndroid Build Coastguard Worker%if %1 == 4
575*c0909341SAndroid Build Coastguard Worker    movq            m4, [tmpq+offq+32*0] ; k1p0
576*c0909341SAndroid Build Coastguard Worker    movhps          m4, [tmpq+offq+32*1]
577*c0909341SAndroid Build Coastguard Worker    neg           offq
578*c0909341SAndroid Build Coastguard Worker    movq            m5, [tmpq+offq+32*0] ; k1p1
579*c0909341SAndroid Build Coastguard Worker    movhps          m5, [tmpq+offq+32*1]
580*c0909341SAndroid Build Coastguard Worker%else
581*c0909341SAndroid Build Coastguard Worker    movu            m4, [tmpq+offq]
582*c0909341SAndroid Build Coastguard Worker    neg           offq
583*c0909341SAndroid Build Coastguard Worker    movu            m5, [tmpq+offq]
584*c0909341SAndroid Build Coastguard Worker%endif
585*c0909341SAndroid Build Coastguard Worker    psignw          m2, m3               ; constrain(diff_k0p1)
586*c0909341SAndroid Build Coastguard Worker    pabsw           m3, m4
587*c0909341SAndroid Build Coastguard Worker    paddw           m7, m2               ; constrain(diff_k0)
588*c0909341SAndroid Build Coastguard Worker    pabsw           m2, m5
589*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
590*c0909341SAndroid Build Coastguard Worker    pmaxsw          m9, m4
591*c0909341SAndroid Build Coastguard Worker    pminsw         m10, m3
592*c0909341SAndroid Build Coastguard Worker    pmaxsw          m9, m5
593*c0909341SAndroid Build Coastguard Worker    pminsw         m10, m2
594*c0909341SAndroid Build Coastguard Worker%else
595*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m10
596*c0909341SAndroid Build Coastguard Worker    pminsw          m3, m2
597*c0909341SAndroid Build Coastguard Worker    pmaxsw          m2, m9, m4
598*c0909341SAndroid Build Coastguard Worker    pmaxsw          m2, m5
599*c0909341SAndroid Build Coastguard Worker    mova           m10, m3
600*c0909341SAndroid Build Coastguard Worker    mova            m9, m2
601*c0909341SAndroid Build Coastguard Worker%endif
602*c0909341SAndroid Build Coastguard Worker    psubw           m4, m1               ; diff_k1p0
603*c0909341SAndroid Build Coastguard Worker    psubw           m5, m1               ; diff_k1p1
604*c0909341SAndroid Build Coastguard Worker    pabsw           m3, m4               ; adiff_k1p0
605*c0909341SAndroid Build Coastguard Worker    pmullw          m7, [priq+16*0]      ; pri_tap_k0
606*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7
607*c0909341SAndroid Build Coastguard Worker    psrlw           m2, m3, [pri_shift+gprsize]
608*c0909341SAndroid Build Coastguard Worker    psubusw         m7, [rsp+16*0+gprsize], m2
609*c0909341SAndroid Build Coastguard Worker    pabsw           m2, m5               ; adiff_k1p1
610*c0909341SAndroid Build Coastguard Worker    pminsw          m7, m3
611*c0909341SAndroid Build Coastguard Worker    psrlw           m3, m2, [pri_shift+gprsize]
612*c0909341SAndroid Build Coastguard Worker    psignw          m7, m4               ; constrain(diff_k1p0)
613*c0909341SAndroid Build Coastguard Worker    psubusw         m4, [rsp+16*0+gprsize], m3
614*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m2
615*c0909341SAndroid Build Coastguard Worker    psignw          m4, m5               ; constrain(diff_k1p1)
616*c0909341SAndroid Build Coastguard Worker    paddw           m7, m4               ; constrain(diff_k1)
617*c0909341SAndroid Build Coastguard Worker    pmullw          m7, [priq+16*1]      ; pri_tap_k1
618*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7               ; sum
619*c0909341SAndroid Build Coastguard Worker    psraw           m2, m0, 15
620*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
621*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m0, m8
622*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
623*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
624*c0909341SAndroid Build Coastguard Worker    pmaxsw          m9, m1
625*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m9
626*c0909341SAndroid Build Coastguard Worker%else
627*c0909341SAndroid Build Coastguard Worker    pmaxsw          m2, m9, m1
628*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m2
629*c0909341SAndroid Build Coastguard Worker%endif
630*c0909341SAndroid Build Coastguard Worker    pminsw          m1, m10
631*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m1
632*c0909341SAndroid Build Coastguard Worker%if %1 == 4
633*c0909341SAndroid Build Coastguard Worker    add           tmpq, 32*2
634*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
635*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
636*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq+strideq*2]
637*c0909341SAndroid Build Coastguard Worker%else
638*c0909341SAndroid Build Coastguard Worker    add           tmpq, 32
639*c0909341SAndroid Build Coastguard Worker    mova        [dstq], m0
640*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
641*c0909341SAndroid Build Coastguard Worker%endif
642*c0909341SAndroid Build Coastguard Worker    ret
643*c0909341SAndroid Build Coastguard Worker%endif
644*c0909341SAndroid Build Coastguard Worker%endmacro
645*c0909341SAndroid Build Coastguard Worker
646*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
647*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
648*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_4x4_16bpc, 5, 9, 9, 32*10, dst, stride, left, top, bot, \
649*c0909341SAndroid Build Coastguard Worker                                               pri, sec, edge
650*c0909341SAndroid Build Coastguard Worker    %define         px  rsp+32*4
651*c0909341SAndroid Build Coastguard Worker%else
652*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_4x4_16bpc, 2, 7, 8, -32*11, dst, stride, edge, top, left
653*c0909341SAndroid Build Coastguard Worker    %define       botq  topq
654*c0909341SAndroid Build Coastguard Worker    %define         px  rsp+32*5
655*c0909341SAndroid Build Coastguard Worker%endif
656*c0909341SAndroid Build Coastguard Worker    %define       base  t0-dir_table
657*c0909341SAndroid Build Coastguard Worker    %define  pri_shift  px-16*6
658*c0909341SAndroid Build Coastguard Worker    %define  sec_shift  px-16*5
659*c0909341SAndroid Build Coastguard Worker    mov          edged, r9m
660*c0909341SAndroid Build Coastguard Worker    LEA             t0, dir_table
661*c0909341SAndroid Build Coastguard Worker    movu            m0, [dstq+strideq*0]
662*c0909341SAndroid Build Coastguard Worker    movu            m1, [dstq+strideq*1]
663*c0909341SAndroid Build Coastguard Worker    lea             t1, [dstq+strideq*2]
664*c0909341SAndroid Build Coastguard Worker    movu            m2, [t1  +strideq*0]
665*c0909341SAndroid Build Coastguard Worker    movu            m3, [t1  +strideq*1]
666*c0909341SAndroid Build Coastguard Worker    movddup         m7, [base+pw_m16384]
667*c0909341SAndroid Build Coastguard Worker    mova   [px+32*0+0], m0
668*c0909341SAndroid Build Coastguard Worker    mova   [px+32*1+0], m1
669*c0909341SAndroid Build Coastguard Worker    mova   [px+32*2+0], m2
670*c0909341SAndroid Build Coastguard Worker    mova   [px+32*3+0], m3
671*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; HAVE_TOP
672*c0909341SAndroid Build Coastguard Worker    jz .no_top
673*c0909341SAndroid Build Coastguard Worker    movifnidn     topq, topmp
674*c0909341SAndroid Build Coastguard Worker    movu            m0, [topq+strideq*0]
675*c0909341SAndroid Build Coastguard Worker    movu            m1, [topq+strideq*1]
676*c0909341SAndroid Build Coastguard Worker    mova   [px-32*2+0], m0
677*c0909341SAndroid Build Coastguard Worker    mova   [px-32*1+0], m1
678*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; HAVE_LEFT
679*c0909341SAndroid Build Coastguard Worker    jz .top_no_left
680*c0909341SAndroid Build Coastguard Worker    movd            m0, [topq+strideq*0-4]
681*c0909341SAndroid Build Coastguard Worker    movd            m1, [topq+strideq*1-4]
682*c0909341SAndroid Build Coastguard Worker    movd   [px-32*2-4], m0
683*c0909341SAndroid Build Coastguard Worker    movd   [px-32*1-4], m1
684*c0909341SAndroid Build Coastguard Worker    jmp .top_done
685*c0909341SAndroid Build Coastguard Worker.no_top:
686*c0909341SAndroid Build Coastguard Worker    mova   [px-32*2+0], m7
687*c0909341SAndroid Build Coastguard Worker    mova   [px-32*1+0], m7
688*c0909341SAndroid Build Coastguard Worker.top_no_left:
689*c0909341SAndroid Build Coastguard Worker    movd   [px-32*2-4], m7
690*c0909341SAndroid Build Coastguard Worker    movd   [px-32*1-4], m7
691*c0909341SAndroid Build Coastguard Worker.top_done:
692*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; HAVE_BOTTOM
693*c0909341SAndroid Build Coastguard Worker    jz .no_bottom
694*c0909341SAndroid Build Coastguard Worker    movifnidn     botq, r4mp
695*c0909341SAndroid Build Coastguard Worker    movu            m0, [botq+strideq*0]
696*c0909341SAndroid Build Coastguard Worker    movu            m1, [botq+strideq*1]
697*c0909341SAndroid Build Coastguard Worker    mova   [px+32*4+0], m0
698*c0909341SAndroid Build Coastguard Worker    mova   [px+32*5+0], m1
699*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; HAVE_LEFT
700*c0909341SAndroid Build Coastguard Worker    jz .bottom_no_left
701*c0909341SAndroid Build Coastguard Worker    movd            m0, [botq+strideq*0-4]
702*c0909341SAndroid Build Coastguard Worker    movd            m1, [botq+strideq*1-4]
703*c0909341SAndroid Build Coastguard Worker    movd   [px+32*4-4], m0
704*c0909341SAndroid Build Coastguard Worker    movd   [px+32*5-4], m1
705*c0909341SAndroid Build Coastguard Worker    jmp .bottom_done
706*c0909341SAndroid Build Coastguard Worker.no_bottom:
707*c0909341SAndroid Build Coastguard Worker    mova   [px+32*4+0], m7
708*c0909341SAndroid Build Coastguard Worker    mova   [px+32*5+0], m7
709*c0909341SAndroid Build Coastguard Worker.bottom_no_left:
710*c0909341SAndroid Build Coastguard Worker    movd   [px+32*4-4], m7
711*c0909341SAndroid Build Coastguard Worker    movd   [px+32*5-4], m7
712*c0909341SAndroid Build Coastguard Worker.bottom_done:
713*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; HAVE_LEFT
714*c0909341SAndroid Build Coastguard Worker    jz .no_left
715*c0909341SAndroid Build Coastguard Worker    movifnidn    leftq, r2mp
716*c0909341SAndroid Build Coastguard Worker    movd            m0, [leftq+4*0]
717*c0909341SAndroid Build Coastguard Worker    movd            m1, [leftq+4*1]
718*c0909341SAndroid Build Coastguard Worker    movd            m2, [leftq+4*2]
719*c0909341SAndroid Build Coastguard Worker    movd            m3, [leftq+4*3]
720*c0909341SAndroid Build Coastguard Worker    movd   [px+32*0-4], m0
721*c0909341SAndroid Build Coastguard Worker    movd   [px+32*1-4], m1
722*c0909341SAndroid Build Coastguard Worker    movd   [px+32*2-4], m2
723*c0909341SAndroid Build Coastguard Worker    movd   [px+32*3-4], m3
724*c0909341SAndroid Build Coastguard Worker    jmp .left_done
725*c0909341SAndroid Build Coastguard Worker.no_left:
726*c0909341SAndroid Build Coastguard Worker    REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3
727*c0909341SAndroid Build Coastguard Worker.left_done:
728*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; HAVE_RIGHT
729*c0909341SAndroid Build Coastguard Worker    jnz .padding_done
730*c0909341SAndroid Build Coastguard Worker    REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5
731*c0909341SAndroid Build Coastguard Worker.padding_done:
732*c0909341SAndroid Build Coastguard Worker    CDEF_FILTER      4, 4
733*c0909341SAndroid Build Coastguard Worker
734*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
735*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_4x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \
736*c0909341SAndroid Build Coastguard Worker                                               pri, sec, edge
737*c0909341SAndroid Build Coastguard Worker%else
738*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_4x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
739*c0909341SAndroid Build Coastguard Worker%endif
740*c0909341SAndroid Build Coastguard Worker    mov          edged, r9m
741*c0909341SAndroid Build Coastguard Worker    LEA             t0, dir_table
742*c0909341SAndroid Build Coastguard Worker    movu            m0, [dstq+strideq*0]
743*c0909341SAndroid Build Coastguard Worker    movu            m1, [dstq+strideq*1]
744*c0909341SAndroid Build Coastguard Worker    lea             t1, [dstq+strideq*2]
745*c0909341SAndroid Build Coastguard Worker    movu            m2, [t1  +strideq*0]
746*c0909341SAndroid Build Coastguard Worker    movu            m3, [t1  +strideq*1]
747*c0909341SAndroid Build Coastguard Worker    lea             t1, [t1  +strideq*2]
748*c0909341SAndroid Build Coastguard Worker    movu            m4, [t1  +strideq*0]
749*c0909341SAndroid Build Coastguard Worker    movu            m5, [t1  +strideq*1]
750*c0909341SAndroid Build Coastguard Worker    lea             t1, [t1  +strideq*2]
751*c0909341SAndroid Build Coastguard Worker    movu            m6, [t1  +strideq*0]
752*c0909341SAndroid Build Coastguard Worker    movu            m7, [t1  +strideq*1]
753*c0909341SAndroid Build Coastguard Worker    mova   [px+32*0+0], m0
754*c0909341SAndroid Build Coastguard Worker    mova   [px+32*1+0], m1
755*c0909341SAndroid Build Coastguard Worker    mova   [px+32*2+0], m2
756*c0909341SAndroid Build Coastguard Worker    mova   [px+32*3+0], m3
757*c0909341SAndroid Build Coastguard Worker    mova   [px+32*4+0], m4
758*c0909341SAndroid Build Coastguard Worker    mova   [px+32*5+0], m5
759*c0909341SAndroid Build Coastguard Worker    mova   [px+32*6+0], m6
760*c0909341SAndroid Build Coastguard Worker    mova   [px+32*7+0], m7
761*c0909341SAndroid Build Coastguard Worker    movddup         m7, [base+pw_m16384]
762*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; HAVE_TOP
763*c0909341SAndroid Build Coastguard Worker    jz .no_top
764*c0909341SAndroid Build Coastguard Worker    movifnidn     topq, topmp
765*c0909341SAndroid Build Coastguard Worker    movu            m0, [topq+strideq*0]
766*c0909341SAndroid Build Coastguard Worker    movu            m1, [topq+strideq*1]
767*c0909341SAndroid Build Coastguard Worker    mova   [px-32*2+0], m0
768*c0909341SAndroid Build Coastguard Worker    mova   [px-32*1+0], m1
769*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; HAVE_LEFT
770*c0909341SAndroid Build Coastguard Worker    jz .top_no_left
771*c0909341SAndroid Build Coastguard Worker    movd            m0, [topq+strideq*0-4]
772*c0909341SAndroid Build Coastguard Worker    movd            m1, [topq+strideq*1-4]
773*c0909341SAndroid Build Coastguard Worker    movd   [px-32*2-4], m0
774*c0909341SAndroid Build Coastguard Worker    movd   [px-32*1-4], m1
775*c0909341SAndroid Build Coastguard Worker    jmp .top_done
776*c0909341SAndroid Build Coastguard Worker.no_top:
777*c0909341SAndroid Build Coastguard Worker    mova   [px-32*2+0], m7
778*c0909341SAndroid Build Coastguard Worker    mova   [px-32*1+0], m7
779*c0909341SAndroid Build Coastguard Worker.top_no_left:
780*c0909341SAndroid Build Coastguard Worker    movd   [px-32*2-4], m7
781*c0909341SAndroid Build Coastguard Worker    movd   [px-32*1-4], m7
782*c0909341SAndroid Build Coastguard Worker.top_done:
783*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; HAVE_BOTTOM
784*c0909341SAndroid Build Coastguard Worker    jz .no_bottom
785*c0909341SAndroid Build Coastguard Worker    movifnidn     botq, r4mp
786*c0909341SAndroid Build Coastguard Worker    movu            m0, [botq+strideq*0]
787*c0909341SAndroid Build Coastguard Worker    movu            m1, [botq+strideq*1]
788*c0909341SAndroid Build Coastguard Worker    mova   [px+32*8+0], m0
789*c0909341SAndroid Build Coastguard Worker    mova   [px+32*9+0], m1
790*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; HAVE_LEFT
791*c0909341SAndroid Build Coastguard Worker    jz .bottom_no_left
792*c0909341SAndroid Build Coastguard Worker    movd            m0, [botq+strideq*0-4]
793*c0909341SAndroid Build Coastguard Worker    movd            m1, [botq+strideq*1-4]
794*c0909341SAndroid Build Coastguard Worker    movd   [px+32*8-4], m0
795*c0909341SAndroid Build Coastguard Worker    movd   [px+32*9-4], m1
796*c0909341SAndroid Build Coastguard Worker    jmp .bottom_done
797*c0909341SAndroid Build Coastguard Worker.no_bottom:
798*c0909341SAndroid Build Coastguard Worker    mova   [px+32*8+0], m7
799*c0909341SAndroid Build Coastguard Worker    mova   [px+32*9+0], m7
800*c0909341SAndroid Build Coastguard Worker.bottom_no_left:
801*c0909341SAndroid Build Coastguard Worker    movd   [px+32*8-4], m7
802*c0909341SAndroid Build Coastguard Worker    movd   [px+32*9-4], m7
803*c0909341SAndroid Build Coastguard Worker.bottom_done:
804*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; HAVE_LEFT
805*c0909341SAndroid Build Coastguard Worker    jz .no_left
806*c0909341SAndroid Build Coastguard Worker    movifnidn    leftq, r2mp
807*c0909341SAndroid Build Coastguard Worker    movd            m0, [leftq+4*0]
808*c0909341SAndroid Build Coastguard Worker    movd            m1, [leftq+4*1]
809*c0909341SAndroid Build Coastguard Worker    movd            m2, [leftq+4*2]
810*c0909341SAndroid Build Coastguard Worker    movd            m3, [leftq+4*3]
811*c0909341SAndroid Build Coastguard Worker    movd   [px+32*0-4], m0
812*c0909341SAndroid Build Coastguard Worker    movd   [px+32*1-4], m1
813*c0909341SAndroid Build Coastguard Worker    movd   [px+32*2-4], m2
814*c0909341SAndroid Build Coastguard Worker    movd   [px+32*3-4], m3
815*c0909341SAndroid Build Coastguard Worker    movd            m0, [leftq+4*4]
816*c0909341SAndroid Build Coastguard Worker    movd            m1, [leftq+4*5]
817*c0909341SAndroid Build Coastguard Worker    movd            m2, [leftq+4*6]
818*c0909341SAndroid Build Coastguard Worker    movd            m3, [leftq+4*7]
819*c0909341SAndroid Build Coastguard Worker    movd   [px+32*4-4], m0
820*c0909341SAndroid Build Coastguard Worker    movd   [px+32*5-4], m1
821*c0909341SAndroid Build Coastguard Worker    movd   [px+32*6-4], m2
822*c0909341SAndroid Build Coastguard Worker    movd   [px+32*7-4], m3
823*c0909341SAndroid Build Coastguard Worker    jmp .left_done
824*c0909341SAndroid Build Coastguard Worker.no_left:
825*c0909341SAndroid Build Coastguard Worker    REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
826*c0909341SAndroid Build Coastguard Worker.left_done:
827*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; HAVE_RIGHT
828*c0909341SAndroid Build Coastguard Worker    jnz .padding_done
829*c0909341SAndroid Build Coastguard Worker    REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
830*c0909341SAndroid Build Coastguard Worker.padding_done:
831*c0909341SAndroid Build Coastguard Worker    CDEF_FILTER      4, 8
832*c0909341SAndroid Build Coastguard Worker
833*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
834*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \
835*c0909341SAndroid Build Coastguard Worker                                               pri, sec, edge
836*c0909341SAndroid Build Coastguard Worker%else
837*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_8x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
838*c0909341SAndroid Build Coastguard Worker%endif
839*c0909341SAndroid Build Coastguard Worker    mov          edged, r9m
840*c0909341SAndroid Build Coastguard Worker    LEA             t0, dir_table
841*c0909341SAndroid Build Coastguard Worker    mova            m0, [dstq+strideq*0+ 0]
842*c0909341SAndroid Build Coastguard Worker    movd            m1, [dstq+strideq*0+16]
843*c0909341SAndroid Build Coastguard Worker    mova            m2, [dstq+strideq*1+ 0]
844*c0909341SAndroid Build Coastguard Worker    movd            m3, [dstq+strideq*1+16]
845*c0909341SAndroid Build Coastguard Worker    lea             t1, [dstq+strideq*2]
846*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1  +strideq*0+ 0]
847*c0909341SAndroid Build Coastguard Worker    movd            m5, [t1  +strideq*0+16]
848*c0909341SAndroid Build Coastguard Worker    mova            m6, [t1  +strideq*1+ 0]
849*c0909341SAndroid Build Coastguard Worker    movd            m7, [t1  +strideq*1+16]
850*c0909341SAndroid Build Coastguard Worker    lea             t1, [t1  +strideq*2]
851*c0909341SAndroid Build Coastguard Worker    mova  [px+32*0+ 0], m0
852*c0909341SAndroid Build Coastguard Worker    movd  [px+32*0+16], m1
853*c0909341SAndroid Build Coastguard Worker    mova  [px+32*1+ 0], m2
854*c0909341SAndroid Build Coastguard Worker    movd  [px+32*1+16], m3
855*c0909341SAndroid Build Coastguard Worker    mova  [px+32*2+ 0], m4
856*c0909341SAndroid Build Coastguard Worker    movd  [px+32*2+16], m5
857*c0909341SAndroid Build Coastguard Worker    mova  [px+32*3+ 0], m6
858*c0909341SAndroid Build Coastguard Worker    movd  [px+32*3+16], m7
859*c0909341SAndroid Build Coastguard Worker    mova            m0, [t1  +strideq*0+ 0]
860*c0909341SAndroid Build Coastguard Worker    movd            m1, [t1  +strideq*0+16]
861*c0909341SAndroid Build Coastguard Worker    mova            m2, [t1  +strideq*1+ 0]
862*c0909341SAndroid Build Coastguard Worker    movd            m3, [t1  +strideq*1+16]
863*c0909341SAndroid Build Coastguard Worker    lea             t1, [t1  +strideq*2]
864*c0909341SAndroid Build Coastguard Worker    mova            m4, [t1  +strideq*0+ 0]
865*c0909341SAndroid Build Coastguard Worker    movd            m5, [t1  +strideq*0+16]
866*c0909341SAndroid Build Coastguard Worker    mova            m6, [t1  +strideq*1+ 0]
867*c0909341SAndroid Build Coastguard Worker    movd            m7, [t1  +strideq*1+16]
868*c0909341SAndroid Build Coastguard Worker    mova  [px+32*4+ 0], m0
869*c0909341SAndroid Build Coastguard Worker    movd  [px+32*4+16], m1
870*c0909341SAndroid Build Coastguard Worker    mova  [px+32*5+ 0], m2
871*c0909341SAndroid Build Coastguard Worker    movd  [px+32*5+16], m3
872*c0909341SAndroid Build Coastguard Worker    mova  [px+32*6+ 0], m4
873*c0909341SAndroid Build Coastguard Worker    movd  [px+32*6+16], m5
874*c0909341SAndroid Build Coastguard Worker    mova  [px+32*7+ 0], m6
875*c0909341SAndroid Build Coastguard Worker    movd  [px+32*7+16], m7
876*c0909341SAndroid Build Coastguard Worker    movddup         m7, [base+pw_m16384]
877*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; HAVE_TOP
878*c0909341SAndroid Build Coastguard Worker    jz .no_top
879*c0909341SAndroid Build Coastguard Worker    movifnidn     topq, topmp
880*c0909341SAndroid Build Coastguard Worker    mova            m0, [topq+strideq*0+ 0]
881*c0909341SAndroid Build Coastguard Worker    mova            m1, [topq+strideq*0+16]
882*c0909341SAndroid Build Coastguard Worker    mova            m2, [topq+strideq*1+ 0]
883*c0909341SAndroid Build Coastguard Worker    mova            m3, [topq+strideq*1+16]
884*c0909341SAndroid Build Coastguard Worker    mova  [px-32*2+ 0], m0
885*c0909341SAndroid Build Coastguard Worker    movd  [px-32*2+16], m1
886*c0909341SAndroid Build Coastguard Worker    mova  [px-32*1+ 0], m2
887*c0909341SAndroid Build Coastguard Worker    movd  [px-32*1+16], m3
888*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; HAVE_LEFT
889*c0909341SAndroid Build Coastguard Worker    jz .top_no_left
890*c0909341SAndroid Build Coastguard Worker    movd            m0, [topq+strideq*0-4]
891*c0909341SAndroid Build Coastguard Worker    movd            m1, [topq+strideq*1-4]
892*c0909341SAndroid Build Coastguard Worker    movd   [px-32*2-4], m0
893*c0909341SAndroid Build Coastguard Worker    movd   [px-32*1-4], m1
894*c0909341SAndroid Build Coastguard Worker    jmp .top_done
895*c0909341SAndroid Build Coastguard Worker.no_top:
896*c0909341SAndroid Build Coastguard Worker    mova  [px-32*2+ 0], m7
897*c0909341SAndroid Build Coastguard Worker    movd  [px-32*2+16], m7
898*c0909341SAndroid Build Coastguard Worker    mova  [px-32*1+ 0], m7
899*c0909341SAndroid Build Coastguard Worker    movd  [px-32*1+16], m7
900*c0909341SAndroid Build Coastguard Worker.top_no_left:
901*c0909341SAndroid Build Coastguard Worker    movd  [px-32*2- 4], m7
902*c0909341SAndroid Build Coastguard Worker    movd  [px-32*1- 4], m7
903*c0909341SAndroid Build Coastguard Worker.top_done:
904*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; HAVE_BOTTOM
905*c0909341SAndroid Build Coastguard Worker    jz .no_bottom
906*c0909341SAndroid Build Coastguard Worker    movifnidn     botq, r4mp
907*c0909341SAndroid Build Coastguard Worker    mova            m0, [botq+strideq*0+ 0]
908*c0909341SAndroid Build Coastguard Worker    movd            m1, [botq+strideq*0+16]
909*c0909341SAndroid Build Coastguard Worker    mova            m2, [botq+strideq*1+ 0]
910*c0909341SAndroid Build Coastguard Worker    movd            m3, [botq+strideq*1+16]
911*c0909341SAndroid Build Coastguard Worker    mova  [px+32*8+ 0], m0
912*c0909341SAndroid Build Coastguard Worker    movd  [px+32*8+16], m1
913*c0909341SAndroid Build Coastguard Worker    mova  [px+32*9+ 0], m2
914*c0909341SAndroid Build Coastguard Worker    movd  [px+32*9+16], m3
915*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; HAVE_LEFT
916*c0909341SAndroid Build Coastguard Worker    jz .bottom_no_left
917*c0909341SAndroid Build Coastguard Worker    movd            m0, [botq+strideq*0-4]
918*c0909341SAndroid Build Coastguard Worker    movd            m1, [botq+strideq*1-4]
919*c0909341SAndroid Build Coastguard Worker    movd  [px+32*8- 4], m0
920*c0909341SAndroid Build Coastguard Worker    movd  [px+32*9- 4], m1
921*c0909341SAndroid Build Coastguard Worker    jmp .bottom_done
922*c0909341SAndroid Build Coastguard Worker.no_bottom:
923*c0909341SAndroid Build Coastguard Worker    mova  [px+32*8+ 0], m7
924*c0909341SAndroid Build Coastguard Worker    movd  [px+32*8+16], m7
925*c0909341SAndroid Build Coastguard Worker    mova  [px+32*9+ 0], m7
926*c0909341SAndroid Build Coastguard Worker    movd  [px+32*9+16], m7
927*c0909341SAndroid Build Coastguard Worker.bottom_no_left:
928*c0909341SAndroid Build Coastguard Worker    movd  [px+32*8- 4], m7
929*c0909341SAndroid Build Coastguard Worker    movd  [px+32*9- 4], m7
930*c0909341SAndroid Build Coastguard Worker.bottom_done:
931*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; HAVE_LEFT
932*c0909341SAndroid Build Coastguard Worker    jz .no_left
933*c0909341SAndroid Build Coastguard Worker    movifnidn    leftq, r2mp
934*c0909341SAndroid Build Coastguard Worker    movd            m0, [leftq+4*0]
935*c0909341SAndroid Build Coastguard Worker    movd            m1, [leftq+4*1]
936*c0909341SAndroid Build Coastguard Worker    movd            m2, [leftq+4*2]
937*c0909341SAndroid Build Coastguard Worker    movd            m3, [leftq+4*3]
938*c0909341SAndroid Build Coastguard Worker    movd  [px+32*0- 4], m0
939*c0909341SAndroid Build Coastguard Worker    movd  [px+32*1- 4], m1
940*c0909341SAndroid Build Coastguard Worker    movd  [px+32*2- 4], m2
941*c0909341SAndroid Build Coastguard Worker    movd  [px+32*3- 4], m3
942*c0909341SAndroid Build Coastguard Worker    movd            m0, [leftq+4*4]
943*c0909341SAndroid Build Coastguard Worker    movd            m1, [leftq+4*5]
944*c0909341SAndroid Build Coastguard Worker    movd            m2, [leftq+4*6]
945*c0909341SAndroid Build Coastguard Worker    movd            m3, [leftq+4*7]
946*c0909341SAndroid Build Coastguard Worker    movd  [px+32*4- 4], m0
947*c0909341SAndroid Build Coastguard Worker    movd  [px+32*5- 4], m1
948*c0909341SAndroid Build Coastguard Worker    movd  [px+32*6- 4], m2
949*c0909341SAndroid Build Coastguard Worker    movd  [px+32*7- 4], m3
950*c0909341SAndroid Build Coastguard Worker    jmp .left_done
951*c0909341SAndroid Build Coastguard Worker.no_left:
952*c0909341SAndroid Build Coastguard Worker    REPX {movd [px+32*x- 4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
953*c0909341SAndroid Build Coastguard Worker.left_done:
954*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; HAVE_RIGHT
955*c0909341SAndroid Build Coastguard Worker    jnz .padding_done
956*c0909341SAndroid Build Coastguard Worker    REPX {movd [px+32*x+16], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
957*c0909341SAndroid Build Coastguard Worker.padding_done:
958*c0909341SAndroid Build Coastguard Worker    CDEF_FILTER      8, 8
959*c0909341SAndroid Build Coastguard Worker
960*c0909341SAndroid Build Coastguard Worker%macro CDEF_DIR 0
961*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
962*c0909341SAndroid Build Coastguard Workercglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax
963*c0909341SAndroid Build Coastguard Worker    lea             r6, [dir_shift]
964*c0909341SAndroid Build Coastguard Worker    shr         bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc
965*c0909341SAndroid Build Coastguard Worker    movddup         m7, [r6+bdmaxq*8]
966*c0909341SAndroid Build Coastguard Worker    lea             r6, [strideq*3]
967*c0909341SAndroid Build Coastguard Worker    mova            m0, [srcq+strideq*0]
968*c0909341SAndroid Build Coastguard Worker    mova            m1, [srcq+strideq*1]
969*c0909341SAndroid Build Coastguard Worker    mova            m2, [srcq+strideq*2]
970*c0909341SAndroid Build Coastguard Worker    mova            m3, [srcq+r6       ]
971*c0909341SAndroid Build Coastguard Worker    lea           srcq, [srcq+strideq*4]
972*c0909341SAndroid Build Coastguard Worker    mova            m4, [srcq+strideq*0]
973*c0909341SAndroid Build Coastguard Worker    mova            m5, [srcq+strideq*1]
974*c0909341SAndroid Build Coastguard Worker    mova            m6, [srcq+strideq*2]
975*c0909341SAndroid Build Coastguard Worker    REPX {pmulhuw x, m7}, m0, m1, m2, m3, m4, m5, m6
976*c0909341SAndroid Build Coastguard Worker    pmulhuw         m7, [srcq+r6       ]
977*c0909341SAndroid Build Coastguard Worker    pxor            m8, m8
978*c0909341SAndroid Build Coastguard Worker    packuswb        m9, m0, m1
979*c0909341SAndroid Build Coastguard Worker    packuswb       m10, m2, m3
980*c0909341SAndroid Build Coastguard Worker    packuswb       m11, m4, m5
981*c0909341SAndroid Build Coastguard Worker    packuswb       m12, m6, m7
982*c0909341SAndroid Build Coastguard Worker    REPX {psadbw x, m8}, m9, m10, m11, m12
983*c0909341SAndroid Build Coastguard Worker    packssdw        m9, m10
984*c0909341SAndroid Build Coastguard Worker    packssdw       m11, m12
985*c0909341SAndroid Build Coastguard Worker    packssdw        m9, m11
986*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
987*c0909341SAndroid Build Coastguard Worker%else
988*c0909341SAndroid Build Coastguard Workercglobal cdef_dir_16bpc, 2, 4, 8, 96, src, stride, var, bdmax
989*c0909341SAndroid Build Coastguard Worker    mov         bdmaxd, bdmaxm
990*c0909341SAndroid Build Coastguard Worker    LEA             r2, dir_shift
991*c0909341SAndroid Build Coastguard Worker    shr         bdmaxd, 11
992*c0909341SAndroid Build Coastguard Worker    movddup         m7, [r2+bdmaxq*8]
993*c0909341SAndroid Build Coastguard Worker    lea             r3, [strideq*3]
994*c0909341SAndroid Build Coastguard Worker    pmulhuw         m3, m7, [srcq+strideq*0]
995*c0909341SAndroid Build Coastguard Worker    pmulhuw         m4, m7, [srcq+strideq*1]
996*c0909341SAndroid Build Coastguard Worker    pmulhuw         m5, m7, [srcq+strideq*2]
997*c0909341SAndroid Build Coastguard Worker    pmulhuw         m6, m7, [srcq+r3       ]
998*c0909341SAndroid Build Coastguard Worker    movddup         m1, [r2-dir_shift+pw_128]
999*c0909341SAndroid Build Coastguard Worker    lea           srcq, [srcq+strideq*4]
1000*c0909341SAndroid Build Coastguard Worker    pxor            m0, m0
1001*c0909341SAndroid Build Coastguard Worker    packuswb        m2, m3, m4
1002*c0909341SAndroid Build Coastguard Worker    psubw           m3, m1
1003*c0909341SAndroid Build Coastguard Worker    psubw           m4, m1
1004*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x00], m3
1005*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x10], m4
1006*c0909341SAndroid Build Coastguard Worker    packuswb        m3, m5, m6
1007*c0909341SAndroid Build Coastguard Worker    psadbw          m2, m0
1008*c0909341SAndroid Build Coastguard Worker    psadbw          m3, m0
1009*c0909341SAndroid Build Coastguard Worker    psubw           m5, m1
1010*c0909341SAndroid Build Coastguard Worker    psubw           m6, m1
1011*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
1012*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x20], m5
1013*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x50], m6
1014*c0909341SAndroid Build Coastguard Worker    pmulhuw         m4, m7, [srcq+strideq*0]
1015*c0909341SAndroid Build Coastguard Worker    pmulhuw         m5, m7, [srcq+strideq*1]
1016*c0909341SAndroid Build Coastguard Worker    pmulhuw         m6, m7, [srcq+strideq*2]
1017*c0909341SAndroid Build Coastguard Worker    pmulhuw         m7,     [srcq+r3       ]
1018*c0909341SAndroid Build Coastguard Worker    packuswb        m3, m4, m5
1019*c0909341SAndroid Build Coastguard Worker    packuswb        m1, m6, m7
1020*c0909341SAndroid Build Coastguard Worker    psadbw          m3, m0
1021*c0909341SAndroid Build Coastguard Worker    psadbw          m1, m0
1022*c0909341SAndroid Build Coastguard Worker    packssdw        m3, m1
1023*c0909341SAndroid Build Coastguard Worker    movddup         m1, [r2-dir_shift+pw_128]
1024*c0909341SAndroid Build Coastguard Worker    LEA             r2, shufw_6543210x
1025*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
1026*c0909341SAndroid Build Coastguard Worker%endif
1027*c0909341SAndroid Build Coastguard Worker%endmacro
1028*c0909341SAndroid Build Coastguard Worker
1029*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
1030*c0909341SAndroid Build Coastguard WorkerCDEF_DIR
1031*c0909341SAndroid Build Coastguard Worker
1032*c0909341SAndroid Build Coastguard WorkerINIT_XMM sse4
1033*c0909341SAndroid Build Coastguard WorkerCDEF_DIR
1034