xref: /aosp_15_r20/external/libdav1d/src/x86/cdef_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; Copyright © 2019, VideoLabs
4*c0909341SAndroid Build Coastguard Worker; All rights reserved.
5*c0909341SAndroid Build Coastguard Worker;
6*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
7*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
8*c0909341SAndroid Build Coastguard Worker;
9*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
10*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
11*c0909341SAndroid Build Coastguard Worker;
12*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
13*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
14*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
15*c0909341SAndroid Build Coastguard Worker;
16*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*c0909341SAndroid Build Coastguard Worker
27*c0909341SAndroid Build Coastguard Worker%include "config.asm"
28*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
29*c0909341SAndroid Build Coastguard Worker
30*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 16
31*c0909341SAndroid Build Coastguard Worker
32*c0909341SAndroid Build Coastguard Worker%macro DUP8 1-*
33*c0909341SAndroid Build Coastguard Worker    %rep %0
34*c0909341SAndroid Build Coastguard Worker        times 8 db %1
35*c0909341SAndroid Build Coastguard Worker        %rotate 1
36*c0909341SAndroid Build Coastguard Worker    %endrep
37*c0909341SAndroid Build Coastguard Worker%endmacro
38*c0909341SAndroid Build Coastguard Worker
39*c0909341SAndroid Build Coastguard Workerdiv_table_sse4:  dd 840, 420, 280, 210, 168, 140, 120, 105
40*c0909341SAndroid Build Coastguard Worker                 dd 420, 210, 140, 105, 105, 105, 105, 105
41*c0909341SAndroid Build Coastguard Workerdiv_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210
42*c0909341SAndroid Build Coastguard Worker                 dw 168, 168, 140, 140, 120, 120, 105, 105
43*c0909341SAndroid Build Coastguard Worker                 dw 420, 420, 210, 210, 140, 140, 105, 105
44*c0909341SAndroid Build Coastguard Worker                 dw 105, 105, 105, 105, 105, 105, 105, 105
45*c0909341SAndroid Build Coastguard Workerconst shufw_6543210x, \
46*c0909341SAndroid Build Coastguard Worker            db 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1, 14, 15
47*c0909341SAndroid Build Coastguard Workershufb_lohi: db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
48*c0909341SAndroid Build Coastguard Workerpw_8:      times 8 dw 8
49*c0909341SAndroid Build Coastguard Workerpw_128:    times 8 dw 128
50*c0909341SAndroid Build Coastguard Workerpw_256:    times 8 dw 256
51*c0909341SAndroid Build Coastguard Workerpw_2048:   times 8 dw 2048
52*c0909341SAndroid Build Coastguard Workerpw_0x7FFF: times 8 dw 0x7FFF
53*c0909341SAndroid Build Coastguard Workerpw_0x8000: times 8 dw 0x8000
54*c0909341SAndroid Build Coastguard Workertap_table: ; masks for 8-bit shift emulation
55*c0909341SAndroid Build Coastguard Worker           DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80
56*c0909341SAndroid Build Coastguard Worker           ; weights
57*c0909341SAndroid Build Coastguard Worker           DUP8 4, 2, 3, 3, 2, 1
58*c0909341SAndroid Build Coastguard Worker           ; taps indices
59*c0909341SAndroid Build Coastguard Worker           db -1 * 16 + 1, -2 * 16 + 2
60*c0909341SAndroid Build Coastguard Worker           db  0 * 16 + 1, -1 * 16 + 2
61*c0909341SAndroid Build Coastguard Worker           db  0 * 16 + 1,  0 * 16 + 2
62*c0909341SAndroid Build Coastguard Worker           db  0 * 16 + 1,  1 * 16 + 2
63*c0909341SAndroid Build Coastguard Worker           db  1 * 16 + 1,  2 * 16 + 2
64*c0909341SAndroid Build Coastguard Worker           db  1 * 16 + 0,  2 * 16 + 1
65*c0909341SAndroid Build Coastguard Worker           db  1 * 16 + 0,  2 * 16 + 0
66*c0909341SAndroid Build Coastguard Worker           db  1 * 16 + 0,  2 * 16 - 1
67*c0909341SAndroid Build Coastguard Worker           ; the last 6 are repeats of the first 6 so we don't need to & 7
68*c0909341SAndroid Build Coastguard Worker           db -1 * 16 + 1, -2 * 16 + 2
69*c0909341SAndroid Build Coastguard Worker           db  0 * 16 + 1, -1 * 16 + 2
70*c0909341SAndroid Build Coastguard Worker           db  0 * 16 + 1,  0 * 16 + 2
71*c0909341SAndroid Build Coastguard Worker           db  0 * 16 + 1,  1 * 16 + 2
72*c0909341SAndroid Build Coastguard Worker           db  1 * 16 + 1,  2 * 16 + 2
73*c0909341SAndroid Build Coastguard Worker           db  1 * 16 + 0,  2 * 16 + 1
74*c0909341SAndroid Build Coastguard Worker
75*c0909341SAndroid Build Coastguard WorkerSECTION .text
76*c0909341SAndroid Build Coastguard Worker
77*c0909341SAndroid Build Coastguard Worker%macro movif32 2
78*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
79*c0909341SAndroid Build Coastguard Worker    mov     %1, %2
80*c0909341SAndroid Build Coastguard Worker %endif
81*c0909341SAndroid Build Coastguard Worker%endmacro
82*c0909341SAndroid Build Coastguard Worker
83*c0909341SAndroid Build Coastguard Worker%macro PMOVZXBW 2-3 0 ; %3 = half
84*c0909341SAndroid Build Coastguard Worker %if cpuflag(sse4) && %3 == 0
85*c0909341SAndroid Build Coastguard Worker    pmovzxbw        %1, %2
86*c0909341SAndroid Build Coastguard Worker %else
87*c0909341SAndroid Build Coastguard Worker  %if %3 == 1
88*c0909341SAndroid Build Coastguard Worker    movd            %1, %2
89*c0909341SAndroid Build Coastguard Worker  %else
90*c0909341SAndroid Build Coastguard Worker    movq            %1, %2
91*c0909341SAndroid Build Coastguard Worker  %endif
92*c0909341SAndroid Build Coastguard Worker    punpcklbw       %1, m7
93*c0909341SAndroid Build Coastguard Worker %endif
94*c0909341SAndroid Build Coastguard Worker%endmacro
95*c0909341SAndroid Build Coastguard Worker
96*c0909341SAndroid Build Coastguard Worker%macro PSHUFB_0 2
97*c0909341SAndroid Build Coastguard Worker %if cpuflag(ssse3)
98*c0909341SAndroid Build Coastguard Worker    pshufb          %1, %2
99*c0909341SAndroid Build Coastguard Worker %else
100*c0909341SAndroid Build Coastguard Worker    punpcklbw       %1, %1
101*c0909341SAndroid Build Coastguard Worker    pshuflw         %1, %1, q0000
102*c0909341SAndroid Build Coastguard Worker    punpcklqdq      %1, %1
103*c0909341SAndroid Build Coastguard Worker %endif
104*c0909341SAndroid Build Coastguard Worker%endmacro
105*c0909341SAndroid Build Coastguard Worker
106*c0909341SAndroid Build Coastguard Worker%macro MOVDDUP 2
107*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3)
108*c0909341SAndroid Build Coastguard Worker    movddup         %1, %2
109*c0909341SAndroid Build Coastguard Worker%else
110*c0909341SAndroid Build Coastguard Worker    movq            %1, %2
111*c0909341SAndroid Build Coastguard Worker    punpcklqdq      %1, %1
112*c0909341SAndroid Build Coastguard Worker%endif
113*c0909341SAndroid Build Coastguard Worker%endmacro
114*c0909341SAndroid Build Coastguard Worker
115*c0909341SAndroid Build Coastguard Worker%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax
116*c0909341SAndroid Build Coastguard Worker    ; load p0/p1
117*c0909341SAndroid Build Coastguard Worker    movsx         offq, byte [dirq+kq+%1+14*8]  ; off1
118*c0909341SAndroid Build Coastguard Worker %if %6 == 4
119*c0909341SAndroid Build Coastguard Worker    movq            m5, [stkq+offq*2+32*0]      ; p0
120*c0909341SAndroid Build Coastguard Worker    movhps          m5, [stkq+offq*2+32*1]
121*c0909341SAndroid Build Coastguard Worker %else
122*c0909341SAndroid Build Coastguard Worker    movu            m5, [stkq+offq*2+32*0]      ; p0
123*c0909341SAndroid Build Coastguard Worker %endif
124*c0909341SAndroid Build Coastguard Worker    neg           offq                          ; -off1
125*c0909341SAndroid Build Coastguard Worker %if %6 == 4
126*c0909341SAndroid Build Coastguard Worker    movq            m6, [stkq+offq*2+32*0]      ; p1
127*c0909341SAndroid Build Coastguard Worker    movhps          m6, [stkq+offq*2+32*1]
128*c0909341SAndroid Build Coastguard Worker %else
129*c0909341SAndroid Build Coastguard Worker    movu            m6, [stkq+offq*2+32*0]      ; p1
130*c0909341SAndroid Build Coastguard Worker %endif
131*c0909341SAndroid Build Coastguard Worker %if %7
132*c0909341SAndroid Build Coastguard Worker  %if cpuflag(sse4)
133*c0909341SAndroid Build Coastguard Worker    ; out of bounds values are set to a value that is a both a large unsigned
134*c0909341SAndroid Build Coastguard Worker    ; value and a negative signed value.
135*c0909341SAndroid Build Coastguard Worker    ; use signed max and unsigned min to remove them
136*c0909341SAndroid Build Coastguard Worker    pmaxsw          m7, m5
137*c0909341SAndroid Build Coastguard Worker    pminuw          m8, m5
138*c0909341SAndroid Build Coastguard Worker    pmaxsw          m7, m6
139*c0909341SAndroid Build Coastguard Worker    pminuw          m8, m6
140*c0909341SAndroid Build Coastguard Worker  %else
141*c0909341SAndroid Build Coastguard Worker    pcmpeqw         m3, m14, m5
142*c0909341SAndroid Build Coastguard Worker    pminsw          m8, m5     ; min after p0
143*c0909341SAndroid Build Coastguard Worker    pandn           m3, m5
144*c0909341SAndroid Build Coastguard Worker    pmaxsw          m7, m3     ; max after p0
145*c0909341SAndroid Build Coastguard Worker    pcmpeqw         m3, m14, m6
146*c0909341SAndroid Build Coastguard Worker    pminsw          m8, m6     ; min after p1
147*c0909341SAndroid Build Coastguard Worker    pandn           m3, m6
148*c0909341SAndroid Build Coastguard Worker    pmaxsw          m7, m3     ; max after p1
149*c0909341SAndroid Build Coastguard Worker  %endif
150*c0909341SAndroid Build Coastguard Worker %endif
151*c0909341SAndroid Build Coastguard Worker
152*c0909341SAndroid Build Coastguard Worker    ; accumulate sum[m13] over p0/p1
153*c0909341SAndroid Build Coastguard Worker    psubw           m5, m4     ; diff_p0(p0 - px)
154*c0909341SAndroid Build Coastguard Worker    psubw           m6, m4     ; diff_p1(p1 - px)
155*c0909341SAndroid Build Coastguard Worker    packsswb        m5, m6     ; convert pixel diff to 8-bit
156*c0909341SAndroid Build Coastguard Worker %if cpuflag(ssse3)
157*c0909341SAndroid Build Coastguard Worker    pshufb          m5, m13    ; group diffs p0 and p1 into pairs
158*c0909341SAndroid Build Coastguard Worker    pabsb           m6, m5
159*c0909341SAndroid Build Coastguard Worker    psignb          m3, %5, m5
160*c0909341SAndroid Build Coastguard Worker %else
161*c0909341SAndroid Build Coastguard Worker    movlhps         m6, m5
162*c0909341SAndroid Build Coastguard Worker    punpckhbw       m6, m5
163*c0909341SAndroid Build Coastguard Worker    pxor            m5, m5
164*c0909341SAndroid Build Coastguard Worker    pcmpgtb         m5, m6
165*c0909341SAndroid Build Coastguard Worker    paddb           m6, m5
166*c0909341SAndroid Build Coastguard Worker    pxor            m6, m5
167*c0909341SAndroid Build Coastguard Worker    paddb           m3, %5, m5
168*c0909341SAndroid Build Coastguard Worker    pxor            m3, m5
169*c0909341SAndroid Build Coastguard Worker %endif
170*c0909341SAndroid Build Coastguard Worker    pand            m9, %3, m6 ; emulate 8-bit shift
171*c0909341SAndroid Build Coastguard Worker    psrlw           m9, %2
172*c0909341SAndroid Build Coastguard Worker    psubusb         m5, %4, m9
173*c0909341SAndroid Build Coastguard Worker    pminub          m5, m6     ; constrain(diff_p)
174*c0909341SAndroid Build Coastguard Worker %if cpuflag(ssse3)
175*c0909341SAndroid Build Coastguard Worker    pmaddubsw       m5, m3     ; constrain(diff_p) * taps
176*c0909341SAndroid Build Coastguard Worker %else
177*c0909341SAndroid Build Coastguard Worker    psrlw           m9, m5, 8
178*c0909341SAndroid Build Coastguard Worker    psraw           m6, m3, 8
179*c0909341SAndroid Build Coastguard Worker    psllw           m5, 8
180*c0909341SAndroid Build Coastguard Worker    psllw           m3, 8
181*c0909341SAndroid Build Coastguard Worker    pmullw          m9, m6
182*c0909341SAndroid Build Coastguard Worker    pmulhw          m5, m3
183*c0909341SAndroid Build Coastguard Worker    paddw           m5, m9
184*c0909341SAndroid Build Coastguard Worker %endif
185*c0909341SAndroid Build Coastguard Worker    paddw           m0, m5
186*c0909341SAndroid Build Coastguard Worker%endmacro
187*c0909341SAndroid Build Coastguard Worker
188*c0909341SAndroid Build Coastguard Worker%macro LOAD_BODY 3 ; dst, src, block_width
189*c0909341SAndroid Build Coastguard Worker %if %3 == 4
190*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m0, [%2+strideq*0]
191*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m1, [%2+strideq*1]
192*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m2, [%2+strideq*2]
193*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m3, [%2+stride3q]
194*c0909341SAndroid Build Coastguard Worker    mova     [%1+32*0], m0
195*c0909341SAndroid Build Coastguard Worker    mova     [%1+32*1], m1
196*c0909341SAndroid Build Coastguard Worker    mova     [%1+32*2], m2
197*c0909341SAndroid Build Coastguard Worker    mova     [%1+32*3], m3
198*c0909341SAndroid Build Coastguard Worker %else
199*c0909341SAndroid Build Coastguard Worker    movu            m0, [%2+strideq*0]
200*c0909341SAndroid Build Coastguard Worker    movu            m1, [%2+strideq*1]
201*c0909341SAndroid Build Coastguard Worker    movu            m2, [%2+strideq*2]
202*c0909341SAndroid Build Coastguard Worker    movu            m3, [%2+stride3q]
203*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, m0, m7
204*c0909341SAndroid Build Coastguard Worker    punpckhbw       m0, m7
205*c0909341SAndroid Build Coastguard Worker    mova  [%1+32*0+ 0], m4
206*c0909341SAndroid Build Coastguard Worker    mova  [%1+32*0+16], m0
207*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, m1, m7
208*c0909341SAndroid Build Coastguard Worker    punpckhbw       m1, m7
209*c0909341SAndroid Build Coastguard Worker    mova  [%1+32*1+ 0], m4
210*c0909341SAndroid Build Coastguard Worker    mova  [%1+32*1+16], m1
211*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, m2, m7
212*c0909341SAndroid Build Coastguard Worker    punpckhbw       m2, m7
213*c0909341SAndroid Build Coastguard Worker    mova  [%1+32*2+ 0], m4
214*c0909341SAndroid Build Coastguard Worker    mova  [%1+32*2+16], m2
215*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, m3, m7
216*c0909341SAndroid Build Coastguard Worker    punpckhbw       m3, m7
217*c0909341SAndroid Build Coastguard Worker    mova  [%1+32*3+ 0], m4
218*c0909341SAndroid Build Coastguard Worker    mova  [%1+32*3+16], m3
219*c0909341SAndroid Build Coastguard Worker %endif
220*c0909341SAndroid Build Coastguard Worker%endmacro
221*c0909341SAndroid Build Coastguard Worker
222*c0909341SAndroid Build Coastguard Worker%macro CDEF_FILTER_END 2 ; w, minmax
223*c0909341SAndroid Build Coastguard Worker    pxor            m6, m6
224*c0909341SAndroid Build Coastguard Worker    pcmpgtw         m6, m0
225*c0909341SAndroid Build Coastguard Worker    paddw           m0, m6
226*c0909341SAndroid Build Coastguard Worker %if cpuflag(ssse3)
227*c0909341SAndroid Build Coastguard Worker    pmulhrsw        m0, m15
228*c0909341SAndroid Build Coastguard Worker %else
229*c0909341SAndroid Build Coastguard Worker    paddw           m0, m15
230*c0909341SAndroid Build Coastguard Worker    psraw           m0, 4
231*c0909341SAndroid Build Coastguard Worker %endif
232*c0909341SAndroid Build Coastguard Worker    paddw           m4, m0
233*c0909341SAndroid Build Coastguard Worker %if %2
234*c0909341SAndroid Build Coastguard Worker    pminsw          m4, m7
235*c0909341SAndroid Build Coastguard Worker    pmaxsw          m4, m8
236*c0909341SAndroid Build Coastguard Worker %endif
237*c0909341SAndroid Build Coastguard Worker    packuswb        m4, m4
238*c0909341SAndroid Build Coastguard Worker %if %1 == 4
239*c0909341SAndroid Build Coastguard Worker    movd [dstq+strideq*0], m4
240*c0909341SAndroid Build Coastguard Worker    psrlq           m4, 32
241*c0909341SAndroid Build Coastguard Worker    movd [dstq+strideq*1], m4
242*c0909341SAndroid Build Coastguard Worker    add           stkq, 32*2
243*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq+strideq*2]
244*c0909341SAndroid Build Coastguard Worker %else
245*c0909341SAndroid Build Coastguard Worker    movq        [dstq], m4
246*c0909341SAndroid Build Coastguard Worker    add           stkq, 32
247*c0909341SAndroid Build Coastguard Worker    add           dstq, strideq
248*c0909341SAndroid Build Coastguard Worker %endif
249*c0909341SAndroid Build Coastguard Worker%endmacro
250*c0909341SAndroid Build Coastguard Worker
251*c0909341SAndroid Build Coastguard Worker%macro CDEF_FILTER 2 ; w, h
252*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
253*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_%1x%2_8bpc, 5, 9, 16, 3 * 16 + (%2+4)*32, \
254*c0909341SAndroid Build Coastguard Worker                                dst, stride, left, top, bot, pri, dst4, edge, \
255*c0909341SAndroid Build Coastguard Worker                                stride3
256*c0909341SAndroid Build Coastguard Worker  %define px rsp+3*16+2*32
257*c0909341SAndroid Build Coastguard Worker  %define base 0
258*c0909341SAndroid Build Coastguard Worker %else
259*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
260*c0909341SAndroid Build Coastguard Worker                                dst, stride, left, edge, stride3
261*c0909341SAndroid Build Coastguard Worker    %define       topq  r2
262*c0909341SAndroid Build Coastguard Worker    %define       botq  r2
263*c0909341SAndroid Build Coastguard Worker    %define      dst4q  r2
264*c0909341SAndroid Build Coastguard Worker    LEA             r5, tap_table
265*c0909341SAndroid Build Coastguard Worker  %define px esp+7*16+2*32
266*c0909341SAndroid Build Coastguard Worker  %define base r5-tap_table
267*c0909341SAndroid Build Coastguard Worker %endif
268*c0909341SAndroid Build Coastguard Worker    mov          edged, r9m
269*c0909341SAndroid Build Coastguard Worker %if cpuflag(sse4)
270*c0909341SAndroid Build Coastguard Worker   %define OUT_OF_BOUNDS_MEM [base+pw_0x8000]
271*c0909341SAndroid Build Coastguard Worker %else
272*c0909341SAndroid Build Coastguard Worker   %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF]
273*c0909341SAndroid Build Coastguard Worker %endif
274*c0909341SAndroid Build Coastguard Worker    mova            m6, OUT_OF_BOUNDS_MEM
275*c0909341SAndroid Build Coastguard Worker    pxor            m7, m7
276*c0909341SAndroid Build Coastguard Worker
277*c0909341SAndroid Build Coastguard Worker    ; prepare pixel buffers - body/right
278*c0909341SAndroid Build Coastguard Worker %if %2 == 8
279*c0909341SAndroid Build Coastguard Worker    lea          dst4q, [dstq+strideq*4]
280*c0909341SAndroid Build Coastguard Worker %endif
281*c0909341SAndroid Build Coastguard Worker    lea       stride3q, [strideq*3]
282*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; have_right
283*c0909341SAndroid Build Coastguard Worker    jz .no_right
284*c0909341SAndroid Build Coastguard Worker    LOAD_BODY       px, dstq, %1
285*c0909341SAndroid Build Coastguard Worker %if %2 == 8
286*c0909341SAndroid Build Coastguard Worker    LOAD_BODY  px+4*32, dst4q, %1
287*c0909341SAndroid Build Coastguard Worker %endif
288*c0909341SAndroid Build Coastguard Worker    jmp .body_done
289*c0909341SAndroid Build Coastguard Worker.no_right:
290*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m0, [dstq+strideq*0], %1 == 4
291*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m1, [dstq+strideq*1], %1 == 4
292*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m2, [dstq+strideq*2], %1 == 4
293*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m3, [dstq+stride3q ], %1 == 4
294*c0909341SAndroid Build Coastguard Worker    mova     [px+32*0], m0
295*c0909341SAndroid Build Coastguard Worker    mova     [px+32*1], m1
296*c0909341SAndroid Build Coastguard Worker    mova     [px+32*2], m2
297*c0909341SAndroid Build Coastguard Worker    mova     [px+32*3], m3
298*c0909341SAndroid Build Coastguard Worker    movd [px+32*0+%1*2], m6
299*c0909341SAndroid Build Coastguard Worker    movd [px+32*1+%1*2], m6
300*c0909341SAndroid Build Coastguard Worker    movd [px+32*2+%1*2], m6
301*c0909341SAndroid Build Coastguard Worker    movd [px+32*3+%1*2], m6
302*c0909341SAndroid Build Coastguard Worker %if %2 == 8
303*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m0, [dst4q+strideq*0], %1 == 4
304*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m1, [dst4q+strideq*1], %1 == 4
305*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m2, [dst4q+strideq*2], %1 == 4
306*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m3, [dst4q+stride3q ], %1 == 4
307*c0909341SAndroid Build Coastguard Worker    mova     [px+32*4], m0
308*c0909341SAndroid Build Coastguard Worker    mova     [px+32*5], m1
309*c0909341SAndroid Build Coastguard Worker    mova     [px+32*6], m2
310*c0909341SAndroid Build Coastguard Worker    mova     [px+32*7], m3
311*c0909341SAndroid Build Coastguard Worker    movd [px+32*4+%1*2], m6
312*c0909341SAndroid Build Coastguard Worker    movd [px+32*5+%1*2], m6
313*c0909341SAndroid Build Coastguard Worker    movd [px+32*6+%1*2], m6
314*c0909341SAndroid Build Coastguard Worker    movd [px+32*7+%1*2], m6
315*c0909341SAndroid Build Coastguard Worker %endif
316*c0909341SAndroid Build Coastguard Worker.body_done:
317*c0909341SAndroid Build Coastguard Worker
318*c0909341SAndroid Build Coastguard Worker    ; top
319*c0909341SAndroid Build Coastguard Worker    movifnidn     topq, r3mp
320*c0909341SAndroid Build Coastguard Worker    test         edgeb, 4 ; have_top
321*c0909341SAndroid Build Coastguard Worker    jz .no_top
322*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; have_left
323*c0909341SAndroid Build Coastguard Worker    jz .top_no_left
324*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; have_right
325*c0909341SAndroid Build Coastguard Worker    jz .top_no_right
326*c0909341SAndroid Build Coastguard Worker %if %1 == 4
327*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m0, [topq+strideq*0-2]
328*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m1, [topq+strideq*1-2]
329*c0909341SAndroid Build Coastguard Worker %else
330*c0909341SAndroid Build Coastguard Worker    movu            m0, [topq+strideq*0-4]
331*c0909341SAndroid Build Coastguard Worker    movu            m1, [topq+strideq*1-4]
332*c0909341SAndroid Build Coastguard Worker    punpckhbw       m2, m0, m7
333*c0909341SAndroid Build Coastguard Worker    punpcklbw       m0, m7
334*c0909341SAndroid Build Coastguard Worker    punpckhbw       m3, m1, m7
335*c0909341SAndroid Build Coastguard Worker    punpcklbw       m1, m7
336*c0909341SAndroid Build Coastguard Worker    movu  [px-32*2+8], m2
337*c0909341SAndroid Build Coastguard Worker    movu  [px-32*1+8], m3
338*c0909341SAndroid Build Coastguard Worker %endif
339*c0909341SAndroid Build Coastguard Worker    movu  [px-32*2-%1], m0
340*c0909341SAndroid Build Coastguard Worker    movu  [px-32*1-%1], m1
341*c0909341SAndroid Build Coastguard Worker    jmp .top_done
342*c0909341SAndroid Build Coastguard Worker.top_no_right:
343*c0909341SAndroid Build Coastguard Worker %if %1 == 4
344*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m0, [topq+strideq*0-%1]
345*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m1, [topq+strideq*1-%1]
346*c0909341SAndroid Build Coastguard Worker    movu   [px-32*2-8], m0
347*c0909341SAndroid Build Coastguard Worker    movu   [px-32*1-8], m1
348*c0909341SAndroid Build Coastguard Worker %else
349*c0909341SAndroid Build Coastguard Worker    movu            m0, [topq+strideq*0-%1]
350*c0909341SAndroid Build Coastguard Worker    movu            m1, [topq+strideq*1-%2]
351*c0909341SAndroid Build Coastguard Worker    punpckhbw       m2, m0, m7
352*c0909341SAndroid Build Coastguard Worker    punpcklbw       m0, m7
353*c0909341SAndroid Build Coastguard Worker    punpckhbw       m3, m1, m7
354*c0909341SAndroid Build Coastguard Worker    punpcklbw       m1, m7
355*c0909341SAndroid Build Coastguard Worker    mova  [px-32*2-16], m0
356*c0909341SAndroid Build Coastguard Worker    mova  [px-32*2+ 0], m2
357*c0909341SAndroid Build Coastguard Worker    mova  [px-32*1-16], m1
358*c0909341SAndroid Build Coastguard Worker    mova  [px-32*1+ 0], m3
359*c0909341SAndroid Build Coastguard Worker %endif
360*c0909341SAndroid Build Coastguard Worker    movd [px-32*2+%1*2], m6
361*c0909341SAndroid Build Coastguard Worker    movd [px-32*1+%1*2], m6
362*c0909341SAndroid Build Coastguard Worker    jmp .top_done
363*c0909341SAndroid Build Coastguard Worker.top_no_left:
364*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; have_right
365*c0909341SAndroid Build Coastguard Worker    jz .top_no_left_right
366*c0909341SAndroid Build Coastguard Worker %if %1 == 4
367*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m0, [topq+strideq*0]
368*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m1, [topq+strideq*1]
369*c0909341SAndroid Build Coastguard Worker %else
370*c0909341SAndroid Build Coastguard Worker    movu            m0, [topq+strideq*0]
371*c0909341SAndroid Build Coastguard Worker    movu            m1, [topq+strideq*1]
372*c0909341SAndroid Build Coastguard Worker    punpckhbw       m2, m0, m7
373*c0909341SAndroid Build Coastguard Worker    punpcklbw       m0, m7
374*c0909341SAndroid Build Coastguard Worker    punpckhbw       m3, m1, m7
375*c0909341SAndroid Build Coastguard Worker    punpcklbw       m1, m7
376*c0909341SAndroid Build Coastguard Worker    movd  [px-32*2+16], m2
377*c0909341SAndroid Build Coastguard Worker    movd  [px-32*1+16], m3
378*c0909341SAndroid Build Coastguard Worker %endif
379*c0909341SAndroid Build Coastguard Worker    movd  [px-32*2- 4], m6
380*c0909341SAndroid Build Coastguard Worker    movd  [px-32*1- 4], m6
381*c0909341SAndroid Build Coastguard Worker    mova  [px-32*2+ 0], m0
382*c0909341SAndroid Build Coastguard Worker    mova  [px-32*1+ 0], m1
383*c0909341SAndroid Build Coastguard Worker    jmp .top_done
384*c0909341SAndroid Build Coastguard Worker.top_no_left_right:
385*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m0, [topq+strideq*0], %1 == 4
386*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m1, [topq+strideq*1], %1 == 4
387*c0909341SAndroid Build Coastguard Worker    movd   [px-32*2-4], m6
388*c0909341SAndroid Build Coastguard Worker    movd   [px-32*1-4], m6
389*c0909341SAndroid Build Coastguard Worker    mova   [px-32*2+0], m0
390*c0909341SAndroid Build Coastguard Worker    mova   [px-32*1+0], m1
391*c0909341SAndroid Build Coastguard Worker    movd [px-32*2+%1*2], m6
392*c0909341SAndroid Build Coastguard Worker    movd [px-32*1+%1*2], m6
393*c0909341SAndroid Build Coastguard Worker    jmp .top_done
394*c0909341SAndroid Build Coastguard Worker.no_top:
395*c0909341SAndroid Build Coastguard Worker    movu  [px-32*2- 4], m6
396*c0909341SAndroid Build Coastguard Worker    movu  [px-32*1- 4], m6
397*c0909341SAndroid Build Coastguard Worker %if %1 == 8
398*c0909341SAndroid Build Coastguard Worker    movq  [px-32*2+12], m6
399*c0909341SAndroid Build Coastguard Worker    movq  [px-32*1+12], m6
400*c0909341SAndroid Build Coastguard Worker %endif
401*c0909341SAndroid Build Coastguard Worker.top_done:
402*c0909341SAndroid Build Coastguard Worker
403*c0909341SAndroid Build Coastguard Worker    ; left
404*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; have_left
405*c0909341SAndroid Build Coastguard Worker    jz .no_left
406*c0909341SAndroid Build Coastguard Worker    movifnidn    leftq, leftmp
407*c0909341SAndroid Build Coastguard Worker %if %2 == 4
408*c0909341SAndroid Build Coastguard Worker    movq            m0, [leftq]
409*c0909341SAndroid Build Coastguard Worker %else
410*c0909341SAndroid Build Coastguard Worker    movu            m0, [leftq]
411*c0909341SAndroid Build Coastguard Worker %endif
412*c0909341SAndroid Build Coastguard Worker %if %2 == 4
413*c0909341SAndroid Build Coastguard Worker    punpcklbw       m0, m7
414*c0909341SAndroid Build Coastguard Worker %else
415*c0909341SAndroid Build Coastguard Worker    punpckhbw       m1, m0, m7
416*c0909341SAndroid Build Coastguard Worker    punpcklbw       m0, m7
417*c0909341SAndroid Build Coastguard Worker    movhlps         m3, m1
418*c0909341SAndroid Build Coastguard Worker    movd   [px+32*4-4], m1
419*c0909341SAndroid Build Coastguard Worker    movd   [px+32*6-4], m3
420*c0909341SAndroid Build Coastguard Worker    psrlq           m1, 32
421*c0909341SAndroid Build Coastguard Worker    psrlq           m3, 32
422*c0909341SAndroid Build Coastguard Worker    movd   [px+32*5-4], m1
423*c0909341SAndroid Build Coastguard Worker    movd   [px+32*7-4], m3
424*c0909341SAndroid Build Coastguard Worker %endif
425*c0909341SAndroid Build Coastguard Worker    movhlps         m2, m0
426*c0909341SAndroid Build Coastguard Worker    movd   [px+32*0-4], m0
427*c0909341SAndroid Build Coastguard Worker    movd   [px+32*2-4], m2
428*c0909341SAndroid Build Coastguard Worker    psrlq           m0, 32
429*c0909341SAndroid Build Coastguard Worker    psrlq           m2, 32
430*c0909341SAndroid Build Coastguard Worker    movd   [px+32*1-4], m0
431*c0909341SAndroid Build Coastguard Worker    movd   [px+32*3-4], m2
432*c0909341SAndroid Build Coastguard Worker    jmp .left_done
433*c0909341SAndroid Build Coastguard Worker.no_left:
434*c0909341SAndroid Build Coastguard Worker    movd   [px+32*0-4], m6
435*c0909341SAndroid Build Coastguard Worker    movd   [px+32*1-4], m6
436*c0909341SAndroid Build Coastguard Worker    movd   [px+32*2-4], m6
437*c0909341SAndroid Build Coastguard Worker    movd   [px+32*3-4], m6
438*c0909341SAndroid Build Coastguard Worker %if %2 == 8
439*c0909341SAndroid Build Coastguard Worker    movd   [px+32*4-4], m6
440*c0909341SAndroid Build Coastguard Worker    movd   [px+32*5-4], m6
441*c0909341SAndroid Build Coastguard Worker    movd   [px+32*6-4], m6
442*c0909341SAndroid Build Coastguard Worker    movd   [px+32*7-4], m6
443*c0909341SAndroid Build Coastguard Worker %endif
444*c0909341SAndroid Build Coastguard Worker.left_done:
445*c0909341SAndroid Build Coastguard Worker
446*c0909341SAndroid Build Coastguard Worker    ; bottom
447*c0909341SAndroid Build Coastguard Worker    movifnidn     botq, r4mp
448*c0909341SAndroid Build Coastguard Worker    test         edgeb, 8 ; have_bottom
449*c0909341SAndroid Build Coastguard Worker    jz .no_bottom
450*c0909341SAndroid Build Coastguard Worker    test         edgeb, 1 ; have_left
451*c0909341SAndroid Build Coastguard Worker    jz .bottom_no_left
452*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; have_right
453*c0909341SAndroid Build Coastguard Worker    jz .bottom_no_right
454*c0909341SAndroid Build Coastguard Worker %if %1 == 4
455*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m0, [botq+strideq*0-(%1/2)]
456*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m1, [botq+strideq*1-(%1/2)]
457*c0909341SAndroid Build Coastguard Worker %else
458*c0909341SAndroid Build Coastguard Worker    movu            m0, [botq+strideq*0-4]
459*c0909341SAndroid Build Coastguard Worker    movu            m1, [botq+strideq*1-4]
460*c0909341SAndroid Build Coastguard Worker    punpckhbw       m2, m0, m7
461*c0909341SAndroid Build Coastguard Worker    punpcklbw       m0, m7
462*c0909341SAndroid Build Coastguard Worker    punpckhbw       m3, m1, m7
463*c0909341SAndroid Build Coastguard Worker    punpcklbw       m1, m7
464*c0909341SAndroid Build Coastguard Worker    movu [px+32*(%2+0)+8], m2
465*c0909341SAndroid Build Coastguard Worker    movu [px+32*(%2+1)+8], m3
466*c0909341SAndroid Build Coastguard Worker %endif
467*c0909341SAndroid Build Coastguard Worker    movu [px+32*(%2+0)-%1], m0
468*c0909341SAndroid Build Coastguard Worker    movu [px+32*(%2+1)-%1], m1
469*c0909341SAndroid Build Coastguard Worker    jmp .bottom_done
470*c0909341SAndroid Build Coastguard Worker.bottom_no_right:
471*c0909341SAndroid Build Coastguard Worker %if %1 == 4
472*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m0, [botq+strideq*0-4]
473*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m1, [botq+strideq*1-4]
474*c0909341SAndroid Build Coastguard Worker    movu [px+32*(%2+0)-8], m0
475*c0909341SAndroid Build Coastguard Worker    movu [px+32*(%2+1)-8], m1
476*c0909341SAndroid Build Coastguard Worker %else
477*c0909341SAndroid Build Coastguard Worker    movu            m0, [botq+strideq*0-8]
478*c0909341SAndroid Build Coastguard Worker    movu            m1, [botq+strideq*1-8]
479*c0909341SAndroid Build Coastguard Worker    punpckhbw       m2, m0, m7
480*c0909341SAndroid Build Coastguard Worker    punpcklbw       m0, m7
481*c0909341SAndroid Build Coastguard Worker    punpckhbw       m3, m1, m7
482*c0909341SAndroid Build Coastguard Worker    punpcklbw       m1, m7
483*c0909341SAndroid Build Coastguard Worker    mova [px+32*(%2+0)-16], m0
484*c0909341SAndroid Build Coastguard Worker    mova [px+32*(%2+0)+ 0], m2
485*c0909341SAndroid Build Coastguard Worker    mova [px+32*(%2+1)-16], m1
486*c0909341SAndroid Build Coastguard Worker    mova [px+32*(%2+1)+ 0], m3
487*c0909341SAndroid Build Coastguard Worker    movd [px+32*(%2-1)+16], m6 ; overwritten by first mova
488*c0909341SAndroid Build Coastguard Worker %endif
489*c0909341SAndroid Build Coastguard Worker    movd [px+32*(%2+0)+%1*2], m6
490*c0909341SAndroid Build Coastguard Worker    movd [px+32*(%2+1)+%1*2], m6
491*c0909341SAndroid Build Coastguard Worker    jmp .bottom_done
492*c0909341SAndroid Build Coastguard Worker.bottom_no_left:
493*c0909341SAndroid Build Coastguard Worker    test         edgeb, 2 ; have_right
494*c0909341SAndroid Build Coastguard Worker    jz .bottom_no_left_right
495*c0909341SAndroid Build Coastguard Worker %if %1 == 4
496*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m0, [botq+strideq*0]
497*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m1, [botq+strideq*1]
498*c0909341SAndroid Build Coastguard Worker %else
499*c0909341SAndroid Build Coastguard Worker    movu            m0, [botq+strideq*0]
500*c0909341SAndroid Build Coastguard Worker    movu            m1, [botq+strideq*1]
501*c0909341SAndroid Build Coastguard Worker    punpckhbw       m2, m0, m7
502*c0909341SAndroid Build Coastguard Worker    punpcklbw       m0, m7
503*c0909341SAndroid Build Coastguard Worker    punpckhbw       m3, m1, m7
504*c0909341SAndroid Build Coastguard Worker    punpcklbw       m1, m7
505*c0909341SAndroid Build Coastguard Worker    mova [px+32*(%2+0)+16], m2
506*c0909341SAndroid Build Coastguard Worker    mova [px+32*(%2+1)+16], m3
507*c0909341SAndroid Build Coastguard Worker %endif
508*c0909341SAndroid Build Coastguard Worker    mova [px+32*(%2+0)+ 0], m0
509*c0909341SAndroid Build Coastguard Worker    mova [px+32*(%2+1)+ 0], m1
510*c0909341SAndroid Build Coastguard Worker    movd [px+32*(%2+0)- 4], m6
511*c0909341SAndroid Build Coastguard Worker    movd [px+32*(%2+1)- 4], m6
512*c0909341SAndroid Build Coastguard Worker    jmp .bottom_done
513*c0909341SAndroid Build Coastguard Worker.bottom_no_left_right:
514*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m0, [botq+strideq*0], %1 == 4
515*c0909341SAndroid Build Coastguard Worker    PMOVZXBW        m1, [botq+strideq*1], %1 == 4
516*c0909341SAndroid Build Coastguard Worker    mova [px+32*(%2+0)+ 0], m0
517*c0909341SAndroid Build Coastguard Worker    mova [px+32*(%2+1)+ 0], m1
518*c0909341SAndroid Build Coastguard Worker    movd [px+32*(%2+0)+%1*2], m6
519*c0909341SAndroid Build Coastguard Worker    movd [px+32*(%2+1)+%1*2], m6
520*c0909341SAndroid Build Coastguard Worker    movd [px+32*(%2+0)- 4], m6
521*c0909341SAndroid Build Coastguard Worker    movd [px+32*(%2+1)- 4], m6
522*c0909341SAndroid Build Coastguard Worker    jmp .bottom_done
523*c0909341SAndroid Build Coastguard Worker.no_bottom:
524*c0909341SAndroid Build Coastguard Worker    movu [px+32*(%2+0)- 4], m6
525*c0909341SAndroid Build Coastguard Worker    movu [px+32*(%2+1)- 4], m6
526*c0909341SAndroid Build Coastguard Worker %if %1 == 8
527*c0909341SAndroid Build Coastguard Worker    movq [px+32*(%2+0)+12], m6
528*c0909341SAndroid Build Coastguard Worker    movq [px+32*(%2+1)+12], m6
529*c0909341SAndroid Build Coastguard Worker %endif
530*c0909341SAndroid Build Coastguard Worker.bottom_done:
531*c0909341SAndroid Build Coastguard Worker
532*c0909341SAndroid Build Coastguard Worker    ; actual filter
533*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
534*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, stride, _, pridmp, damping, pri, sec
535*c0909341SAndroid Build Coastguard Worker    mova           m13, [shufb_lohi]
536*c0909341SAndroid Build Coastguard Worker %if cpuflag(ssse3)
537*c0909341SAndroid Build Coastguard Worker    mova           m15, [pw_2048]
538*c0909341SAndroid Build Coastguard Worker %else
539*c0909341SAndroid Build Coastguard Worker    mova           m15, [pw_8]
540*c0909341SAndroid Build Coastguard Worker %endif
541*c0909341SAndroid Build Coastguard Worker    mova           m14, m6
542*c0909341SAndroid Build Coastguard Worker %else
543*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, pridmp, sec, damping, pri, tap
544*c0909341SAndroid Build Coastguard Worker    %xdefine        m8  m1
545*c0909341SAndroid Build Coastguard Worker    %xdefine        m9  m2
546*c0909341SAndroid Build Coastguard Worker    %xdefine       m10  m0
547*c0909341SAndroid Build Coastguard Worker    %xdefine       m13  [base+shufb_lohi]
548*c0909341SAndroid Build Coastguard Worker    %xdefine       m14  OUT_OF_BOUNDS_MEM
549*c0909341SAndroid Build Coastguard Worker %if cpuflag(ssse3)
550*c0909341SAndroid Build Coastguard Worker    %xdefine       m15  [base+pw_2048]
551*c0909341SAndroid Build Coastguard Worker %else
552*c0909341SAndroid Build Coastguard Worker    %xdefine       m15  [base+pw_8]
553*c0909341SAndroid Build Coastguard Worker %endif
554*c0909341SAndroid Build Coastguard Worker %endif
555*c0909341SAndroid Build Coastguard Worker    movifnidn     prid, r5m
556*c0909341SAndroid Build Coastguard Worker    movifnidn     secd, r6m
557*c0909341SAndroid Build Coastguard Worker    mov       dampingd, r8m
558*c0909341SAndroid Build Coastguard Worker    movif32 [esp+0x3C], r1d
559*c0909341SAndroid Build Coastguard Worker    test          prid, prid
560*c0909341SAndroid Build Coastguard Worker    jz .sec_only
561*c0909341SAndroid Build Coastguard Worker    movd            m1, r5m
562*c0909341SAndroid Build Coastguard Worker    bsr        pridmpd, prid
563*c0909341SAndroid Build Coastguard Worker    test          secd, secd
564*c0909341SAndroid Build Coastguard Worker    jz .pri_only
565*c0909341SAndroid Build Coastguard Worker    movd           m10, r6m
566*c0909341SAndroid Build Coastguard Worker    tzcnt         secd, secd
567*c0909341SAndroid Build Coastguard Worker    and           prid, 1
568*c0909341SAndroid Build Coastguard Worker    sub        pridmpd, dampingd
569*c0909341SAndroid Build Coastguard Worker    sub           secd, dampingd
570*c0909341SAndroid Build Coastguard Worker    xor       dampingd, dampingd
571*c0909341SAndroid Build Coastguard Worker    add           prid, prid
572*c0909341SAndroid Build Coastguard Worker    neg        pridmpd
573*c0909341SAndroid Build Coastguard Worker    cmovs      pridmpd, dampingd
574*c0909341SAndroid Build Coastguard Worker    neg           secd
575*c0909341SAndroid Build Coastguard Worker    PSHUFB_0        m1, m7
576*c0909341SAndroid Build Coastguard Worker    PSHUFB_0       m10, m7
577*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
578*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, stride, _, pridmp, tap, pri, sec
579*c0909341SAndroid Build Coastguard Worker    lea           tapq, [tap_table]
580*c0909341SAndroid Build Coastguard Worker    MOVDDUP        m11, [tapq+pridmpq*8] ; pri_shift_mask
581*c0909341SAndroid Build Coastguard Worker    MOVDDUP        m12, [tapq+secq*8]    ; sec_shift_mask
582*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x00], pridmpq          ; pri_shift
583*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x10], secq             ; sec_shift
584*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, stride, h, dir, tap, pri, stk, k, off
585*c0909341SAndroid Build Coastguard Worker %else
586*c0909341SAndroid Build Coastguard Worker    MOVDDUP         m2, [tapq+pridmpq*8]
587*c0909341SAndroid Build Coastguard Worker    MOVDDUP         m3, [tapq+secq*8]
588*c0909341SAndroid Build Coastguard Worker    mov     [esp+0x04], dampingd         ; zero upper 32 bits of psrlw
589*c0909341SAndroid Build Coastguard Worker    mov     [esp+0x34], dampingd         ; source operand in ACCUMULATE_TAP
590*c0909341SAndroid Build Coastguard Worker    mov     [esp+0x00], pridmpd
591*c0909341SAndroid Build Coastguard Worker    mov     [esp+0x30], secd
592*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
593*c0909341SAndroid Build Coastguard Worker  %define         offq  dstq
594*c0909341SAndroid Build Coastguard Worker  %define           kd  strided
595*c0909341SAndroid Build Coastguard Worker  %define           kq  strideq
596*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x10], m2
597*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x40], m3
598*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x20], m1
599*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x50], m10
600*c0909341SAndroid Build Coastguard Worker %endif
601*c0909341SAndroid Build Coastguard Worker    mov           dird, r7m
602*c0909341SAndroid Build Coastguard Worker    lea           stkq, [px]
603*c0909341SAndroid Build Coastguard Worker    lea           priq, [tapq+8*8+priq*8] ; pri_taps
604*c0909341SAndroid Build Coastguard Worker    mov             hd, %1*%2/8
605*c0909341SAndroid Build Coastguard Worker    lea           dirq, [tapq+dirq*2]
606*c0909341SAndroid Build Coastguard Worker.v_loop:
607*c0909341SAndroid Build Coastguard Worker    movif32 [esp+0x38], dstd
608*c0909341SAndroid Build Coastguard Worker    mov             kd, 1
609*c0909341SAndroid Build Coastguard Worker %if %1 == 4
610*c0909341SAndroid Build Coastguard Worker    movq            m4, [stkq+32*0]
611*c0909341SAndroid Build Coastguard Worker    movhps          m4, [stkq+32*1]
612*c0909341SAndroid Build Coastguard Worker %else
613*c0909341SAndroid Build Coastguard Worker    mova            m4, [stkq+32*0]       ; px
614*c0909341SAndroid Build Coastguard Worker %endif
615*c0909341SAndroid Build Coastguard Worker    pxor            m0, m0                ; sum
616*c0909341SAndroid Build Coastguard Worker    mova            m7, m4                ; max
617*c0909341SAndroid Build Coastguard Worker    mova            m8, m4                ; min
618*c0909341SAndroid Build Coastguard Worker.k_loop:
619*c0909341SAndroid Build Coastguard Worker    MOVDDUP         m2, [priq+kq*8]
620*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
621*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1
622*c0909341SAndroid Build Coastguard Worker    MOVDDUP         m2, [tapq+12*8+kq*8]
623*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1
624*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1
625*c0909341SAndroid Build Coastguard Worker %else
626*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1
627*c0909341SAndroid Build Coastguard Worker    MOVDDUP         m2, [tapq+12*8+kq*8]
628*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
629*c0909341SAndroid Build Coastguard Worker    MOVDDUP         m2, [tapq+12*8+kq*8]
630*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
631*c0909341SAndroid Build Coastguard Worker %endif
632*c0909341SAndroid Build Coastguard Worker    dec             kd
633*c0909341SAndroid Build Coastguard Worker    jge .k_loop
634*c0909341SAndroid Build Coastguard Worker    movif32       dstq, [esp+0x38]
635*c0909341SAndroid Build Coastguard Worker    movif32    strideq, [esp+0x3C]
636*c0909341SAndroid Build Coastguard Worker    CDEF_FILTER_END %1, 1
637*c0909341SAndroid Build Coastguard Worker    dec             hd
638*c0909341SAndroid Build Coastguard Worker    jg .v_loop
639*c0909341SAndroid Build Coastguard Worker    RET
640*c0909341SAndroid Build Coastguard Worker
641*c0909341SAndroid Build Coastguard Worker.pri_only:
642*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
643*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, stride, zero, pridmp, damping, pri, tap
644*c0909341SAndroid Build Coastguard Worker    lea           tapq, [tap_table]
645*c0909341SAndroid Build Coastguard Worker %else
646*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, pridmp, zero, damping, pri, tap
647*c0909341SAndroid Build Coastguard Worker %endif
648*c0909341SAndroid Build Coastguard Worker    and           prid, 1
649*c0909341SAndroid Build Coastguard Worker    xor          zerod, zerod
650*c0909341SAndroid Build Coastguard Worker    sub       dampingd, pridmpd
651*c0909341SAndroid Build Coastguard Worker    cmovs     dampingd, zerod
652*c0909341SAndroid Build Coastguard Worker    add           prid, prid
653*c0909341SAndroid Build Coastguard Worker    PSHUFB_0        m1, m7
654*c0909341SAndroid Build Coastguard Worker    MOVDDUP         m7, [tapq+dampingq*8]
655*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x00], dampingq
656*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
657*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, stride, h, dir, stk, pri, tap, k, off
658*c0909341SAndroid Build Coastguard Worker %else
659*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x04], zerod
660*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
661*c0909341SAndroid Build Coastguard Worker %endif
662*c0909341SAndroid Build Coastguard Worker    mov           dird, r7m
663*c0909341SAndroid Build Coastguard Worker    lea           stkq, [px]
664*c0909341SAndroid Build Coastguard Worker    lea           priq, [tapq+8*8+priq*8]
665*c0909341SAndroid Build Coastguard Worker    mov             hd, %1*%2/8
666*c0909341SAndroid Build Coastguard Worker    lea           dirq, [tapq+dirq*2]
667*c0909341SAndroid Build Coastguard Worker.pri_v_loop:
668*c0909341SAndroid Build Coastguard Worker    movif32 [esp+0x38], dstd
669*c0909341SAndroid Build Coastguard Worker    mov             kd, 1
670*c0909341SAndroid Build Coastguard Worker %if %1 == 4
671*c0909341SAndroid Build Coastguard Worker    movq            m4, [stkq+32*0]
672*c0909341SAndroid Build Coastguard Worker    movhps          m4, [stkq+32*1]
673*c0909341SAndroid Build Coastguard Worker %else
674*c0909341SAndroid Build Coastguard Worker    mova            m4, [stkq+32*0]
675*c0909341SAndroid Build Coastguard Worker %endif
676*c0909341SAndroid Build Coastguard Worker    pxor            m0, m0
677*c0909341SAndroid Build Coastguard Worker.pri_k_loop:
678*c0909341SAndroid Build Coastguard Worker    MOVDDUP         m2, [priq+kq*8]
679*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0
680*c0909341SAndroid Build Coastguard Worker    dec             kd
681*c0909341SAndroid Build Coastguard Worker    jge .pri_k_loop
682*c0909341SAndroid Build Coastguard Worker    movif32       dstq, [esp+0x38]
683*c0909341SAndroid Build Coastguard Worker    movif32    strideq, [esp+0x3C]
684*c0909341SAndroid Build Coastguard Worker    CDEF_FILTER_END %1, 0
685*c0909341SAndroid Build Coastguard Worker    dec             hd
686*c0909341SAndroid Build Coastguard Worker    jg .pri_v_loop
687*c0909341SAndroid Build Coastguard Worker    RET
688*c0909341SAndroid Build Coastguard Worker
689*c0909341SAndroid Build Coastguard Worker.sec_only:
690*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
691*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, stride, zero, dir, damping, tap, sec
692*c0909341SAndroid Build Coastguard Worker%else
693*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero
694*c0909341SAndroid Build Coastguard Worker%endif
695*c0909341SAndroid Build Coastguard Worker    movd            m1, r6m
696*c0909341SAndroid Build Coastguard Worker    tzcnt         secd, secd
697*c0909341SAndroid Build Coastguard Worker    mov           dird, r7m
698*c0909341SAndroid Build Coastguard Worker    xor          zerod, zerod
699*c0909341SAndroid Build Coastguard Worker    sub       dampingd, secd
700*c0909341SAndroid Build Coastguard Worker    cmovs     dampingd, zerod
701*c0909341SAndroid Build Coastguard Worker    PSHUFB_0        m1, m7
702*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
703*c0909341SAndroid Build Coastguard Worker    lea           tapq, [tap_table]
704*c0909341SAndroid Build Coastguard Worker %else
705*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x04], zerod
706*c0909341SAndroid Build Coastguard Worker %endif
707*c0909341SAndroid Build Coastguard Worker    mov     [rsp+0x00], dampingq
708*c0909341SAndroid Build Coastguard Worker    MOVDDUP         m7, [tapq+dampingq*8]
709*c0909341SAndroid Build Coastguard Worker    lea           dirq, [tapq+dirq*2]
710*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
711*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, stride, h, dir, stk, tap, off, k
712*c0909341SAndroid Build Coastguard Worker %else
713*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, stride, off, stk, dir, tap, h
714*c0909341SAndroid Build Coastguard Worker %endif
715*c0909341SAndroid Build Coastguard Worker    lea           stkq, [px]
716*c0909341SAndroid Build Coastguard Worker    mov             hd, %1*%2/8
717*c0909341SAndroid Build Coastguard Worker.sec_v_loop:
718*c0909341SAndroid Build Coastguard Worker    mov             kd, 1
719*c0909341SAndroid Build Coastguard Worker %if %1 == 4
720*c0909341SAndroid Build Coastguard Worker    movq            m4, [stkq+32*0]
721*c0909341SAndroid Build Coastguard Worker    movhps          m4, [stkq+32*1]
722*c0909341SAndroid Build Coastguard Worker %else
723*c0909341SAndroid Build Coastguard Worker    mova            m4, [stkq+32*0]
724*c0909341SAndroid Build Coastguard Worker %endif
725*c0909341SAndroid Build Coastguard Worker    pxor            m0, m0
726*c0909341SAndroid Build Coastguard Worker.sec_k_loop:
727*c0909341SAndroid Build Coastguard Worker    MOVDDUP         m2, [tapq+12*8+kq*8]
728*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0
729*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
730*c0909341SAndroid Build Coastguard Worker    MOVDDUP         m2, [tapq+12*8+kq*8]
731*c0909341SAndroid Build Coastguard Worker %endif
732*c0909341SAndroid Build Coastguard Worker    ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0
733*c0909341SAndroid Build Coastguard Worker    dec             kd
734*c0909341SAndroid Build Coastguard Worker    jge .sec_k_loop
735*c0909341SAndroid Build Coastguard Worker    movif32    strideq, [esp+0x3C]
736*c0909341SAndroid Build Coastguard Worker    CDEF_FILTER_END %1, 0
737*c0909341SAndroid Build Coastguard Worker    dec             hd
738*c0909341SAndroid Build Coastguard Worker    jg .sec_v_loop
739*c0909341SAndroid Build Coastguard Worker    RET
740*c0909341SAndroid Build Coastguard Worker%endmacro
741*c0909341SAndroid Build Coastguard Worker
742*c0909341SAndroid Build Coastguard Worker%macro MULLD 2
743*c0909341SAndroid Build Coastguard Worker %if cpuflag(sse4)
744*c0909341SAndroid Build Coastguard Worker    pmulld          %1, %2
745*c0909341SAndroid Build Coastguard Worker %else
746*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_32
747*c0909341SAndroid Build Coastguard Worker   %define m15 m1
748*c0909341SAndroid Build Coastguard Worker  %endif
749*c0909341SAndroid Build Coastguard Worker    pmulhuw        m15, %1, %2
750*c0909341SAndroid Build Coastguard Worker    pmullw          %1, %2
751*c0909341SAndroid Build Coastguard Worker    pslld          m15, 16
752*c0909341SAndroid Build Coastguard Worker    paddd           %1, m15
753*c0909341SAndroid Build Coastguard Worker %endif
754*c0909341SAndroid Build Coastguard Worker%endmacro
755*c0909341SAndroid Build Coastguard Worker
756*c0909341SAndroid Build Coastguard Worker%macro CDEF_DIR 0
757*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
758*c0909341SAndroid Build Coastguard Workercglobal cdef_dir_8bpc, 3, 7, 16, src, stride, var
759*c0909341SAndroid Build Coastguard Worker    lea             r6, [strideq*3]
760*c0909341SAndroid Build Coastguard Worker    movq            m1, [srcq+strideq*0]
761*c0909341SAndroid Build Coastguard Worker    movhps          m1, [srcq+strideq*1]
762*c0909341SAndroid Build Coastguard Worker    movq            m3, [srcq+strideq*2]
763*c0909341SAndroid Build Coastguard Worker    movhps          m3, [srcq+r6       ]
764*c0909341SAndroid Build Coastguard Worker    lea           srcq, [srcq+strideq*4]
765*c0909341SAndroid Build Coastguard Worker    movq            m5, [srcq+strideq*0]
766*c0909341SAndroid Build Coastguard Worker    movhps          m5, [srcq+strideq*1]
767*c0909341SAndroid Build Coastguard Worker    movq            m7, [srcq+strideq*2]
768*c0909341SAndroid Build Coastguard Worker    movhps          m7, [srcq+r6       ]
769*c0909341SAndroid Build Coastguard Worker
770*c0909341SAndroid Build Coastguard Worker    pxor            m8, m8
771*c0909341SAndroid Build Coastguard Worker    psadbw          m9, m1, m8
772*c0909341SAndroid Build Coastguard Worker    psadbw          m2, m3, m8
773*c0909341SAndroid Build Coastguard Worker    psadbw          m4, m5, m8
774*c0909341SAndroid Build Coastguard Worker    psadbw          m6, m7, m8
775*c0909341SAndroid Build Coastguard Worker    packssdw        m9, m2
776*c0909341SAndroid Build Coastguard Worker    packssdw        m4, m6
777*c0909341SAndroid Build Coastguard Worker    packssdw        m9, m4
778*c0909341SAndroid Build Coastguard Worker
779*c0909341SAndroid Build Coastguard Worker    punpcklbw       m0, m1, m8
780*c0909341SAndroid Build Coastguard Worker    punpckhbw       m1, m8
781*c0909341SAndroid Build Coastguard Worker    punpcklbw       m2, m3, m8
782*c0909341SAndroid Build Coastguard Worker    punpckhbw       m3, m8
783*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, m5, m8
784*c0909341SAndroid Build Coastguard Worker    punpckhbw       m5, m8
785*c0909341SAndroid Build Coastguard Worker    punpcklbw       m6, m7, m8
786*c0909341SAndroid Build Coastguard Worker    punpckhbw       m7, m8
787*c0909341SAndroid Build Coastguard Workercglobal_label .main
788*c0909341SAndroid Build Coastguard Worker    mova            m8, [pw_128]
789*c0909341SAndroid Build Coastguard Worker    psubw           m0, m8
790*c0909341SAndroid Build Coastguard Worker    psubw           m1, m8
791*c0909341SAndroid Build Coastguard Worker    psubw           m2, m8
792*c0909341SAndroid Build Coastguard Worker    psubw           m3, m8
793*c0909341SAndroid Build Coastguard Worker    psubw           m4, m8
794*c0909341SAndroid Build Coastguard Worker    psubw           m5, m8
795*c0909341SAndroid Build Coastguard Worker    psubw           m6, m8
796*c0909341SAndroid Build Coastguard Worker    psubw           m7, m8
797*c0909341SAndroid Build Coastguard Worker    psllw           m8, 3
798*c0909341SAndroid Build Coastguard Worker    psubw           m9, m8                  ; partial_sum_hv[0]
799*c0909341SAndroid Build Coastguard Worker
800*c0909341SAndroid Build Coastguard Worker    paddw           m8, m0, m1
801*c0909341SAndroid Build Coastguard Worker    paddw          m10, m2, m3
802*c0909341SAndroid Build Coastguard Worker    paddw           m8, m4
803*c0909341SAndroid Build Coastguard Worker    paddw          m10, m5
804*c0909341SAndroid Build Coastguard Worker    paddw           m8, m6
805*c0909341SAndroid Build Coastguard Worker    paddw          m10, m7
806*c0909341SAndroid Build Coastguard Worker    paddw           m8, m10                 ; partial_sum_hv[1]
807*c0909341SAndroid Build Coastguard Worker
808*c0909341SAndroid Build Coastguard Worker    pmaddwd         m8, m8
809*c0909341SAndroid Build Coastguard Worker    pmaddwd         m9, m9
810*c0909341SAndroid Build Coastguard Worker    phaddd          m9, m8
811*c0909341SAndroid Build Coastguard Worker    SWAP            m8, m9
812*c0909341SAndroid Build Coastguard Worker    MULLD           m8, [div_table%+SUFFIX+48]
813*c0909341SAndroid Build Coastguard Worker
814*c0909341SAndroid Build Coastguard Worker    pslldq          m9, m1, 2
815*c0909341SAndroid Build Coastguard Worker    psrldq         m10, m1, 14
816*c0909341SAndroid Build Coastguard Worker    pslldq         m11, m2, 4
817*c0909341SAndroid Build Coastguard Worker    psrldq         m12, m2, 12
818*c0909341SAndroid Build Coastguard Worker    pslldq         m13, m3, 6
819*c0909341SAndroid Build Coastguard Worker    psrldq         m14, m3, 10
820*c0909341SAndroid Build Coastguard Worker    paddw           m9, m0
821*c0909341SAndroid Build Coastguard Worker    paddw          m10, m12
822*c0909341SAndroid Build Coastguard Worker    paddw          m11, m13
823*c0909341SAndroid Build Coastguard Worker    paddw          m10, m14                 ; partial_sum_diag[0] top/right half
824*c0909341SAndroid Build Coastguard Worker    paddw           m9, m11                 ; partial_sum_diag[0] top/left half
825*c0909341SAndroid Build Coastguard Worker    pslldq         m11, m4, 8
826*c0909341SAndroid Build Coastguard Worker    psrldq         m12, m4, 8
827*c0909341SAndroid Build Coastguard Worker    pslldq         m13, m5, 10
828*c0909341SAndroid Build Coastguard Worker    psrldq         m14, m5, 6
829*c0909341SAndroid Build Coastguard Worker    paddw           m9, m11
830*c0909341SAndroid Build Coastguard Worker    paddw          m10, m12
831*c0909341SAndroid Build Coastguard Worker    paddw           m9, m13
832*c0909341SAndroid Build Coastguard Worker    paddw          m10, m14
833*c0909341SAndroid Build Coastguard Worker    pslldq         m11, m6, 12
834*c0909341SAndroid Build Coastguard Worker    psrldq         m12, m6, 4
835*c0909341SAndroid Build Coastguard Worker    pslldq         m13, m7, 14
836*c0909341SAndroid Build Coastguard Worker    psrldq         m14, m7, 2
837*c0909341SAndroid Build Coastguard Worker    paddw           m9, m11
838*c0909341SAndroid Build Coastguard Worker    paddw          m10, m12
839*c0909341SAndroid Build Coastguard Worker    paddw           m9, m13                 ; partial_sum_diag[0][0-7]
840*c0909341SAndroid Build Coastguard Worker    paddw          m10, m14                 ; partial_sum_diag[0][8-14,zero]
841*c0909341SAndroid Build Coastguard Worker    pshufb         m10, [shufw_6543210x]
842*c0909341SAndroid Build Coastguard Worker    punpckhwd      m11, m9, m10
843*c0909341SAndroid Build Coastguard Worker    punpcklwd       m9, m10
844*c0909341SAndroid Build Coastguard Worker    pmaddwd        m11, m11
845*c0909341SAndroid Build Coastguard Worker    pmaddwd         m9, m9
846*c0909341SAndroid Build Coastguard Worker    MULLD          m11, [div_table%+SUFFIX+16]
847*c0909341SAndroid Build Coastguard Worker    MULLD           m9, [div_table%+SUFFIX+0]
848*c0909341SAndroid Build Coastguard Worker    paddd           m9, m11                 ; cost[0a-d]
849*c0909341SAndroid Build Coastguard Worker
850*c0909341SAndroid Build Coastguard Worker    pslldq         m10, m0, 14
851*c0909341SAndroid Build Coastguard Worker    psrldq         m11, m0, 2
852*c0909341SAndroid Build Coastguard Worker    pslldq         m12, m1, 12
853*c0909341SAndroid Build Coastguard Worker    psrldq         m13, m1, 4
854*c0909341SAndroid Build Coastguard Worker    pslldq         m14, m2, 10
855*c0909341SAndroid Build Coastguard Worker    psrldq         m15, m2, 6
856*c0909341SAndroid Build Coastguard Worker    paddw          m10, m12
857*c0909341SAndroid Build Coastguard Worker    paddw          m11, m13
858*c0909341SAndroid Build Coastguard Worker    paddw          m10, m14
859*c0909341SAndroid Build Coastguard Worker    paddw          m11, m15
860*c0909341SAndroid Build Coastguard Worker    pslldq         m12, m3, 8
861*c0909341SAndroid Build Coastguard Worker    psrldq         m13, m3, 8
862*c0909341SAndroid Build Coastguard Worker    pslldq         m14, m4, 6
863*c0909341SAndroid Build Coastguard Worker    psrldq         m15, m4, 10
864*c0909341SAndroid Build Coastguard Worker    paddw          m10, m12
865*c0909341SAndroid Build Coastguard Worker    paddw          m11, m13
866*c0909341SAndroid Build Coastguard Worker    paddw          m10, m14
867*c0909341SAndroid Build Coastguard Worker    paddw          m11, m15
868*c0909341SAndroid Build Coastguard Worker    pslldq         m12, m5, 4
869*c0909341SAndroid Build Coastguard Worker    psrldq         m13, m5, 12
870*c0909341SAndroid Build Coastguard Worker    pslldq         m14, m6, 2
871*c0909341SAndroid Build Coastguard Worker    psrldq         m15, m6, 14
872*c0909341SAndroid Build Coastguard Worker    paddw          m10, m12
873*c0909341SAndroid Build Coastguard Worker    paddw          m11, m13
874*c0909341SAndroid Build Coastguard Worker    paddw          m10, m14
875*c0909341SAndroid Build Coastguard Worker    paddw          m11, m15                 ; partial_sum_diag[1][8-14,zero]
876*c0909341SAndroid Build Coastguard Worker    paddw          m10, m7                  ; partial_sum_diag[1][0-7]
877*c0909341SAndroid Build Coastguard Worker    pshufb         m11, [shufw_6543210x]
878*c0909341SAndroid Build Coastguard Worker    punpckhwd      m12, m10, m11
879*c0909341SAndroid Build Coastguard Worker    punpcklwd      m10, m11
880*c0909341SAndroid Build Coastguard Worker    pmaddwd        m12, m12
881*c0909341SAndroid Build Coastguard Worker    pmaddwd        m10, m10
882*c0909341SAndroid Build Coastguard Worker    MULLD          m12, [div_table%+SUFFIX+16]
883*c0909341SAndroid Build Coastguard Worker    MULLD          m10, [div_table%+SUFFIX+0]
884*c0909341SAndroid Build Coastguard Worker    paddd          m10, m12                 ; cost[4a-d]
885*c0909341SAndroid Build Coastguard Worker    phaddd          m9, m10                 ; cost[0a/b,4a/b]
886*c0909341SAndroid Build Coastguard Worker
887*c0909341SAndroid Build Coastguard Worker    paddw          m10, m0, m1
888*c0909341SAndroid Build Coastguard Worker    paddw          m11, m2, m3
889*c0909341SAndroid Build Coastguard Worker    paddw          m12, m4, m5
890*c0909341SAndroid Build Coastguard Worker    paddw          m13, m6, m7
891*c0909341SAndroid Build Coastguard Worker    phaddw          m0, m4
892*c0909341SAndroid Build Coastguard Worker    phaddw          m1, m5
893*c0909341SAndroid Build Coastguard Worker    phaddw          m2, m6
894*c0909341SAndroid Build Coastguard Worker    phaddw          m3, m7
895*c0909341SAndroid Build Coastguard Worker
896*c0909341SAndroid Build Coastguard Worker    ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1)
897*c0909341SAndroid Build Coastguard Worker    pslldq          m4, m11, 2
898*c0909341SAndroid Build Coastguard Worker    psrldq          m5, m11, 14
899*c0909341SAndroid Build Coastguard Worker    pslldq          m6, m12, 4
900*c0909341SAndroid Build Coastguard Worker    psrldq          m7, m12, 12
901*c0909341SAndroid Build Coastguard Worker    pslldq         m14, m13, 6
902*c0909341SAndroid Build Coastguard Worker    psrldq         m15, m13, 10
903*c0909341SAndroid Build Coastguard Worker    paddw           m4, m10
904*c0909341SAndroid Build Coastguard Worker    paddw           m5, m7
905*c0909341SAndroid Build Coastguard Worker    paddw           m4, m6
906*c0909341SAndroid Build Coastguard Worker    paddw           m5, m15                 ; partial_sum_alt[3] right
907*c0909341SAndroid Build Coastguard Worker    paddw           m4, m14                 ; partial_sum_alt[3] left
908*c0909341SAndroid Build Coastguard Worker    pshuflw         m6, m5, q3012
909*c0909341SAndroid Build Coastguard Worker    punpckhwd       m5, m4
910*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m6
911*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
912*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
913*c0909341SAndroid Build Coastguard Worker    MULLD           m5, [div_table%+SUFFIX+48]
914*c0909341SAndroid Build Coastguard Worker    MULLD           m4, [div_table%+SUFFIX+32]
915*c0909341SAndroid Build Coastguard Worker    paddd           m4, m5                  ; cost[7a-d]
916*c0909341SAndroid Build Coastguard Worker
917*c0909341SAndroid Build Coastguard Worker    pslldq          m5, m10, 6
918*c0909341SAndroid Build Coastguard Worker    psrldq          m6, m10, 10
919*c0909341SAndroid Build Coastguard Worker    pslldq          m7, m11, 4
920*c0909341SAndroid Build Coastguard Worker    psrldq         m10, m11, 12
921*c0909341SAndroid Build Coastguard Worker    pslldq         m11, m12, 2
922*c0909341SAndroid Build Coastguard Worker    psrldq         m12, 14
923*c0909341SAndroid Build Coastguard Worker    paddw           m5, m7
924*c0909341SAndroid Build Coastguard Worker    paddw           m6, m10
925*c0909341SAndroid Build Coastguard Worker    paddw           m5, m11
926*c0909341SAndroid Build Coastguard Worker    paddw           m6, m12
927*c0909341SAndroid Build Coastguard Worker    paddw           m5, m13
928*c0909341SAndroid Build Coastguard Worker    pshuflw         m7, m6, q3012
929*c0909341SAndroid Build Coastguard Worker    punpckhwd       m6, m5
930*c0909341SAndroid Build Coastguard Worker    punpcklwd       m5, m7
931*c0909341SAndroid Build Coastguard Worker    pmaddwd         m6, m6
932*c0909341SAndroid Build Coastguard Worker    pmaddwd         m5, m5
933*c0909341SAndroid Build Coastguard Worker    MULLD           m6, [div_table%+SUFFIX+48]
934*c0909341SAndroid Build Coastguard Worker    MULLD           m5, [div_table%+SUFFIX+32]
935*c0909341SAndroid Build Coastguard Worker    paddd           m5, m6                  ; cost[5a-d]
936*c0909341SAndroid Build Coastguard Worker
937*c0909341SAndroid Build Coastguard Worker    pslldq          m6, m1, 2
938*c0909341SAndroid Build Coastguard Worker    psrldq          m7, m1, 14
939*c0909341SAndroid Build Coastguard Worker    pslldq         m10, m2, 4
940*c0909341SAndroid Build Coastguard Worker    psrldq         m11, m2, 12
941*c0909341SAndroid Build Coastguard Worker    pslldq         m12, m3, 6
942*c0909341SAndroid Build Coastguard Worker    psrldq         m13, m3, 10
943*c0909341SAndroid Build Coastguard Worker    paddw           m6, m0
944*c0909341SAndroid Build Coastguard Worker    paddw           m7, m11
945*c0909341SAndroid Build Coastguard Worker    paddw           m6, m10
946*c0909341SAndroid Build Coastguard Worker    paddw           m7, m13                 ; partial_sum_alt[3] right
947*c0909341SAndroid Build Coastguard Worker    paddw           m6, m12                 ; partial_sum_alt[3] left
948*c0909341SAndroid Build Coastguard Worker    pshuflw        m10, m7, q3012
949*c0909341SAndroid Build Coastguard Worker    punpckhwd       m7, m6
950*c0909341SAndroid Build Coastguard Worker    punpcklwd       m6, m10
951*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m7
952*c0909341SAndroid Build Coastguard Worker    pmaddwd         m6, m6
953*c0909341SAndroid Build Coastguard Worker    MULLD           m7, [div_table%+SUFFIX+48]
954*c0909341SAndroid Build Coastguard Worker    MULLD           m6, [div_table%+SUFFIX+32]
955*c0909341SAndroid Build Coastguard Worker    paddd           m6, m7                  ; cost[1a-d]
956*c0909341SAndroid Build Coastguard Worker
957*c0909341SAndroid Build Coastguard Worker    pshufd          m0, m0, q1032
958*c0909341SAndroid Build Coastguard Worker    pshufd          m1, m1, q1032
959*c0909341SAndroid Build Coastguard Worker    pshufd          m2, m2, q1032
960*c0909341SAndroid Build Coastguard Worker    pshufd          m3, m3, q1032
961*c0909341SAndroid Build Coastguard Worker
962*c0909341SAndroid Build Coastguard Worker    pslldq         m10, m0, 6
963*c0909341SAndroid Build Coastguard Worker    psrldq         m11, m0, 10
964*c0909341SAndroid Build Coastguard Worker    pslldq         m12, m1, 4
965*c0909341SAndroid Build Coastguard Worker    psrldq         m13, m1, 12
966*c0909341SAndroid Build Coastguard Worker    pslldq         m14, m2, 2
967*c0909341SAndroid Build Coastguard Worker    psrldq          m2, 14
968*c0909341SAndroid Build Coastguard Worker    paddw          m10, m12
969*c0909341SAndroid Build Coastguard Worker    paddw          m11, m13
970*c0909341SAndroid Build Coastguard Worker    paddw          m10, m14
971*c0909341SAndroid Build Coastguard Worker    paddw          m11, m2
972*c0909341SAndroid Build Coastguard Worker    paddw          m10, m3
973*c0909341SAndroid Build Coastguard Worker    pshuflw        m12, m11, q3012
974*c0909341SAndroid Build Coastguard Worker    punpckhwd      m11, m10
975*c0909341SAndroid Build Coastguard Worker    punpcklwd      m10, m12
976*c0909341SAndroid Build Coastguard Worker    pmaddwd        m11, m11
977*c0909341SAndroid Build Coastguard Worker    pmaddwd        m10, m10
978*c0909341SAndroid Build Coastguard Worker    MULLD          m11, [div_table%+SUFFIX+48]
979*c0909341SAndroid Build Coastguard Worker    MULLD          m10, [div_table%+SUFFIX+32]
980*c0909341SAndroid Build Coastguard Worker    paddd          m10, m11                 ; cost[3a-d]
981*c0909341SAndroid Build Coastguard Worker
982*c0909341SAndroid Build Coastguard Worker    phaddd          m9, m8                  ; cost[0,4,2,6]
983*c0909341SAndroid Build Coastguard Worker    phaddd          m6, m10
984*c0909341SAndroid Build Coastguard Worker    phaddd          m5, m4
985*c0909341SAndroid Build Coastguard Worker    phaddd          m6, m5                  ; cost[1,3,5,7]
986*c0909341SAndroid Build Coastguard Worker    pshufd          m4, m9, q3120
987*c0909341SAndroid Build Coastguard Worker
988*c0909341SAndroid Build Coastguard Worker    ; now find the best cost
989*c0909341SAndroid Build Coastguard Worker  %if cpuflag(sse4)
990*c0909341SAndroid Build Coastguard Worker    pmaxsd          m9, m6
991*c0909341SAndroid Build Coastguard Worker    pshufd          m0, m9, q1032
992*c0909341SAndroid Build Coastguard Worker    pmaxsd          m0, m9
993*c0909341SAndroid Build Coastguard Worker    pshufd          m1, m0, q2301
994*c0909341SAndroid Build Coastguard Worker    pmaxsd          m0, m1                  ; best cost
995*c0909341SAndroid Build Coastguard Worker  %else
996*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m0, m9, m6
997*c0909341SAndroid Build Coastguard Worker    pand            m9, m0
998*c0909341SAndroid Build Coastguard Worker    pandn           m0, m6
999*c0909341SAndroid Build Coastguard Worker    por             m9, m0
1000*c0909341SAndroid Build Coastguard Worker    pshufd          m1, m9, q1032
1001*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m0, m9, m1
1002*c0909341SAndroid Build Coastguard Worker    pand            m9, m0
1003*c0909341SAndroid Build Coastguard Worker    pandn           m0, m1
1004*c0909341SAndroid Build Coastguard Worker    por             m9, m0
1005*c0909341SAndroid Build Coastguard Worker    pshufd          m1, m9, q2301
1006*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m0, m9, m1
1007*c0909341SAndroid Build Coastguard Worker    pand            m9, m0
1008*c0909341SAndroid Build Coastguard Worker    pandn           m0, m1
1009*c0909341SAndroid Build Coastguard Worker    por             m0, m9
1010*c0909341SAndroid Build Coastguard Worker  %endif
1011*c0909341SAndroid Build Coastguard Worker
1012*c0909341SAndroid Build Coastguard Worker    ; get direction and variance
1013*c0909341SAndroid Build Coastguard Worker    punpckhdq       m1, m4, m6
1014*c0909341SAndroid Build Coastguard Worker    punpckldq       m4, m6
1015*c0909341SAndroid Build Coastguard Worker    psubd           m2, m0, m1
1016*c0909341SAndroid Build Coastguard Worker    psubd           m3, m0, m4
1017*c0909341SAndroid Build Coastguard Worker%if WIN64
1018*c0909341SAndroid Build Coastguard Worker    WIN64_RESTORE_XMM
1019*c0909341SAndroid Build Coastguard Worker    %define tmp rsp+stack_offset+8
1020*c0909341SAndroid Build Coastguard Worker%else
1021*c0909341SAndroid Build Coastguard Worker    %define tmp rsp-40
1022*c0909341SAndroid Build Coastguard Worker%endif
1023*c0909341SAndroid Build Coastguard Worker    mova    [tmp+0x00], m2                  ; emulate ymm in stack
1024*c0909341SAndroid Build Coastguard Worker    mova    [tmp+0x10], m3
1025*c0909341SAndroid Build Coastguard Worker    pcmpeqd         m1, m0                  ; compute best cost mask
1026*c0909341SAndroid Build Coastguard Worker    pcmpeqd         m4, m0
1027*c0909341SAndroid Build Coastguard Worker    packssdw        m4, m1
1028*c0909341SAndroid Build Coastguard Worker    pmovmskb       eax, m4                  ; get byte-idx from mask
1029*c0909341SAndroid Build Coastguard Worker    tzcnt          eax, eax
1030*c0909341SAndroid Build Coastguard Worker    mov            r1d, [tmp+rax*2]         ; get idx^4 complement from emulated ymm
1031*c0909341SAndroid Build Coastguard Worker    shr            eax, 1                   ; get direction by converting byte-idx to word-idx
1032*c0909341SAndroid Build Coastguard Worker    shr            r1d, 10
1033*c0909341SAndroid Build Coastguard Worker    mov         [varq], r1d
1034*c0909341SAndroid Build Coastguard Worker %else
1035*c0909341SAndroid Build Coastguard Workercglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3
1036*c0909341SAndroid Build Coastguard Worker%define base r2-shufw_6543210x
1037*c0909341SAndroid Build Coastguard Worker    LEA             r2, shufw_6543210x
1038*c0909341SAndroid Build Coastguard Worker    pxor            m0, m0
1039*c0909341SAndroid Build Coastguard Worker    lea       stride3q, [strideq*3]
1040*c0909341SAndroid Build Coastguard Worker    movq            m5, [srcq+strideq*0]
1041*c0909341SAndroid Build Coastguard Worker    movhps          m5, [srcq+strideq*1]
1042*c0909341SAndroid Build Coastguard Worker    movq            m7, [srcq+strideq*2]
1043*c0909341SAndroid Build Coastguard Worker    movhps          m7, [srcq+stride3q]
1044*c0909341SAndroid Build Coastguard Worker    mova            m1, [base+pw_128]
1045*c0909341SAndroid Build Coastguard Worker    psadbw          m2, m5, m0
1046*c0909341SAndroid Build Coastguard Worker    psadbw          m3, m7, m0
1047*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
1048*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, m5, m0
1049*c0909341SAndroid Build Coastguard Worker    punpckhbw       m5, m0
1050*c0909341SAndroid Build Coastguard Worker    punpcklbw       m6, m7, m0
1051*c0909341SAndroid Build Coastguard Worker    punpckhbw       m7, m0
1052*c0909341SAndroid Build Coastguard Worker    psubw           m4, m1
1053*c0909341SAndroid Build Coastguard Worker    psubw           m5, m1
1054*c0909341SAndroid Build Coastguard Worker    psubw           m6, m1
1055*c0909341SAndroid Build Coastguard Worker    psubw           m7, m1
1056*c0909341SAndroid Build Coastguard Worker
1057*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x00], m4
1058*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x10], m5
1059*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x20], m6
1060*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x50], m7
1061*c0909341SAndroid Build Coastguard Worker
1062*c0909341SAndroid Build Coastguard Worker    lea           srcq, [srcq+strideq*4]
1063*c0909341SAndroid Build Coastguard Worker    movq            m5, [srcq+strideq*0]
1064*c0909341SAndroid Build Coastguard Worker    movhps          m5, [srcq+strideq*1]
1065*c0909341SAndroid Build Coastguard Worker    movq            m7, [srcq+strideq*2]
1066*c0909341SAndroid Build Coastguard Worker    movhps          m7, [srcq+stride3q]
1067*c0909341SAndroid Build Coastguard Worker    psadbw          m3, m5, m0
1068*c0909341SAndroid Build Coastguard Worker    psadbw          m0, m7
1069*c0909341SAndroid Build Coastguard Worker    packssdw        m3, m0
1070*c0909341SAndroid Build Coastguard Worker    pxor            m0, m0
1071*c0909341SAndroid Build Coastguard Worker    punpcklbw       m4, m5, m0
1072*c0909341SAndroid Build Coastguard Worker    punpckhbw       m5, m0
1073*c0909341SAndroid Build Coastguard Worker    punpcklbw       m6, m7, m0
1074*c0909341SAndroid Build Coastguard Worker    punpckhbw       m7, m0
1075*c0909341SAndroid Build Coastguard Workercglobal_label .main
1076*c0909341SAndroid Build Coastguard Worker    psubw           m4, m1
1077*c0909341SAndroid Build Coastguard Worker    psubw           m5, m1
1078*c0909341SAndroid Build Coastguard Worker    psubw           m6, m1
1079*c0909341SAndroid Build Coastguard Worker    psubw           m7, m1
1080*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
1081*c0909341SAndroid Build Coastguard Worker    psllw           m1, 3
1082*c0909341SAndroid Build Coastguard Worker    psubw           m2, m1                  ; partial_sum_hv[0]
1083*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1084*c0909341SAndroid Build Coastguard Worker
1085*c0909341SAndroid Build Coastguard Worker    mova            m3, [esp+0x50]
1086*c0909341SAndroid Build Coastguard Worker    mova            m0, [esp+0x00]
1087*c0909341SAndroid Build Coastguard Worker    paddw           m0, [esp+0x10]
1088*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3, [esp+0x20]
1089*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4
1090*c0909341SAndroid Build Coastguard Worker    paddw           m1, m5
1091*c0909341SAndroid Build Coastguard Worker    paddw           m0, m6
1092*c0909341SAndroid Build Coastguard Worker    paddw           m1, m7
1093*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1                  ; partial_sum_hv[1]
1094*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
1095*c0909341SAndroid Build Coastguard Worker
1096*c0909341SAndroid Build Coastguard Worker    phaddd          m2, m0
1097*c0909341SAndroid Build Coastguard Worker    MULLD           m2, [base+div_table%+SUFFIX+48]
1098*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x30], m2
1099*c0909341SAndroid Build Coastguard Worker
1100*c0909341SAndroid Build Coastguard Worker    mova            m1, [esp+0x10]
1101*c0909341SAndroid Build Coastguard Worker    pslldq          m0, m1, 2
1102*c0909341SAndroid Build Coastguard Worker    psrldq          m1, 14
1103*c0909341SAndroid Build Coastguard Worker    paddw           m0, [esp+0x00]
1104*c0909341SAndroid Build Coastguard Worker    pslldq          m2, m3, 6
1105*c0909341SAndroid Build Coastguard Worker    psrldq          m3, 10
1106*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
1107*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
1108*c0909341SAndroid Build Coastguard Worker    mova            m3, [esp+0x20]
1109*c0909341SAndroid Build Coastguard Worker    pslldq          m2, m3, 4
1110*c0909341SAndroid Build Coastguard Worker    psrldq          m3, 12
1111*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2                  ; partial_sum_diag[0] top/left half
1112*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3                  ; partial_sum_diag[0] top/right half
1113*c0909341SAndroid Build Coastguard Worker    pslldq          m2, m4, 8
1114*c0909341SAndroid Build Coastguard Worker    psrldq          m3, m4, 8
1115*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
1116*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
1117*c0909341SAndroid Build Coastguard Worker    pslldq          m2, m5, 10
1118*c0909341SAndroid Build Coastguard Worker    psrldq          m3, m5, 6
1119*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
1120*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
1121*c0909341SAndroid Build Coastguard Worker    pslldq          m2, m6, 12
1122*c0909341SAndroid Build Coastguard Worker    psrldq          m3, m6, 4
1123*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
1124*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
1125*c0909341SAndroid Build Coastguard Worker    pslldq          m2, m7, 14
1126*c0909341SAndroid Build Coastguard Worker    psrldq          m3, m7, 2
1127*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2                  ; partial_sum_diag[0][0-7]
1128*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3                  ; partial_sum_diag[0][8-14,zero]
1129*c0909341SAndroid Build Coastguard Worker    mova            m3, [esp+0x50]
1130*c0909341SAndroid Build Coastguard Worker    pshufb          m1, [base+shufw_6543210x]
1131*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m0, m1
1132*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1
1133*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1134*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
1135*c0909341SAndroid Build Coastguard Worker    MULLD           m2, [base+div_table%+SUFFIX+16]
1136*c0909341SAndroid Build Coastguard Worker    MULLD           m0, [base+div_table%+SUFFIX+ 0]
1137*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2                  ; cost[0a-d]
1138*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x40], m0
1139*c0909341SAndroid Build Coastguard Worker
1140*c0909341SAndroid Build Coastguard Worker    mova            m1, [esp+0x00]
1141*c0909341SAndroid Build Coastguard Worker    pslldq          m0, m1, 14
1142*c0909341SAndroid Build Coastguard Worker    psrldq          m1, 2
1143*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7
1144*c0909341SAndroid Build Coastguard Worker    pslldq          m2, m3, 8
1145*c0909341SAndroid Build Coastguard Worker    psrldq          m3, 8
1146*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
1147*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
1148*c0909341SAndroid Build Coastguard Worker    mova            m3, [esp+0x20]
1149*c0909341SAndroid Build Coastguard Worker    pslldq          m2, m3, 10
1150*c0909341SAndroid Build Coastguard Worker    psrldq          m3, 6
1151*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
1152*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
1153*c0909341SAndroid Build Coastguard Worker    mova            m3, [esp+0x10]
1154*c0909341SAndroid Build Coastguard Worker    pslldq          m2, m3, 12
1155*c0909341SAndroid Build Coastguard Worker    psrldq          m3, 4
1156*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
1157*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
1158*c0909341SAndroid Build Coastguard Worker    pslldq          m2, m4, 6
1159*c0909341SAndroid Build Coastguard Worker    psrldq          m3, m4, 10
1160*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
1161*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
1162*c0909341SAndroid Build Coastguard Worker    pslldq          m2, m5, 4
1163*c0909341SAndroid Build Coastguard Worker    psrldq          m3, m5, 12
1164*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
1165*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3
1166*c0909341SAndroid Build Coastguard Worker    pslldq          m2, m6, 2
1167*c0909341SAndroid Build Coastguard Worker    psrldq          m3, m6, 14
1168*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2                  ; partial_sum_diag[1][0-7]
1169*c0909341SAndroid Build Coastguard Worker    paddw           m1, m3                  ; partial_sum_diag[1][8-14,zero]
1170*c0909341SAndroid Build Coastguard Worker    mova            m3, [esp+0x50]
1171*c0909341SAndroid Build Coastguard Worker    pshufb          m1, [base+shufw_6543210x]
1172*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m0, m1
1173*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1
1174*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1175*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
1176*c0909341SAndroid Build Coastguard Worker    MULLD           m2, [base+div_table%+SUFFIX+16]
1177*c0909341SAndroid Build Coastguard Worker    MULLD           m0, [base+div_table%+SUFFIX+ 0]
1178*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2                  ; cost[4a-d]
1179*c0909341SAndroid Build Coastguard Worker    phaddd          m1, [esp+0x40], m0      ; cost[0a/b,4a/b]
1180*c0909341SAndroid Build Coastguard Worker    phaddd          m1, [esp+0x30]          ; cost[0,4,2,6]
1181*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x30], m1
1182*c0909341SAndroid Build Coastguard Worker
1183*c0909341SAndroid Build Coastguard Worker    phaddw          m0, [esp+0x00], m4
1184*c0909341SAndroid Build Coastguard Worker    phaddw          m1, [esp+0x10], m5
1185*c0909341SAndroid Build Coastguard Worker    paddw           m4, m5
1186*c0909341SAndroid Build Coastguard Worker    mova            m2, [esp+0x20]
1187*c0909341SAndroid Build Coastguard Worker    paddw           m5, m2, m3
1188*c0909341SAndroid Build Coastguard Worker    phaddw          m2, m6
1189*c0909341SAndroid Build Coastguard Worker    paddw           m6, m7
1190*c0909341SAndroid Build Coastguard Worker    phaddw          m3, m7
1191*c0909341SAndroid Build Coastguard Worker    mova            m7, [esp+0x00]
1192*c0909341SAndroid Build Coastguard Worker    paddw           m7, [esp+0x10]
1193*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x00], m0
1194*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x10], m1
1195*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x20], m2
1196*c0909341SAndroid Build Coastguard Worker
1197*c0909341SAndroid Build Coastguard Worker    pslldq          m1, m4, 4
1198*c0909341SAndroid Build Coastguard Worker    pslldq          m2, m6, 6
1199*c0909341SAndroid Build Coastguard Worker    pslldq          m0, m5, 2
1200*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2
1201*c0909341SAndroid Build Coastguard Worker    paddw           m0, m7
1202*c0909341SAndroid Build Coastguard Worker    psrldq          m2, m5, 14
1203*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1                  ; partial_sum_alt[3] left
1204*c0909341SAndroid Build Coastguard Worker    psrldq          m1, m4, 12
1205*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2
1206*c0909341SAndroid Build Coastguard Worker    psrldq          m2, m6, 10
1207*c0909341SAndroid Build Coastguard Worker    paddw           m1, m2                  ; partial_sum_alt[3] right
1208*c0909341SAndroid Build Coastguard Worker    pshuflw         m1, m1, q3012
1209*c0909341SAndroid Build Coastguard Worker    punpckhwd       m2, m0, m1
1210*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m1
1211*c0909341SAndroid Build Coastguard Worker    pmaddwd         m2, m2
1212*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
1213*c0909341SAndroid Build Coastguard Worker    MULLD           m2, [base+div_table%+SUFFIX+48]
1214*c0909341SAndroid Build Coastguard Worker    MULLD           m0, [base+div_table%+SUFFIX+32]
1215*c0909341SAndroid Build Coastguard Worker    paddd           m0, m2                  ; cost[7a-d]
1216*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x40], m0
1217*c0909341SAndroid Build Coastguard Worker
1218*c0909341SAndroid Build Coastguard Worker    pslldq          m0, m7, 6
1219*c0909341SAndroid Build Coastguard Worker    psrldq          m7, 10
1220*c0909341SAndroid Build Coastguard Worker    pslldq          m1, m5, 4
1221*c0909341SAndroid Build Coastguard Worker    psrldq          m5, 12
1222*c0909341SAndroid Build Coastguard Worker    pslldq          m2, m4, 2
1223*c0909341SAndroid Build Coastguard Worker    psrldq          m4, 14
1224*c0909341SAndroid Build Coastguard Worker    paddw           m0, m6
1225*c0909341SAndroid Build Coastguard Worker    paddw           m7, m5
1226*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
1227*c0909341SAndroid Build Coastguard Worker    paddw           m7, m4
1228*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
1229*c0909341SAndroid Build Coastguard Worker    pshuflw         m2, m7, q3012
1230*c0909341SAndroid Build Coastguard Worker    punpckhwd       m7, m0
1231*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m2
1232*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m7
1233*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
1234*c0909341SAndroid Build Coastguard Worker    MULLD           m7, [base+div_table%+SUFFIX+48]
1235*c0909341SAndroid Build Coastguard Worker    MULLD           m0, [base+div_table%+SUFFIX+32]
1236*c0909341SAndroid Build Coastguard Worker    paddd           m0, m7                  ; cost[5a-d]
1237*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x50], m0
1238*c0909341SAndroid Build Coastguard Worker
1239*c0909341SAndroid Build Coastguard Worker    mova            m7, [esp+0x10]
1240*c0909341SAndroid Build Coastguard Worker    mova            m2, [esp+0x20]
1241*c0909341SAndroid Build Coastguard Worker    pslldq          m0, m7, 2
1242*c0909341SAndroid Build Coastguard Worker    psrldq          m7, 14
1243*c0909341SAndroid Build Coastguard Worker    pslldq          m4, m2, 4
1244*c0909341SAndroid Build Coastguard Worker    psrldq          m2, 12
1245*c0909341SAndroid Build Coastguard Worker    pslldq          m5, m3, 6
1246*c0909341SAndroid Build Coastguard Worker    psrldq          m6, m3, 10
1247*c0909341SAndroid Build Coastguard Worker    paddw           m0, [esp+0x00]
1248*c0909341SAndroid Build Coastguard Worker    paddw           m7, m2
1249*c0909341SAndroid Build Coastguard Worker    paddw           m4, m5
1250*c0909341SAndroid Build Coastguard Worker    paddw           m7, m6                  ; partial_sum_alt[3] right
1251*c0909341SAndroid Build Coastguard Worker    paddw           m0, m4                  ; partial_sum_alt[3] left
1252*c0909341SAndroid Build Coastguard Worker    pshuflw         m2, m7, q3012
1253*c0909341SAndroid Build Coastguard Worker    punpckhwd       m7, m0
1254*c0909341SAndroid Build Coastguard Worker    punpcklwd       m0, m2
1255*c0909341SAndroid Build Coastguard Worker    pmaddwd         m7, m7
1256*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
1257*c0909341SAndroid Build Coastguard Worker    MULLD           m7, [base+div_table%+SUFFIX+48]
1258*c0909341SAndroid Build Coastguard Worker    MULLD           m0, [base+div_table%+SUFFIX+32]
1259*c0909341SAndroid Build Coastguard Worker    paddd           m0, m7                  ; cost[1a-d]
1260*c0909341SAndroid Build Coastguard Worker    SWAP            m0, m4
1261*c0909341SAndroid Build Coastguard Worker
1262*c0909341SAndroid Build Coastguard Worker    pshufd          m0, [esp+0x00], q1032
1263*c0909341SAndroid Build Coastguard Worker    pshufd          m1, [esp+0x10], q1032
1264*c0909341SAndroid Build Coastguard Worker    pshufd          m2, [esp+0x20], q1032
1265*c0909341SAndroid Build Coastguard Worker    pshufd          m3, m3, q1032
1266*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x00], m4
1267*c0909341SAndroid Build Coastguard Worker
1268*c0909341SAndroid Build Coastguard Worker    pslldq          m4, m0, 6
1269*c0909341SAndroid Build Coastguard Worker    psrldq          m0, 10
1270*c0909341SAndroid Build Coastguard Worker    pslldq          m5, m1, 4
1271*c0909341SAndroid Build Coastguard Worker    psrldq          m1, 12
1272*c0909341SAndroid Build Coastguard Worker    pslldq          m6, m2, 2
1273*c0909341SAndroid Build Coastguard Worker    psrldq          m2, 14
1274*c0909341SAndroid Build Coastguard Worker    paddw           m4, m3
1275*c0909341SAndroid Build Coastguard Worker    paddw           m0, m1
1276*c0909341SAndroid Build Coastguard Worker    paddw           m5, m6
1277*c0909341SAndroid Build Coastguard Worker    paddw           m0, m2
1278*c0909341SAndroid Build Coastguard Worker    paddw           m4, m5
1279*c0909341SAndroid Build Coastguard Worker    pshuflw         m2, m0, q3012
1280*c0909341SAndroid Build Coastguard Worker    punpckhwd       m0, m4
1281*c0909341SAndroid Build Coastguard Worker    punpcklwd       m4, m2
1282*c0909341SAndroid Build Coastguard Worker    pmaddwd         m0, m0
1283*c0909341SAndroid Build Coastguard Worker    pmaddwd         m4, m4
1284*c0909341SAndroid Build Coastguard Worker    MULLD           m0, [base+div_table%+SUFFIX+48]
1285*c0909341SAndroid Build Coastguard Worker    MULLD           m4, [base+div_table%+SUFFIX+32]
1286*c0909341SAndroid Build Coastguard Worker    paddd           m4, m0                   ; cost[3a-d]
1287*c0909341SAndroid Build Coastguard Worker
1288*c0909341SAndroid Build Coastguard Worker    mova            m1, [esp+0x00]
1289*c0909341SAndroid Build Coastguard Worker    mova            m2, [esp+0x50]
1290*c0909341SAndroid Build Coastguard Worker    mova            m0, [esp+0x30]          ; cost[0,4,2,6]
1291*c0909341SAndroid Build Coastguard Worker    phaddd          m1, m4
1292*c0909341SAndroid Build Coastguard Worker    phaddd          m2, [esp+0x40]          ; cost[1,3,5,7]
1293*c0909341SAndroid Build Coastguard Worker    phaddd          m1, m2
1294*c0909341SAndroid Build Coastguard Worker    pshufd          m2, m0, q3120
1295*c0909341SAndroid Build Coastguard Worker
1296*c0909341SAndroid Build Coastguard Worker    ; now find the best cost
1297*c0909341SAndroid Build Coastguard Worker  %if cpuflag(sse4)
1298*c0909341SAndroid Build Coastguard Worker    pmaxsd          m0, m1
1299*c0909341SAndroid Build Coastguard Worker    pshufd          m3, m0, q1032
1300*c0909341SAndroid Build Coastguard Worker    pmaxsd          m3, m0
1301*c0909341SAndroid Build Coastguard Worker    pshufd          m0, m3, q2301
1302*c0909341SAndroid Build Coastguard Worker    pmaxsd          m0, m3
1303*c0909341SAndroid Build Coastguard Worker  %else
1304*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m3, m0, m1
1305*c0909341SAndroid Build Coastguard Worker    pand            m0, m3
1306*c0909341SAndroid Build Coastguard Worker    pandn           m3, m1
1307*c0909341SAndroid Build Coastguard Worker    por             m0, m3
1308*c0909341SAndroid Build Coastguard Worker    pshufd          m4, m0, q1032
1309*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m3, m0, m4
1310*c0909341SAndroid Build Coastguard Worker    pand            m0, m3
1311*c0909341SAndroid Build Coastguard Worker    pandn           m3, m4
1312*c0909341SAndroid Build Coastguard Worker    por             m0, m3
1313*c0909341SAndroid Build Coastguard Worker    pshufd          m4, m0, q2301
1314*c0909341SAndroid Build Coastguard Worker    pcmpgtd         m3, m0, m4
1315*c0909341SAndroid Build Coastguard Worker    pand            m0, m3
1316*c0909341SAndroid Build Coastguard Worker    pandn           m3, m4
1317*c0909341SAndroid Build Coastguard Worker    por             m0, m3
1318*c0909341SAndroid Build Coastguard Worker  %endif
1319*c0909341SAndroid Build Coastguard Worker
1320*c0909341SAndroid Build Coastguard Worker    ; get direction and variance
1321*c0909341SAndroid Build Coastguard Worker    mov           vard, varm
1322*c0909341SAndroid Build Coastguard Worker    punpckhdq       m3, m2, m1
1323*c0909341SAndroid Build Coastguard Worker    punpckldq       m2, m1
1324*c0909341SAndroid Build Coastguard Worker    psubd           m1, m0, m3
1325*c0909341SAndroid Build Coastguard Worker    psubd           m4, m0, m2
1326*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x00], m1                  ; emulate ymm in stack
1327*c0909341SAndroid Build Coastguard Worker    mova    [esp+0x10], m4
1328*c0909341SAndroid Build Coastguard Worker    pcmpeqd         m3, m0                  ; compute best cost mask
1329*c0909341SAndroid Build Coastguard Worker    pcmpeqd         m2, m0
1330*c0909341SAndroid Build Coastguard Worker    packssdw        m2, m3
1331*c0909341SAndroid Build Coastguard Worker    pmovmskb       eax, m2                  ; get byte-idx from mask
1332*c0909341SAndroid Build Coastguard Worker    tzcnt          eax, eax
1333*c0909341SAndroid Build Coastguard Worker    mov            r1d, [esp+eax*2]         ; get idx^4 complement from emulated ymm
1334*c0909341SAndroid Build Coastguard Worker    shr            eax, 1                   ; get direction by converting byte-idx to word-idx
1335*c0909341SAndroid Build Coastguard Worker    shr            r1d, 10
1336*c0909341SAndroid Build Coastguard Worker    mov         [vard], r1d
1337*c0909341SAndroid Build Coastguard Worker %endif
1338*c0909341SAndroid Build Coastguard Worker
1339*c0909341SAndroid Build Coastguard Worker    RET
1340*c0909341SAndroid Build Coastguard Worker%endmacro
1341*c0909341SAndroid Build Coastguard Worker
1342*c0909341SAndroid Build Coastguard WorkerINIT_XMM sse4
1343*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 8, 8
1344*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 4, 8
1345*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 4, 4
1346*c0909341SAndroid Build Coastguard WorkerCDEF_DIR
1347*c0909341SAndroid Build Coastguard Worker
1348*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
1349*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 8, 8
1350*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 4, 8
1351*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 4, 4
1352*c0909341SAndroid Build Coastguard WorkerCDEF_DIR
1353*c0909341SAndroid Build Coastguard Worker
1354*c0909341SAndroid Build Coastguard WorkerINIT_XMM sse2
1355*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 8, 8
1356*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 4, 8
1357*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 4, 4
1358