xref: /aosp_15_r20/external/libdav1d/src/x86/cdef_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; Copyright © 2019, VideoLabs
4; All rights reserved.
5;
6; Redistribution and use in source and binary forms, with or without
7; modification, are permitted provided that the following conditions are met:
8;
9; 1. Redistributions of source code must retain the above copyright notice, this
10;    list of conditions and the following disclaimer.
11;
12; 2. Redistributions in binary form must reproduce the above copyright notice,
13;    this list of conditions and the following disclaimer in the documentation
14;    and/or other materials provided with the distribution.
15;
16; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27%include "config.asm"
28%include "ext/x86/x86inc.asm"
29
30SECTION_RODATA 16
31
32%macro DUP8 1-*
33    %rep %0
34        times 8 db %1
35        %rotate 1
36    %endrep
37%endmacro
38
39div_table_sse4:  dd 840, 420, 280, 210, 168, 140, 120, 105
40                 dd 420, 210, 140, 105, 105, 105, 105, 105
41div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210
42                 dw 168, 168, 140, 140, 120, 120, 105, 105
43                 dw 420, 420, 210, 210, 140, 140, 105, 105
44                 dw 105, 105, 105, 105, 105, 105, 105, 105
45const shufw_6543210x, \
46            db 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1, 14, 15
47shufb_lohi: db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
48pw_8:      times 8 dw 8
49pw_128:    times 8 dw 128
50pw_256:    times 8 dw 256
51pw_2048:   times 8 dw 2048
52pw_0x7FFF: times 8 dw 0x7FFF
53pw_0x8000: times 8 dw 0x8000
54tap_table: ; masks for 8-bit shift emulation
55           DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80
56           ; weights
57           DUP8 4, 2, 3, 3, 2, 1
58           ; taps indices
59           db -1 * 16 + 1, -2 * 16 + 2
60           db  0 * 16 + 1, -1 * 16 + 2
61           db  0 * 16 + 1,  0 * 16 + 2
62           db  0 * 16 + 1,  1 * 16 + 2
63           db  1 * 16 + 1,  2 * 16 + 2
64           db  1 * 16 + 0,  2 * 16 + 1
65           db  1 * 16 + 0,  2 * 16 + 0
66           db  1 * 16 + 0,  2 * 16 - 1
67           ; the last 6 are repeats of the first 6 so we don't need to & 7
68           db -1 * 16 + 1, -2 * 16 + 2
69           db  0 * 16 + 1, -1 * 16 + 2
70           db  0 * 16 + 1,  0 * 16 + 2
71           db  0 * 16 + 1,  1 * 16 + 2
72           db  1 * 16 + 1,  2 * 16 + 2
73           db  1 * 16 + 0,  2 * 16 + 1
74
75SECTION .text
76
77%macro movif32 2
78 %if ARCH_X86_32
79    mov     %1, %2
80 %endif
81%endmacro
82
83%macro PMOVZXBW 2-3 0 ; %3 = half
84 %if cpuflag(sse4) && %3 == 0
85    pmovzxbw        %1, %2
86 %else
87  %if %3 == 1
88    movd            %1, %2
89  %else
90    movq            %1, %2
91  %endif
92    punpcklbw       %1, m7
93 %endif
94%endmacro
95
96%macro PSHUFB_0 2
97 %if cpuflag(ssse3)
98    pshufb          %1, %2
99 %else
100    punpcklbw       %1, %1
101    pshuflw         %1, %1, q0000
102    punpcklqdq      %1, %1
103 %endif
104%endmacro
105
106%macro MOVDDUP 2
107%if cpuflag(ssse3)
108    movddup         %1, %2
109%else
110    movq            %1, %2
111    punpcklqdq      %1, %1
112%endif
113%endmacro
114
115%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax
116    ; load p0/p1
117    movsx         offq, byte [dirq+kq+%1+14*8]  ; off1
118 %if %6 == 4
119    movq            m5, [stkq+offq*2+32*0]      ; p0
120    movhps          m5, [stkq+offq*2+32*1]
121 %else
122    movu            m5, [stkq+offq*2+32*0]      ; p0
123 %endif
124    neg           offq                          ; -off1
125 %if %6 == 4
126    movq            m6, [stkq+offq*2+32*0]      ; p1
127    movhps          m6, [stkq+offq*2+32*1]
128 %else
129    movu            m6, [stkq+offq*2+32*0]      ; p1
130 %endif
131 %if %7
132  %if cpuflag(sse4)
133    ; out of bounds values are set to a value that is a both a large unsigned
134    ; value and a negative signed value.
135    ; use signed max and unsigned min to remove them
136    pmaxsw          m7, m5
137    pminuw          m8, m5
138    pmaxsw          m7, m6
139    pminuw          m8, m6
140  %else
141    pcmpeqw         m3, m14, m5
142    pminsw          m8, m5     ; min after p0
143    pandn           m3, m5
144    pmaxsw          m7, m3     ; max after p0
145    pcmpeqw         m3, m14, m6
146    pminsw          m8, m6     ; min after p1
147    pandn           m3, m6
148    pmaxsw          m7, m3     ; max after p1
149  %endif
150 %endif
151
152    ; accumulate sum[m13] over p0/p1
153    psubw           m5, m4     ; diff_p0(p0 - px)
154    psubw           m6, m4     ; diff_p1(p1 - px)
155    packsswb        m5, m6     ; convert pixel diff to 8-bit
156 %if cpuflag(ssse3)
157    pshufb          m5, m13    ; group diffs p0 and p1 into pairs
158    pabsb           m6, m5
159    psignb          m3, %5, m5
160 %else
161    movlhps         m6, m5
162    punpckhbw       m6, m5
163    pxor            m5, m5
164    pcmpgtb         m5, m6
165    paddb           m6, m5
166    pxor            m6, m5
167    paddb           m3, %5, m5
168    pxor            m3, m5
169 %endif
170    pand            m9, %3, m6 ; emulate 8-bit shift
171    psrlw           m9, %2
172    psubusb         m5, %4, m9
173    pminub          m5, m6     ; constrain(diff_p)
174 %if cpuflag(ssse3)
175    pmaddubsw       m5, m3     ; constrain(diff_p) * taps
176 %else
177    psrlw           m9, m5, 8
178    psraw           m6, m3, 8
179    psllw           m5, 8
180    psllw           m3, 8
181    pmullw          m9, m6
182    pmulhw          m5, m3
183    paddw           m5, m9
184 %endif
185    paddw           m0, m5
186%endmacro
187
188%macro LOAD_BODY 3 ; dst, src, block_width
189 %if %3 == 4
190    PMOVZXBW        m0, [%2+strideq*0]
191    PMOVZXBW        m1, [%2+strideq*1]
192    PMOVZXBW        m2, [%2+strideq*2]
193    PMOVZXBW        m3, [%2+stride3q]
194    mova     [%1+32*0], m0
195    mova     [%1+32*1], m1
196    mova     [%1+32*2], m2
197    mova     [%1+32*3], m3
198 %else
199    movu            m0, [%2+strideq*0]
200    movu            m1, [%2+strideq*1]
201    movu            m2, [%2+strideq*2]
202    movu            m3, [%2+stride3q]
203    punpcklbw       m4, m0, m7
204    punpckhbw       m0, m7
205    mova  [%1+32*0+ 0], m4
206    mova  [%1+32*0+16], m0
207    punpcklbw       m4, m1, m7
208    punpckhbw       m1, m7
209    mova  [%1+32*1+ 0], m4
210    mova  [%1+32*1+16], m1
211    punpcklbw       m4, m2, m7
212    punpckhbw       m2, m7
213    mova  [%1+32*2+ 0], m4
214    mova  [%1+32*2+16], m2
215    punpcklbw       m4, m3, m7
216    punpckhbw       m3, m7
217    mova  [%1+32*3+ 0], m4
218    mova  [%1+32*3+16], m3
219 %endif
220%endmacro
221
222%macro CDEF_FILTER_END 2 ; w, minmax
223    pxor            m6, m6
224    pcmpgtw         m6, m0
225    paddw           m0, m6
226 %if cpuflag(ssse3)
227    pmulhrsw        m0, m15
228 %else
229    paddw           m0, m15
230    psraw           m0, 4
231 %endif
232    paddw           m4, m0
233 %if %2
234    pminsw          m4, m7
235    pmaxsw          m4, m8
236 %endif
237    packuswb        m4, m4
238 %if %1 == 4
239    movd [dstq+strideq*0], m4
240    psrlq           m4, 32
241    movd [dstq+strideq*1], m4
242    add           stkq, 32*2
243    lea           dstq, [dstq+strideq*2]
244 %else
245    movq        [dstq], m4
246    add           stkq, 32
247    add           dstq, strideq
248 %endif
249%endmacro
250
251%macro CDEF_FILTER 2 ; w, h
252 %if ARCH_X86_64
253cglobal cdef_filter_%1x%2_8bpc, 5, 9, 16, 3 * 16 + (%2+4)*32, \
254                                dst, stride, left, top, bot, pri, dst4, edge, \
255                                stride3
256  %define px rsp+3*16+2*32
257  %define base 0
258 %else
259cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
260                                dst, stride, left, edge, stride3
261    %define       topq  r2
262    %define       botq  r2
263    %define      dst4q  r2
264    LEA             r5, tap_table
265  %define px esp+7*16+2*32
266  %define base r5-tap_table
267 %endif
268    mov          edged, r9m
269 %if cpuflag(sse4)
270   %define OUT_OF_BOUNDS_MEM [base+pw_0x8000]
271 %else
272   %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF]
273 %endif
274    mova            m6, OUT_OF_BOUNDS_MEM
275    pxor            m7, m7
276
277    ; prepare pixel buffers - body/right
278 %if %2 == 8
279    lea          dst4q, [dstq+strideq*4]
280 %endif
281    lea       stride3q, [strideq*3]
282    test         edgeb, 2 ; have_right
283    jz .no_right
284    LOAD_BODY       px, dstq, %1
285 %if %2 == 8
286    LOAD_BODY  px+4*32, dst4q, %1
287 %endif
288    jmp .body_done
289.no_right:
290    PMOVZXBW        m0, [dstq+strideq*0], %1 == 4
291    PMOVZXBW        m1, [dstq+strideq*1], %1 == 4
292    PMOVZXBW        m2, [dstq+strideq*2], %1 == 4
293    PMOVZXBW        m3, [dstq+stride3q ], %1 == 4
294    mova     [px+32*0], m0
295    mova     [px+32*1], m1
296    mova     [px+32*2], m2
297    mova     [px+32*3], m3
298    movd [px+32*0+%1*2], m6
299    movd [px+32*1+%1*2], m6
300    movd [px+32*2+%1*2], m6
301    movd [px+32*3+%1*2], m6
302 %if %2 == 8
303    PMOVZXBW        m0, [dst4q+strideq*0], %1 == 4
304    PMOVZXBW        m1, [dst4q+strideq*1], %1 == 4
305    PMOVZXBW        m2, [dst4q+strideq*2], %1 == 4
306    PMOVZXBW        m3, [dst4q+stride3q ], %1 == 4
307    mova     [px+32*4], m0
308    mova     [px+32*5], m1
309    mova     [px+32*6], m2
310    mova     [px+32*7], m3
311    movd [px+32*4+%1*2], m6
312    movd [px+32*5+%1*2], m6
313    movd [px+32*6+%1*2], m6
314    movd [px+32*7+%1*2], m6
315 %endif
316.body_done:
317
318    ; top
319    movifnidn     topq, r3mp
320    test         edgeb, 4 ; have_top
321    jz .no_top
322    test         edgeb, 1 ; have_left
323    jz .top_no_left
324    test         edgeb, 2 ; have_right
325    jz .top_no_right
326 %if %1 == 4
327    PMOVZXBW        m0, [topq+strideq*0-2]
328    PMOVZXBW        m1, [topq+strideq*1-2]
329 %else
330    movu            m0, [topq+strideq*0-4]
331    movu            m1, [topq+strideq*1-4]
332    punpckhbw       m2, m0, m7
333    punpcklbw       m0, m7
334    punpckhbw       m3, m1, m7
335    punpcklbw       m1, m7
336    movu  [px-32*2+8], m2
337    movu  [px-32*1+8], m3
338 %endif
339    movu  [px-32*2-%1], m0
340    movu  [px-32*1-%1], m1
341    jmp .top_done
342.top_no_right:
343 %if %1 == 4
344    PMOVZXBW        m0, [topq+strideq*0-%1]
345    PMOVZXBW        m1, [topq+strideq*1-%1]
346    movu   [px-32*2-8], m0
347    movu   [px-32*1-8], m1
348 %else
349    movu            m0, [topq+strideq*0-%1]
350    movu            m1, [topq+strideq*1-%2]
351    punpckhbw       m2, m0, m7
352    punpcklbw       m0, m7
353    punpckhbw       m3, m1, m7
354    punpcklbw       m1, m7
355    mova  [px-32*2-16], m0
356    mova  [px-32*2+ 0], m2
357    mova  [px-32*1-16], m1
358    mova  [px-32*1+ 0], m3
359 %endif
360    movd [px-32*2+%1*2], m6
361    movd [px-32*1+%1*2], m6
362    jmp .top_done
363.top_no_left:
364    test         edgeb, 2 ; have_right
365    jz .top_no_left_right
366 %if %1 == 4
367    PMOVZXBW        m0, [topq+strideq*0]
368    PMOVZXBW        m1, [topq+strideq*1]
369 %else
370    movu            m0, [topq+strideq*0]
371    movu            m1, [topq+strideq*1]
372    punpckhbw       m2, m0, m7
373    punpcklbw       m0, m7
374    punpckhbw       m3, m1, m7
375    punpcklbw       m1, m7
376    movd  [px-32*2+16], m2
377    movd  [px-32*1+16], m3
378 %endif
379    movd  [px-32*2- 4], m6
380    movd  [px-32*1- 4], m6
381    mova  [px-32*2+ 0], m0
382    mova  [px-32*1+ 0], m1
383    jmp .top_done
384.top_no_left_right:
385    PMOVZXBW        m0, [topq+strideq*0], %1 == 4
386    PMOVZXBW        m1, [topq+strideq*1], %1 == 4
387    movd   [px-32*2-4], m6
388    movd   [px-32*1-4], m6
389    mova   [px-32*2+0], m0
390    mova   [px-32*1+0], m1
391    movd [px-32*2+%1*2], m6
392    movd [px-32*1+%1*2], m6
393    jmp .top_done
394.no_top:
395    movu  [px-32*2- 4], m6
396    movu  [px-32*1- 4], m6
397 %if %1 == 8
398    movq  [px-32*2+12], m6
399    movq  [px-32*1+12], m6
400 %endif
401.top_done:
402
403    ; left
404    test         edgeb, 1 ; have_left
405    jz .no_left
406    movifnidn    leftq, leftmp
407 %if %2 == 4
408    movq            m0, [leftq]
409 %else
410    movu            m0, [leftq]
411 %endif
412 %if %2 == 4
413    punpcklbw       m0, m7
414 %else
415    punpckhbw       m1, m0, m7
416    punpcklbw       m0, m7
417    movhlps         m3, m1
418    movd   [px+32*4-4], m1
419    movd   [px+32*6-4], m3
420    psrlq           m1, 32
421    psrlq           m3, 32
422    movd   [px+32*5-4], m1
423    movd   [px+32*7-4], m3
424 %endif
425    movhlps         m2, m0
426    movd   [px+32*0-4], m0
427    movd   [px+32*2-4], m2
428    psrlq           m0, 32
429    psrlq           m2, 32
430    movd   [px+32*1-4], m0
431    movd   [px+32*3-4], m2
432    jmp .left_done
433.no_left:
434    movd   [px+32*0-4], m6
435    movd   [px+32*1-4], m6
436    movd   [px+32*2-4], m6
437    movd   [px+32*3-4], m6
438 %if %2 == 8
439    movd   [px+32*4-4], m6
440    movd   [px+32*5-4], m6
441    movd   [px+32*6-4], m6
442    movd   [px+32*7-4], m6
443 %endif
444.left_done:
445
446    ; bottom
447    movifnidn     botq, r4mp
448    test         edgeb, 8 ; have_bottom
449    jz .no_bottom
450    test         edgeb, 1 ; have_left
451    jz .bottom_no_left
452    test         edgeb, 2 ; have_right
453    jz .bottom_no_right
454 %if %1 == 4
455    PMOVZXBW        m0, [botq+strideq*0-(%1/2)]
456    PMOVZXBW        m1, [botq+strideq*1-(%1/2)]
457 %else
458    movu            m0, [botq+strideq*0-4]
459    movu            m1, [botq+strideq*1-4]
460    punpckhbw       m2, m0, m7
461    punpcklbw       m0, m7
462    punpckhbw       m3, m1, m7
463    punpcklbw       m1, m7
464    movu [px+32*(%2+0)+8], m2
465    movu [px+32*(%2+1)+8], m3
466 %endif
467    movu [px+32*(%2+0)-%1], m0
468    movu [px+32*(%2+1)-%1], m1
469    jmp .bottom_done
470.bottom_no_right:
471 %if %1 == 4
472    PMOVZXBW        m0, [botq+strideq*0-4]
473    PMOVZXBW        m1, [botq+strideq*1-4]
474    movu [px+32*(%2+0)-8], m0
475    movu [px+32*(%2+1)-8], m1
476 %else
477    movu            m0, [botq+strideq*0-8]
478    movu            m1, [botq+strideq*1-8]
479    punpckhbw       m2, m0, m7
480    punpcklbw       m0, m7
481    punpckhbw       m3, m1, m7
482    punpcklbw       m1, m7
483    mova [px+32*(%2+0)-16], m0
484    mova [px+32*(%2+0)+ 0], m2
485    mova [px+32*(%2+1)-16], m1
486    mova [px+32*(%2+1)+ 0], m3
487    movd [px+32*(%2-1)+16], m6 ; overwritten by first mova
488 %endif
489    movd [px+32*(%2+0)+%1*2], m6
490    movd [px+32*(%2+1)+%1*2], m6
491    jmp .bottom_done
492.bottom_no_left:
493    test         edgeb, 2 ; have_right
494    jz .bottom_no_left_right
495 %if %1 == 4
496    PMOVZXBW        m0, [botq+strideq*0]
497    PMOVZXBW        m1, [botq+strideq*1]
498 %else
499    movu            m0, [botq+strideq*0]
500    movu            m1, [botq+strideq*1]
501    punpckhbw       m2, m0, m7
502    punpcklbw       m0, m7
503    punpckhbw       m3, m1, m7
504    punpcklbw       m1, m7
505    mova [px+32*(%2+0)+16], m2
506    mova [px+32*(%2+1)+16], m3
507 %endif
508    mova [px+32*(%2+0)+ 0], m0
509    mova [px+32*(%2+1)+ 0], m1
510    movd [px+32*(%2+0)- 4], m6
511    movd [px+32*(%2+1)- 4], m6
512    jmp .bottom_done
513.bottom_no_left_right:
514    PMOVZXBW        m0, [botq+strideq*0], %1 == 4
515    PMOVZXBW        m1, [botq+strideq*1], %1 == 4
516    mova [px+32*(%2+0)+ 0], m0
517    mova [px+32*(%2+1)+ 0], m1
518    movd [px+32*(%2+0)+%1*2], m6
519    movd [px+32*(%2+1)+%1*2], m6
520    movd [px+32*(%2+0)- 4], m6
521    movd [px+32*(%2+1)- 4], m6
522    jmp .bottom_done
523.no_bottom:
524    movu [px+32*(%2+0)- 4], m6
525    movu [px+32*(%2+1)- 4], m6
526 %if %1 == 8
527    movq [px+32*(%2+0)+12], m6
528    movq [px+32*(%2+1)+12], m6
529 %endif
530.bottom_done:
531
532    ; actual filter
533 %if ARCH_X86_64
534    DEFINE_ARGS dst, stride, _, pridmp, damping, pri, sec
535    mova           m13, [shufb_lohi]
536 %if cpuflag(ssse3)
537    mova           m15, [pw_2048]
538 %else
539    mova           m15, [pw_8]
540 %endif
541    mova           m14, m6
542 %else
543    DEFINE_ARGS dst, pridmp, sec, damping, pri, tap
544    %xdefine        m8  m1
545    %xdefine        m9  m2
546    %xdefine       m10  m0
547    %xdefine       m13  [base+shufb_lohi]
548    %xdefine       m14  OUT_OF_BOUNDS_MEM
549 %if cpuflag(ssse3)
550    %xdefine       m15  [base+pw_2048]
551 %else
552    %xdefine       m15  [base+pw_8]
553 %endif
554 %endif
555    movifnidn     prid, r5m
556    movifnidn     secd, r6m
557    mov       dampingd, r8m
558    movif32 [esp+0x3C], r1d
559    test          prid, prid
560    jz .sec_only
561    movd            m1, r5m
562    bsr        pridmpd, prid
563    test          secd, secd
564    jz .pri_only
565    movd           m10, r6m
566    tzcnt         secd, secd
567    and           prid, 1
568    sub        pridmpd, dampingd
569    sub           secd, dampingd
570    xor       dampingd, dampingd
571    add           prid, prid
572    neg        pridmpd
573    cmovs      pridmpd, dampingd
574    neg           secd
575    PSHUFB_0        m1, m7
576    PSHUFB_0       m10, m7
577 %if ARCH_X86_64
578    DEFINE_ARGS dst, stride, _, pridmp, tap, pri, sec
579    lea           tapq, [tap_table]
580    MOVDDUP        m11, [tapq+pridmpq*8] ; pri_shift_mask
581    MOVDDUP        m12, [tapq+secq*8]    ; sec_shift_mask
582    mov     [rsp+0x00], pridmpq          ; pri_shift
583    mov     [rsp+0x10], secq             ; sec_shift
584    DEFINE_ARGS dst, stride, h, dir, tap, pri, stk, k, off
585 %else
586    MOVDDUP         m2, [tapq+pridmpq*8]
587    MOVDDUP         m3, [tapq+secq*8]
588    mov     [esp+0x04], dampingd         ; zero upper 32 bits of psrlw
589    mov     [esp+0x34], dampingd         ; source operand in ACCUMULATE_TAP
590    mov     [esp+0x00], pridmpd
591    mov     [esp+0x30], secd
592    DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
593  %define         offq  dstq
594  %define           kd  strided
595  %define           kq  strideq
596    mova    [esp+0x10], m2
597    mova    [esp+0x40], m3
598    mova    [esp+0x20], m1
599    mova    [esp+0x50], m10
600 %endif
601    mov           dird, r7m
602    lea           stkq, [px]
603    lea           priq, [tapq+8*8+priq*8] ; pri_taps
604    mov             hd, %1*%2/8
605    lea           dirq, [tapq+dirq*2]
606.v_loop:
607    movif32 [esp+0x38], dstd
608    mov             kd, 1
609 %if %1 == 4
610    movq            m4, [stkq+32*0]
611    movhps          m4, [stkq+32*1]
612 %else
613    mova            m4, [stkq+32*0]       ; px
614 %endif
615    pxor            m0, m0                ; sum
616    mova            m7, m4                ; max
617    mova            m8, m4                ; min
618.k_loop:
619    MOVDDUP         m2, [priq+kq*8]
620 %if ARCH_X86_64
621    ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1
622    MOVDDUP         m2, [tapq+12*8+kq*8]
623    ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1
624    ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1
625 %else
626    ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1
627    MOVDDUP         m2, [tapq+12*8+kq*8]
628    ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
629    MOVDDUP         m2, [tapq+12*8+kq*8]
630    ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
631 %endif
632    dec             kd
633    jge .k_loop
634    movif32       dstq, [esp+0x38]
635    movif32    strideq, [esp+0x3C]
636    CDEF_FILTER_END %1, 1
637    dec             hd
638    jg .v_loop
639    RET
640
641.pri_only:
642%if ARCH_X86_64
643    DEFINE_ARGS dst, stride, zero, pridmp, damping, pri, tap
644    lea           tapq, [tap_table]
645 %else
646    DEFINE_ARGS dst, pridmp, zero, damping, pri, tap
647 %endif
648    and           prid, 1
649    xor          zerod, zerod
650    sub       dampingd, pridmpd
651    cmovs     dampingd, zerod
652    add           prid, prid
653    PSHUFB_0        m1, m7
654    MOVDDUP         m7, [tapq+dampingq*8]
655    mov     [rsp+0x00], dampingq
656 %if ARCH_X86_64
657    DEFINE_ARGS dst, stride, h, dir, stk, pri, tap, k, off
658 %else
659    mov     [rsp+0x04], zerod
660    DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
661 %endif
662    mov           dird, r7m
663    lea           stkq, [px]
664    lea           priq, [tapq+8*8+priq*8]
665    mov             hd, %1*%2/8
666    lea           dirq, [tapq+dirq*2]
667.pri_v_loop:
668    movif32 [esp+0x38], dstd
669    mov             kd, 1
670 %if %1 == 4
671    movq            m4, [stkq+32*0]
672    movhps          m4, [stkq+32*1]
673 %else
674    mova            m4, [stkq+32*0]
675 %endif
676    pxor            m0, m0
677.pri_k_loop:
678    MOVDDUP         m2, [priq+kq*8]
679    ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0
680    dec             kd
681    jge .pri_k_loop
682    movif32       dstq, [esp+0x38]
683    movif32    strideq, [esp+0x3C]
684    CDEF_FILTER_END %1, 0
685    dec             hd
686    jg .pri_v_loop
687    RET
688
689.sec_only:
690%if ARCH_X86_64
691    DEFINE_ARGS dst, stride, zero, dir, damping, tap, sec
692%else
693    DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero
694%endif
695    movd            m1, r6m
696    tzcnt         secd, secd
697    mov           dird, r7m
698    xor          zerod, zerod
699    sub       dampingd, secd
700    cmovs     dampingd, zerod
701    PSHUFB_0        m1, m7
702 %if ARCH_X86_64
703    lea           tapq, [tap_table]
704 %else
705    mov     [rsp+0x04], zerod
706 %endif
707    mov     [rsp+0x00], dampingq
708    MOVDDUP         m7, [tapq+dampingq*8]
709    lea           dirq, [tapq+dirq*2]
710 %if ARCH_X86_64
711    DEFINE_ARGS dst, stride, h, dir, stk, tap, off, k
712 %else
713    DEFINE_ARGS dst, stride, off, stk, dir, tap, h
714 %endif
715    lea           stkq, [px]
716    mov             hd, %1*%2/8
717.sec_v_loop:
718    mov             kd, 1
719 %if %1 == 4
720    movq            m4, [stkq+32*0]
721    movhps          m4, [stkq+32*1]
722 %else
723    mova            m4, [stkq+32*0]
724 %endif
725    pxor            m0, m0
726.sec_k_loop:
727    MOVDDUP         m2, [tapq+12*8+kq*8]
728    ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0
729 %if ARCH_X86_32
730    MOVDDUP         m2, [tapq+12*8+kq*8]
731 %endif
732    ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0
733    dec             kd
734    jge .sec_k_loop
735    movif32    strideq, [esp+0x3C]
736    CDEF_FILTER_END %1, 0
737    dec             hd
738    jg .sec_v_loop
739    RET
740%endmacro
741
742%macro MULLD 2
743 %if cpuflag(sse4)
744    pmulld          %1, %2
745 %else
746  %if ARCH_X86_32
747   %define m15 m1
748  %endif
749    pmulhuw        m15, %1, %2
750    pmullw          %1, %2
751    pslld          m15, 16
752    paddd           %1, m15
753 %endif
754%endmacro
755
756%macro CDEF_DIR 0
757 %if ARCH_X86_64
758cglobal cdef_dir_8bpc, 3, 7, 16, src, stride, var
759    lea             r6, [strideq*3]
760    movq            m1, [srcq+strideq*0]
761    movhps          m1, [srcq+strideq*1]
762    movq            m3, [srcq+strideq*2]
763    movhps          m3, [srcq+r6       ]
764    lea           srcq, [srcq+strideq*4]
765    movq            m5, [srcq+strideq*0]
766    movhps          m5, [srcq+strideq*1]
767    movq            m7, [srcq+strideq*2]
768    movhps          m7, [srcq+r6       ]
769
770    pxor            m8, m8
771    psadbw          m9, m1, m8
772    psadbw          m2, m3, m8
773    psadbw          m4, m5, m8
774    psadbw          m6, m7, m8
775    packssdw        m9, m2
776    packssdw        m4, m6
777    packssdw        m9, m4
778
779    punpcklbw       m0, m1, m8
780    punpckhbw       m1, m8
781    punpcklbw       m2, m3, m8
782    punpckhbw       m3, m8
783    punpcklbw       m4, m5, m8
784    punpckhbw       m5, m8
785    punpcklbw       m6, m7, m8
786    punpckhbw       m7, m8
787cglobal_label .main
788    mova            m8, [pw_128]
789    psubw           m0, m8
790    psubw           m1, m8
791    psubw           m2, m8
792    psubw           m3, m8
793    psubw           m4, m8
794    psubw           m5, m8
795    psubw           m6, m8
796    psubw           m7, m8
797    psllw           m8, 3
798    psubw           m9, m8                  ; partial_sum_hv[0]
799
800    paddw           m8, m0, m1
801    paddw          m10, m2, m3
802    paddw           m8, m4
803    paddw          m10, m5
804    paddw           m8, m6
805    paddw          m10, m7
806    paddw           m8, m10                 ; partial_sum_hv[1]
807
808    pmaddwd         m8, m8
809    pmaddwd         m9, m9
810    phaddd          m9, m8
811    SWAP            m8, m9
812    MULLD           m8, [div_table%+SUFFIX+48]
813
814    pslldq          m9, m1, 2
815    psrldq         m10, m1, 14
816    pslldq         m11, m2, 4
817    psrldq         m12, m2, 12
818    pslldq         m13, m3, 6
819    psrldq         m14, m3, 10
820    paddw           m9, m0
821    paddw          m10, m12
822    paddw          m11, m13
823    paddw          m10, m14                 ; partial_sum_diag[0] top/right half
824    paddw           m9, m11                 ; partial_sum_diag[0] top/left half
825    pslldq         m11, m4, 8
826    psrldq         m12, m4, 8
827    pslldq         m13, m5, 10
828    psrldq         m14, m5, 6
829    paddw           m9, m11
830    paddw          m10, m12
831    paddw           m9, m13
832    paddw          m10, m14
833    pslldq         m11, m6, 12
834    psrldq         m12, m6, 4
835    pslldq         m13, m7, 14
836    psrldq         m14, m7, 2
837    paddw           m9, m11
838    paddw          m10, m12
839    paddw           m9, m13                 ; partial_sum_diag[0][0-7]
840    paddw          m10, m14                 ; partial_sum_diag[0][8-14,zero]
841    pshufb         m10, [shufw_6543210x]
842    punpckhwd      m11, m9, m10
843    punpcklwd       m9, m10
844    pmaddwd        m11, m11
845    pmaddwd         m9, m9
846    MULLD          m11, [div_table%+SUFFIX+16]
847    MULLD           m9, [div_table%+SUFFIX+0]
848    paddd           m9, m11                 ; cost[0a-d]
849
850    pslldq         m10, m0, 14
851    psrldq         m11, m0, 2
852    pslldq         m12, m1, 12
853    psrldq         m13, m1, 4
854    pslldq         m14, m2, 10
855    psrldq         m15, m2, 6
856    paddw          m10, m12
857    paddw          m11, m13
858    paddw          m10, m14
859    paddw          m11, m15
860    pslldq         m12, m3, 8
861    psrldq         m13, m3, 8
862    pslldq         m14, m4, 6
863    psrldq         m15, m4, 10
864    paddw          m10, m12
865    paddw          m11, m13
866    paddw          m10, m14
867    paddw          m11, m15
868    pslldq         m12, m5, 4
869    psrldq         m13, m5, 12
870    pslldq         m14, m6, 2
871    psrldq         m15, m6, 14
872    paddw          m10, m12
873    paddw          m11, m13
874    paddw          m10, m14
875    paddw          m11, m15                 ; partial_sum_diag[1][8-14,zero]
876    paddw          m10, m7                  ; partial_sum_diag[1][0-7]
877    pshufb         m11, [shufw_6543210x]
878    punpckhwd      m12, m10, m11
879    punpcklwd      m10, m11
880    pmaddwd        m12, m12
881    pmaddwd        m10, m10
882    MULLD          m12, [div_table%+SUFFIX+16]
883    MULLD          m10, [div_table%+SUFFIX+0]
884    paddd          m10, m12                 ; cost[4a-d]
885    phaddd          m9, m10                 ; cost[0a/b,4a/b]
886
887    paddw          m10, m0, m1
888    paddw          m11, m2, m3
889    paddw          m12, m4, m5
890    paddw          m13, m6, m7
891    phaddw          m0, m4
892    phaddw          m1, m5
893    phaddw          m2, m6
894    phaddw          m3, m7
895
896    ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1)
897    pslldq          m4, m11, 2
898    psrldq          m5, m11, 14
899    pslldq          m6, m12, 4
900    psrldq          m7, m12, 12
901    pslldq         m14, m13, 6
902    psrldq         m15, m13, 10
903    paddw           m4, m10
904    paddw           m5, m7
905    paddw           m4, m6
906    paddw           m5, m15                 ; partial_sum_alt[3] right
907    paddw           m4, m14                 ; partial_sum_alt[3] left
908    pshuflw         m6, m5, q3012
909    punpckhwd       m5, m4
910    punpcklwd       m4, m6
911    pmaddwd         m5, m5
912    pmaddwd         m4, m4
913    MULLD           m5, [div_table%+SUFFIX+48]
914    MULLD           m4, [div_table%+SUFFIX+32]
915    paddd           m4, m5                  ; cost[7a-d]
916
917    pslldq          m5, m10, 6
918    psrldq          m6, m10, 10
919    pslldq          m7, m11, 4
920    psrldq         m10, m11, 12
921    pslldq         m11, m12, 2
922    psrldq         m12, 14
923    paddw           m5, m7
924    paddw           m6, m10
925    paddw           m5, m11
926    paddw           m6, m12
927    paddw           m5, m13
928    pshuflw         m7, m6, q3012
929    punpckhwd       m6, m5
930    punpcklwd       m5, m7
931    pmaddwd         m6, m6
932    pmaddwd         m5, m5
933    MULLD           m6, [div_table%+SUFFIX+48]
934    MULLD           m5, [div_table%+SUFFIX+32]
935    paddd           m5, m6                  ; cost[5a-d]
936
937    pslldq          m6, m1, 2
938    psrldq          m7, m1, 14
939    pslldq         m10, m2, 4
940    psrldq         m11, m2, 12
941    pslldq         m12, m3, 6
942    psrldq         m13, m3, 10
943    paddw           m6, m0
944    paddw           m7, m11
945    paddw           m6, m10
946    paddw           m7, m13                 ; partial_sum_alt[3] right
947    paddw           m6, m12                 ; partial_sum_alt[3] left
948    pshuflw        m10, m7, q3012
949    punpckhwd       m7, m6
950    punpcklwd       m6, m10
951    pmaddwd         m7, m7
952    pmaddwd         m6, m6
953    MULLD           m7, [div_table%+SUFFIX+48]
954    MULLD           m6, [div_table%+SUFFIX+32]
955    paddd           m6, m7                  ; cost[1a-d]
956
957    pshufd          m0, m0, q1032
958    pshufd          m1, m1, q1032
959    pshufd          m2, m2, q1032
960    pshufd          m3, m3, q1032
961
962    pslldq         m10, m0, 6
963    psrldq         m11, m0, 10
964    pslldq         m12, m1, 4
965    psrldq         m13, m1, 12
966    pslldq         m14, m2, 2
967    psrldq          m2, 14
968    paddw          m10, m12
969    paddw          m11, m13
970    paddw          m10, m14
971    paddw          m11, m2
972    paddw          m10, m3
973    pshuflw        m12, m11, q3012
974    punpckhwd      m11, m10
975    punpcklwd      m10, m12
976    pmaddwd        m11, m11
977    pmaddwd        m10, m10
978    MULLD          m11, [div_table%+SUFFIX+48]
979    MULLD          m10, [div_table%+SUFFIX+32]
980    paddd          m10, m11                 ; cost[3a-d]
981
982    phaddd          m9, m8                  ; cost[0,4,2,6]
983    phaddd          m6, m10
984    phaddd          m5, m4
985    phaddd          m6, m5                  ; cost[1,3,5,7]
986    pshufd          m4, m9, q3120
987
988    ; now find the best cost
989  %if cpuflag(sse4)
990    pmaxsd          m9, m6
991    pshufd          m0, m9, q1032
992    pmaxsd          m0, m9
993    pshufd          m1, m0, q2301
994    pmaxsd          m0, m1                  ; best cost
995  %else
996    pcmpgtd         m0, m9, m6
997    pand            m9, m0
998    pandn           m0, m6
999    por             m9, m0
1000    pshufd          m1, m9, q1032
1001    pcmpgtd         m0, m9, m1
1002    pand            m9, m0
1003    pandn           m0, m1
1004    por             m9, m0
1005    pshufd          m1, m9, q2301
1006    pcmpgtd         m0, m9, m1
1007    pand            m9, m0
1008    pandn           m0, m1
1009    por             m0, m9
1010  %endif
1011
1012    ; get direction and variance
1013    punpckhdq       m1, m4, m6
1014    punpckldq       m4, m6
1015    psubd           m2, m0, m1
1016    psubd           m3, m0, m4
1017%if WIN64
1018    WIN64_RESTORE_XMM
1019    %define tmp rsp+stack_offset+8
1020%else
1021    %define tmp rsp-40
1022%endif
1023    mova    [tmp+0x00], m2                  ; emulate ymm in stack
1024    mova    [tmp+0x10], m3
1025    pcmpeqd         m1, m0                  ; compute best cost mask
1026    pcmpeqd         m4, m0
1027    packssdw        m4, m1
1028    pmovmskb       eax, m4                  ; get byte-idx from mask
1029    tzcnt          eax, eax
1030    mov            r1d, [tmp+rax*2]         ; get idx^4 complement from emulated ymm
1031    shr            eax, 1                   ; get direction by converting byte-idx to word-idx
1032    shr            r1d, 10
1033    mov         [varq], r1d
1034 %else
1035cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3
1036%define base r2-shufw_6543210x
1037    LEA             r2, shufw_6543210x
1038    pxor            m0, m0
1039    lea       stride3q, [strideq*3]
1040    movq            m5, [srcq+strideq*0]
1041    movhps          m5, [srcq+strideq*1]
1042    movq            m7, [srcq+strideq*2]
1043    movhps          m7, [srcq+stride3q]
1044    mova            m1, [base+pw_128]
1045    psadbw          m2, m5, m0
1046    psadbw          m3, m7, m0
1047    packssdw        m2, m3
1048    punpcklbw       m4, m5, m0
1049    punpckhbw       m5, m0
1050    punpcklbw       m6, m7, m0
1051    punpckhbw       m7, m0
1052    psubw           m4, m1
1053    psubw           m5, m1
1054    psubw           m6, m1
1055    psubw           m7, m1
1056
1057    mova    [esp+0x00], m4
1058    mova    [esp+0x10], m5
1059    mova    [esp+0x20], m6
1060    mova    [esp+0x50], m7
1061
1062    lea           srcq, [srcq+strideq*4]
1063    movq            m5, [srcq+strideq*0]
1064    movhps          m5, [srcq+strideq*1]
1065    movq            m7, [srcq+strideq*2]
1066    movhps          m7, [srcq+stride3q]
1067    psadbw          m3, m5, m0
1068    psadbw          m0, m7
1069    packssdw        m3, m0
1070    pxor            m0, m0
1071    punpcklbw       m4, m5, m0
1072    punpckhbw       m5, m0
1073    punpcklbw       m6, m7, m0
1074    punpckhbw       m7, m0
1075cglobal_label .main
1076    psubw           m4, m1
1077    psubw           m5, m1
1078    psubw           m6, m1
1079    psubw           m7, m1
1080    packssdw        m2, m3
1081    psllw           m1, 3
1082    psubw           m2, m1                  ; partial_sum_hv[0]
1083    pmaddwd         m2, m2
1084
1085    mova            m3, [esp+0x50]
1086    mova            m0, [esp+0x00]
1087    paddw           m0, [esp+0x10]
1088    paddw           m1, m3, [esp+0x20]
1089    paddw           m0, m4
1090    paddw           m1, m5
1091    paddw           m0, m6
1092    paddw           m1, m7
1093    paddw           m0, m1                  ; partial_sum_hv[1]
1094    pmaddwd         m0, m0
1095
1096    phaddd          m2, m0
1097    MULLD           m2, [base+div_table%+SUFFIX+48]
1098    mova    [esp+0x30], m2
1099
1100    mova            m1, [esp+0x10]
1101    pslldq          m0, m1, 2
1102    psrldq          m1, 14
1103    paddw           m0, [esp+0x00]
1104    pslldq          m2, m3, 6
1105    psrldq          m3, 10
1106    paddw           m0, m2
1107    paddw           m1, m3
1108    mova            m3, [esp+0x20]
1109    pslldq          m2, m3, 4
1110    psrldq          m3, 12
1111    paddw           m0, m2                  ; partial_sum_diag[0] top/left half
1112    paddw           m1, m3                  ; partial_sum_diag[0] top/right half
1113    pslldq          m2, m4, 8
1114    psrldq          m3, m4, 8
1115    paddw           m0, m2
1116    paddw           m1, m3
1117    pslldq          m2, m5, 10
1118    psrldq          m3, m5, 6
1119    paddw           m0, m2
1120    paddw           m1, m3
1121    pslldq          m2, m6, 12
1122    psrldq          m3, m6, 4
1123    paddw           m0, m2
1124    paddw           m1, m3
1125    pslldq          m2, m7, 14
1126    psrldq          m3, m7, 2
1127    paddw           m0, m2                  ; partial_sum_diag[0][0-7]
1128    paddw           m1, m3                  ; partial_sum_diag[0][8-14,zero]
1129    mova            m3, [esp+0x50]
1130    pshufb          m1, [base+shufw_6543210x]
1131    punpckhwd       m2, m0, m1
1132    punpcklwd       m0, m1
1133    pmaddwd         m2, m2
1134    pmaddwd         m0, m0
1135    MULLD           m2, [base+div_table%+SUFFIX+16]
1136    MULLD           m0, [base+div_table%+SUFFIX+ 0]
1137    paddd           m0, m2                  ; cost[0a-d]
1138    mova    [esp+0x40], m0
1139
1140    mova            m1, [esp+0x00]
1141    pslldq          m0, m1, 14
1142    psrldq          m1, 2
1143    paddw           m0, m7
1144    pslldq          m2, m3, 8
1145    psrldq          m3, 8
1146    paddw           m0, m2
1147    paddw           m1, m3
1148    mova            m3, [esp+0x20]
1149    pslldq          m2, m3, 10
1150    psrldq          m3, 6
1151    paddw           m0, m2
1152    paddw           m1, m3
1153    mova            m3, [esp+0x10]
1154    pslldq          m2, m3, 12
1155    psrldq          m3, 4
1156    paddw           m0, m2
1157    paddw           m1, m3
1158    pslldq          m2, m4, 6
1159    psrldq          m3, m4, 10
1160    paddw           m0, m2
1161    paddw           m1, m3
1162    pslldq          m2, m5, 4
1163    psrldq          m3, m5, 12
1164    paddw           m0, m2
1165    paddw           m1, m3
1166    pslldq          m2, m6, 2
1167    psrldq          m3, m6, 14
1168    paddw           m0, m2                  ; partial_sum_diag[1][0-7]
1169    paddw           m1, m3                  ; partial_sum_diag[1][8-14,zero]
1170    mova            m3, [esp+0x50]
1171    pshufb          m1, [base+shufw_6543210x]
1172    punpckhwd       m2, m0, m1
1173    punpcklwd       m0, m1
1174    pmaddwd         m2, m2
1175    pmaddwd         m0, m0
1176    MULLD           m2, [base+div_table%+SUFFIX+16]
1177    MULLD           m0, [base+div_table%+SUFFIX+ 0]
1178    paddd           m0, m2                  ; cost[4a-d]
1179    phaddd          m1, [esp+0x40], m0      ; cost[0a/b,4a/b]
1180    phaddd          m1, [esp+0x30]          ; cost[0,4,2,6]
1181    mova    [esp+0x30], m1
1182
1183    phaddw          m0, [esp+0x00], m4
1184    phaddw          m1, [esp+0x10], m5
1185    paddw           m4, m5
1186    mova            m2, [esp+0x20]
1187    paddw           m5, m2, m3
1188    phaddw          m2, m6
1189    paddw           m6, m7
1190    phaddw          m3, m7
1191    mova            m7, [esp+0x00]
1192    paddw           m7, [esp+0x10]
1193    mova    [esp+0x00], m0
1194    mova    [esp+0x10], m1
1195    mova    [esp+0x20], m2
1196
1197    pslldq          m1, m4, 4
1198    pslldq          m2, m6, 6
1199    pslldq          m0, m5, 2
1200    paddw           m1, m2
1201    paddw           m0, m7
1202    psrldq          m2, m5, 14
1203    paddw           m0, m1                  ; partial_sum_alt[3] left
1204    psrldq          m1, m4, 12
1205    paddw           m1, m2
1206    psrldq          m2, m6, 10
1207    paddw           m1, m2                  ; partial_sum_alt[3] right
1208    pshuflw         m1, m1, q3012
1209    punpckhwd       m2, m0, m1
1210    punpcklwd       m0, m1
1211    pmaddwd         m2, m2
1212    pmaddwd         m0, m0
1213    MULLD           m2, [base+div_table%+SUFFIX+48]
1214    MULLD           m0, [base+div_table%+SUFFIX+32]
1215    paddd           m0, m2                  ; cost[7a-d]
1216    mova    [esp+0x40], m0
1217
1218    pslldq          m0, m7, 6
1219    psrldq          m7, 10
1220    pslldq          m1, m5, 4
1221    psrldq          m5, 12
1222    pslldq          m2, m4, 2
1223    psrldq          m4, 14
1224    paddw           m0, m6
1225    paddw           m7, m5
1226    paddw           m0, m1
1227    paddw           m7, m4
1228    paddw           m0, m2
1229    pshuflw         m2, m7, q3012
1230    punpckhwd       m7, m0
1231    punpcklwd       m0, m2
1232    pmaddwd         m7, m7
1233    pmaddwd         m0, m0
1234    MULLD           m7, [base+div_table%+SUFFIX+48]
1235    MULLD           m0, [base+div_table%+SUFFIX+32]
1236    paddd           m0, m7                  ; cost[5a-d]
1237    mova    [esp+0x50], m0
1238
1239    mova            m7, [esp+0x10]
1240    mova            m2, [esp+0x20]
1241    pslldq          m0, m7, 2
1242    psrldq          m7, 14
1243    pslldq          m4, m2, 4
1244    psrldq          m2, 12
1245    pslldq          m5, m3, 6
1246    psrldq          m6, m3, 10
1247    paddw           m0, [esp+0x00]
1248    paddw           m7, m2
1249    paddw           m4, m5
1250    paddw           m7, m6                  ; partial_sum_alt[3] right
1251    paddw           m0, m4                  ; partial_sum_alt[3] left
1252    pshuflw         m2, m7, q3012
1253    punpckhwd       m7, m0
1254    punpcklwd       m0, m2
1255    pmaddwd         m7, m7
1256    pmaddwd         m0, m0
1257    MULLD           m7, [base+div_table%+SUFFIX+48]
1258    MULLD           m0, [base+div_table%+SUFFIX+32]
1259    paddd           m0, m7                  ; cost[1a-d]
1260    SWAP            m0, m4
1261
1262    pshufd          m0, [esp+0x00], q1032
1263    pshufd          m1, [esp+0x10], q1032
1264    pshufd          m2, [esp+0x20], q1032
1265    pshufd          m3, m3, q1032
1266    mova    [esp+0x00], m4
1267
1268    pslldq          m4, m0, 6
1269    psrldq          m0, 10
1270    pslldq          m5, m1, 4
1271    psrldq          m1, 12
1272    pslldq          m6, m2, 2
1273    psrldq          m2, 14
1274    paddw           m4, m3
1275    paddw           m0, m1
1276    paddw           m5, m6
1277    paddw           m0, m2
1278    paddw           m4, m5
1279    pshuflw         m2, m0, q3012
1280    punpckhwd       m0, m4
1281    punpcklwd       m4, m2
1282    pmaddwd         m0, m0
1283    pmaddwd         m4, m4
1284    MULLD           m0, [base+div_table%+SUFFIX+48]
1285    MULLD           m4, [base+div_table%+SUFFIX+32]
1286    paddd           m4, m0                   ; cost[3a-d]
1287
1288    mova            m1, [esp+0x00]
1289    mova            m2, [esp+0x50]
1290    mova            m0, [esp+0x30]          ; cost[0,4,2,6]
1291    phaddd          m1, m4
1292    phaddd          m2, [esp+0x40]          ; cost[1,3,5,7]
1293    phaddd          m1, m2
1294    pshufd          m2, m0, q3120
1295
1296    ; now find the best cost
1297  %if cpuflag(sse4)
1298    pmaxsd          m0, m1
1299    pshufd          m3, m0, q1032
1300    pmaxsd          m3, m0
1301    pshufd          m0, m3, q2301
1302    pmaxsd          m0, m3
1303  %else
1304    pcmpgtd         m3, m0, m1
1305    pand            m0, m3
1306    pandn           m3, m1
1307    por             m0, m3
1308    pshufd          m4, m0, q1032
1309    pcmpgtd         m3, m0, m4
1310    pand            m0, m3
1311    pandn           m3, m4
1312    por             m0, m3
1313    pshufd          m4, m0, q2301
1314    pcmpgtd         m3, m0, m4
1315    pand            m0, m3
1316    pandn           m3, m4
1317    por             m0, m3
1318  %endif
1319
1320    ; get direction and variance
1321    mov           vard, varm
1322    punpckhdq       m3, m2, m1
1323    punpckldq       m2, m1
1324    psubd           m1, m0, m3
1325    psubd           m4, m0, m2
1326    mova    [esp+0x00], m1                  ; emulate ymm in stack
1327    mova    [esp+0x10], m4
1328    pcmpeqd         m3, m0                  ; compute best cost mask
1329    pcmpeqd         m2, m0
1330    packssdw        m2, m3
1331    pmovmskb       eax, m2                  ; get byte-idx from mask
1332    tzcnt          eax, eax
1333    mov            r1d, [esp+eax*2]         ; get idx^4 complement from emulated ymm
1334    shr            eax, 1                   ; get direction by converting byte-idx to word-idx
1335    shr            r1d, 10
1336    mov         [vard], r1d
1337 %endif
1338
1339    RET
1340%endmacro
1341
1342INIT_XMM sse4
1343CDEF_FILTER 8, 8
1344CDEF_FILTER 4, 8
1345CDEF_FILTER 4, 4
1346CDEF_DIR
1347
1348INIT_XMM ssse3
1349CDEF_FILTER 8, 8
1350CDEF_FILTER 4, 8
1351CDEF_FILTER 4, 4
1352CDEF_DIR
1353
1354INIT_XMM sse2
1355CDEF_FILTER 8, 8
1356CDEF_FILTER 4, 8
1357CDEF_FILTER 4, 4
1358