xref: /aosp_15_r20/external/libdav1d/src/x86/cdef16_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA
32
33%macro DIR_TABLE 1 ; stride
34    db  1 * %1 + 0,  2 * %1 + 0
35    db  1 * %1 + 0,  2 * %1 - 2
36    db -1 * %1 + 2, -2 * %1 + 4
37    db  0 * %1 + 2, -1 * %1 + 4
38    db  0 * %1 + 2,  0 * %1 + 4
39    db  0 * %1 + 2,  1 * %1 + 4
40    db  1 * %1 + 2,  2 * %1 + 4
41    db  1 * %1 + 0,  2 * %1 + 2
42    db  1 * %1 + 0,  2 * %1 + 0
43    db  1 * %1 + 0,  2 * %1 - 2
44    db -1 * %1 + 2, -2 * %1 + 4
45    db  0 * %1 + 2, -1 * %1 + 4
46%endmacro
47
48dir_table4: DIR_TABLE 16
49dir_table8: DIR_TABLE 32
50pri_taps:   dw  4, 4, 3, 3, 2, 2, 3, 3
51
52dir_shift:  times 2 dw 0x4000
53            times 2 dw 0x1000
54
55pw_2048:    times 2 dw 2048
56pw_m16384:  times 2 dw -16384
57
58cextern cdef_dir_8bpc_avx2.main
59
60SECTION .text
61
62%macro CDEF_FILTER 2 ; w, h
63    DEFINE_ARGS dst, stride, _, dir, pridmp, pri, sec, tmp
64    movifnidn     prid, r5m
65    movifnidn     secd, r6m
66    mov           dird, r7m
67    vpbroadcastd    m8, [base+pw_2048]
68    lea           dirq, [base+dir_table%1+dirq*2]
69    test          prid, prid
70    jz .sec_only
71%if WIN64
72    vpbroadcastw    m6, prim
73    movaps  [rsp+16*0], xmm9
74    movaps  [rsp+16*1], xmm10
75%else
76    movd           xm6, prid
77    vpbroadcastw    m6, xm6
78%endif
79    lzcnt      pridmpd, prid
80    rorx          tmpd, prid, 2
81    cmp     dword r10m, 0xfff ; if (bpc == 12)
82    cmove         prid, tmpd  ;     pri >>= 2
83    mov           tmpd, r8m   ; damping
84    and           prid, 4
85    sub           tmpd, 31
86    vpbroadcastd    m9, [base+pri_taps+priq+8*0]
87    vpbroadcastd   m10, [base+pri_taps+priq+8*1]
88    test          secd, secd
89    jz .pri_only
90%if WIN64
91    movaps         r8m, xmm13
92    vpbroadcastw   m13, secm
93    movaps         r4m, xmm11
94    movaps         r6m, xmm12
95%else
96    movd           xm0, secd
97    vpbroadcastw   m13, xm0
98%endif
99    lzcnt         secd, secd
100    xor           prid, prid
101    add        pridmpd, tmpd
102    cmovs      pridmpd, prid
103    add           secd, tmpd
104    lea           tmpq, [px]
105    mov    [pri_shift], pridmpq
106    mov    [sec_shift], secq
107%rep %1*%2/16
108    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
109%endrep
110%if WIN64
111    movaps       xmm11, r4m
112    movaps       xmm12, r6m
113    movaps       xmm13, r8m
114%endif
115    jmp .pri_end
116.pri_only:
117    add        pridmpd, tmpd
118    cmovs      pridmpd, secd
119    lea           tmpq, [px]
120    mov    [pri_shift], pridmpq
121%rep %1*%2/16
122    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
123%endrep
124.pri_end:
125%if WIN64
126    movaps        xmm9, [rsp+16*0]
127    movaps       xmm10, [rsp+16*1]
128%endif
129.end:
130    RET
131.sec_only:
132    mov           tmpd, r8m ; damping
133%if WIN64
134    vpbroadcastw    m6, secm
135%else
136    movd           xm6, secd
137    vpbroadcastw    m6, xm6
138%endif
139    tzcnt         secd, secd
140    sub           tmpd, secd
141    mov    [sec_shift], tmpq
142    lea           tmpq, [px]
143%rep %1*%2/16
144    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
145%endrep
146    jmp .end
147%if %1 == %2
148ALIGN function_align
149.pri:
150    movsx         offq, byte [dirq+4]    ; off_k0
151%if %1 == 4
152    mova            m1, [tmpq+32*0]
153    punpcklqdq      m1, [tmpq+32*1]      ; 0 2 1 3
154    movu            m2, [tmpq+offq+32*0]
155    punpcklqdq      m2, [tmpq+offq+32*1] ; k0p0
156    neg           offq
157    movu            m3, [tmpq+offq+32*0]
158    punpcklqdq      m3, [tmpq+offq+32*1] ; k0p1
159%else
160    mova           xm1, [tmpq+32*0]
161    vinserti128     m1, [tmpq+32*1], 1
162    movu           xm2, [tmpq+offq+32*0]
163    vinserti128     m2, [tmpq+offq+32*1], 1
164    neg           offq
165    movu           xm3, [tmpq+offq+32*0]
166    vinserti128     m3, [tmpq+offq+32*1], 1
167%endif
168    movsx         offq, byte [dirq+5]    ; off_k1
169    psubw           m2, m1               ; diff_k0p0
170    psubw           m3, m1               ; diff_k0p1
171    pabsw           m4, m2               ; adiff_k0p0
172    psrlw           m5, m4, [pri_shift+gprsize]
173    psubusw         m0, m6, m5
174    pabsw           m5, m3               ; adiff_k0p1
175    pminsw          m0, m4
176    psrlw           m4, m5, [pri_shift+gprsize]
177    psignw          m0, m2               ; constrain(diff_k0p0)
178    psubusw         m2, m6, m4
179    pminsw          m2, m5
180%if %1 == 4
181    movu            m4, [tmpq+offq+32*0]
182    punpcklqdq      m4, [tmpq+offq+32*1] ; k1p0
183    neg           offq
184    movu            m5, [tmpq+offq+32*0]
185    punpcklqdq      m5, [tmpq+offq+32*1] ; k1p1
186%else
187    movu           xm4, [tmpq+offq+32*0]
188    vinserti128     m4, [tmpq+offq+32*1], 1
189    neg           offq
190    movu           xm5, [tmpq+offq+32*0]
191    vinserti128     m5, [tmpq+offq+32*1], 1
192%endif
193    psubw           m4, m1               ; diff_k1p0
194    psubw           m5, m1               ; diff_k1p1
195    psignw          m2, m3               ; constrain(diff_k0p1)
196    pabsw           m3, m4               ; adiff_k1p0
197    paddw           m0, m2               ; constrain(diff_k0)
198    psrlw           m2, m3, [pri_shift+gprsize]
199    psubusw         m7, m6, m2
200    pabsw           m2, m5               ; adiff_k1p1
201    pminsw          m7, m3
202    psrlw           m3, m2, [pri_shift+gprsize]
203    psignw          m7, m4               ; constrain(diff_k1p0)
204    psubusw         m4, m6, m3
205    pminsw          m4, m2
206    psignw          m4, m5               ; constrain(diff_k1p1)
207    paddw           m7, m4               ; constrain(diff_k1)
208    pmullw          m0, m9               ; pri_tap_k0
209    pmullw          m7, m10              ; pri_tap_k1
210    paddw           m0, m7               ; sum
211    psraw           m2, m0, 15
212    paddw           m0, m2
213    pmulhrsw        m0, m8
214    add           tmpq, 32*2
215    paddw           m0, m1
216%if %1 == 4
217    vextracti128   xm1, m0, 1
218    movq   [dstq+strideq*0], xm0
219    movq   [dstq+strideq*1], xm1
220    movhps [dstq+strideq*2], xm0
221    movhps [dstq+r9       ], xm1
222    lea           dstq, [dstq+strideq*4]
223%else
224    mova         [dstq+strideq*0], xm0
225    vextracti128 [dstq+strideq*1], m0, 1
226    lea           dstq, [dstq+strideq*2]
227%endif
228    ret
229ALIGN function_align
230.sec:
231    movsx         offq, byte [dirq+8]    ; off1_k0
232%if %1 == 4
233    mova            m1, [tmpq+32*0]
234    punpcklqdq      m1, [tmpq+32*1]
235    movu            m2, [tmpq+offq+32*0]
236    punpcklqdq      m2, [tmpq+offq+32*1] ; k0s0
237    neg           offq
238    movu            m3, [tmpq+offq+32*0]
239    punpcklqdq      m3, [tmpq+offq+32*1] ; k0s1
240%else
241    mova           xm1, [tmpq+32*0]
242    vinserti128     m1, [tmpq+32*1], 1
243    movu           xm2, [tmpq+offq+32*0]
244    vinserti128     m2, [tmpq+offq+32*1], 1
245    neg           offq
246    movu           xm3, [tmpq+offq+32*0]
247    vinserti128     m3, [tmpq+offq+32*1], 1
248%endif
249    movsx         offq, byte [dirq+0]    ; off2_k0
250    psubw           m2, m1               ; diff_k0s0
251    psubw           m3, m1               ; diff_k0s1
252    pabsw           m4, m2               ; adiff_k0s0
253    psrlw           m5, m4, [sec_shift+gprsize]
254    psubusw         m0, m6, m5
255    pabsw           m5, m3               ; adiff_k0s1
256    pminsw          m0, m4
257    psrlw           m4, m5, [sec_shift+gprsize]
258    psignw          m0, m2               ; constrain(diff_k0s0)
259    psubusw         m2, m6, m4
260    pminsw          m2, m5
261%if %1 == 4
262    movu            m4, [tmpq+offq+32*0]
263    punpcklqdq      m4, [tmpq+offq+32*1] ; k0s2
264    neg           offq
265    movu            m5, [tmpq+offq+32*0]
266    punpcklqdq      m5, [tmpq+offq+32*1] ; k0s3
267%else
268    movu           xm4, [tmpq+offq+32*0]
269    vinserti128     m4, [tmpq+offq+32*1], 1
270    neg           offq
271    movu           xm5, [tmpq+offq+32*0]
272    vinserti128     m5, [tmpq+offq+32*1], 1
273%endif
274    movsx         offq, byte [dirq+9]    ; off1_k1
275    psubw           m4, m1               ; diff_k0s2
276    psubw           m5, m1               ; diff_k0s3
277    psignw          m2, m3               ; constrain(diff_k0s1)
278    pabsw           m3, m4               ; adiff_k0s2
279    paddw           m0, m2
280    psrlw           m2, m3, [sec_shift+gprsize]
281    psubusw         m7, m6, m2
282    pabsw           m2, m5               ; adiff_k0s3
283    pminsw          m7, m3
284    psrlw           m3, m2, [sec_shift+gprsize]
285    psignw          m7, m4               ; constrain(diff_k0s2)
286    psubusw         m4, m6, m3
287    pminsw          m4, m2
288%if %1 == 4
289    movu            m2, [tmpq+offq+32*0]
290    punpcklqdq      m2, [tmpq+offq+32*1] ; k1s0
291    neg           offq
292    movu            m3, [tmpq+offq+32*0]
293    punpcklqdq      m3, [tmpq+offq+32*1] ; k1s1
294%else
295    movu           xm2, [tmpq+offq+32*0]
296    vinserti128     m2, [tmpq+offq+32*1], 1
297    neg           offq
298    movu           xm3, [tmpq+offq+32*0]
299    vinserti128     m3, [tmpq+offq+32*1], 1
300%endif
301    movsx         offq, byte [dirq+1]    ; off2_k1
302    paddw           m0, m7
303    psignw          m4, m5               ; constrain(diff_k0s3)
304    paddw           m0, m4               ; constrain(diff_k0)
305    psubw           m2, m1               ; diff_k1s0
306    psubw           m3, m1               ; diff_k1s1
307    paddw           m0, m0               ; sec_tap_k0
308    pabsw           m4, m2               ; adiff_k1s0
309    psrlw           m5, m4, [sec_shift+gprsize]
310    psubusw         m7, m6, m5
311    pabsw           m5, m3               ; adiff_k1s1
312    pminsw          m7, m4
313    psrlw           m4, m5, [sec_shift+gprsize]
314    psignw          m7, m2               ; constrain(diff_k1s0)
315    psubusw         m2, m6, m4
316    pminsw          m2, m5
317%if %1 == 4
318    movu            m4, [tmpq+offq+32*0]
319    punpcklqdq      m4, [tmpq+offq+32*1] ; k1s2
320    neg           offq
321    movu            m5, [tmpq+offq+32*0]
322    punpcklqdq      m5, [tmpq+offq+32*1] ; k1s3
323%else
324    movu           xm4, [tmpq+offq+32*0]
325    vinserti128     m4, [tmpq+offq+32*1], 1
326    neg           offq
327    movu           xm5, [tmpq+offq+32*0]
328    vinserti128     m5, [tmpq+offq+32*1], 1
329%endif
330    paddw           m0, m7
331    psubw           m4, m1               ; diff_k1s2
332    psubw           m5, m1               ; diff_k1s3
333    psignw          m2, m3               ; constrain(diff_k1s1)
334    pabsw           m3, m4               ; adiff_k1s2
335    paddw           m0, m2
336    psrlw           m2, m3, [sec_shift+gprsize]
337    psubusw         m7, m6, m2
338    pabsw           m2, m5               ; adiff_k1s3
339    pminsw          m7, m3
340    psrlw           m3, m2, [sec_shift+gprsize]
341    psignw          m7, m4               ; constrain(diff_k1s2)
342    psubusw         m4, m6, m3
343    pminsw          m4, m2
344    paddw           m0, m7
345    psignw          m4, m5               ; constrain(diff_k1s3)
346    paddw           m0, m4               ; sum
347    psraw           m2, m0, 15
348    paddw           m0, m2
349    pmulhrsw        m0, m8
350    add           tmpq, 32*2
351    paddw           m0, m1
352%if %1 == 4
353    vextracti128   xm1, m0, 1
354    movq   [dstq+strideq*0], xm0
355    movq   [dstq+strideq*1], xm1
356    movhps [dstq+strideq*2], xm0
357    movhps [dstq+r9       ], xm1
358    lea           dstq, [dstq+strideq*4]
359%else
360    mova         [dstq+strideq*0], xm0
361    vextracti128 [dstq+strideq*1], m0, 1
362    lea           dstq, [dstq+strideq*2]
363%endif
364    ret
365ALIGN function_align
366.pri_sec:
367    movsx         offq, byte [dirq+8]    ; off2_k0
368%if %1 == 4
369    mova            m1, [tmpq+32*0]
370    punpcklqdq      m1, [tmpq+32*1]
371    movu            m2, [tmpq+offq+32*0]
372    punpcklqdq      m2, [tmpq+offq+32*1] ; k0s0
373    neg           offq
374    movu            m3, [tmpq+offq+32*0]
375    punpcklqdq      m3, [tmpq+offq+32*1] ; k0s1
376%else
377    mova           xm1, [dstq+strideq*0]
378    vinserti128     m1, [dstq+strideq*1], 1
379    movu           xm2, [tmpq+offq+32*0]
380    vinserti128     m2, [tmpq+offq+32*1], 1
381    neg           offq
382    movu           xm3, [tmpq+offq+32*0]
383    vinserti128     m3, [tmpq+offq+32*1], 1
384%endif
385    movsx         offq, byte [dirq+0]    ; off3_k0
386    pmaxsw         m11, m2, m3
387    pminuw         m12, m2, m3
388    psubw           m2, m1               ; diff_k0s0
389    psubw           m3, m1               ; diff_k0s1
390    pabsw           m4, m2               ; adiff_k0s0
391    psrlw           m5, m4, [sec_shift+gprsize]
392    psubusw         m0, m13, m5
393    pabsw           m5, m3               ; adiff_k0s1
394    pminsw          m0, m4
395    psrlw           m4, m5, [sec_shift+gprsize]
396    psignw          m0, m2               ; constrain(diff_k0s0)
397    psubusw         m2, m13, m4
398    pminsw          m2, m5
399%if %1 == 4
400    movu            m4, [tmpq+offq+32*0]
401    punpcklqdq      m4, [tmpq+offq+32*1] ; k0s2
402    neg           offq
403    movu            m5, [tmpq+offq+32*0]
404    punpcklqdq      m5, [tmpq+offq+32*1] ; k0s3
405%else
406    movu           xm4, [tmpq+offq+32*0]
407    vinserti128     m4, [tmpq+offq+32*1], 1
408    neg           offq
409    movu           xm5, [tmpq+offq+32*0]
410    vinserti128     m5, [tmpq+offq+32*1], 1
411%endif
412    movsx         offq, byte [dirq+9]    ; off2_k1
413    psignw          m2, m3               ; constrain(diff_k0s1)
414    pmaxsw         m11, m4
415    pminuw         m12, m4
416    pmaxsw         m11, m5
417    pminuw         m12, m5
418    psubw           m4, m1               ; diff_k0s2
419    psubw           m5, m1               ; diff_k0s3
420    paddw           m0, m2
421    pabsw           m3, m4               ; adiff_k0s2
422    psrlw           m2, m3, [sec_shift+gprsize]
423    psubusw         m7, m13, m2
424    pabsw           m2, m5               ; adiff_k0s3
425    pminsw          m7, m3
426    psrlw           m3, m2, [sec_shift+gprsize]
427    psignw          m7, m4               ; constrain(diff_k0s2)
428    psubusw         m4, m13, m3
429    pminsw          m4, m2
430%if %1 == 4
431    movu            m2, [tmpq+offq+32*0]
432    punpcklqdq      m2, [tmpq+offq+32*1] ; k1s0
433    neg           offq
434    movu            m3, [tmpq+offq+32*0]
435    punpcklqdq      m3, [tmpq+offq+32*1] ; k1s1
436%else
437    movu           xm2, [tmpq+offq+32*0]
438    vinserti128     m2, [tmpq+offq+32*1], 1
439    neg           offq
440    movu           xm3, [tmpq+offq+32*0]
441    vinserti128     m3, [tmpq+offq+32*1], 1
442%endif
443    movsx         offq, byte [dirq+1]    ; off3_k1
444    paddw           m0, m7
445    psignw          m4, m5               ; constrain(diff_k0s3)
446    pmaxsw         m11, m2
447    pminuw         m12, m2
448    pmaxsw         m11, m3
449    pminuw         m12, m3
450    paddw           m0, m4               ; constrain(diff_k0)
451    psubw           m2, m1               ; diff_k1s0
452    psubw           m3, m1               ; diff_k1s1
453    paddw           m0, m0               ; sec_tap_k0
454    pabsw           m4, m2               ; adiff_k1s0
455    psrlw           m5, m4, [sec_shift+gprsize]
456    psubusw         m7, m13, m5
457    pabsw           m5, m3               ; adiff_k1s1
458    pminsw          m7, m4
459    psrlw           m4, m5, [sec_shift+gprsize]
460    psignw          m7, m2               ; constrain(diff_k1s0)
461    psubusw         m2, m13, m4
462    pminsw          m2, m5
463%if %1 == 4
464    movu            m4, [tmpq+offq+32*0]
465    punpcklqdq      m4, [tmpq+offq+32*1] ; k1s2
466    neg           offq
467    movu            m5, [tmpq+offq+32*0]
468    punpcklqdq      m5, [tmpq+offq+32*1] ; k1s3
469%else
470    movu           xm4, [tmpq+offq+32*0]
471    vinserti128     m4, [tmpq+offq+32*1], 1
472    neg           offq
473    movu           xm5, [tmpq+offq+32*0]
474    vinserti128     m5, [tmpq+offq+32*1], 1
475%endif
476    movsx         offq, byte [dirq+4]    ; off1_k0
477    paddw           m0, m7
478    psignw          m2, m3               ; constrain(diff_k1s1)
479    pmaxsw         m11, m4
480    pminuw         m12, m4
481    pmaxsw         m11, m5
482    pminuw         m12, m5
483    psubw           m4, m1               ; diff_k1s2
484    psubw           m5, m1               ; diff_k1s3
485    pabsw           m3, m4               ; adiff_k1s2
486    paddw           m0, m2
487    psrlw           m2, m3, [sec_shift+gprsize]
488    psubusw         m7, m13, m2
489    pabsw           m2, m5               ; adiff_k1s3
490    pminsw          m7, m3
491    psrlw           m3, m2, [sec_shift+gprsize]
492    psignw          m7, m4               ; constrain(diff_k1s2)
493    psubusw         m4, m13, m3
494    pminsw          m4, m2
495    paddw           m0, m7
496%if %1 == 4
497    movu            m2, [tmpq+offq+32*0]
498    punpcklqdq      m2, [tmpq+offq+32*1] ; k0p0
499    neg           offq
500    movu            m3, [tmpq+offq+32*0]
501    punpcklqdq      m3, [tmpq+offq+32*1] ; k0p1
502%else
503    movu           xm2, [tmpq+offq+32*0]
504    vinserti128     m2, [tmpq+offq+32*1], 1
505    neg           offq
506    movu           xm3, [tmpq+offq+32*0]
507    vinserti128     m3, [tmpq+offq+32*1], 1
508%endif
509    movsx         offq, byte [dirq+5]    ; off1_k1
510    psignw          m4, m5               ; constrain(diff_k1s3)
511    pmaxsw         m11, m2
512    pminuw         m12, m2
513    pmaxsw         m11, m3
514    pminuw         m12, m3
515    psubw           m2, m1               ; diff_k0p0
516    psubw           m3, m1               ; diff_k0p1
517    paddw           m0, m4
518    pabsw           m4, m2               ; adiff_k0p0
519    psrlw           m5, m4, [pri_shift+gprsize]
520    psubusw         m7, m6, m5
521    pabsw           m5, m3               ; adiff_k0p1
522    pminsw          m7, m4
523    psrlw           m4, m5, [pri_shift+gprsize]
524    psignw          m7, m2               ; constrain(diff_k0p0)
525    psubusw         m2, m6, m4
526    pminsw          m2, m5
527%if %1 == 4
528    movu            m4, [tmpq+offq+32*0]
529    punpcklqdq      m4, [tmpq+offq+32*1] ; k1p0
530    neg           offq
531    movu            m5, [tmpq+offq+32*0]
532    punpcklqdq      m5, [tmpq+offq+32*1] ; k1p1
533%else
534    movu           xm4, [tmpq+offq+32*0]
535    vinserti128     m4, [tmpq+offq+32*1], 1
536    neg           offq
537    movu           xm5, [tmpq+offq+32*0]
538    vinserti128     m5, [tmpq+offq+32*1], 1
539%endif
540    psignw          m2, m3               ; constrain(diff_k0p1)
541    paddw           m7, m2               ; constrain(diff_k0)
542    pmaxsw         m11, m4
543    pminuw         m12, m4
544    pmaxsw         m11, m5
545    pminuw         m12, m5
546    psubw           m4, m1               ; diff_k1p0
547    psubw           m5, m1               ; diff_k1p1
548    pabsw           m3, m4               ; adiff_k1p0
549    pmullw          m7, m9               ; pri_tap_k0
550    paddw           m0, m7
551    psrlw           m2, m3, [pri_shift+gprsize]
552    psubusw         m7, m6, m2
553    pabsw           m2, m5               ; adiff_k1p1
554    pminsw          m7, m3
555    psrlw           m3, m2, [pri_shift+gprsize]
556    psignw          m7, m4               ; constrain(diff_k1p0)
557    psubusw         m4, m6, m3
558    pminsw          m4, m2
559    psignw          m4, m5               ; constrain(diff_k1p1)
560    paddw           m7, m4               ; constrain(diff_k1)
561    pmullw          m7, m10              ; pri_tap_k1
562    paddw           m0, m7               ; sum
563    psraw           m2, m0, 15
564    paddw           m0, m2
565    pmulhrsw        m0, m8
566    add           tmpq, 32*2
567    pmaxsw         m11, m1
568    pminuw         m12, m1
569    paddw           m0, m1
570    pminsw          m0, m11
571    pmaxsw          m0, m12
572%if %1 == 4
573    vextracti128   xm1, m0, 1
574    movq   [dstq+strideq*0], xm0
575    movq   [dstq+strideq*1], xm1
576    movhps [dstq+strideq*2], xm0
577    movhps [dstq+r9       ], xm1
578    lea           dstq, [dstq+strideq*4]
579%else
580    mova         [dstq+strideq*0], xm0
581    vextracti128 [dstq+strideq*1], m0, 1
582    lea           dstq, [dstq+strideq*2]
583%endif
584    ret
585%endif
586%endmacro
587
588INIT_YMM avx2
589cglobal cdef_filter_4x4_16bpc, 5, 10, 9, 16*10, dst, stride, left, top, bot, \
590                                                pri, sec, edge
591%if WIN64
592    %define         px  rsp+16*6
593    %define       offq  r8
594    %define  pri_shift  rsp+16*2
595    %define  sec_shift  rsp+16*3
596%else
597    %define         px  rsp+16*4
598    %define       offq  r4
599    %define  pri_shift  rsp+16*0
600    %define  sec_shift  rsp+16*1
601%endif
602    %define       base  r8-dir_table4
603    mov          edged, r9m
604    lea             r8, [dir_table4]
605    movu           xm0, [dstq+strideq*0]
606    movu           xm1, [dstq+strideq*1]
607    lea             r9, [strideq*3]
608    movu           xm2, [dstq+strideq*2]
609    movu           xm3, [dstq+r9       ]
610    vpbroadcastd    m7, [base+pw_m16384]
611    mova   [px+16*0+0], xm0
612    mova   [px+16*1+0], xm1
613    mova   [px+16*2+0], xm2
614    mova   [px+16*3+0], xm3
615    test         edgeb, 4 ; HAVE_TOP
616    jz .no_top
617    movu           xm0, [topq+strideq*0]
618    movu           xm1, [topq+strideq*1]
619    mova   [px-16*2+0], xm0
620    mova   [px-16*1+0], xm1
621    test         edgeb, 1 ; HAVE_LEFT
622    jz .top_no_left
623    movd           xm0, [topq+strideq*0-4]
624    movd           xm1, [topq+strideq*1-4]
625    movd   [px-16*2-4], xm0
626    movd   [px-16*1-4], xm1
627    jmp .top_done
628.no_top:
629    mova   [px-16*2+0], m7
630.top_no_left:
631    movd   [px-16*2-4], xm7
632    movd   [px-16*1-4], xm7
633.top_done:
634    test         edgeb, 8 ; HAVE_BOTTOM
635    jz .no_bottom
636    movu           xm0, [botq+strideq*0]
637    movu           xm1, [botq+strideq*1]
638    mova   [px+16*4+0], xm0
639    mova   [px+16*5+0], xm1
640    test         edgeb, 1 ; HAVE_LEFT
641    jz .bottom_no_left
642    movd           xm0, [botq+strideq*0-4]
643    movd           xm1, [botq+strideq*1-4]
644    movd   [px+16*4-4], xm0
645    movd   [px+16*5-4], xm1
646    jmp .bottom_done
647.no_bottom:
648    mova   [px+16*4+0], m7
649.bottom_no_left:
650    movd   [px+16*4-4], xm7
651    movd   [px+16*5-4], xm7
652.bottom_done:
653    test         edgeb, 1 ; HAVE_LEFT
654    jz .no_left
655    movd           xm0, [leftq+4*0]
656    movd           xm1, [leftq+4*1]
657    movd           xm2, [leftq+4*2]
658    movd           xm3, [leftq+4*3]
659    movd   [px+16*0-4], xm0
660    movd   [px+16*1-4], xm1
661    movd   [px+16*2-4], xm2
662    movd   [px+16*3-4], xm3
663    jmp .left_done
664.no_left:
665    REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3
666.left_done:
667    test         edgeb, 2 ; HAVE_RIGHT
668    jnz .padding_done
669    REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5
670.padding_done:
671    CDEF_FILTER      4, 4
672
673cglobal cdef_filter_4x8_16bpc, 5, 10, 9, 16*14, dst, stride, left, top, bot, \
674                                                pri, sec, edge
675    mov          edged, r9m
676    movu           xm0, [dstq+strideq*0]
677    movu           xm1, [dstq+strideq*1]
678    lea             r9, [strideq*3]
679    movu           xm2, [dstq+strideq*2]
680    movu           xm3, [dstq+r9       ]
681    lea             r6, [dstq+strideq*4]
682    movu           xm4, [r6  +strideq*0]
683    movu           xm5, [r6  +strideq*1]
684    movu           xm6, [r6  +strideq*2]
685    movu           xm7, [r6  +r9       ]
686    lea             r8, [dir_table4]
687    mova   [px+16*0+0], xm0
688    mova   [px+16*1+0], xm1
689    mova   [px+16*2+0], xm2
690    mova   [px+16*3+0], xm3
691    mova   [px+16*4+0], xm4
692    mova   [px+16*5+0], xm5
693    mova   [px+16*6+0], xm6
694    mova   [px+16*7+0], xm7
695    vpbroadcastd    m7, [base+pw_m16384]
696    test         edgeb, 4 ; HAVE_TOP
697    jz .no_top
698    movu           xm0, [topq+strideq*0]
699    movu           xm1, [topq+strideq*1]
700    mova   [px-16*2+0], xm0
701    mova   [px-16*1+0], xm1
702    test         edgeb, 1 ; HAVE_LEFT
703    jz .top_no_left
704    movd           xm0, [topq+strideq*0-4]
705    movd           xm1, [topq+strideq*1-4]
706    movd   [px-16*2-4], xm0
707    movd   [px-16*1-4], xm1
708    jmp .top_done
709.no_top:
710    mova   [px-16*2+0], m7
711.top_no_left:
712    movd   [px-16*2-4], xm7
713    movd   [px-16*1-4], xm7
714.top_done:
715    test         edgeb, 8 ; HAVE_BOTTOM
716    jz .no_bottom
717    movu           xm0, [botq+strideq*0]
718    movu           xm1, [botq+strideq*1]
719    mova   [px+16*8+0], xm0
720    mova   [px+16*9+0], xm1
721    test         edgeb, 1 ; HAVE_LEFT
722    jz .bottom_no_left
723    movd           xm0, [botq+strideq*0-4]
724    movd           xm1, [botq+strideq*1-4]
725    movd   [px+16*8-4], xm0
726    movd   [px+16*9-4], xm1
727    jmp .bottom_done
728.no_bottom:
729    mova   [px+16*8+0], m7
730.bottom_no_left:
731    movd   [px+16*8-4], xm7
732    movd   [px+16*9-4], xm7
733.bottom_done:
734    test         edgeb, 1 ; HAVE_LEFT
735    jz .no_left
736    movd           xm0, [leftq+4*0]
737    movd           xm1, [leftq+4*1]
738    movd           xm2, [leftq+4*2]
739    movd           xm3, [leftq+4*3]
740    movd   [px+16*0-4], xm0
741    movd   [px+16*1-4], xm1
742    movd   [px+16*2-4], xm2
743    movd   [px+16*3-4], xm3
744    movd           xm0, [leftq+4*4]
745    movd           xm1, [leftq+4*5]
746    movd           xm2, [leftq+4*6]
747    movd           xm3, [leftq+4*7]
748    movd   [px+16*4-4], xm0
749    movd   [px+16*5-4], xm1
750    movd   [px+16*6-4], xm2
751    movd   [px+16*7-4], xm3
752    jmp .left_done
753.no_left:
754    REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7
755.left_done:
756    test         edgeb, 2 ; HAVE_RIGHT
757    jnz .padding_done
758    REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
759.padding_done:
760    CDEF_FILTER      4, 8
761
762cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*13, dst, stride, left, top, bot, \
763                                               pri, sec, edge
764%if WIN64
765    %define         px  rsp+32*4
766%else
767    %define         px  rsp+32*3
768%endif
769    %define       base  r8-dir_table8
770    mov          edged, r9m
771    movu            m0, [dstq+strideq*0]
772    movu            m1, [dstq+strideq*1]
773    lea             r6, [dstq+strideq*2]
774    movu            m2, [r6  +strideq*0]
775    movu            m3, [r6  +strideq*1]
776    lea             r6, [r6  +strideq*2]
777    movu            m4, [r6  +strideq*0]
778    movu            m5, [r6  +strideq*1]
779    lea             r6, [r6  +strideq*2]
780    movu            m6, [r6  +strideq*0]
781    movu            m7, [r6  +strideq*1]
782    lea             r8, [dir_table8]
783    mova   [px+32*0+0], m0
784    mova   [px+32*1+0], m1
785    mova   [px+32*2+0], m2
786    mova   [px+32*3+0], m3
787    mova   [px+32*4+0], m4
788    mova   [px+32*5+0], m5
789    mova   [px+32*6+0], m6
790    mova   [px+32*7+0], m7
791    vpbroadcastd    m7, [base+pw_m16384]
792    test         edgeb, 4 ; HAVE_TOP
793    jz .no_top
794    movu            m0, [topq+strideq*0]
795    movu            m1, [topq+strideq*1]
796    mova   [px-32*2+0], m0
797    mova   [px-32*1+0], m1
798    test         edgeb, 1 ; HAVE_LEFT
799    jz .top_no_left
800    movd           xm0, [topq+strideq*0-4]
801    movd           xm1, [topq+strideq*1-4]
802    movd   [px-32*2-4], xm0
803    movd   [px-32*1-4], xm1
804    jmp .top_done
805.no_top:
806    mova   [px-32*2+0], m7
807    mova   [px-32*1+0], m7
808.top_no_left:
809    movd   [px-32*2-4], xm7
810    movd   [px-32*1-4], xm7
811.top_done:
812    test         edgeb, 8 ; HAVE_BOTTOM
813    jz .no_bottom
814    movu            m0, [botq+strideq*0]
815    movu            m1, [botq+strideq*1]
816    mova   [px+32*8+0], m0
817    mova   [px+32*9+0], m1
818    test         edgeb, 1 ; HAVE_LEFT
819    jz .bottom_no_left
820    movd           xm0, [botq+strideq*0-4]
821    movd           xm1, [botq+strideq*1-4]
822    movd   [px+32*8-4], xm0
823    movd   [px+32*9-4], xm1
824    jmp .bottom_done
825.no_bottom:
826    mova   [px+32*8+0], m7
827    mova   [px+32*9+0], m7
828.bottom_no_left:
829    movd   [px+32*8-4], xm7
830    movd   [px+32*9-4], xm7
831.bottom_done:
832    test         edgeb, 1 ; HAVE_LEFT
833    jz .no_left
834    movd           xm0, [leftq+4*0]
835    movd           xm1, [leftq+4*1]
836    movd           xm2, [leftq+4*2]
837    movd           xm3, [leftq+4*3]
838    movd   [px+32*0-4], xm0
839    movd   [px+32*1-4], xm1
840    movd   [px+32*2-4], xm2
841    movd   [px+32*3-4], xm3
842    movd           xm0, [leftq+4*4]
843    movd           xm1, [leftq+4*5]
844    movd           xm2, [leftq+4*6]
845    movd           xm3, [leftq+4*7]
846    movd   [px+32*4-4], xm0
847    movd   [px+32*5-4], xm1
848    movd   [px+32*6-4], xm2
849    movd   [px+32*7-4], xm3
850    jmp .left_done
851.no_left:
852    REPX {movd [px+32*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7
853.left_done:
854    test         edgeb, 2 ; HAVE_RIGHT
855    jnz .padding_done
856    REPX {movd [px+32*x+16], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
857.padding_done:
858    CDEF_FILTER      8, 8
859
860cglobal cdef_dir_16bpc, 4, 7, 6, src, stride, var, bdmax
861    lea             r6, [dir_shift]
862    shr         bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc
863    vpbroadcastd    m4, [r6+bdmaxq*4]
864    lea             r6, [strideq*3]
865    mova           xm0, [srcq+strideq*0]
866    mova           xm1, [srcq+strideq*1]
867    mova           xm2, [srcq+strideq*2]
868    mova           xm3, [srcq+r6       ]
869    lea           srcq, [srcq+strideq*4]
870    vinserti128     m0, [srcq+r6       ], 1
871    vinserti128     m1, [srcq+strideq*2], 1
872    vinserti128     m2, [srcq+strideq*1], 1
873    vinserti128     m3, [srcq+strideq*0], 1
874    REPX {pmulhuw x, m4}, m0, m1, m2, m3
875    jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
876
877%endif ; ARCH_X86_64
878