xref: /aosp_15_r20/external/libdav1d/src/x86/cdef16_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2022, VideoLAN and dav1d authors
2; Copyright © 2022, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 64
32
33cdef_perm:     db  2, 18, 16, 18, 24, 19,  0, 19, 25, 20,  1, 20, 26, 21,  2, 21
34               db  3, 26,  3, 26, 28, 27,  4, 27, 29, 28, -1, 28, 30, 29, -1, 29
35               db  0, 34, 17, 34, 16, 35,  8, 35, 17, 36,  9, 36, 18, 37, 10, 37
36               db  1, 42, 11, 42, 20, 43, 12, 43, 21, 44, -1, 44, 22, 45, -1, 45
37end_perm4:     db  1,  2,  5,  6,  9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
38               db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
39edge_mask4:    dw 0xff99, 0xff88, 0xff11, 0xff00 ; 0100, 0101, 0110, 0111
40               dw 0x99ff, 0x88ff, 0x11ff, 0x00ff ; 1000, 1001, 1010, 1011
41               dw 0x9999, 0x8888, 0x1111, 0x0000 ; 1100, 1101, 1110, 1111
42pri_taps4:     dw 64, 32, 48, 48                 ; left-shifted by 4
43cdef_dirs4:    dw  8, 16,  8, 15, -7,-14,  1, -6
44               dw  1,  2,  1, 10,  9, 18,  8, 17
45               dw  8, 16,  8, 15, -7,-14,  1, -6
46deint_shuf:    db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
47cdef_dirs8:    db 32, 64, 32, 62,-30,-60,  2,-28
48               db  2,  4,  2, 36, 34, 68, 32, 66
49               db 32, 64, 32, 62,-30,-60,  2,-28
50pri_taps8:     dw  4,  4,  2,  2,  3,  3,  3,  3
51sec_taps4:     dw 32, 16
52pw_m16384:     times 2 dw -16384
53pw_2048:       times 2 dw 2048
54pd_268435568:  dd 268435568                      ; (1 << 28) + (7 << 4)
55edge_mask8:    dw 0x2121, 0x2020, 0x0101
56
57SECTION .text
58
59%macro CONSTRAIN 7 ; dst, p, px, zero, tresh, shift, tmp
60    psubw           %1, %2, %3
61    pabsw           %1, %1
62    vpcmpgtw        k1, %3, %2
63    vpsrlvw         %7, %1, %6
64    psubusw         %7, %5, %7
65    pminsw          %1, %7
66    vpsubw      %1{k1}, %4, %1
67%endmacro
68
69; t0 t1 t2 t3 t4 t5 t6 t7   L4 L5 20 21 22 23 24 25
70; T0 T1 T2 T3 T4 T5 T6 T7   L6 L7 30 31 32 33 34 35
71; L0 L1 00 01 02 03 04 05   b0 b1 b2 b3 b4 b5 b6 b7
72; L2 L3 10 11 12 13 14 15   B0 B1 B2 B3 B4 B5 B6 B7
73
74INIT_ZMM avx512icl
75cglobal cdef_filter_4x4_16bpc, 5, 7, 16, dst, stride, left, top, bot, \
76                                         pri, sec, dir, damping, edge
77%define base r6-cdef_dirs4
78    lea             r6, [cdef_dirs4]
79    movu           xm3, [dstq+strideq*0]
80    vinserti32x4   ym3, [dstq+strideq*1], 1
81    mova           xm2, [leftq]
82    lea             r2, [dstq+strideq*2]
83    vinserti32x4    m3, [r2+strideq*0], 2
84    mova            m5, [base+cdef_perm]
85    vinserti32x4    m3, [r2+strideq*1], 3
86    vpermt2d        m2, m5, m3
87    vinserti32x4    m1, m2, [topq+strideq*0-4], 0
88    vinserti32x4    m1, [topq+strideq*1-4], 1
89    mov            r3d, edgem
90    movifnidn     prid, prim
91    punpcklwd       m3, m3     ; px
92    psrlw           m5, 8
93    vpbroadcastd    m0, [base+pd_268435568]
94    pxor           m12, m12
95    cmp            r3d, 0x0f
96    jne .mask_edges
97    vinserti32x4    m2, [botq+strideq*0-4], 2
98    vinserti32x4    m2, [botq+strideq*1-4], 3
99.main:
100    test          prid, prid
101    jz .sec_only
102    lzcnt          r4d, prid
103    rorx           r3d, prid, 2
104    vpbroadcastw   m13, prim
105    cmp     dword r10m, 0xfff  ; if (bpc == 12)
106    cmove         prid, r3d    ;     pri >>= 2
107    mov            r3d, dampingm
108    and           prid, 4
109    sub            r3d, 31
110    vpbroadcastd   m15, [base+pri_taps4+priq]
111    xor           prid, prid
112    add            r4d, r3d
113    cmovns        prid, r4d    ; pri_shift
114    mov            r4d, dirm
115    vpbroadcastw   m14, prid
116    mov            r5d, secm
117    vpbroadcastd    m9, [base+cdef_dirs4+(r4+2)*4]
118    call .constrain
119    test           r5d, r5d
120    jz .end_no_clip
121    lzcnt          r5d, r5d
122    vpbroadcastw   m13, secm
123    add            r3d, r5d
124    pminuw          m6, m3, m8
125    pmaxsw          m7, m3, m8
126    pminuw          m6, m9
127    pmaxsw          m7, m9
128    call .constrain_sec
129    pminuw          m6, m8
130    pmaxsw          m7, m8
131    pminuw          m6, m9
132    pmaxsw          m7, m9
133    vpbroadcastd    m9, [base+cdef_dirs4+(r4+0)*4]
134    call .constrain
135    pminuw          m6, m8
136    pmaxsw          m7, m8
137    pminuw          m6, m9
138    pmaxsw          m7, m9
139    psrldq          m8, m6, 2
140    vpshldd         m3, m0, 8
141    psrldq          m9, m7, 2
142    paddd           m0, m3
143    pminuw          m6, m8
144    psrldq          m0, 1
145    pmaxsw          m7, m9
146    pmaxsw          m0, m6
147    pminsw          m0, m7
148    vpmovdw        ym0, m0
149    jmp .end
150.sec_only:
151    tzcnt          r5d, secm
152    mov            r3d, dampingm
153    vpbroadcastw   m13, secm
154    mov            r4d, dirm
155    sub            r3d, r5d    ; sec_shift
156    call .constrain_sec
157    vpbroadcastd    m9, [base+cdef_dirs4+(r4+0)*4]
158    call .constrain
159.end_no_clip:
160    mova           ym1, [base+end_perm4]
161    vpshldd         m3, m0, 8  ; (px << 8) + ((sum > -8) << 4)
162    paddd           m0, m3     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
163    vpermb          m0, m1, m0
164.end:
165    movq   [dstq+strideq*0], xm0
166    movhps [dstq+strideq*1], xm0
167    vextracti32x4  xm0, ym0, 1
168    movq   [r2+strideq*0], xm0
169    movhps [r2+strideq*1], xm0
170    RET
171.mask_edges:
172    vpbroadcastd    m6, [base+pw_m16384]
173    test           r3b, 0x08
174    jz .mask_edges_no_bottom  ; avoid buffer overread
175    vinserti32x4    m2, [botq+strideq*0-4], 2
176    vinserti32x4    m2, [botq+strideq*1-4], 3
177    kmovw           k1, [base+edge_mask4-8+r3*2]
178    jmp .mask_edges_main
179.mask_edges_no_bottom:
180    kmovw           k1, [base+edge_mask4+8+r3*2]
181.mask_edges_main:
182    or             r3d, 0x04
183    vmovdqa32   m1{k1}, m6     ; edge pixels = -16384
184    kmovw           k1, [base+edge_mask4-8+r3*2]
185    vmovdqa32   m2{k1}, m6
186    jmp .main
187.constrain_sec:
188    vpbroadcastd    m9, [base+cdef_dirs4+(r4+4)*4]
189    vpbroadcastw   m14, r3d
190    vpbroadcastd   m15, [base+sec_taps4]
191.constrain:
192    paddw           m8, m5, m9
193    vpermi2w        m8, m1, m2 ; k0p0 k1p0
194    psubw           m9, m5, m9
195    vpermi2w        m9, m1, m2 ; k0p1 k1p1
196    CONSTRAIN      m10, m8, m3, m12, m13, m14, m11
197    vpdpwssd        m0, m10, m15
198    CONSTRAIN      m10, m9, m3, m12, m13, m14, m11
199    vpdpwssd        m0, m10, m15
200    ret
201
202; t0 t1 t2 t3 t4 t5 t6 t7   L4 L5 20 21 22 23 24 25   Lc Ld 60 61 62 63 64 65
203; T0 T1 T2 T3 T4 T5 T6 T7   L6 L7 30 31 32 33 34 35   Le Lf 70 71 72 73 74 75
204; L0 L1 00 01 02 03 04 05   L8 L9 40 41 42 43 44 45   b0 b1 b2 b3 b4 b5 b6 b7
205; L2 L3 10 11 12 13 14 15   La Lb 50 51 52 53 54 55   B0 B1 B2 B3 B4 B5 B6 B7
206
207cglobal cdef_filter_4x8_16bpc, 5, 7, 22, dst, stride, left, top, bot, \
208                                         pri, sec, dir, damping, edge
209    lea             r6, [cdef_dirs4]
210    movu          xm18, [dstq+strideq*0]
211    vinserti128   ym18, [dstq+strideq*1], 1
212    mova           xm1, [leftq+16*0]
213    mova           xm2, [leftq+16*1]
214    lea             r2, [strideq*3]
215    vinserti32x4   m18, [dstq+strideq*2], 2
216    mova            m5, [base+cdef_perm]
217    vinserti32x4   m18, [dstq+r2       ], 3
218    vpermt2d        m1, m5, m18
219    vinserti32x4    m0, m1, [topq+strideq*0-4], 0
220    vinserti32x4    m0, [topq+strideq*1-4], 1
221    lea             r3, [dstq+strideq*4]
222    movu          xm19, [r3+strideq*0]
223    vinserti128   ym19, [r3+strideq*1], 1
224    vinserti32x4   m19, [r3+strideq*2], 2
225    vinserti32x4   m19, [r3+r2       ], 3
226    mov            r3d, edgem
227    movifnidn     prid, prim
228    vpermt2d        m2, m5, m19
229    vpbroadcastd   m16, [base+pd_268435568]
230    pxor           m12, m12
231    punpcklwd      m18, m18    ; px (top)
232    psrlw           m5, 8
233    punpcklwd      m19, m19    ; px (bottom)
234    mova           m17, m16
235    vshufi32x4      m1, m2, q3210
236    cmp            r3d, 0x0f
237    jne .mask_edges
238    vinserti32x4    m2, [botq+strideq*0-4], 2
239    vinserti32x4    m2, [botq+strideq*1-4], 3
240.main:
241    test          prid, prid
242    jz .sec_only
243    lzcnt          r4d, prid
244    rorx           r3d, prid, 2
245    vpbroadcastw   m13, prim
246    cmp     dword r10m, 0xfff  ; if (bpc == 12)
247    cmove         prid, r3d    ;     pri >>= 2
248    mov            r3d, dampingm
249    and           prid, 4
250    sub            r3d, 31
251    vpbroadcastd   m15, [base+pri_taps4+priq]
252    xor           prid, prid
253    add            r4d, r3d
254    cmovns        prid, r4d    ; pri_shift
255    mov            r4d, dirm
256    vpbroadcastw   m14, prid
257    mov            r5d, secm
258    vpbroadcastd    m9, [base+cdef_dirs4+(r4+2)*4]
259    call .constrain
260    test           r5d, r5d
261    jz .end_no_clip
262    lzcnt          r5d, r5d
263    vpbroadcastw   m13, secm
264    add            r3d, r5d
265    pminuw          m3, m18, m6
266    pmaxsw          m4, m18, m6
267    pminuw         m20, m19, m7
268    pmaxsw         m21, m19, m7
269    pminuw          m3, m8
270    pmaxsw          m4, m8
271    pminuw         m20, m9
272    pmaxsw         m21, m9
273    call .constrain_sec
274    pminuw          m3, m6
275    pmaxsw          m4, m6
276    pminuw         m20, m7
277    pmaxsw         m21, m7
278    pminuw          m3, m8
279    pmaxsw          m4, m8
280    pminuw         m20, m9
281    pmaxsw         m21, m9
282    vpbroadcastd    m9, [base+cdef_dirs4+(r4+0)*4]
283    call .constrain
284    pminuw          m3, m6
285    pmaxsw          m4, m6
286    mov             r3, 0xcccccccccccccccc
287    pminuw         m20, m7
288    pmaxsw         m21, m7
289    kmovq           k1, r3
290    pminuw          m3, m8
291    pmaxsw          m4, m8
292    pminuw         m20, m9
293    pmaxsw         m21, m9
294    vbroadcasti32x4 m0, [base+deint_shuf]
295    vpshldd         m6, m20, m3, 16
296    vmovdqu8    m3{k1}, m20
297    vpshldd        m18, m16, 8
298    vpshldd         m7, m21, m4, 16
299    vmovdqu8    m4{k1}, m21
300    vpshldd        m19, m17, 8
301    pminuw          m3, m6
302    paddd          m16, m18
303    pmaxsw          m4, m7
304    paddd          m17, m19
305    psrldq         m16, 1
306    palignr    m16{k1}, m17, m17, 15
307    lea             r6, [dstq+strideq*4]
308    pmaxsw         m16, m3
309    pminsw         m16, m4
310    pshufb         m16, m0
311    movq   [dstq+strideq*0], xm16
312    movhps [r6  +strideq*0], xm16
313    vextracti128  xm17, ym16, 1
314    movq   [dstq+strideq*1], xm17
315    movhps [r6  +strideq*1], xm17
316    vextracti32x4  xm17, m16, 2
317    movq   [dstq+strideq*2], xm17
318    movhps [r6  +strideq*2], xm17
319    vextracti32x4  xm16, m16, 3
320    movq   [dstq+r2       ], xm16
321    movhps [r6  +r2       ], xm16
322    RET
323.sec_only:
324    mov            r4d, dirm
325    tzcnt          r5d, secm
326    mov            r3d, dampingm
327    vpbroadcastw   m13, secm
328    sub            r3d, r5d    ; sec_shift
329    call .constrain_sec
330    vpbroadcastd    m9, [base+cdef_dirs4+(r4+0)*4]
331    call .constrain
332.end_no_clip:
333    mova          ym20, [base+end_perm4]
334    vpshldd        m18, m16, 8 ; (px << 8) + ((sum > -8) << 4)
335    vpshldd        m19, m17, 8
336    paddd          m16, m18    ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
337    paddd          m17, m19
338    vpermb         m16, m20, m16
339    vpermb         m17, m20, m17
340    movq   [dstq+strideq*0], xm16
341    movhps [dstq+strideq*1], xm16
342    vextracti128  xm16, ym16, 1
343    movq   [dstq+strideq*2], xm16
344    movhps [dstq+r2       ], xm16
345    lea           dstq, [dstq+strideq*4]
346    movq   [dstq+strideq*0], xm17
347    movhps [dstq+strideq*1], xm17
348    vextracti128  xm17, ym17, 1
349    movq   [dstq+strideq*2], xm17
350    movhps [dstq+r2       ], xm17
351    RET
352.mask_edges:
353    vpbroadcastd    m6, [base+pw_m16384]
354    test           r3b, 0x08
355    jz .mask_edges_no_bottom   ; avoid buffer overread
356    vinserti32x4    m2, [botq+strideq*0-4], 2
357    vinserti32x4    m2, [botq+strideq*1-4], 3
358    kmovw           k1, [base+edge_mask4-8+r3*2]
359    jmp .mask_edges_main
360.mask_edges_no_bottom:
361    kmovw           k1, [base+edge_mask4+8+r3*2]
362.mask_edges_main:
363    mov            r4d, r3d
364    or             r3d, 0x0c
365    vmovdqa32   m0{k1}, m6     ; edge pixels = -16384
366    kmovw           k1, [base+edge_mask4-8+r3*2]
367    or             r4d, 0x04
368    vmovdqa32   m1{k1}, m6
369    kmovw           k1, [base+edge_mask4-8+r4*2]
370    vmovdqa32   m2{k1}, m6
371    jmp .main
372.constrain_sec:
373    vpbroadcastd    m9, [base+cdef_dirs4+(r4+4)*4]
374    vpbroadcastw   m14, r3d
375    vpbroadcastd   m15, [base+sec_taps4]
376.constrain:
377    paddw           m7, m5, m9
378    mova            m6, m0
379    vpermt2w        m6, m7, m1 ; k0p0 k1p0 (top)
380    psubw           m9, m5, m9
381    mova            m8, m0
382    vpermi2w        m7, m1, m2 ; k0p0 k1p0 (bottom)
383    CONSTRAIN      m10, m6, m18, m12, m13, m14, m11
384    vpermt2w        m8, m9, m1 ; k0p1 k1p1 (top)
385    vpdpwssd       m16, m10, m15
386    CONSTRAIN      m10, m7, m19, m12, m13, m14, m11
387    vpermi2w        m9, m1, m2 ; k0p1 k1p1 (bottom)
388    vpdpwssd       m17, m10, m15
389    CONSTRAIN      m10, m8, m18, m12, m13, m14, m11
390    vpdpwssd       m16, m10, m15
391    CONSTRAIN      m10, m9, m19, m12, m13, m14, m11
392    vpdpwssd       m17, m10, m15
393    ret
394
395cglobal cdef_filter_8x8_16bpc, 5, 7, 22, 64*6, dst, stride, left, top, bot, \
396                                               pri, sec, dir, damping, edge
397%define base r6-cdef_dirs8
398    lea             r6, [cdef_dirs8]
399    movu          ym17, [dstq+strideq*0]
400    vinserti32x8   m17, [dstq+strideq*1], 1
401    movq           xm4, [leftq+8*0]
402    movq           xm5, [leftq+8*1]
403    psrld           m2, [base+cdef_perm], 16
404    movq           xm6, [leftq+8*2]
405    movq           xm7, [leftq+8*3]
406    lea             r2, [strideq*3]
407    movu          ym16, [topq+strideq*0-4]
408    vinserti32x8   m16, [topq+strideq*1-4], 1
409    lea             r3, [dstq+strideq*4]
410    movu          ym18, [dstq+strideq*2]
411    vinserti32x8   m18, [dstq+r2       ], 1
412    movu          ym19, [r3+strideq*0]
413    vinserti32x8   m19, [r3+strideq*1], 1
414    movu          ym20, [r3+strideq*2]
415    vinserti32x8   m20, [r3+r2       ], 1
416    vshufi32x4      m0, m17, m18, q2020 ; px (top)
417    mov            r3d, edgem
418    vshufi32x4      m1, m19, m20, q2020 ; px (bottom)
419    movifnidn     prid, prim
420    vpermt2d       m17, m2, m4
421    vpermt2d       m18, m2, m5
422    pxor           m12, m12
423    vpermt2d       m19, m2, m6
424    vpermt2d       m20, m2, m7
425    cmp            r3d, 0x0f
426    jne .mask_edges
427    movu          ym21, [botq+strideq*0-4]
428    vinserti32x8   m21, [botq+strideq*1-4], 1
429.main:
430    mova    [rsp+64*0], m16    ; top
431    mova    [rsp+64*1], m17    ; 0 1
432    mova    [rsp+64*2], m18    ; 2 3
433    mova    [rsp+64*3], m19    ; 4 5
434    mova    [rsp+64*4], m20    ; 6 7
435    mova    [rsp+64*5], m21    ; bottom
436    test          prid, prid
437    jz .sec_only
438    lzcnt          r4d, prid
439    rorx           r3d, prid, 2
440    vpbroadcastw   m13, prim
441    cmp     dword r10m, 0xfff  ; if (bpc == 12)
442    cmove         prid, r3d    ;     pri >>= 2
443    mov            r3d, dampingm
444    and           prid, 4
445    sub            r3d, 31
446    add            r4d, r3d    ; pri_shift
447    vpbroadcastw   m14, r4d
448    mov            r4d, dirm
449    vpbroadcastd    m2, [base+pri_taps8+priq*2+0]
450    vpbroadcastd    m3, [base+pri_taps8+priq*2+4]
451    movsx           r5, byte [base+cdef_dirs8+(r4+2)*2+0] ; k0off1
452    pmaxsw         m14, m12
453    call .constrain
454    mov            r5d, secm
455    pmullw         m16, m8, m2
456    pmullw         m17, m9, m2
457    test           r5d, r5d
458    jnz .pri_sec
459    movsx           r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1
460    call .constrain
461    pmullw          m8, m3
462    pmullw          m9, m3
463    jmp .end_no_clip
464.pri_sec:
465    lzcnt          r5d, r5d
466    add            r3d, r5d    ; sec_shift
467    movsx           r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1
468    pminuw         m18, m0, m4
469    pmaxsw         m19, m0, m4
470    pminuw         m20, m1, m5
471    pmaxsw         m21, m1, m5
472    call .min_max_constrain2
473    movsx           r5, byte [base+cdef_dirs8+(r4+0)*2+0] ; k0off2
474    pmullw          m8, m3
475    pmullw          m9, m3
476    vpbroadcastw   m13, secm
477    vpbroadcastw   m14, r3d
478    paddw          m16, m8
479    paddw          m17, m9
480    call .min_max_constrain
481    movsx           r5, byte [base+cdef_dirs8+(r4+4)*2+0] ; k0off3
482    mova            m2, m8
483    mova            m3, m9
484    call .min_max_constrain
485    movsx           r5, byte [base+cdef_dirs8+(r4+0)*2+1] ; k1off2
486    paddw           m2, m8
487    paddw           m3, m9
488    call .min_max_constrain
489    movsx           r5, byte [base+cdef_dirs8+(r4+4)*2+1] ; k1off3
490    paddw           m2, m2
491    paddw           m3, m3
492    paddw          m16, m8
493    paddw          m17, m9
494    call .min_max_constrain
495    vpbroadcastd   m10, [base+pw_2048]
496    paddw          m16, m2
497    paddw          m17, m3
498    paddw          m16, m8
499    paddw          m17, m9
500    psraw           m8, m16, 15
501    psraw           m9, m17, 15
502    paddw          m16, m8
503    paddw          m17, m9
504    pmulhrsw       m16, m10
505    pmulhrsw       m17, m10
506    pminuw         m18, m4
507    pmaxsw         m19, m4
508    pminuw         m20, m5
509    pmaxsw         m21, m5
510    pminuw         m18, m6
511    pmaxsw         m19, m6
512    pminuw         m20, m7
513    pmaxsw         m21, m7
514    paddw          m16, m0
515    paddw          m17, m1
516    pmaxsw         m16, m18
517    pmaxsw         m17, m20
518    pminsw         m16, m19
519    pminsw         m17, m21
520    jmp .end
521.sec_only:
522    tzcnt          r5d, secm
523    mov            r4d, dirm
524    mov            r3d, dampingm
525    vpbroadcastw   m13, secm
526    sub            r3d, r5d
527    movsx           r5, byte [base+cdef_dirs8+(r4+0)*2+0]
528    vpbroadcastw   m14, r3d
529    call .constrain
530    movsx           r5, byte [base+cdef_dirs8+(r4+4)*2+0]
531    mova           m16, m8
532    mova           m17, m9
533    call .constrain
534    movsx           r5, byte [base+cdef_dirs8+(r4+0)*2+1]
535    paddw          m16, m8
536    paddw          m17, m9
537    call .constrain
538    movsx           r5, byte [base+cdef_dirs8+(r4+4)*2+1]
539    paddw          m16, m16
540    paddw          m17, m17
541    paddw          m16, m8
542    paddw          m17, m9
543    call .constrain
544.end_no_clip:
545    vpbroadcastd   m10, [base+pw_2048]
546    paddw          m16, m8
547    paddw          m17, m9
548    psraw           m8, m16, 15
549    psraw           m9, m17, 15
550    paddw          m16, m8
551    paddw          m17, m9
552    pmulhrsw       m16, m10
553    pmulhrsw       m17, m10
554    paddw          m16, m0
555    paddw          m17, m1
556.end:
557    mova          [dstq+strideq*0], xm16
558    vextracti128  [dstq+strideq*1], ym16, 1
559    vextracti32x4 [dstq+strideq*2], m16, 2
560    vextracti32x4 [dstq+r2       ], m16, 3
561    lea           dstq, [dstq+strideq*4]
562    mova          [dstq+strideq*0], xm17
563    vextracti128  [dstq+strideq*1], ym17, 1
564    vextracti32x4 [dstq+strideq*2], m17, 2
565    vextracti32x4 [dstq+r2       ], m17, 3
566    RET
567.mask_edges:
568    vpbroadcastd    m2, [base+pw_m16384]
569    test           r3b, 0x08
570    jz .mask_edges_no_bottom  ; avoid buffer overread
571    movu          ym21, [botq+strideq*0-4]
572    vinserti32x8   m21, [botq+strideq*1-4], 1
573    jmp .mask_edges_top
574.mask_edges_no_bottom:
575    mova           m21, m2
576.mask_edges_top:
577    test           r3b, 0x04
578    jnz .mask_edges_main
579    mova           m16, m2
580.mask_edges_main:
581    and            r3d, 0x03
582    cmp            r3d, 0x03
583    je .main
584    kmovw           k1, [base+edge_mask8+r3*2]
585    vmovdqa32  m16{k1}, m2     ; edge pixels = -16384
586    vmovdqa32  m17{k1}, m2
587    vmovdqa32  m18{k1}, m2
588    vmovdqa32  m19{k1}, m2
589    vmovdqa32  m20{k1}, m2
590    vmovdqa32  m21{k1}, m2
591    jmp .main
592ALIGN function_align
593.min_max_constrain:
594    pminuw         m18, m4
595    pmaxsw         m19, m4
596    pminuw         m20, m5
597    pmaxsw         m21, m5
598.min_max_constrain2:
599    pminuw         m18, m6
600    pmaxsw         m19, m6
601    pminuw         m20, m7
602    pmaxsw         m21, m7
603.constrain:
604    %define        tmp  rsp+gprsize+68
605    movu            m4, [tmp+r5+64*0]
606    vshufi32x4      m4, [tmp+r5+64*1], q2020 ; k0p0 (top)
607    movu            m5, [tmp+r5+64*2]
608    vshufi32x4      m5, [tmp+r5+64*3], q2020 ; k0p0 (bottom)
609    neg             r5
610    movu            m6, [tmp+r5+64*0]
611    vshufi32x4      m6, [tmp+r5+64*1], q2020 ; k0p1 (top)
612    movu            m7, [tmp+r5+64*2]
613    vshufi32x4      m7, [tmp+r5+64*3], q2020 ; k0p1 (bottom)
614    CONSTRAIN       m8, m4, m0, m12, m13, m14, m15
615    CONSTRAIN       m9, m5, m1, m12, m13, m14, m15
616    CONSTRAIN      m10, m6, m0, m12, m13, m14, m15
617    CONSTRAIN      m11, m7, m1, m12, m13, m14, m15
618    paddw           m8, m10
619    paddw           m9, m11
620    ret
621
622%endif ; ARCH_X86_64
623