xref: /aosp_15_r20/external/libdav1d/src/x86/cdef16_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2022, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2022, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64
32*c0909341SAndroid Build Coastguard Worker
33*c0909341SAndroid Build Coastguard Workercdef_perm:     db  2, 18, 16, 18, 24, 19,  0, 19, 25, 20,  1, 20, 26, 21,  2, 21
34*c0909341SAndroid Build Coastguard Worker               db  3, 26,  3, 26, 28, 27,  4, 27, 29, 28, -1, 28, 30, 29, -1, 29
35*c0909341SAndroid Build Coastguard Worker               db  0, 34, 17, 34, 16, 35,  8, 35, 17, 36,  9, 36, 18, 37, 10, 37
36*c0909341SAndroid Build Coastguard Worker               db  1, 42, 11, 42, 20, 43, 12, 43, 21, 44, -1, 44, 22, 45, -1, 45
37*c0909341SAndroid Build Coastguard Workerend_perm4:     db  1,  2,  5,  6,  9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
38*c0909341SAndroid Build Coastguard Worker               db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
39*c0909341SAndroid Build Coastguard Workeredge_mask4:    dw 0xff99, 0xff88, 0xff11, 0xff00 ; 0100, 0101, 0110, 0111
40*c0909341SAndroid Build Coastguard Worker               dw 0x99ff, 0x88ff, 0x11ff, 0x00ff ; 1000, 1001, 1010, 1011
41*c0909341SAndroid Build Coastguard Worker               dw 0x9999, 0x8888, 0x1111, 0x0000 ; 1100, 1101, 1110, 1111
42*c0909341SAndroid Build Coastguard Workerpri_taps4:     dw 64, 32, 48, 48                 ; left-shifted by 4
43*c0909341SAndroid Build Coastguard Workercdef_dirs4:    dw  8, 16,  8, 15, -7,-14,  1, -6
44*c0909341SAndroid Build Coastguard Worker               dw  1,  2,  1, 10,  9, 18,  8, 17
45*c0909341SAndroid Build Coastguard Worker               dw  8, 16,  8, 15, -7,-14,  1, -6
46*c0909341SAndroid Build Coastguard Workerdeint_shuf:    db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
47*c0909341SAndroid Build Coastguard Workercdef_dirs8:    db 32, 64, 32, 62,-30,-60,  2,-28
48*c0909341SAndroid Build Coastguard Worker               db  2,  4,  2, 36, 34, 68, 32, 66
49*c0909341SAndroid Build Coastguard Worker               db 32, 64, 32, 62,-30,-60,  2,-28
50*c0909341SAndroid Build Coastguard Workerpri_taps8:     dw  4,  4,  2,  2,  3,  3,  3,  3
51*c0909341SAndroid Build Coastguard Workersec_taps4:     dw 32, 16
52*c0909341SAndroid Build Coastguard Workerpw_m16384:     times 2 dw -16384
53*c0909341SAndroid Build Coastguard Workerpw_2048:       times 2 dw 2048
54*c0909341SAndroid Build Coastguard Workerpd_268435568:  dd 268435568                      ; (1 << 28) + (7 << 4)
55*c0909341SAndroid Build Coastguard Workeredge_mask8:    dw 0x2121, 0x2020, 0x0101
56*c0909341SAndroid Build Coastguard Worker
57*c0909341SAndroid Build Coastguard WorkerSECTION .text
58*c0909341SAndroid Build Coastguard Worker
59*c0909341SAndroid Build Coastguard Worker%macro CONSTRAIN 7 ; dst, p, px, zero, tresh, shift, tmp
60*c0909341SAndroid Build Coastguard Worker    psubw           %1, %2, %3
61*c0909341SAndroid Build Coastguard Worker    pabsw           %1, %1
62*c0909341SAndroid Build Coastguard Worker    vpcmpgtw        k1, %3, %2
63*c0909341SAndroid Build Coastguard Worker    vpsrlvw         %7, %1, %6
64*c0909341SAndroid Build Coastguard Worker    psubusw         %7, %5, %7
65*c0909341SAndroid Build Coastguard Worker    pminsw          %1, %7
66*c0909341SAndroid Build Coastguard Worker    vpsubw      %1{k1}, %4, %1
67*c0909341SAndroid Build Coastguard Worker%endmacro
68*c0909341SAndroid Build Coastguard Worker
69*c0909341SAndroid Build Coastguard Worker; t0 t1 t2 t3 t4 t5 t6 t7   L4 L5 20 21 22 23 24 25
70*c0909341SAndroid Build Coastguard Worker; T0 T1 T2 T3 T4 T5 T6 T7   L6 L7 30 31 32 33 34 35
71*c0909341SAndroid Build Coastguard Worker; L0 L1 00 01 02 03 04 05   b0 b1 b2 b3 b4 b5 b6 b7
72*c0909341SAndroid Build Coastguard Worker; L2 L3 10 11 12 13 14 15   B0 B1 B2 B3 B4 B5 B6 B7
73*c0909341SAndroid Build Coastguard Worker
74*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
75*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_4x4_16bpc, 5, 7, 16, dst, stride, left, top, bot, \
76*c0909341SAndroid Build Coastguard Worker                                         pri, sec, dir, damping, edge
77*c0909341SAndroid Build Coastguard Worker%define base r6-cdef_dirs4
78*c0909341SAndroid Build Coastguard Worker    lea             r6, [cdef_dirs4]
79*c0909341SAndroid Build Coastguard Worker    movu           xm3, [dstq+strideq*0]
80*c0909341SAndroid Build Coastguard Worker    vinserti32x4   ym3, [dstq+strideq*1], 1
81*c0909341SAndroid Build Coastguard Worker    mova           xm2, [leftq]
82*c0909341SAndroid Build Coastguard Worker    lea             r2, [dstq+strideq*2]
83*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m3, [r2+strideq*0], 2
84*c0909341SAndroid Build Coastguard Worker    mova            m5, [base+cdef_perm]
85*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m3, [r2+strideq*1], 3
86*c0909341SAndroid Build Coastguard Worker    vpermt2d        m2, m5, m3
87*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m1, m2, [topq+strideq*0-4], 0
88*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m1, [topq+strideq*1-4], 1
89*c0909341SAndroid Build Coastguard Worker    mov            r3d, edgem
90*c0909341SAndroid Build Coastguard Worker    movifnidn     prid, prim
91*c0909341SAndroid Build Coastguard Worker    punpcklwd       m3, m3     ; px
92*c0909341SAndroid Build Coastguard Worker    psrlw           m5, 8
93*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m0, [base+pd_268435568]
94*c0909341SAndroid Build Coastguard Worker    pxor           m12, m12
95*c0909341SAndroid Build Coastguard Worker    cmp            r3d, 0x0f
96*c0909341SAndroid Build Coastguard Worker    jne .mask_edges
97*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m2, [botq+strideq*0-4], 2
98*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m2, [botq+strideq*1-4], 3
99*c0909341SAndroid Build Coastguard Worker.main:
100*c0909341SAndroid Build Coastguard Worker    test          prid, prid
101*c0909341SAndroid Build Coastguard Worker    jz .sec_only
102*c0909341SAndroid Build Coastguard Worker    lzcnt          r4d, prid
103*c0909341SAndroid Build Coastguard Worker    rorx           r3d, prid, 2
104*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m13, prim
105*c0909341SAndroid Build Coastguard Worker    cmp     dword r10m, 0xfff  ; if (bpc == 12)
106*c0909341SAndroid Build Coastguard Worker    cmove         prid, r3d    ;     pri >>= 2
107*c0909341SAndroid Build Coastguard Worker    mov            r3d, dampingm
108*c0909341SAndroid Build Coastguard Worker    and           prid, 4
109*c0909341SAndroid Build Coastguard Worker    sub            r3d, 31
110*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m15, [base+pri_taps4+priq]
111*c0909341SAndroid Build Coastguard Worker    xor           prid, prid
112*c0909341SAndroid Build Coastguard Worker    add            r4d, r3d
113*c0909341SAndroid Build Coastguard Worker    cmovns        prid, r4d    ; pri_shift
114*c0909341SAndroid Build Coastguard Worker    mov            r4d, dirm
115*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m14, prid
116*c0909341SAndroid Build Coastguard Worker    mov            r5d, secm
117*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [base+cdef_dirs4+(r4+2)*4]
118*c0909341SAndroid Build Coastguard Worker    call .constrain
119*c0909341SAndroid Build Coastguard Worker    test           r5d, r5d
120*c0909341SAndroid Build Coastguard Worker    jz .end_no_clip
121*c0909341SAndroid Build Coastguard Worker    lzcnt          r5d, r5d
122*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m13, secm
123*c0909341SAndroid Build Coastguard Worker    add            r3d, r5d
124*c0909341SAndroid Build Coastguard Worker    pminuw          m6, m3, m8
125*c0909341SAndroid Build Coastguard Worker    pmaxsw          m7, m3, m8
126*c0909341SAndroid Build Coastguard Worker    pminuw          m6, m9
127*c0909341SAndroid Build Coastguard Worker    pmaxsw          m7, m9
128*c0909341SAndroid Build Coastguard Worker    call .constrain_sec
129*c0909341SAndroid Build Coastguard Worker    pminuw          m6, m8
130*c0909341SAndroid Build Coastguard Worker    pmaxsw          m7, m8
131*c0909341SAndroid Build Coastguard Worker    pminuw          m6, m9
132*c0909341SAndroid Build Coastguard Worker    pmaxsw          m7, m9
133*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [base+cdef_dirs4+(r4+0)*4]
134*c0909341SAndroid Build Coastguard Worker    call .constrain
135*c0909341SAndroid Build Coastguard Worker    pminuw          m6, m8
136*c0909341SAndroid Build Coastguard Worker    pmaxsw          m7, m8
137*c0909341SAndroid Build Coastguard Worker    pminuw          m6, m9
138*c0909341SAndroid Build Coastguard Worker    pmaxsw          m7, m9
139*c0909341SAndroid Build Coastguard Worker    psrldq          m8, m6, 2
140*c0909341SAndroid Build Coastguard Worker    vpshldd         m3, m0, 8
141*c0909341SAndroid Build Coastguard Worker    psrldq          m9, m7, 2
142*c0909341SAndroid Build Coastguard Worker    paddd           m0, m3
143*c0909341SAndroid Build Coastguard Worker    pminuw          m6, m8
144*c0909341SAndroid Build Coastguard Worker    psrldq          m0, 1
145*c0909341SAndroid Build Coastguard Worker    pmaxsw          m7, m9
146*c0909341SAndroid Build Coastguard Worker    pmaxsw          m0, m6
147*c0909341SAndroid Build Coastguard Worker    pminsw          m0, m7
148*c0909341SAndroid Build Coastguard Worker    vpmovdw        ym0, m0
149*c0909341SAndroid Build Coastguard Worker    jmp .end
150*c0909341SAndroid Build Coastguard Worker.sec_only:
151*c0909341SAndroid Build Coastguard Worker    tzcnt          r5d, secm
152*c0909341SAndroid Build Coastguard Worker    mov            r3d, dampingm
153*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m13, secm
154*c0909341SAndroid Build Coastguard Worker    mov            r4d, dirm
155*c0909341SAndroid Build Coastguard Worker    sub            r3d, r5d    ; sec_shift
156*c0909341SAndroid Build Coastguard Worker    call .constrain_sec
157*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [base+cdef_dirs4+(r4+0)*4]
158*c0909341SAndroid Build Coastguard Worker    call .constrain
159*c0909341SAndroid Build Coastguard Worker.end_no_clip:
160*c0909341SAndroid Build Coastguard Worker    mova           ym1, [base+end_perm4]
161*c0909341SAndroid Build Coastguard Worker    vpshldd         m3, m0, 8  ; (px << 8) + ((sum > -8) << 4)
162*c0909341SAndroid Build Coastguard Worker    paddd           m0, m3     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
163*c0909341SAndroid Build Coastguard Worker    vpermb          m0, m1, m0
164*c0909341SAndroid Build Coastguard Worker.end:
165*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
166*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
167*c0909341SAndroid Build Coastguard Worker    vextracti32x4  xm0, ym0, 1
168*c0909341SAndroid Build Coastguard Worker    movq   [r2+strideq*0], xm0
169*c0909341SAndroid Build Coastguard Worker    movhps [r2+strideq*1], xm0
170*c0909341SAndroid Build Coastguard Worker    RET
171*c0909341SAndroid Build Coastguard Worker.mask_edges:
172*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m6, [base+pw_m16384]
173*c0909341SAndroid Build Coastguard Worker    test           r3b, 0x08
174*c0909341SAndroid Build Coastguard Worker    jz .mask_edges_no_bottom  ; avoid buffer overread
175*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m2, [botq+strideq*0-4], 2
176*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m2, [botq+strideq*1-4], 3
177*c0909341SAndroid Build Coastguard Worker    kmovw           k1, [base+edge_mask4-8+r3*2]
178*c0909341SAndroid Build Coastguard Worker    jmp .mask_edges_main
179*c0909341SAndroid Build Coastguard Worker.mask_edges_no_bottom:
180*c0909341SAndroid Build Coastguard Worker    kmovw           k1, [base+edge_mask4+8+r3*2]
181*c0909341SAndroid Build Coastguard Worker.mask_edges_main:
182*c0909341SAndroid Build Coastguard Worker    or             r3d, 0x04
183*c0909341SAndroid Build Coastguard Worker    vmovdqa32   m1{k1}, m6     ; edge pixels = -16384
184*c0909341SAndroid Build Coastguard Worker    kmovw           k1, [base+edge_mask4-8+r3*2]
185*c0909341SAndroid Build Coastguard Worker    vmovdqa32   m2{k1}, m6
186*c0909341SAndroid Build Coastguard Worker    jmp .main
187*c0909341SAndroid Build Coastguard Worker.constrain_sec:
188*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [base+cdef_dirs4+(r4+4)*4]
189*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m14, r3d
190*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m15, [base+sec_taps4]
191*c0909341SAndroid Build Coastguard Worker.constrain:
192*c0909341SAndroid Build Coastguard Worker    paddw           m8, m5, m9
193*c0909341SAndroid Build Coastguard Worker    vpermi2w        m8, m1, m2 ; k0p0 k1p0
194*c0909341SAndroid Build Coastguard Worker    psubw           m9, m5, m9
195*c0909341SAndroid Build Coastguard Worker    vpermi2w        m9, m1, m2 ; k0p1 k1p1
196*c0909341SAndroid Build Coastguard Worker    CONSTRAIN      m10, m8, m3, m12, m13, m14, m11
197*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m10, m15
198*c0909341SAndroid Build Coastguard Worker    CONSTRAIN      m10, m9, m3, m12, m13, m14, m11
199*c0909341SAndroid Build Coastguard Worker    vpdpwssd        m0, m10, m15
200*c0909341SAndroid Build Coastguard Worker    ret
201*c0909341SAndroid Build Coastguard Worker
202*c0909341SAndroid Build Coastguard Worker; t0 t1 t2 t3 t4 t5 t6 t7   L4 L5 20 21 22 23 24 25   Lc Ld 60 61 62 63 64 65
203*c0909341SAndroid Build Coastguard Worker; T0 T1 T2 T3 T4 T5 T6 T7   L6 L7 30 31 32 33 34 35   Le Lf 70 71 72 73 74 75
204*c0909341SAndroid Build Coastguard Worker; L0 L1 00 01 02 03 04 05   L8 L9 40 41 42 43 44 45   b0 b1 b2 b3 b4 b5 b6 b7
205*c0909341SAndroid Build Coastguard Worker; L2 L3 10 11 12 13 14 15   La Lb 50 51 52 53 54 55   B0 B1 B2 B3 B4 B5 B6 B7
206*c0909341SAndroid Build Coastguard Worker
207*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_4x8_16bpc, 5, 7, 22, dst, stride, left, top, bot, \
208*c0909341SAndroid Build Coastguard Worker                                         pri, sec, dir, damping, edge
209*c0909341SAndroid Build Coastguard Worker    lea             r6, [cdef_dirs4]
210*c0909341SAndroid Build Coastguard Worker    movu          xm18, [dstq+strideq*0]
211*c0909341SAndroid Build Coastguard Worker    vinserti128   ym18, [dstq+strideq*1], 1
212*c0909341SAndroid Build Coastguard Worker    mova           xm1, [leftq+16*0]
213*c0909341SAndroid Build Coastguard Worker    mova           xm2, [leftq+16*1]
214*c0909341SAndroid Build Coastguard Worker    lea             r2, [strideq*3]
215*c0909341SAndroid Build Coastguard Worker    vinserti32x4   m18, [dstq+strideq*2], 2
216*c0909341SAndroid Build Coastguard Worker    mova            m5, [base+cdef_perm]
217*c0909341SAndroid Build Coastguard Worker    vinserti32x4   m18, [dstq+r2       ], 3
218*c0909341SAndroid Build Coastguard Worker    vpermt2d        m1, m5, m18
219*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m0, m1, [topq+strideq*0-4], 0
220*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m0, [topq+strideq*1-4], 1
221*c0909341SAndroid Build Coastguard Worker    lea             r3, [dstq+strideq*4]
222*c0909341SAndroid Build Coastguard Worker    movu          xm19, [r3+strideq*0]
223*c0909341SAndroid Build Coastguard Worker    vinserti128   ym19, [r3+strideq*1], 1
224*c0909341SAndroid Build Coastguard Worker    vinserti32x4   m19, [r3+strideq*2], 2
225*c0909341SAndroid Build Coastguard Worker    vinserti32x4   m19, [r3+r2       ], 3
226*c0909341SAndroid Build Coastguard Worker    mov            r3d, edgem
227*c0909341SAndroid Build Coastguard Worker    movifnidn     prid, prim
228*c0909341SAndroid Build Coastguard Worker    vpermt2d        m2, m5, m19
229*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m16, [base+pd_268435568]
230*c0909341SAndroid Build Coastguard Worker    pxor           m12, m12
231*c0909341SAndroid Build Coastguard Worker    punpcklwd      m18, m18    ; px (top)
232*c0909341SAndroid Build Coastguard Worker    psrlw           m5, 8
233*c0909341SAndroid Build Coastguard Worker    punpcklwd      m19, m19    ; px (bottom)
234*c0909341SAndroid Build Coastguard Worker    mova           m17, m16
235*c0909341SAndroid Build Coastguard Worker    vshufi32x4      m1, m2, q3210
236*c0909341SAndroid Build Coastguard Worker    cmp            r3d, 0x0f
237*c0909341SAndroid Build Coastguard Worker    jne .mask_edges
238*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m2, [botq+strideq*0-4], 2
239*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m2, [botq+strideq*1-4], 3
240*c0909341SAndroid Build Coastguard Worker.main:
241*c0909341SAndroid Build Coastguard Worker    test          prid, prid
242*c0909341SAndroid Build Coastguard Worker    jz .sec_only
243*c0909341SAndroid Build Coastguard Worker    lzcnt          r4d, prid
244*c0909341SAndroid Build Coastguard Worker    rorx           r3d, prid, 2
245*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m13, prim
246*c0909341SAndroid Build Coastguard Worker    cmp     dword r10m, 0xfff  ; if (bpc == 12)
247*c0909341SAndroid Build Coastguard Worker    cmove         prid, r3d    ;     pri >>= 2
248*c0909341SAndroid Build Coastguard Worker    mov            r3d, dampingm
249*c0909341SAndroid Build Coastguard Worker    and           prid, 4
250*c0909341SAndroid Build Coastguard Worker    sub            r3d, 31
251*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m15, [base+pri_taps4+priq]
252*c0909341SAndroid Build Coastguard Worker    xor           prid, prid
253*c0909341SAndroid Build Coastguard Worker    add            r4d, r3d
254*c0909341SAndroid Build Coastguard Worker    cmovns        prid, r4d    ; pri_shift
255*c0909341SAndroid Build Coastguard Worker    mov            r4d, dirm
256*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m14, prid
257*c0909341SAndroid Build Coastguard Worker    mov            r5d, secm
258*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [base+cdef_dirs4+(r4+2)*4]
259*c0909341SAndroid Build Coastguard Worker    call .constrain
260*c0909341SAndroid Build Coastguard Worker    test           r5d, r5d
261*c0909341SAndroid Build Coastguard Worker    jz .end_no_clip
262*c0909341SAndroid Build Coastguard Worker    lzcnt          r5d, r5d
263*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m13, secm
264*c0909341SAndroid Build Coastguard Worker    add            r3d, r5d
265*c0909341SAndroid Build Coastguard Worker    pminuw          m3, m18, m6
266*c0909341SAndroid Build Coastguard Worker    pmaxsw          m4, m18, m6
267*c0909341SAndroid Build Coastguard Worker    pminuw         m20, m19, m7
268*c0909341SAndroid Build Coastguard Worker    pmaxsw         m21, m19, m7
269*c0909341SAndroid Build Coastguard Worker    pminuw          m3, m8
270*c0909341SAndroid Build Coastguard Worker    pmaxsw          m4, m8
271*c0909341SAndroid Build Coastguard Worker    pminuw         m20, m9
272*c0909341SAndroid Build Coastguard Worker    pmaxsw         m21, m9
273*c0909341SAndroid Build Coastguard Worker    call .constrain_sec
274*c0909341SAndroid Build Coastguard Worker    pminuw          m3, m6
275*c0909341SAndroid Build Coastguard Worker    pmaxsw          m4, m6
276*c0909341SAndroid Build Coastguard Worker    pminuw         m20, m7
277*c0909341SAndroid Build Coastguard Worker    pmaxsw         m21, m7
278*c0909341SAndroid Build Coastguard Worker    pminuw          m3, m8
279*c0909341SAndroid Build Coastguard Worker    pmaxsw          m4, m8
280*c0909341SAndroid Build Coastguard Worker    pminuw         m20, m9
281*c0909341SAndroid Build Coastguard Worker    pmaxsw         m21, m9
282*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [base+cdef_dirs4+(r4+0)*4]
283*c0909341SAndroid Build Coastguard Worker    call .constrain
284*c0909341SAndroid Build Coastguard Worker    pminuw          m3, m6
285*c0909341SAndroid Build Coastguard Worker    pmaxsw          m4, m6
286*c0909341SAndroid Build Coastguard Worker    mov             r3, 0xcccccccccccccccc
287*c0909341SAndroid Build Coastguard Worker    pminuw         m20, m7
288*c0909341SAndroid Build Coastguard Worker    pmaxsw         m21, m7
289*c0909341SAndroid Build Coastguard Worker    kmovq           k1, r3
290*c0909341SAndroid Build Coastguard Worker    pminuw          m3, m8
291*c0909341SAndroid Build Coastguard Worker    pmaxsw          m4, m8
292*c0909341SAndroid Build Coastguard Worker    pminuw         m20, m9
293*c0909341SAndroid Build Coastguard Worker    pmaxsw         m21, m9
294*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4 m0, [base+deint_shuf]
295*c0909341SAndroid Build Coastguard Worker    vpshldd         m6, m20, m3, 16
296*c0909341SAndroid Build Coastguard Worker    vmovdqu8    m3{k1}, m20
297*c0909341SAndroid Build Coastguard Worker    vpshldd        m18, m16, 8
298*c0909341SAndroid Build Coastguard Worker    vpshldd         m7, m21, m4, 16
299*c0909341SAndroid Build Coastguard Worker    vmovdqu8    m4{k1}, m21
300*c0909341SAndroid Build Coastguard Worker    vpshldd        m19, m17, 8
301*c0909341SAndroid Build Coastguard Worker    pminuw          m3, m6
302*c0909341SAndroid Build Coastguard Worker    paddd          m16, m18
303*c0909341SAndroid Build Coastguard Worker    pmaxsw          m4, m7
304*c0909341SAndroid Build Coastguard Worker    paddd          m17, m19
305*c0909341SAndroid Build Coastguard Worker    psrldq         m16, 1
306*c0909341SAndroid Build Coastguard Worker    palignr    m16{k1}, m17, m17, 15
307*c0909341SAndroid Build Coastguard Worker    lea             r6, [dstq+strideq*4]
308*c0909341SAndroid Build Coastguard Worker    pmaxsw         m16, m3
309*c0909341SAndroid Build Coastguard Worker    pminsw         m16, m4
310*c0909341SAndroid Build Coastguard Worker    pshufb         m16, m0
311*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm16
312*c0909341SAndroid Build Coastguard Worker    movhps [r6  +strideq*0], xm16
313*c0909341SAndroid Build Coastguard Worker    vextracti128  xm17, ym16, 1
314*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm17
315*c0909341SAndroid Build Coastguard Worker    movhps [r6  +strideq*1], xm17
316*c0909341SAndroid Build Coastguard Worker    vextracti32x4  xm17, m16, 2
317*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm17
318*c0909341SAndroid Build Coastguard Worker    movhps [r6  +strideq*2], xm17
319*c0909341SAndroid Build Coastguard Worker    vextracti32x4  xm16, m16, 3
320*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r2       ], xm16
321*c0909341SAndroid Build Coastguard Worker    movhps [r6  +r2       ], xm16
322*c0909341SAndroid Build Coastguard Worker    RET
323*c0909341SAndroid Build Coastguard Worker.sec_only:
324*c0909341SAndroid Build Coastguard Worker    mov            r4d, dirm
325*c0909341SAndroid Build Coastguard Worker    tzcnt          r5d, secm
326*c0909341SAndroid Build Coastguard Worker    mov            r3d, dampingm
327*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m13, secm
328*c0909341SAndroid Build Coastguard Worker    sub            r3d, r5d    ; sec_shift
329*c0909341SAndroid Build Coastguard Worker    call .constrain_sec
330*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [base+cdef_dirs4+(r4+0)*4]
331*c0909341SAndroid Build Coastguard Worker    call .constrain
332*c0909341SAndroid Build Coastguard Worker.end_no_clip:
333*c0909341SAndroid Build Coastguard Worker    mova          ym20, [base+end_perm4]
334*c0909341SAndroid Build Coastguard Worker    vpshldd        m18, m16, 8 ; (px << 8) + ((sum > -8) << 4)
335*c0909341SAndroid Build Coastguard Worker    vpshldd        m19, m17, 8
336*c0909341SAndroid Build Coastguard Worker    paddd          m16, m18    ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
337*c0909341SAndroid Build Coastguard Worker    paddd          m17, m19
338*c0909341SAndroid Build Coastguard Worker    vpermb         m16, m20, m16
339*c0909341SAndroid Build Coastguard Worker    vpermb         m17, m20, m17
340*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm16
341*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm16
342*c0909341SAndroid Build Coastguard Worker    vextracti128  xm16, ym16, 1
343*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm16
344*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r2       ], xm16
345*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq+strideq*4]
346*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm17
347*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm17
348*c0909341SAndroid Build Coastguard Worker    vextracti128  xm17, ym17, 1
349*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm17
350*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r2       ], xm17
351*c0909341SAndroid Build Coastguard Worker    RET
352*c0909341SAndroid Build Coastguard Worker.mask_edges:
353*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m6, [base+pw_m16384]
354*c0909341SAndroid Build Coastguard Worker    test           r3b, 0x08
355*c0909341SAndroid Build Coastguard Worker    jz .mask_edges_no_bottom   ; avoid buffer overread
356*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m2, [botq+strideq*0-4], 2
357*c0909341SAndroid Build Coastguard Worker    vinserti32x4    m2, [botq+strideq*1-4], 3
358*c0909341SAndroid Build Coastguard Worker    kmovw           k1, [base+edge_mask4-8+r3*2]
359*c0909341SAndroid Build Coastguard Worker    jmp .mask_edges_main
360*c0909341SAndroid Build Coastguard Worker.mask_edges_no_bottom:
361*c0909341SAndroid Build Coastguard Worker    kmovw           k1, [base+edge_mask4+8+r3*2]
362*c0909341SAndroid Build Coastguard Worker.mask_edges_main:
363*c0909341SAndroid Build Coastguard Worker    mov            r4d, r3d
364*c0909341SAndroid Build Coastguard Worker    or             r3d, 0x0c
365*c0909341SAndroid Build Coastguard Worker    vmovdqa32   m0{k1}, m6     ; edge pixels = -16384
366*c0909341SAndroid Build Coastguard Worker    kmovw           k1, [base+edge_mask4-8+r3*2]
367*c0909341SAndroid Build Coastguard Worker    or             r4d, 0x04
368*c0909341SAndroid Build Coastguard Worker    vmovdqa32   m1{k1}, m6
369*c0909341SAndroid Build Coastguard Worker    kmovw           k1, [base+edge_mask4-8+r4*2]
370*c0909341SAndroid Build Coastguard Worker    vmovdqa32   m2{k1}, m6
371*c0909341SAndroid Build Coastguard Worker    jmp .main
372*c0909341SAndroid Build Coastguard Worker.constrain_sec:
373*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m9, [base+cdef_dirs4+(r4+4)*4]
374*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m14, r3d
375*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m15, [base+sec_taps4]
376*c0909341SAndroid Build Coastguard Worker.constrain:
377*c0909341SAndroid Build Coastguard Worker    paddw           m7, m5, m9
378*c0909341SAndroid Build Coastguard Worker    mova            m6, m0
379*c0909341SAndroid Build Coastguard Worker    vpermt2w        m6, m7, m1 ; k0p0 k1p0 (top)
380*c0909341SAndroid Build Coastguard Worker    psubw           m9, m5, m9
381*c0909341SAndroid Build Coastguard Worker    mova            m8, m0
382*c0909341SAndroid Build Coastguard Worker    vpermi2w        m7, m1, m2 ; k0p0 k1p0 (bottom)
383*c0909341SAndroid Build Coastguard Worker    CONSTRAIN      m10, m6, m18, m12, m13, m14, m11
384*c0909341SAndroid Build Coastguard Worker    vpermt2w        m8, m9, m1 ; k0p1 k1p1 (top)
385*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m16, m10, m15
386*c0909341SAndroid Build Coastguard Worker    CONSTRAIN      m10, m7, m19, m12, m13, m14, m11
387*c0909341SAndroid Build Coastguard Worker    vpermi2w        m9, m1, m2 ; k0p1 k1p1 (bottom)
388*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m17, m10, m15
389*c0909341SAndroid Build Coastguard Worker    CONSTRAIN      m10, m8, m18, m12, m13, m14, m11
390*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m16, m10, m15
391*c0909341SAndroid Build Coastguard Worker    CONSTRAIN      m10, m9, m19, m12, m13, m14, m11
392*c0909341SAndroid Build Coastguard Worker    vpdpwssd       m17, m10, m15
393*c0909341SAndroid Build Coastguard Worker    ret
394*c0909341SAndroid Build Coastguard Worker
395*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_8x8_16bpc, 5, 7, 22, 64*6, dst, stride, left, top, bot, \
396*c0909341SAndroid Build Coastguard Worker                                               pri, sec, dir, damping, edge
397*c0909341SAndroid Build Coastguard Worker%define base r6-cdef_dirs8
398*c0909341SAndroid Build Coastguard Worker    lea             r6, [cdef_dirs8]
399*c0909341SAndroid Build Coastguard Worker    movu          ym17, [dstq+strideq*0]
400*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m17, [dstq+strideq*1], 1
401*c0909341SAndroid Build Coastguard Worker    movq           xm4, [leftq+8*0]
402*c0909341SAndroid Build Coastguard Worker    movq           xm5, [leftq+8*1]
403*c0909341SAndroid Build Coastguard Worker    psrld           m2, [base+cdef_perm], 16
404*c0909341SAndroid Build Coastguard Worker    movq           xm6, [leftq+8*2]
405*c0909341SAndroid Build Coastguard Worker    movq           xm7, [leftq+8*3]
406*c0909341SAndroid Build Coastguard Worker    lea             r2, [strideq*3]
407*c0909341SAndroid Build Coastguard Worker    movu          ym16, [topq+strideq*0-4]
408*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m16, [topq+strideq*1-4], 1
409*c0909341SAndroid Build Coastguard Worker    lea             r3, [dstq+strideq*4]
410*c0909341SAndroid Build Coastguard Worker    movu          ym18, [dstq+strideq*2]
411*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m18, [dstq+r2       ], 1
412*c0909341SAndroid Build Coastguard Worker    movu          ym19, [r3+strideq*0]
413*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m19, [r3+strideq*1], 1
414*c0909341SAndroid Build Coastguard Worker    movu          ym20, [r3+strideq*2]
415*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m20, [r3+r2       ], 1
416*c0909341SAndroid Build Coastguard Worker    vshufi32x4      m0, m17, m18, q2020 ; px (top)
417*c0909341SAndroid Build Coastguard Worker    mov            r3d, edgem
418*c0909341SAndroid Build Coastguard Worker    vshufi32x4      m1, m19, m20, q2020 ; px (bottom)
419*c0909341SAndroid Build Coastguard Worker    movifnidn     prid, prim
420*c0909341SAndroid Build Coastguard Worker    vpermt2d       m17, m2, m4
421*c0909341SAndroid Build Coastguard Worker    vpermt2d       m18, m2, m5
422*c0909341SAndroid Build Coastguard Worker    pxor           m12, m12
423*c0909341SAndroid Build Coastguard Worker    vpermt2d       m19, m2, m6
424*c0909341SAndroid Build Coastguard Worker    vpermt2d       m20, m2, m7
425*c0909341SAndroid Build Coastguard Worker    cmp            r3d, 0x0f
426*c0909341SAndroid Build Coastguard Worker    jne .mask_edges
427*c0909341SAndroid Build Coastguard Worker    movu          ym21, [botq+strideq*0-4]
428*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m21, [botq+strideq*1-4], 1
429*c0909341SAndroid Build Coastguard Worker.main:
430*c0909341SAndroid Build Coastguard Worker    mova    [rsp+64*0], m16    ; top
431*c0909341SAndroid Build Coastguard Worker    mova    [rsp+64*1], m17    ; 0 1
432*c0909341SAndroid Build Coastguard Worker    mova    [rsp+64*2], m18    ; 2 3
433*c0909341SAndroid Build Coastguard Worker    mova    [rsp+64*3], m19    ; 4 5
434*c0909341SAndroid Build Coastguard Worker    mova    [rsp+64*4], m20    ; 6 7
435*c0909341SAndroid Build Coastguard Worker    mova    [rsp+64*5], m21    ; bottom
436*c0909341SAndroid Build Coastguard Worker    test          prid, prid
437*c0909341SAndroid Build Coastguard Worker    jz .sec_only
438*c0909341SAndroid Build Coastguard Worker    lzcnt          r4d, prid
439*c0909341SAndroid Build Coastguard Worker    rorx           r3d, prid, 2
440*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m13, prim
441*c0909341SAndroid Build Coastguard Worker    cmp     dword r10m, 0xfff  ; if (bpc == 12)
442*c0909341SAndroid Build Coastguard Worker    cmove         prid, r3d    ;     pri >>= 2
443*c0909341SAndroid Build Coastguard Worker    mov            r3d, dampingm
444*c0909341SAndroid Build Coastguard Worker    and           prid, 4
445*c0909341SAndroid Build Coastguard Worker    sub            r3d, 31
446*c0909341SAndroid Build Coastguard Worker    add            r4d, r3d    ; pri_shift
447*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m14, r4d
448*c0909341SAndroid Build Coastguard Worker    mov            r4d, dirm
449*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m2, [base+pri_taps8+priq*2+0]
450*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m3, [base+pri_taps8+priq*2+4]
451*c0909341SAndroid Build Coastguard Worker    movsx           r5, byte [base+cdef_dirs8+(r4+2)*2+0] ; k0off1
452*c0909341SAndroid Build Coastguard Worker    pmaxsw         m14, m12
453*c0909341SAndroid Build Coastguard Worker    call .constrain
454*c0909341SAndroid Build Coastguard Worker    mov            r5d, secm
455*c0909341SAndroid Build Coastguard Worker    pmullw         m16, m8, m2
456*c0909341SAndroid Build Coastguard Worker    pmullw         m17, m9, m2
457*c0909341SAndroid Build Coastguard Worker    test           r5d, r5d
458*c0909341SAndroid Build Coastguard Worker    jnz .pri_sec
459*c0909341SAndroid Build Coastguard Worker    movsx           r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1
460*c0909341SAndroid Build Coastguard Worker    call .constrain
461*c0909341SAndroid Build Coastguard Worker    pmullw          m8, m3
462*c0909341SAndroid Build Coastguard Worker    pmullw          m9, m3
463*c0909341SAndroid Build Coastguard Worker    jmp .end_no_clip
464*c0909341SAndroid Build Coastguard Worker.pri_sec:
465*c0909341SAndroid Build Coastguard Worker    lzcnt          r5d, r5d
466*c0909341SAndroid Build Coastguard Worker    add            r3d, r5d    ; sec_shift
467*c0909341SAndroid Build Coastguard Worker    movsx           r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1
468*c0909341SAndroid Build Coastguard Worker    pminuw         m18, m0, m4
469*c0909341SAndroid Build Coastguard Worker    pmaxsw         m19, m0, m4
470*c0909341SAndroid Build Coastguard Worker    pminuw         m20, m1, m5
471*c0909341SAndroid Build Coastguard Worker    pmaxsw         m21, m1, m5
472*c0909341SAndroid Build Coastguard Worker    call .min_max_constrain2
473*c0909341SAndroid Build Coastguard Worker    movsx           r5, byte [base+cdef_dirs8+(r4+0)*2+0] ; k0off2
474*c0909341SAndroid Build Coastguard Worker    pmullw          m8, m3
475*c0909341SAndroid Build Coastguard Worker    pmullw          m9, m3
476*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m13, secm
477*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m14, r3d
478*c0909341SAndroid Build Coastguard Worker    paddw          m16, m8
479*c0909341SAndroid Build Coastguard Worker    paddw          m17, m9
480*c0909341SAndroid Build Coastguard Worker    call .min_max_constrain
481*c0909341SAndroid Build Coastguard Worker    movsx           r5, byte [base+cdef_dirs8+(r4+4)*2+0] ; k0off3
482*c0909341SAndroid Build Coastguard Worker    mova            m2, m8
483*c0909341SAndroid Build Coastguard Worker    mova            m3, m9
484*c0909341SAndroid Build Coastguard Worker    call .min_max_constrain
485*c0909341SAndroid Build Coastguard Worker    movsx           r5, byte [base+cdef_dirs8+(r4+0)*2+1] ; k1off2
486*c0909341SAndroid Build Coastguard Worker    paddw           m2, m8
487*c0909341SAndroid Build Coastguard Worker    paddw           m3, m9
488*c0909341SAndroid Build Coastguard Worker    call .min_max_constrain
489*c0909341SAndroid Build Coastguard Worker    movsx           r5, byte [base+cdef_dirs8+(r4+4)*2+1] ; k1off3
490*c0909341SAndroid Build Coastguard Worker    paddw           m2, m2
491*c0909341SAndroid Build Coastguard Worker    paddw           m3, m3
492*c0909341SAndroid Build Coastguard Worker    paddw          m16, m8
493*c0909341SAndroid Build Coastguard Worker    paddw          m17, m9
494*c0909341SAndroid Build Coastguard Worker    call .min_max_constrain
495*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m10, [base+pw_2048]
496*c0909341SAndroid Build Coastguard Worker    paddw          m16, m2
497*c0909341SAndroid Build Coastguard Worker    paddw          m17, m3
498*c0909341SAndroid Build Coastguard Worker    paddw          m16, m8
499*c0909341SAndroid Build Coastguard Worker    paddw          m17, m9
500*c0909341SAndroid Build Coastguard Worker    psraw           m8, m16, 15
501*c0909341SAndroid Build Coastguard Worker    psraw           m9, m17, 15
502*c0909341SAndroid Build Coastguard Worker    paddw          m16, m8
503*c0909341SAndroid Build Coastguard Worker    paddw          m17, m9
504*c0909341SAndroid Build Coastguard Worker    pmulhrsw       m16, m10
505*c0909341SAndroid Build Coastguard Worker    pmulhrsw       m17, m10
506*c0909341SAndroid Build Coastguard Worker    pminuw         m18, m4
507*c0909341SAndroid Build Coastguard Worker    pmaxsw         m19, m4
508*c0909341SAndroid Build Coastguard Worker    pminuw         m20, m5
509*c0909341SAndroid Build Coastguard Worker    pmaxsw         m21, m5
510*c0909341SAndroid Build Coastguard Worker    pminuw         m18, m6
511*c0909341SAndroid Build Coastguard Worker    pmaxsw         m19, m6
512*c0909341SAndroid Build Coastguard Worker    pminuw         m20, m7
513*c0909341SAndroid Build Coastguard Worker    pmaxsw         m21, m7
514*c0909341SAndroid Build Coastguard Worker    paddw          m16, m0
515*c0909341SAndroid Build Coastguard Worker    paddw          m17, m1
516*c0909341SAndroid Build Coastguard Worker    pmaxsw         m16, m18
517*c0909341SAndroid Build Coastguard Worker    pmaxsw         m17, m20
518*c0909341SAndroid Build Coastguard Worker    pminsw         m16, m19
519*c0909341SAndroid Build Coastguard Worker    pminsw         m17, m21
520*c0909341SAndroid Build Coastguard Worker    jmp .end
521*c0909341SAndroid Build Coastguard Worker.sec_only:
522*c0909341SAndroid Build Coastguard Worker    tzcnt          r5d, secm
523*c0909341SAndroid Build Coastguard Worker    mov            r4d, dirm
524*c0909341SAndroid Build Coastguard Worker    mov            r3d, dampingm
525*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m13, secm
526*c0909341SAndroid Build Coastguard Worker    sub            r3d, r5d
527*c0909341SAndroid Build Coastguard Worker    movsx           r5, byte [base+cdef_dirs8+(r4+0)*2+0]
528*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   m14, r3d
529*c0909341SAndroid Build Coastguard Worker    call .constrain
530*c0909341SAndroid Build Coastguard Worker    movsx           r5, byte [base+cdef_dirs8+(r4+4)*2+0]
531*c0909341SAndroid Build Coastguard Worker    mova           m16, m8
532*c0909341SAndroid Build Coastguard Worker    mova           m17, m9
533*c0909341SAndroid Build Coastguard Worker    call .constrain
534*c0909341SAndroid Build Coastguard Worker    movsx           r5, byte [base+cdef_dirs8+(r4+0)*2+1]
535*c0909341SAndroid Build Coastguard Worker    paddw          m16, m8
536*c0909341SAndroid Build Coastguard Worker    paddw          m17, m9
537*c0909341SAndroid Build Coastguard Worker    call .constrain
538*c0909341SAndroid Build Coastguard Worker    movsx           r5, byte [base+cdef_dirs8+(r4+4)*2+1]
539*c0909341SAndroid Build Coastguard Worker    paddw          m16, m16
540*c0909341SAndroid Build Coastguard Worker    paddw          m17, m17
541*c0909341SAndroid Build Coastguard Worker    paddw          m16, m8
542*c0909341SAndroid Build Coastguard Worker    paddw          m17, m9
543*c0909341SAndroid Build Coastguard Worker    call .constrain
544*c0909341SAndroid Build Coastguard Worker.end_no_clip:
545*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m10, [base+pw_2048]
546*c0909341SAndroid Build Coastguard Worker    paddw          m16, m8
547*c0909341SAndroid Build Coastguard Worker    paddw          m17, m9
548*c0909341SAndroid Build Coastguard Worker    psraw           m8, m16, 15
549*c0909341SAndroid Build Coastguard Worker    psraw           m9, m17, 15
550*c0909341SAndroid Build Coastguard Worker    paddw          m16, m8
551*c0909341SAndroid Build Coastguard Worker    paddw          m17, m9
552*c0909341SAndroid Build Coastguard Worker    pmulhrsw       m16, m10
553*c0909341SAndroid Build Coastguard Worker    pmulhrsw       m17, m10
554*c0909341SAndroid Build Coastguard Worker    paddw          m16, m0
555*c0909341SAndroid Build Coastguard Worker    paddw          m17, m1
556*c0909341SAndroid Build Coastguard Worker.end:
557*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm16
558*c0909341SAndroid Build Coastguard Worker    vextracti128  [dstq+strideq*1], ym16, 1
559*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m16, 2
560*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r2       ], m16, 3
561*c0909341SAndroid Build Coastguard Worker    lea           dstq, [dstq+strideq*4]
562*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm17
563*c0909341SAndroid Build Coastguard Worker    vextracti128  [dstq+strideq*1], ym17, 1
564*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m17, 2
565*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r2       ], m17, 3
566*c0909341SAndroid Build Coastguard Worker    RET
567*c0909341SAndroid Build Coastguard Worker.mask_edges:
568*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m2, [base+pw_m16384]
569*c0909341SAndroid Build Coastguard Worker    test           r3b, 0x08
570*c0909341SAndroid Build Coastguard Worker    jz .mask_edges_no_bottom  ; avoid buffer overread
571*c0909341SAndroid Build Coastguard Worker    movu          ym21, [botq+strideq*0-4]
572*c0909341SAndroid Build Coastguard Worker    vinserti32x8   m21, [botq+strideq*1-4], 1
573*c0909341SAndroid Build Coastguard Worker    jmp .mask_edges_top
574*c0909341SAndroid Build Coastguard Worker.mask_edges_no_bottom:
575*c0909341SAndroid Build Coastguard Worker    mova           m21, m2
576*c0909341SAndroid Build Coastguard Worker.mask_edges_top:
577*c0909341SAndroid Build Coastguard Worker    test           r3b, 0x04
578*c0909341SAndroid Build Coastguard Worker    jnz .mask_edges_main
579*c0909341SAndroid Build Coastguard Worker    mova           m16, m2
580*c0909341SAndroid Build Coastguard Worker.mask_edges_main:
581*c0909341SAndroid Build Coastguard Worker    and            r3d, 0x03
582*c0909341SAndroid Build Coastguard Worker    cmp            r3d, 0x03
583*c0909341SAndroid Build Coastguard Worker    je .main
584*c0909341SAndroid Build Coastguard Worker    kmovw           k1, [base+edge_mask8+r3*2]
585*c0909341SAndroid Build Coastguard Worker    vmovdqa32  m16{k1}, m2     ; edge pixels = -16384
586*c0909341SAndroid Build Coastguard Worker    vmovdqa32  m17{k1}, m2
587*c0909341SAndroid Build Coastguard Worker    vmovdqa32  m18{k1}, m2
588*c0909341SAndroid Build Coastguard Worker    vmovdqa32  m19{k1}, m2
589*c0909341SAndroid Build Coastguard Worker    vmovdqa32  m20{k1}, m2
590*c0909341SAndroid Build Coastguard Worker    vmovdqa32  m21{k1}, m2
591*c0909341SAndroid Build Coastguard Worker    jmp .main
592*c0909341SAndroid Build Coastguard WorkerALIGN function_align
593*c0909341SAndroid Build Coastguard Worker.min_max_constrain:
594*c0909341SAndroid Build Coastguard Worker    pminuw         m18, m4
595*c0909341SAndroid Build Coastguard Worker    pmaxsw         m19, m4
596*c0909341SAndroid Build Coastguard Worker    pminuw         m20, m5
597*c0909341SAndroid Build Coastguard Worker    pmaxsw         m21, m5
598*c0909341SAndroid Build Coastguard Worker.min_max_constrain2:
599*c0909341SAndroid Build Coastguard Worker    pminuw         m18, m6
600*c0909341SAndroid Build Coastguard Worker    pmaxsw         m19, m6
601*c0909341SAndroid Build Coastguard Worker    pminuw         m20, m7
602*c0909341SAndroid Build Coastguard Worker    pmaxsw         m21, m7
603*c0909341SAndroid Build Coastguard Worker.constrain:
604*c0909341SAndroid Build Coastguard Worker    %define        tmp  rsp+gprsize+68
605*c0909341SAndroid Build Coastguard Worker    movu            m4, [tmp+r5+64*0]
606*c0909341SAndroid Build Coastguard Worker    vshufi32x4      m4, [tmp+r5+64*1], q2020 ; k0p0 (top)
607*c0909341SAndroid Build Coastguard Worker    movu            m5, [tmp+r5+64*2]
608*c0909341SAndroid Build Coastguard Worker    vshufi32x4      m5, [tmp+r5+64*3], q2020 ; k0p0 (bottom)
609*c0909341SAndroid Build Coastguard Worker    neg             r5
610*c0909341SAndroid Build Coastguard Worker    movu            m6, [tmp+r5+64*0]
611*c0909341SAndroid Build Coastguard Worker    vshufi32x4      m6, [tmp+r5+64*1], q2020 ; k0p1 (top)
612*c0909341SAndroid Build Coastguard Worker    movu            m7, [tmp+r5+64*2]
613*c0909341SAndroid Build Coastguard Worker    vshufi32x4      m7, [tmp+r5+64*3], q2020 ; k0p1 (bottom)
614*c0909341SAndroid Build Coastguard Worker    CONSTRAIN       m8, m4, m0, m12, m13, m14, m15
615*c0909341SAndroid Build Coastguard Worker    CONSTRAIN       m9, m5, m1, m12, m13, m14, m15
616*c0909341SAndroid Build Coastguard Worker    CONSTRAIN      m10, m6, m0, m12, m13, m14, m15
617*c0909341SAndroid Build Coastguard Worker    CONSTRAIN      m11, m7, m1, m12, m13, m14, m15
618*c0909341SAndroid Build Coastguard Worker    paddw           m8, m10
619*c0909341SAndroid Build Coastguard Worker    paddw           m9, m11
620*c0909341SAndroid Build Coastguard Worker    ret
621*c0909341SAndroid Build Coastguard Worker
622*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
623