xref: /aosp_15_r20/external/libdav1d/src/x86/cdef_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2020, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2020, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Worker%macro DUP4 1-*
32*c0909341SAndroid Build Coastguard Worker    %rep %0
33*c0909341SAndroid Build Coastguard Worker        times 4 db %1
34*c0909341SAndroid Build Coastguard Worker        %rotate 1
35*c0909341SAndroid Build Coastguard Worker    %endrep
36*c0909341SAndroid Build Coastguard Worker%endmacro
37*c0909341SAndroid Build Coastguard Worker
38*c0909341SAndroid Build Coastguard Worker%macro DIRS 16 ; cdef_directions[]
39*c0909341SAndroid Build Coastguard Worker    %rep 4 + 16 + 4 ; 6 7   0 1 2 3 4 5 6 7   0 1
40*c0909341SAndroid Build Coastguard Worker        ; masking away unused bits allows us to use a single vpaddd {1to16}
41*c0909341SAndroid Build Coastguard Worker        ; instruction instead of having to do vpbroadcastd + paddb
42*c0909341SAndroid Build Coastguard Worker        db %13 & 0x3f, -%13 & 0x3f
43*c0909341SAndroid Build Coastguard Worker        %rotate 1
44*c0909341SAndroid Build Coastguard Worker    %endrep
45*c0909341SAndroid Build Coastguard Worker%endmacro
46*c0909341SAndroid Build Coastguard Worker
47*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64
48*c0909341SAndroid Build Coastguard Worker
49*c0909341SAndroid Build Coastguard Workerlut_perm_4x4:  db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
50*c0909341SAndroid Build Coastguard Worker               db 16, 17,  0,  1,  2,  3,  4,  5, 18, 19,  8,  9, 10, 11, 12, 13
51*c0909341SAndroid Build Coastguard Worker               db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37
52*c0909341SAndroid Build Coastguard Worker               db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57
53*c0909341SAndroid Build Coastguard Workerlut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
54*c0909341SAndroid Build Coastguard Worker               db 96, 97,  0,  1,  2,  3,  4,  5, 98, 99,  8,  9, 10, 11, 12, 13
55*c0909341SAndroid Build Coastguard Workerlut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29
56*c0909341SAndroid Build Coastguard Worker              db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45
57*c0909341SAndroid Build Coastguard Worker              db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61
58*c0909341SAndroid Build Coastguard Worker               db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95
59*c0909341SAndroid Build Coastguard Workerpd_01234567:   dd  0,  1,  2,  3,  4,  5,  6,  7
60*c0909341SAndroid Build Coastguard Workerlut_perm_8x8a: db 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55
61*c0909341SAndroid Build Coastguard Worker               db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59
62*c0909341SAndroid Build Coastguard Workerlut_perm_8x8b: db 12, 13,  0,  1,  2,  3,  4,  5, 14, 15, 16, 17, 18, 19, 20, 21
63*c0909341SAndroid Build Coastguard Worker               db  2,  3,  4,  5,  6,  7,  8,  9, 18, 19, 20, 21, 22, 23, 24, 25
64*c0909341SAndroid Build Coastguard Worker               db 28, 29, 32, 33, 34, 35, 36, 37, 30, 31, 48, 49, 50, 51, 52, 53
65*c0909341SAndroid Build Coastguard Worker               db 34, 35, 36, 37, 38, 39, 40, 41, 50, 51, 52, 53, 54, 55, 56, 57
66*c0909341SAndroid Build Coastguard Workerend_perm:      db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
67*c0909341SAndroid Build Coastguard Worker               db  3,  7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
68*c0909341SAndroid Build Coastguard Workerend_perm_clip: db  0,  4,  8, 12,  2,  6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30
69*c0909341SAndroid Build Coastguard Worker               db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62
70*c0909341SAndroid Build Coastguard Worker               db  1,  5,  9, 13,  3,  7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31
71*c0909341SAndroid Build Coastguard Worker               db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63
72*c0909341SAndroid Build Coastguard Workeredge_mask:     dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001
73*c0909341SAndroid Build Coastguard Worker               dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011
74*c0909341SAndroid Build Coastguard Worker               dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101
75*c0909341SAndroid Build Coastguard Worker               dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111
76*c0909341SAndroid Build Coastguard Worker               dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001
77*c0909341SAndroid Build Coastguard Worker               dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011
78*c0909341SAndroid Build Coastguard Worker               dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101
79*c0909341SAndroid Build Coastguard Worker               dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111
80*c0909341SAndroid Build Coastguard Workerpx_idx:      DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45
81*c0909341SAndroid Build Coastguard Workercdef_dirs:   DIRS -7,-14,  1, -6,  1,  2,  1, 10,  9, 18,  8, 17,  8, 16,  8, 15
82*c0909341SAndroid Build Coastguard Workergf_shr:        dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0
83*c0909341SAndroid Build Coastguard Worker               dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2
84*c0909341SAndroid Build Coastguard Worker               dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4
85*c0909341SAndroid Build Coastguard Worker               dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6
86*c0909341SAndroid Build Coastguard Workerpri_tap:       db 64, 64, 32, 32, 48, 48, 48, 48         ; left-shifted by 4
87*c0909341SAndroid Build Coastguard Workersec_tap:       db 32, 32, 16, 16
88*c0909341SAndroid Build Coastguard Workerpd_268435568:  dd 268435568
89*c0909341SAndroid Build Coastguard Worker
90*c0909341SAndroid Build Coastguard WorkerSECTION .text
91*c0909341SAndroid Build Coastguard Worker
92*c0909341SAndroid Build Coastguard Worker%if WIN64
93*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4
94*c0909341SAndroid Build Coastguard Worker%else
95*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 8
96*c0909341SAndroid Build Coastguard Worker%endif
97*c0909341SAndroid Build Coastguard Worker
98*c0909341SAndroid Build Coastguard Worker; lut:
99*c0909341SAndroid Build Coastguard Worker; t0 t1 t2 t3 t4 t5 t6 t7
100*c0909341SAndroid Build Coastguard Worker; T0 T1 T2 T3 T4 T5 T6 T7
101*c0909341SAndroid Build Coastguard Worker; L0 L1 00 01 02 03 04 05
102*c0909341SAndroid Build Coastguard Worker; L2 L3 10 11 12 13 14 15
103*c0909341SAndroid Build Coastguard Worker; L4 L5 20 21 22 23 24 25
104*c0909341SAndroid Build Coastguard Worker; L6 L7 30 31 32 33 34 35
105*c0909341SAndroid Build Coastguard Worker; b0 b1 b2 b3 b4 b5 b6 b7
106*c0909341SAndroid Build Coastguard Worker; B0 B1 B2 B3 B4 B5 B6 B7
107*c0909341SAndroid Build Coastguard Worker
108*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
109*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_4x4_8bpc, 5, 8, 13, dst, stride, left, top, bot, \
110*c0909341SAndroid Build Coastguard Worker                                        pri, sec, dir, damping, edge
111*c0909341SAndroid Build Coastguard Worker%define base r7-edge_mask
112*c0909341SAndroid Build Coastguard Worker    movq         xmm0, [dstq+strideq*0]
113*c0909341SAndroid Build Coastguard Worker    movhps       xmm0, [dstq+strideq*1]
114*c0909341SAndroid Build Coastguard Worker    lea            r7, [edge_mask]
115*c0909341SAndroid Build Coastguard Worker    movq         xmm1, [topq+strideq*0-2]
116*c0909341SAndroid Build Coastguard Worker    movhps       xmm1, [topq+strideq*1-2]
117*c0909341SAndroid Build Coastguard Worker    mov           r6d, edgem
118*c0909341SAndroid Build Coastguard Worker    vinserti32x4  ym0, ymm0, [leftq], 1
119*c0909341SAndroid Build Coastguard Worker    lea            r2, [strideq*3]
120*c0909341SAndroid Build Coastguard Worker    vinserti32x4  ym1, ymm1, [dstq+strideq*2], 1
121*c0909341SAndroid Build Coastguard Worker    mova           m5, [base+lut_perm_4x4]
122*c0909341SAndroid Build Coastguard Worker    vinserti32x4   m0, [dstq+r2], 2
123*c0909341SAndroid Build Coastguard Worker    test          r6b, 0x08      ; avoid buffer overread
124*c0909341SAndroid Build Coastguard Worker    jz .main
125*c0909341SAndroid Build Coastguard Worker    vinserti32x4   m1, [botq+strideq*0-4], 2
126*c0909341SAndroid Build Coastguard Worker    vinserti32x4   m0, [botq+strideq*1-4], 3
127*c0909341SAndroid Build Coastguard Worker.main:
128*c0909341SAndroid Build Coastguard Worker    movifnidn    prid, prim
129*c0909341SAndroid Build Coastguard Worker    mov           t0d, dirm
130*c0909341SAndroid Build Coastguard Worker    mova           m3, [base+px_idx]
131*c0909341SAndroid Build Coastguard Worker    mov           r3d, dampingm
132*c0909341SAndroid Build Coastguard Worker    vpermi2b       m5, m0, m1    ; lut
133*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
134*c0909341SAndroid Build Coastguard Worker    pxor           m7, m7
135*c0909341SAndroid Build Coastguard Worker    lea            r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8
136*c0909341SAndroid Build Coastguard Worker    vpermb         m6, m3, m5    ; px
137*c0909341SAndroid Build Coastguard Worker    cmp           r6d, 0x0f
138*c0909341SAndroid Build Coastguard Worker    jne .mask_edges              ; mask edges only if required
139*c0909341SAndroid Build Coastguard Worker    test         prid, prid
140*c0909341SAndroid Build Coastguard Worker    jz .sec_only
141*c0909341SAndroid Build Coastguard Worker    vpaddd         m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
142*c0909341SAndroid Build Coastguard Worker    vpermb         m1, m1, m5    ; k0p0 k0p1 k1p0 k1p1
143*c0909341SAndroid Build Coastguard Worker%macro CDEF_FILTER_4x4_PRI 0
144*c0909341SAndroid Build Coastguard Worker    vpcmpub        k1, m6, m1, 6 ; px > pN
145*c0909341SAndroid Build Coastguard Worker    psubb          m2, m1, m6
146*c0909341SAndroid Build Coastguard Worker    lzcnt         r6d, prid
147*c0909341SAndroid Build Coastguard Worker    vpsubb     m2{k1}, m6, m1    ; abs(diff)
148*c0909341SAndroid Build Coastguard Worker    vpbroadcastb   m4, prid
149*c0909341SAndroid Build Coastguard Worker    and          prid, 1
150*c0909341SAndroid Build Coastguard Worker    vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift
151*c0909341SAndroid Build Coastguard Worker    movifnidn    secd, secm
152*c0909341SAndroid Build Coastguard Worker    vpbroadcastd  m10, [base+pri_tap+priq*4]
153*c0909341SAndroid Build Coastguard Worker    vpsubb    m10{k1}, m7, m10   ; apply_sign(pri_tap)
154*c0909341SAndroid Build Coastguard Worker    psubusb        m4, m9        ; imax(0, pri_strength - (abs(diff) >> shift)))
155*c0909341SAndroid Build Coastguard Worker    pminub         m2, m4
156*c0909341SAndroid Build Coastguard Worker    vpdpbusd       m0, m2, m10   ; sum
157*c0909341SAndroid Build Coastguard Worker%endmacro
158*c0909341SAndroid Build Coastguard Worker    CDEF_FILTER_4x4_PRI
159*c0909341SAndroid Build Coastguard Worker    test         secd, secd
160*c0909341SAndroid Build Coastguard Worker    jz .end_no_clip
161*c0909341SAndroid Build Coastguard Worker    call .sec
162*c0909341SAndroid Build Coastguard Worker.end_clip:
163*c0909341SAndroid Build Coastguard Worker    pminub         m4, m6, m1
164*c0909341SAndroid Build Coastguard Worker    pmaxub         m1, m6
165*c0909341SAndroid Build Coastguard Worker    pminub         m5, m2, m3
166*c0909341SAndroid Build Coastguard Worker    pmaxub         m2, m3
167*c0909341SAndroid Build Coastguard Worker    pminub         m4, m5
168*c0909341SAndroid Build Coastguard Worker    pmaxub         m2, m1
169*c0909341SAndroid Build Coastguard Worker    psrldq         m1, m4, 2
170*c0909341SAndroid Build Coastguard Worker    psrldq         m3, m2, 2
171*c0909341SAndroid Build Coastguard Worker    pminub         m1, m4
172*c0909341SAndroid Build Coastguard Worker    vpcmpw         k1, m0, m7, 1
173*c0909341SAndroid Build Coastguard Worker    vpshldd        m6, m0, 8
174*c0909341SAndroid Build Coastguard Worker    pmaxub         m2, m3
175*c0909341SAndroid Build Coastguard Worker    pslldq         m3, m1, 1
176*c0909341SAndroid Build Coastguard Worker    psubw          m7, m0
177*c0909341SAndroid Build Coastguard Worker    paddusw        m0, m6     ; clip >0xff
178*c0909341SAndroid Build Coastguard Worker    vpsubusw   m0{k1}, m6, m7 ; clip <0x00
179*c0909341SAndroid Build Coastguard Worker    pslldq         m4, m2, 1
180*c0909341SAndroid Build Coastguard Worker    pminub         m1, m3
181*c0909341SAndroid Build Coastguard Worker    pmaxub         m2, m4
182*c0909341SAndroid Build Coastguard Worker    pmaxub         m0, m1
183*c0909341SAndroid Build Coastguard Worker    pminub         m0, m2
184*c0909341SAndroid Build Coastguard Worker    jmp .end
185*c0909341SAndroid Build Coastguard Worker.sec_only:
186*c0909341SAndroid Build Coastguard Worker    movifnidn    secd, secm
187*c0909341SAndroid Build Coastguard Worker    call .sec
188*c0909341SAndroid Build Coastguard Worker.end_no_clip:
189*c0909341SAndroid Build Coastguard Worker    vpshldd        m6, m0, 8  ; (px << 8) + ((sum > -8) << 4)
190*c0909341SAndroid Build Coastguard Worker    paddw          m0, m6     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
191*c0909341SAndroid Build Coastguard Worker.end:
192*c0909341SAndroid Build Coastguard Worker    mova          xm1, [base+end_perm]
193*c0909341SAndroid Build Coastguard Worker    vpermb         m0, m1, m0 ; output in bits 8-15 of each dword
194*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
195*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
196*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 2
197*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r2       ], xm0, 3
198*c0909341SAndroid Build Coastguard Worker    RET
199*c0909341SAndroid Build Coastguard Worker.mask_edges_sec_only:
200*c0909341SAndroid Build Coastguard Worker    movifnidn    secd, secm
201*c0909341SAndroid Build Coastguard Worker    call .mask_edges_sec
202*c0909341SAndroid Build Coastguard Worker    jmp .end_no_clip
203*c0909341SAndroid Build Coastguard WorkerALIGN function_align
204*c0909341SAndroid Build Coastguard Worker.mask_edges:
205*c0909341SAndroid Build Coastguard Worker    vpbroadcastq   m8, [base+edge_mask+r6*8]
206*c0909341SAndroid Build Coastguard Worker    test         prid, prid
207*c0909341SAndroid Build Coastguard Worker    jz .mask_edges_sec_only
208*c0909341SAndroid Build Coastguard Worker    vpaddd         m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16}
209*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k1, m8, m2 ; index in-range
210*c0909341SAndroid Build Coastguard Worker    mova           m1, m6
211*c0909341SAndroid Build Coastguard Worker    vpermb     m1{k1}, m2, m5
212*c0909341SAndroid Build Coastguard Worker    CDEF_FILTER_4x4_PRI
213*c0909341SAndroid Build Coastguard Worker    test         secd, secd
214*c0909341SAndroid Build Coastguard Worker    jz .end_no_clip
215*c0909341SAndroid Build Coastguard Worker    call .mask_edges_sec
216*c0909341SAndroid Build Coastguard Worker    jmp .end_clip
217*c0909341SAndroid Build Coastguard Worker.mask_edges_sec:
218*c0909341SAndroid Build Coastguard Worker    vpaddd         m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16}
219*c0909341SAndroid Build Coastguard Worker    vpaddd         m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16}
220*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k1, m8, m4
221*c0909341SAndroid Build Coastguard Worker    mova           m2, m6
222*c0909341SAndroid Build Coastguard Worker    vpermb     m2{k1}, m4, m5
223*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k1, m8, m9
224*c0909341SAndroid Build Coastguard Worker    mova           m3, m6
225*c0909341SAndroid Build Coastguard Worker    vpermb     m3{k1}, m9, m5
226*c0909341SAndroid Build Coastguard Worker    jmp .sec_main
227*c0909341SAndroid Build Coastguard WorkerALIGN function_align
228*c0909341SAndroid Build Coastguard Worker.sec:
229*c0909341SAndroid Build Coastguard Worker    vpaddd         m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
230*c0909341SAndroid Build Coastguard Worker    vpaddd         m3,     [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
231*c0909341SAndroid Build Coastguard Worker    vpermb         m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1
232*c0909341SAndroid Build Coastguard Worker    vpermb         m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3
233*c0909341SAndroid Build Coastguard Worker.sec_main:
234*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m8, [base+sec_tap]
235*c0909341SAndroid Build Coastguard Worker    vpcmpub        k1, m6, m2, 6
236*c0909341SAndroid Build Coastguard Worker    psubb          m4, m2, m6
237*c0909341SAndroid Build Coastguard Worker    vpbroadcastb  m12, secd
238*c0909341SAndroid Build Coastguard Worker    lzcnt        secd, secd
239*c0909341SAndroid Build Coastguard Worker    vpsubb     m4{k1}, m6, m2
240*c0909341SAndroid Build Coastguard Worker    vpcmpub        k2, m6, m3, 6
241*c0909341SAndroid Build Coastguard Worker    vpbroadcastq  m11, [r3+secq*8]
242*c0909341SAndroid Build Coastguard Worker    gf2p8affineqb m10, m4, m11, 0
243*c0909341SAndroid Build Coastguard Worker    psubb          m5, m3, m6
244*c0909341SAndroid Build Coastguard Worker    mova           m9, m8
245*c0909341SAndroid Build Coastguard Worker    vpsubb     m8{k1}, m7, m8
246*c0909341SAndroid Build Coastguard Worker    psubusb       m10, m12, m10
247*c0909341SAndroid Build Coastguard Worker    vpsubb     m5{k2}, m6, m3
248*c0909341SAndroid Build Coastguard Worker    pminub         m4, m10
249*c0909341SAndroid Build Coastguard Worker    vpdpbusd       m0, m4, m8
250*c0909341SAndroid Build Coastguard Worker    gf2p8affineqb m11, m5, m11, 0
251*c0909341SAndroid Build Coastguard Worker    vpsubb     m9{k2}, m7, m9
252*c0909341SAndroid Build Coastguard Worker    psubusb       m12, m11
253*c0909341SAndroid Build Coastguard Worker    pminub         m5, m12
254*c0909341SAndroid Build Coastguard Worker    vpdpbusd       m0, m5, m9
255*c0909341SAndroid Build Coastguard Worker    ret
256*c0909341SAndroid Build Coastguard Worker
257*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 2, 7
258*c0909341SAndroid Build Coastguard Worker
259*c0909341SAndroid Build Coastguard Worker;         lut top                lut bottom
260*c0909341SAndroid Build Coastguard Worker; t0 t1 t2 t3 t4 t5 t6 t7  L4 L5 20 21 22 23 24 25
261*c0909341SAndroid Build Coastguard Worker; T0 T1 T2 T3 T4 T5 T6 T7  L6 L7 30 31 32 33 34 35
262*c0909341SAndroid Build Coastguard Worker; L0 L1 00 01 02 03 04 05  L8 L9 40 41 42 43 44 45
263*c0909341SAndroid Build Coastguard Worker; L2 L3 10 11 12 13 14 15  La Lb 50 51 52 53 54 55
264*c0909341SAndroid Build Coastguard Worker; L4 L5 20 21 22 23 24 25  Lc Ld 60 61 62 63 64 65
265*c0909341SAndroid Build Coastguard Worker; L6 L7 30 31 32 33 34 35  Le Lf 70 71 72 73 74 75
266*c0909341SAndroid Build Coastguard Worker; L8 L9 40 41 42 43 44 45  b0 b1 b2 b3 b4 b5 b6 b7
267*c0909341SAndroid Build Coastguard Worker; La Lb 50 51 52 53 54 55  B0 B1 B2 B3 B4 B5 B6 B7
268*c0909341SAndroid Build Coastguard Worker
269*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_4x8_8bpc, 5, 9, 22, dst, stride, left, top, bot, \
270*c0909341SAndroid Build Coastguard Worker                                        pri, sec, dir, damping, edge
271*c0909341SAndroid Build Coastguard Worker%define base r8-edge_mask
272*c0909341SAndroid Build Coastguard Worker    vpbroadcastd ym21, strided
273*c0909341SAndroid Build Coastguard Worker    mov           r6d, edgem
274*c0909341SAndroid Build Coastguard Worker    lea            r8, [edge_mask]
275*c0909341SAndroid Build Coastguard Worker    movq          xm1, [topq+strideq*0-2]
276*c0909341SAndroid Build Coastguard Worker    pmulld       ym21, [base+pd_01234567]
277*c0909341SAndroid Build Coastguard Worker    kxnorb         k1, k1, k1
278*c0909341SAndroid Build Coastguard Worker    movq          xm2, [topq+strideq*1-2]
279*c0909341SAndroid Build Coastguard Worker    vpgatherdq m0{k1}, [dstq+ym21]  ; +0+1 +2+3 +4+5 +6+7
280*c0909341SAndroid Build Coastguard Worker    mova          m14, [base+lut_perm_4x8a]
281*c0909341SAndroid Build Coastguard Worker    movu          m15, [base+lut_perm_4x8b]
282*c0909341SAndroid Build Coastguard Worker    test          r6b, 0x08         ; avoid buffer overread
283*c0909341SAndroid Build Coastguard Worker    jz .main
284*c0909341SAndroid Build Coastguard Worker    vinserti32x4  ym1, [botq+strideq*0-2], 1
285*c0909341SAndroid Build Coastguard Worker    vinserti32x4  ym2, [botq+strideq*1-2], 1
286*c0909341SAndroid Build Coastguard Worker.main:
287*c0909341SAndroid Build Coastguard Worker    punpcklqdq    ym1, ym2
288*c0909341SAndroid Build Coastguard Worker    vinserti32x4   m1, [leftq], 2   ; -2-1 +8+9 left ____
289*c0909341SAndroid Build Coastguard Worker    movifnidn    prid, prim
290*c0909341SAndroid Build Coastguard Worker    mov           t0d, dirm
291*c0909341SAndroid Build Coastguard Worker    mova          m16, [base+px_idx]
292*c0909341SAndroid Build Coastguard Worker    mov           r3d, dampingm
293*c0909341SAndroid Build Coastguard Worker    vpermi2b      m14, m0, m1    ; lut top
294*c0909341SAndroid Build Coastguard Worker    vpermi2b      m15, m0, m1    ; lut bottom
295*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
296*c0909341SAndroid Build Coastguard Worker    pxor          m20, m20
297*c0909341SAndroid Build Coastguard Worker    lea            r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8
298*c0909341SAndroid Build Coastguard Worker    vpermb         m2, m16, m14  ; pxt
299*c0909341SAndroid Build Coastguard Worker    vpermb         m3, m16, m15  ; pxb
300*c0909341SAndroid Build Coastguard Worker    mova           m1, m0
301*c0909341SAndroid Build Coastguard Worker    cmp           r6b, 0x0f
302*c0909341SAndroid Build Coastguard Worker    jne .mask_edges              ; mask edges only if required
303*c0909341SAndroid Build Coastguard Worker    test         prid, prid
304*c0909341SAndroid Build Coastguard Worker    jz .sec_only
305*c0909341SAndroid Build Coastguard Worker    vpaddd         m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
306*c0909341SAndroid Build Coastguard Worker    vpermb         m4, m6, m14   ; pNt k0p0 k0p1 k1p0 k1p1
307*c0909341SAndroid Build Coastguard Worker    vpermb         m5, m6, m15   ; pNb
308*c0909341SAndroid Build Coastguard Worker%macro CDEF_FILTER_4x8_PRI 0
309*c0909341SAndroid Build Coastguard Worker    vpcmpub        k1, m2, m4, 6 ; pxt > pNt
310*c0909341SAndroid Build Coastguard Worker    vpcmpub        k2, m3, m5, 6 ; pxb > pNb
311*c0909341SAndroid Build Coastguard Worker    psubb          m6, m4, m2
312*c0909341SAndroid Build Coastguard Worker    psubb          m7, m5, m3
313*c0909341SAndroid Build Coastguard Worker    lzcnt         r6d, prid
314*c0909341SAndroid Build Coastguard Worker    vpsubb     m6{k1}, m2, m4    ; abs(diff_top)
315*c0909341SAndroid Build Coastguard Worker    vpsubb     m7{k2}, m3, m5    ; abs(diff_bottom)
316*c0909341SAndroid Build Coastguard Worker    vpbroadcastb  m13, prid
317*c0909341SAndroid Build Coastguard Worker    vpbroadcastq   m9, [r3+r6*8]
318*c0909341SAndroid Build Coastguard Worker    and          prid, 1
319*c0909341SAndroid Build Coastguard Worker    vpbroadcastd  m11, [base+pri_tap+priq*4]
320*c0909341SAndroid Build Coastguard Worker    vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift
321*c0909341SAndroid Build Coastguard Worker    vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift
322*c0909341SAndroid Build Coastguard Worker    mova          m10, m11
323*c0909341SAndroid Build Coastguard Worker    movifnidn     t1d, secm
324*c0909341SAndroid Build Coastguard Worker    vpsubb    m10{k1}, m20, m11  ; apply_sign(pri_tap_top)
325*c0909341SAndroid Build Coastguard Worker    vpsubb    m11{k2}, m20, m11  ; apply_sign(pri_tap_bottom)
326*c0909341SAndroid Build Coastguard Worker    psubusb       m12, m13, m8   ; imax(0, pri_strength - (abs(dt) >> shift)))
327*c0909341SAndroid Build Coastguard Worker    psubusb       m13, m13, m9   ; imax(0, pri_strength - (abs(db) >> shift)))
328*c0909341SAndroid Build Coastguard Worker    pminub         m6, m12
329*c0909341SAndroid Build Coastguard Worker    pminub         m7, m13
330*c0909341SAndroid Build Coastguard Worker    vpdpbusd       m0, m6, m10   ; sum top
331*c0909341SAndroid Build Coastguard Worker    vpdpbusd       m1, m7, m11   ; sum bottom
332*c0909341SAndroid Build Coastguard Worker%endmacro
333*c0909341SAndroid Build Coastguard Worker    CDEF_FILTER_4x8_PRI
334*c0909341SAndroid Build Coastguard Worker    test          t1d, t1d       ; sec
335*c0909341SAndroid Build Coastguard Worker    jz .end_no_clip
336*c0909341SAndroid Build Coastguard Worker    call .sec
337*c0909341SAndroid Build Coastguard Worker.end_clip:
338*c0909341SAndroid Build Coastguard Worker    pminub        m10, m4, m2
339*c0909341SAndroid Build Coastguard Worker    pminub        m12, m6, m8
340*c0909341SAndroid Build Coastguard Worker    pminub        m11, m5, m3
341*c0909341SAndroid Build Coastguard Worker    pminub        m13, m7, m9
342*c0909341SAndroid Build Coastguard Worker    pmaxub         m4, m2
343*c0909341SAndroid Build Coastguard Worker    pmaxub         m6, m8
344*c0909341SAndroid Build Coastguard Worker    pmaxub         m5, m3
345*c0909341SAndroid Build Coastguard Worker    pmaxub         m7, m9
346*c0909341SAndroid Build Coastguard Worker    pminub        m10, m12
347*c0909341SAndroid Build Coastguard Worker    pminub        m11, m13
348*c0909341SAndroid Build Coastguard Worker    pmaxub         m4, m6
349*c0909341SAndroid Build Coastguard Worker    pmaxub         m5, m7
350*c0909341SAndroid Build Coastguard Worker    mov           r2d, 0xAAAAAAAA
351*c0909341SAndroid Build Coastguard Worker    kmovd          k1, r2d
352*c0909341SAndroid Build Coastguard Worker    kxnorb         k2, k2, k2       ;   hw   lw
353*c0909341SAndroid Build Coastguard Worker    vpshrdd       m12, m0, m1, 16   ;  m1lw m0hw
354*c0909341SAndroid Build Coastguard Worker    vpshrdd        m6, m10, m11, 16 ; m11lw m10hw
355*c0909341SAndroid Build Coastguard Worker    vpshrdd        m8, m4, m5, 16   ;  m5lw m4hw
356*c0909341SAndroid Build Coastguard Worker    vpblendmw  m7{k1}, m10, m11     ; m11hw m10lw
357*c0909341SAndroid Build Coastguard Worker    vpblendmw  m9{k1}, m4, m5       ;  m5hw m4lw
358*c0909341SAndroid Build Coastguard Worker    vpblendmw  m4{k1}, m0, m12      ;  m1lw m0lw
359*c0909341SAndroid Build Coastguard Worker    vpblendmw  m5{k1}, m12, m1      ;  m1hw m0hw
360*c0909341SAndroid Build Coastguard Worker    vpshrdd        m2, m3, 16
361*c0909341SAndroid Build Coastguard Worker    pminub         m6, m7
362*c0909341SAndroid Build Coastguard Worker    pmaxub         m8, m9
363*c0909341SAndroid Build Coastguard Worker    mova         ym14, [base+end_perm]
364*c0909341SAndroid Build Coastguard Worker    vpcmpw         k1, m4, m20, 1
365*c0909341SAndroid Build Coastguard Worker    vpshldw        m2, m5, 8
366*c0909341SAndroid Build Coastguard Worker    pslldq         m7, m6, 1
367*c0909341SAndroid Build Coastguard Worker    pslldq         m9, m8, 1
368*c0909341SAndroid Build Coastguard Worker    psubw          m5, m20, m4
369*c0909341SAndroid Build Coastguard Worker    paddusw        m0, m4, m2 ; clip >0xff
370*c0909341SAndroid Build Coastguard Worker    pminub         m6, m7
371*c0909341SAndroid Build Coastguard Worker    pmaxub         m8, m9
372*c0909341SAndroid Build Coastguard Worker    psubusw    m0{k1}, m2, m5 ; clip <0x00
373*c0909341SAndroid Build Coastguard Worker    pmaxub         m0, m6
374*c0909341SAndroid Build Coastguard Worker    pminub         m0, m8
375*c0909341SAndroid Build Coastguard Worker    vpermb         m0, m14, m0
376*c0909341SAndroid Build Coastguard Worker    vpscatterdd [dstq+ym21]{k2}, ym0
377*c0909341SAndroid Build Coastguard Worker    RET
378*c0909341SAndroid Build Coastguard Worker.sec_only:
379*c0909341SAndroid Build Coastguard Worker    movifnidn     t1d, secm
380*c0909341SAndroid Build Coastguard Worker    call .sec
381*c0909341SAndroid Build Coastguard Worker.end_no_clip:
382*c0909341SAndroid Build Coastguard Worker    mova          ym4, [base+end_perm]
383*c0909341SAndroid Build Coastguard Worker    kxnorb         k1, k1, k1
384*c0909341SAndroid Build Coastguard Worker    vpshldd        m2, m0, 8  ; (px << 8) + ((sum > -8) << 4)
385*c0909341SAndroid Build Coastguard Worker    vpshldd        m3, m1, 8
386*c0909341SAndroid Build Coastguard Worker    paddw          m0, m2     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
387*c0909341SAndroid Build Coastguard Worker    paddw          m1, m3
388*c0909341SAndroid Build Coastguard Worker    pslld          m0, 16
389*c0909341SAndroid Build Coastguard Worker    vpshrdd        m0, m1, 16
390*c0909341SAndroid Build Coastguard Worker    vpermb         m0, m4, m0 ; output in bits 8-15 of each word
391*c0909341SAndroid Build Coastguard Worker    vpscatterdd [dstq+ym21]{k1}, ym0
392*c0909341SAndroid Build Coastguard Worker    RET
393*c0909341SAndroid Build Coastguard Worker.mask_edges_sec_only:
394*c0909341SAndroid Build Coastguard Worker    movifnidn     t1d, secm
395*c0909341SAndroid Build Coastguard Worker    call .mask_edges_sec
396*c0909341SAndroid Build Coastguard Worker    jmp .end_no_clip
397*c0909341SAndroid Build Coastguard WorkerALIGN function_align
398*c0909341SAndroid Build Coastguard Worker.mask_edges:
399*c0909341SAndroid Build Coastguard Worker    mov           t1d, r6d
400*c0909341SAndroid Build Coastguard Worker    or            r6d, 8 ; top 4x4 has bottom
401*c0909341SAndroid Build Coastguard Worker    or            t1d, 4 ; bottom 4x4 has top
402*c0909341SAndroid Build Coastguard Worker    vpbroadcastq  m17, [base+edge_mask+r6*8]
403*c0909341SAndroid Build Coastguard Worker    vpbroadcastq  m18, [base+edge_mask+t1*8]
404*c0909341SAndroid Build Coastguard Worker    test         prid, prid
405*c0909341SAndroid Build Coastguard Worker    jz .mask_edges_sec_only
406*c0909341SAndroid Build Coastguard Worker    vpaddd         m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16}
407*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k1, m17, m6 ; index in-range
408*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k2, m18, m6
409*c0909341SAndroid Build Coastguard Worker    mova           m4, m2
410*c0909341SAndroid Build Coastguard Worker    mova           m5, m3
411*c0909341SAndroid Build Coastguard Worker    vpermb     m4{k1}, m6, m14
412*c0909341SAndroid Build Coastguard Worker    vpermb     m5{k2}, m6, m15
413*c0909341SAndroid Build Coastguard Worker    CDEF_FILTER_4x8_PRI
414*c0909341SAndroid Build Coastguard Worker    test          t1d, t1d
415*c0909341SAndroid Build Coastguard Worker    jz .end_no_clip
416*c0909341SAndroid Build Coastguard Worker    call .mask_edges_sec
417*c0909341SAndroid Build Coastguard Worker    jmp .end_clip
418*c0909341SAndroid Build Coastguard Worker.mask_edges_sec:
419*c0909341SAndroid Build Coastguard Worker    vpaddd        m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16}
420*c0909341SAndroid Build Coastguard Worker    vpaddd        m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16}
421*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k1, m17, m10
422*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k2, m18, m10
423*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k3, m17, m11
424*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k4, m18, m11
425*c0909341SAndroid Build Coastguard Worker    mova           m6, m2
426*c0909341SAndroid Build Coastguard Worker    mova           m7, m3
427*c0909341SAndroid Build Coastguard Worker    mova           m8, m2
428*c0909341SAndroid Build Coastguard Worker    mova           m9, m3
429*c0909341SAndroid Build Coastguard Worker    vpermb     m6{k1}, m10, m14
430*c0909341SAndroid Build Coastguard Worker    vpermb     m7{k2}, m10, m15
431*c0909341SAndroid Build Coastguard Worker    vpermb     m8{k3}, m11, m14
432*c0909341SAndroid Build Coastguard Worker    vpermb     m9{k4}, m11, m15
433*c0909341SAndroid Build Coastguard Worker    jmp .sec_main
434*c0909341SAndroid Build Coastguard WorkerALIGN function_align
435*c0909341SAndroid Build Coastguard Worker.sec:
436*c0909341SAndroid Build Coastguard Worker    vpaddd         m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
437*c0909341SAndroid Build Coastguard Worker    vpaddd         m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
438*c0909341SAndroid Build Coastguard Worker    vpermb         m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1
439*c0909341SAndroid Build Coastguard Worker    vpermb         m7, m8, m15 ; pNb
440*c0909341SAndroid Build Coastguard Worker    vpermb         m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3
441*c0909341SAndroid Build Coastguard Worker    vpermb         m9, m9, m15 ; pNb
442*c0909341SAndroid Build Coastguard Worker.sec_main:
443*c0909341SAndroid Build Coastguard Worker    vpbroadcastb  m18, t1d
444*c0909341SAndroid Build Coastguard Worker    lzcnt         t1d, t1d
445*c0909341SAndroid Build Coastguard Worker    vpcmpub        k1, m2, m6, 6
446*c0909341SAndroid Build Coastguard Worker    vpcmpub        k2, m3, m7, 6
447*c0909341SAndroid Build Coastguard Worker    vpcmpub        k3, m2, m8, 6
448*c0909341SAndroid Build Coastguard Worker    vpcmpub        k4, m3, m9, 6
449*c0909341SAndroid Build Coastguard Worker    vpbroadcastq  m17, [r3+t1*8]
450*c0909341SAndroid Build Coastguard Worker    psubb         m10, m6, m2
451*c0909341SAndroid Build Coastguard Worker    psubb         m11, m7, m3
452*c0909341SAndroid Build Coastguard Worker    psubb         m12, m8, m2
453*c0909341SAndroid Build Coastguard Worker    psubb         m13, m9, m3
454*c0909341SAndroid Build Coastguard Worker    vpsubb    m10{k1}, m2, m6      ; abs(dt0)
455*c0909341SAndroid Build Coastguard Worker    vpsubb    m11{k2}, m3, m7      ; abs(db0)
456*c0909341SAndroid Build Coastguard Worker    vpsubb    m12{k3}, m2, m8      ; abs(dt1)
457*c0909341SAndroid Build Coastguard Worker    vpsubb    m13{k4}, m3, m9      ; abs(db1)
458*c0909341SAndroid Build Coastguard Worker    vpbroadcastd  m19, [base+sec_tap]
459*c0909341SAndroid Build Coastguard Worker    gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift
460*c0909341SAndroid Build Coastguard Worker    gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift
461*c0909341SAndroid Build Coastguard Worker    gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift
462*c0909341SAndroid Build Coastguard Worker    gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift
463*c0909341SAndroid Build Coastguard Worker    psubusb       m14, m18, m14    ; imax(0, sec_strength - (abs(dt0) >> shift)))
464*c0909341SAndroid Build Coastguard Worker    psubusb       m15, m18, m15    ; imax(0, sec_strength - (abs(db0) >> shift)))
465*c0909341SAndroid Build Coastguard Worker    psubusb       m16, m18, m16    ; imax(0, sec_strength - (abs(dt1) >> shift)))
466*c0909341SAndroid Build Coastguard Worker    psubusb       m17, m18, m17    ; imax(0, sec_strength - (abs(db1) >> shift)))
467*c0909341SAndroid Build Coastguard Worker    pminub        m10, m14
468*c0909341SAndroid Build Coastguard Worker    pminub        m11, m15
469*c0909341SAndroid Build Coastguard Worker    pminub        m12, m16
470*c0909341SAndroid Build Coastguard Worker    pminub        m13, m17
471*c0909341SAndroid Build Coastguard Worker    mova          m14, m19
472*c0909341SAndroid Build Coastguard Worker    mova          m15, m19
473*c0909341SAndroid Build Coastguard Worker    mova          m16, m19
474*c0909341SAndroid Build Coastguard Worker    vpsubb    m14{k1}, m20, m19    ; apply_sign(sec_tap_top_0)
475*c0909341SAndroid Build Coastguard Worker    vpsubb    m15{k2}, m20, m19    ; apply_sign(sec_tap_bottom_0)
476*c0909341SAndroid Build Coastguard Worker    vpsubb    m16{k3}, m20, m19    ; apply_sign(sec_tap_top_1)
477*c0909341SAndroid Build Coastguard Worker    vpsubb    m19{k4}, m20, m19    ; apply_sign(sec_tap_bottom_1)
478*c0909341SAndroid Build Coastguard Worker    vpdpbusd       m0, m10, m14
479*c0909341SAndroid Build Coastguard Worker    vpdpbusd       m1, m11, m15
480*c0909341SAndroid Build Coastguard Worker    vpdpbusd       m0, m12, m16
481*c0909341SAndroid Build Coastguard Worker    vpdpbusd       m1, m13, m19
482*c0909341SAndroid Build Coastguard Worker    ret
483*c0909341SAndroid Build Coastguard Worker
484*c0909341SAndroid Build Coastguard Worker;         lut tl                   lut tr
485*c0909341SAndroid Build Coastguard Worker; t0 t1 t2 t3 t4 t5 t6 t7  t4 t5 t6 t7 t8 t9 ta tb
486*c0909341SAndroid Build Coastguard Worker; T0 T1 T2 T3 T4 T5 T6 T7  T4 T5 T6 T7 T8 T9 Ta Tb
487*c0909341SAndroid Build Coastguard Worker; L0 L1 00 01 02 03 04 05  02 03 04 05 06 07 08 09
488*c0909341SAndroid Build Coastguard Worker; L2 L3 10 11 12 13 14 15  12 13 14 15 16 17 18 19
489*c0909341SAndroid Build Coastguard Worker; L4 L5 20 21 22 23 24 25  22 23 24 25 26 27 28 29
490*c0909341SAndroid Build Coastguard Worker; L6 L7 30 31 32 33 34 35  32 33 34 35 36 37 38 39
491*c0909341SAndroid Build Coastguard Worker; L8 L9 40 41 42 43 44 45  42 43 44 45 46 47 48 49
492*c0909341SAndroid Build Coastguard Worker; La Lb 50 51 52 53 54 55  52 53 54 55 56 57 58 59
493*c0909341SAndroid Build Coastguard Worker;         lut bl                   lut br
494*c0909341SAndroid Build Coastguard Worker; L4 L5 20 21 22 23 24 25  22 23 24 25 26 27 28 29
495*c0909341SAndroid Build Coastguard Worker; L6 L7 30 31 32 33 34 35  32 33 34 35 36 37 38 39
496*c0909341SAndroid Build Coastguard Worker; L8 L9 40 41 42 43 44 45  42 43 44 45 46 47 48 49
497*c0909341SAndroid Build Coastguard Worker; La Lb 50 51 52 53 54 55  52 53 54 55 56 57 58 59
498*c0909341SAndroid Build Coastguard Worker; Lc Ld 60 61 62 63 64 65  62 63 64 65 66 67 68 69
499*c0909341SAndroid Build Coastguard Worker; Le Lf 70 71 72 73 74 75  72 73 74 75 76 77 78 79
500*c0909341SAndroid Build Coastguard Worker; b0 b1 b2 b3 b4 b5 b6 b7  b4 b5 b6 b7 b8 b9 ba bb
501*c0909341SAndroid Build Coastguard Worker; B0 B1 B2 B3 B4 B5 B6 B7  B4 B5 B6 B7 B8 B9 Ba Bb
502*c0909341SAndroid Build Coastguard Worker
503*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_8x8_8bpc, 5, 11, 32, 4*64, dst, stride, left, top, bot, \
504*c0909341SAndroid Build Coastguard Worker                                               pri, sec, dir, damping, edge
505*c0909341SAndroid Build Coastguard Worker%define base r8-edge_mask
506*c0909341SAndroid Build Coastguard Worker    movu         xm16, [dstq+strideq*0]
507*c0909341SAndroid Build Coastguard Worker    pinsrd       xm16, [leftq+4*0], 3
508*c0909341SAndroid Build Coastguard Worker    mov           r6d, edgem
509*c0909341SAndroid Build Coastguard Worker    vinserti128  ym16, [dstq+strideq*1], 1
510*c0909341SAndroid Build Coastguard Worker    lea           r10, [dstq+strideq*4]
511*c0909341SAndroid Build Coastguard Worker    movu         xm17, [dstq+strideq*2]
512*c0909341SAndroid Build Coastguard Worker    vinserti32x4  m16, [topq+strideq*0-2], 2
513*c0909341SAndroid Build Coastguard Worker    lea            r9, [strideq*3]
514*c0909341SAndroid Build Coastguard Worker    pinsrd       xm17, [leftq+4*1], 3
515*c0909341SAndroid Build Coastguard Worker    vinserti32x4  m16, [topq+strideq*1-2], 3 ; 0 1 t T
516*c0909341SAndroid Build Coastguard Worker    lea            r8, [edge_mask]
517*c0909341SAndroid Build Coastguard Worker    vinserti128  ym17, [dstq+r9       ], 1
518*c0909341SAndroid Build Coastguard Worker    vpbroadcastd ym18, [leftq+4*2]
519*c0909341SAndroid Build Coastguard Worker    vpblendd     ym17, ym18, 0x80
520*c0909341SAndroid Build Coastguard Worker    movu         xm18, [r10 +strideq*2]
521*c0909341SAndroid Build Coastguard Worker    vinserti32x4  m17, [r10 +strideq*0], 2
522*c0909341SAndroid Build Coastguard Worker    pinsrd       xm18, [leftq+4*3], 3
523*c0909341SAndroid Build Coastguard Worker    vinserti32x4  m17, [r10 +strideq*1], 3   ; 2 3 4 5
524*c0909341SAndroid Build Coastguard Worker    vinserti128  ym18, [r10 +r9       ], 1
525*c0909341SAndroid Build Coastguard Worker    test          r6b, 0x08       ; avoid buffer overread
526*c0909341SAndroid Build Coastguard Worker    jz .main
527*c0909341SAndroid Build Coastguard Worker    vinserti32x4  m18, [botq+strideq*0-2], 2
528*c0909341SAndroid Build Coastguard Worker    vinserti32x4  m18, [botq+strideq*1-2], 3 ; 6 7 b B
529*c0909341SAndroid Build Coastguard Worker.main:
530*c0909341SAndroid Build Coastguard Worker    mova           m0, [base+lut_perm_8x8a]
531*c0909341SAndroid Build Coastguard Worker    movu           m1, [base+lut_perm_8x8b]
532*c0909341SAndroid Build Coastguard Worker    mova          m30, [base+px_idx]
533*c0909341SAndroid Build Coastguard Worker    vpermb        m16, m0, m16
534*c0909341SAndroid Build Coastguard Worker    movifnidn    prid, prim
535*c0909341SAndroid Build Coastguard Worker    vpermb        m17, m1, m17
536*c0909341SAndroid Build Coastguard Worker    mov           t0d, dirm
537*c0909341SAndroid Build Coastguard Worker    vpermb        m18, m0, m18
538*c0909341SAndroid Build Coastguard Worker    mov           r3d, dampingm
539*c0909341SAndroid Build Coastguard Worker    vshufi32x4    m12, m16, m17, q2020 ; lut tl
540*c0909341SAndroid Build Coastguard Worker    vshufi32x4    m13, m16, m17, q3131 ; lut tr
541*c0909341SAndroid Build Coastguard Worker    vshufi32x4    m14, m17, m18, q0220 ; lut bl
542*c0909341SAndroid Build Coastguard Worker    vshufi32x4    m15, m17, m18, q1331 ; lut br
543*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
544*c0909341SAndroid Build Coastguard Worker    pxor          m31, m31
545*c0909341SAndroid Build Coastguard Worker    lea            r3, [r8+r3*8]  ; gf_shr + (damping - 30) * 8
546*c0909341SAndroid Build Coastguard Worker    vpermb         m4, m30, m12   ; pxtl
547*c0909341SAndroid Build Coastguard Worker    mova           m1, m0
548*c0909341SAndroid Build Coastguard Worker    vpermb         m5, m30, m13   ; pxtr
549*c0909341SAndroid Build Coastguard Worker    mova           m2, m0
550*c0909341SAndroid Build Coastguard Worker    vpermb         m6, m30, m14   ; pxbl
551*c0909341SAndroid Build Coastguard Worker    mova           m3, m0
552*c0909341SAndroid Build Coastguard Worker    vpermb         m7, m30, m15   ; pxbr
553*c0909341SAndroid Build Coastguard Worker    cmp           r6b, 0x0f
554*c0909341SAndroid Build Coastguard Worker    jne .mask_edges               ; mask edges only if required
555*c0909341SAndroid Build Coastguard Worker    test         prid, prid
556*c0909341SAndroid Build Coastguard Worker    jz .sec_only
557*c0909341SAndroid Build Coastguard Worker    vpaddd        m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
558*c0909341SAndroid Build Coastguard Worker    vpermb         m8, m11, m12   ; pNtl k0p0 k0p1 k1p0 k1p1
559*c0909341SAndroid Build Coastguard Worker    vpermb         m9, m11, m13   ; pNtr
560*c0909341SAndroid Build Coastguard Worker    vpermb        m10, m11, m14   ; pNbl
561*c0909341SAndroid Build Coastguard Worker    vpermb        m11, m11, m15   ; pNbr
562*c0909341SAndroid Build Coastguard Worker%macro CDEF_FILTER_8x8_PRI 0
563*c0909341SAndroid Build Coastguard Worker    vpcmpub        k1, m4, m8, 6  ; pxtl > pNtl
564*c0909341SAndroid Build Coastguard Worker    vpcmpub        k2, m5, m9, 6  ; pxtr > pNtr
565*c0909341SAndroid Build Coastguard Worker    vpcmpub        k3, m6, m10, 6 ; pxbl > pNbl
566*c0909341SAndroid Build Coastguard Worker    vpcmpub        k4, m7, m11, 6 ; pxbr > pNbr
567*c0909341SAndroid Build Coastguard Worker    psubb         m16, m8, m4
568*c0909341SAndroid Build Coastguard Worker    psubb         m17, m9, m5
569*c0909341SAndroid Build Coastguard Worker    psubb         m18, m10, m6
570*c0909341SAndroid Build Coastguard Worker    psubb         m19, m11, m7
571*c0909341SAndroid Build Coastguard Worker    lzcnt         r6d, prid
572*c0909341SAndroid Build Coastguard Worker    vpsubb    m16{k1}, m4, m8     ; abs(diff_tl)
573*c0909341SAndroid Build Coastguard Worker    vpsubb    m17{k2}, m5, m9     ; abs(diff_tr)
574*c0909341SAndroid Build Coastguard Worker    vpsubb    m18{k3}, m6, m10    ; abs(diff_bl)
575*c0909341SAndroid Build Coastguard Worker    vpsubb    m19{k4}, m7, m11    ; abs(diff_br)
576*c0909341SAndroid Build Coastguard Worker    vpbroadcastq  m28, [r3+r6*8]
577*c0909341SAndroid Build Coastguard Worker    vpbroadcastb  m29, prid
578*c0909341SAndroid Build Coastguard Worker    and          prid, 1
579*c0909341SAndroid Build Coastguard Worker    vpbroadcastd  m27, [base+pri_tap+priq*4]
580*c0909341SAndroid Build Coastguard Worker    vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift
581*c0909341SAndroid Build Coastguard Worker    vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift
582*c0909341SAndroid Build Coastguard Worker    vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift
583*c0909341SAndroid Build Coastguard Worker    vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift
584*c0909341SAndroid Build Coastguard Worker    mova          m24, m27
585*c0909341SAndroid Build Coastguard Worker    mova          m25, m27
586*c0909341SAndroid Build Coastguard Worker    mova          m26, m27
587*c0909341SAndroid Build Coastguard Worker    movifnidn     t1d, secm
588*c0909341SAndroid Build Coastguard Worker    vpsubb    m24{k1}, m31, m27   ; apply_sign(pri_tap_tl)
589*c0909341SAndroid Build Coastguard Worker    vpsubb    m25{k2}, m31, m27   ; apply_sign(pri_tap_tr)
590*c0909341SAndroid Build Coastguard Worker    vpsubb    m26{k3}, m31, m27   ; apply_sign(pri_tap_tl)
591*c0909341SAndroid Build Coastguard Worker    vpsubb    m27{k4}, m31, m27   ; apply_sign(pri_tap_tr)
592*c0909341SAndroid Build Coastguard Worker    psubusb       m20, m29, m20   ; imax(0, pri_strength - (abs(dtl) >> shift)))
593*c0909341SAndroid Build Coastguard Worker    psubusb       m21, m29, m21   ; imax(0, pri_strength - (abs(dtr) >> shift)))
594*c0909341SAndroid Build Coastguard Worker    psubusb       m22, m29, m22   ; imax(0, pri_strength - (abs(dbl) >> shift)))
595*c0909341SAndroid Build Coastguard Worker    psubusb       m23, m29, m23   ; imax(0, pri_strength - (abs(dbr) >> shift)))
596*c0909341SAndroid Build Coastguard Worker    pminub        m16, m20
597*c0909341SAndroid Build Coastguard Worker    pminub        m17, m21
598*c0909341SAndroid Build Coastguard Worker    pminub        m18, m22
599*c0909341SAndroid Build Coastguard Worker    pminub        m19, m23
600*c0909341SAndroid Build Coastguard Worker    vpdpbusd       m0, m16, m24   ; sum tl
601*c0909341SAndroid Build Coastguard Worker    vpdpbusd       m1, m17, m25   ; sum tr
602*c0909341SAndroid Build Coastguard Worker    vpdpbusd       m2, m18, m26   ; sum bl
603*c0909341SAndroid Build Coastguard Worker    vpdpbusd       m3, m19, m27   ; sum br
604*c0909341SAndroid Build Coastguard Worker%endmacro
605*c0909341SAndroid Build Coastguard Worker    CDEF_FILTER_8x8_PRI
606*c0909341SAndroid Build Coastguard Worker    test          t1d, t1d        ; sec
607*c0909341SAndroid Build Coastguard Worker    jz .end_no_clip
608*c0909341SAndroid Build Coastguard Worker    call .sec
609*c0909341SAndroid Build Coastguard Worker.end_clip:
610*c0909341SAndroid Build Coastguard Worker    pminub        m20, m8, m4
611*c0909341SAndroid Build Coastguard Worker    pminub        m24, m12, m16
612*c0909341SAndroid Build Coastguard Worker    pminub        m21, m9, m5
613*c0909341SAndroid Build Coastguard Worker    pminub        m25, m13, m17
614*c0909341SAndroid Build Coastguard Worker    pminub        m22, m10, m6
615*c0909341SAndroid Build Coastguard Worker    pminub        m26, m14, m18
616*c0909341SAndroid Build Coastguard Worker    pminub        m23, m11, m7
617*c0909341SAndroid Build Coastguard Worker    pminub        m27, m15, m19
618*c0909341SAndroid Build Coastguard Worker    pmaxub         m8, m4
619*c0909341SAndroid Build Coastguard Worker    pmaxub        m12, m16
620*c0909341SAndroid Build Coastguard Worker    pmaxub         m9, m5
621*c0909341SAndroid Build Coastguard Worker    pmaxub        m13, m17
622*c0909341SAndroid Build Coastguard Worker    pmaxub        m10, m6
623*c0909341SAndroid Build Coastguard Worker    pmaxub        m14, m18
624*c0909341SAndroid Build Coastguard Worker    pmaxub        m11, m7
625*c0909341SAndroid Build Coastguard Worker    pmaxub        m15, m19
626*c0909341SAndroid Build Coastguard Worker    pminub        m20, m24
627*c0909341SAndroid Build Coastguard Worker    pminub        m21, m25
628*c0909341SAndroid Build Coastguard Worker    pminub        m22, m26
629*c0909341SAndroid Build Coastguard Worker    pminub        m23, m27
630*c0909341SAndroid Build Coastguard Worker    pmaxub         m8, m12
631*c0909341SAndroid Build Coastguard Worker    pmaxub         m9, m13
632*c0909341SAndroid Build Coastguard Worker    pmaxub        m10, m14
633*c0909341SAndroid Build Coastguard Worker    pmaxub        m11, m15
634*c0909341SAndroid Build Coastguard Worker    mov           r2d, 0xAAAAAAAA
635*c0909341SAndroid Build Coastguard Worker    kmovd          k1, r2d
636*c0909341SAndroid Build Coastguard Worker    vpshrdd       m24,  m0,  m1, 16
637*c0909341SAndroid Build Coastguard Worker    vpshrdd       m25,  m2,  m3, 16
638*c0909341SAndroid Build Coastguard Worker    vpshrdd       m12, m20, m21, 16
639*c0909341SAndroid Build Coastguard Worker    vpshrdd       m14, m22, m23, 16
640*c0909341SAndroid Build Coastguard Worker    vpshrdd       m16,  m8,  m9, 16
641*c0909341SAndroid Build Coastguard Worker    vpshrdd       m18, m10, m11, 16
642*c0909341SAndroid Build Coastguard Worker    vpblendmw m13{k1}, m20, m21
643*c0909341SAndroid Build Coastguard Worker    vpblendmw m15{k1}, m22, m23
644*c0909341SAndroid Build Coastguard Worker    vpblendmw m17{k1},  m8, m9
645*c0909341SAndroid Build Coastguard Worker    vpblendmw m19{k1}, m10, m11
646*c0909341SAndroid Build Coastguard Worker    vpblendmw m20{k1},  m0, m24
647*c0909341SAndroid Build Coastguard Worker    vpblendmw m21{k1}, m24, m1
648*c0909341SAndroid Build Coastguard Worker    vpblendmw m22{k1},  m2, m25
649*c0909341SAndroid Build Coastguard Worker    vpblendmw m23{k1}, m25, m3
650*c0909341SAndroid Build Coastguard Worker    vpshrdd        m4, m5, 16
651*c0909341SAndroid Build Coastguard Worker    vpshrdd        m6, m7, 16
652*c0909341SAndroid Build Coastguard Worker    pminub        m12, m13
653*c0909341SAndroid Build Coastguard Worker    pminub        m14, m15
654*c0909341SAndroid Build Coastguard Worker    pmaxub        m16, m17
655*c0909341SAndroid Build Coastguard Worker    pmaxub        m18, m19
656*c0909341SAndroid Build Coastguard Worker    mova           m8, [base+end_perm_clip]
657*c0909341SAndroid Build Coastguard Worker    vpcmpw         k2, m20, m31, 1
658*c0909341SAndroid Build Coastguard Worker    vpcmpw         k3, m22, m31, 1
659*c0909341SAndroid Build Coastguard Worker    vpshldw        m4, m21, 8
660*c0909341SAndroid Build Coastguard Worker    vpshldw        m6, m23, 8
661*c0909341SAndroid Build Coastguard Worker    kunpckdq       k1, k1, k1
662*c0909341SAndroid Build Coastguard Worker    kxnorb         k4, k4, k4
663*c0909341SAndroid Build Coastguard Worker    vpshrdw       m11, m12, m14, 8
664*c0909341SAndroid Build Coastguard Worker    vpshrdw       m15, m16, m18, 8
665*c0909341SAndroid Build Coastguard Worker    vpblendmb m13{k1}, m12, m14
666*c0909341SAndroid Build Coastguard Worker    vpblendmb m17{k1}, m16, m18
667*c0909341SAndroid Build Coastguard Worker    psubw         m21, m31, m20
668*c0909341SAndroid Build Coastguard Worker    psubw         m23, m31, m22
669*c0909341SAndroid Build Coastguard Worker    paddusw        m0, m20, m4  ; clip >0xff
670*c0909341SAndroid Build Coastguard Worker    paddusw        m1, m22, m6
671*c0909341SAndroid Build Coastguard Worker    pminub        m11, m13
672*c0909341SAndroid Build Coastguard Worker    pmaxub        m15, m17
673*c0909341SAndroid Build Coastguard Worker    psubusw    m0{k2}, m4, m21  ; clip <0x00
674*c0909341SAndroid Build Coastguard Worker    psubusw    m1{k3}, m6, m23
675*c0909341SAndroid Build Coastguard Worker    psrlw          m0, 8
676*c0909341SAndroid Build Coastguard Worker    vmovdqu8   m0{k1}, m1
677*c0909341SAndroid Build Coastguard Worker    pmaxub         m0, m11
678*c0909341SAndroid Build Coastguard Worker    pminub         m0, m15
679*c0909341SAndroid Build Coastguard Worker    vpermb         m0, m8, m0
680*c0909341SAndroid Build Coastguard Worker    vextracti32x4 xm1, m0, 1
681*c0909341SAndroid Build Coastguard Worker    vextracti32x4 xm2, m0, 2
682*c0909341SAndroid Build Coastguard Worker    vextracti32x4 xm3, m0, 3
683*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
684*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
685*c0909341SAndroid Build Coastguard Worker    movq   [r10 +strideq*0], xm2
686*c0909341SAndroid Build Coastguard Worker    movq   [r10 +strideq*2], xm3
687*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
688*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r9       ], xm1
689*c0909341SAndroid Build Coastguard Worker    movhps [r10 +strideq*1], xm2
690*c0909341SAndroid Build Coastguard Worker    movhps [r10 +r9       ], xm3
691*c0909341SAndroid Build Coastguard Worker    RET
692*c0909341SAndroid Build Coastguard Worker.sec_only:
693*c0909341SAndroid Build Coastguard Worker    movifnidn     t1d, secm
694*c0909341SAndroid Build Coastguard Worker    call .sec
695*c0909341SAndroid Build Coastguard Worker.end_no_clip:
696*c0909341SAndroid Build Coastguard Worker    mova          xm8, [base+end_perm]
697*c0909341SAndroid Build Coastguard Worker    kxnorb         k1, k1, k1
698*c0909341SAndroid Build Coastguard Worker    vpshldd        m4, m0, 8  ; (px << 8) + ((sum > -8) << 4)
699*c0909341SAndroid Build Coastguard Worker    vpshldd        m5, m1, 8
700*c0909341SAndroid Build Coastguard Worker    vpshldd        m6, m2, 8
701*c0909341SAndroid Build Coastguard Worker    vpshldd        m7, m3, 8
702*c0909341SAndroid Build Coastguard Worker    paddw          m0, m4     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
703*c0909341SAndroid Build Coastguard Worker    paddw          m1, m5
704*c0909341SAndroid Build Coastguard Worker    paddw          m2, m6
705*c0909341SAndroid Build Coastguard Worker    paddw          m3, m7
706*c0909341SAndroid Build Coastguard Worker    vpermb         m0, m8, m0
707*c0909341SAndroid Build Coastguard Worker    vpermb         m1, m8, m1
708*c0909341SAndroid Build Coastguard Worker    vpermb         m2, m8, m2
709*c0909341SAndroid Build Coastguard Worker    vpermb         m3, m8, m3
710*c0909341SAndroid Build Coastguard Worker    punpckldq      m4, m0, m1
711*c0909341SAndroid Build Coastguard Worker    punpckhdq      m0, m1
712*c0909341SAndroid Build Coastguard Worker    punpckldq      m5, m2, m3
713*c0909341SAndroid Build Coastguard Worker    punpckhdq      m2, m3
714*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm4
715*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm0
716*c0909341SAndroid Build Coastguard Worker    movq   [r10 +strideq*0], xm5
717*c0909341SAndroid Build Coastguard Worker    movq   [r10 +strideq*2], xm2
718*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm4
719*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r9       ], xm0
720*c0909341SAndroid Build Coastguard Worker    movhps [r10 +strideq*1], xm5
721*c0909341SAndroid Build Coastguard Worker    movhps [r10 +r9       ], xm2
722*c0909341SAndroid Build Coastguard Worker    RET
723*c0909341SAndroid Build Coastguard Worker.mask_edges_sec_only:
724*c0909341SAndroid Build Coastguard Worker    movifnidn     t1d, secm
725*c0909341SAndroid Build Coastguard Worker    call .mask_edges_sec
726*c0909341SAndroid Build Coastguard Worker    jmp .end_no_clip
727*c0909341SAndroid Build Coastguard WorkerALIGN function_align
728*c0909341SAndroid Build Coastguard Worker.mask_edges:
729*c0909341SAndroid Build Coastguard Worker    mov           t0d, r6d
730*c0909341SAndroid Build Coastguard Worker    mov           t1d, r6d
731*c0909341SAndroid Build Coastguard Worker    or            t0d, 0xA ; top-left 4x4 has bottom and right
732*c0909341SAndroid Build Coastguard Worker    or            t1d, 0x9 ; top-right 4x4 has bottom and left
733*c0909341SAndroid Build Coastguard Worker    vpbroadcastq  m26, [base+edge_mask+t0*8]
734*c0909341SAndroid Build Coastguard Worker    vpbroadcastq  m27, [base+edge_mask+t1*8]
735*c0909341SAndroid Build Coastguard Worker    mov           t1d, r6d
736*c0909341SAndroid Build Coastguard Worker    or            r6d, 0x6 ; bottom-left 4x4 has top and right
737*c0909341SAndroid Build Coastguard Worker    or            t1d, 0x5 ; bottom-right 4x4 has top and left
738*c0909341SAndroid Build Coastguard Worker    vpbroadcastq  m28, [base+edge_mask+r6*8]
739*c0909341SAndroid Build Coastguard Worker    vpbroadcastq  m29, [base+edge_mask+t1*8]
740*c0909341SAndroid Build Coastguard Worker    mov           t0d, dirm
741*c0909341SAndroid Build Coastguard Worker    test         prid, prid
742*c0909341SAndroid Build Coastguard Worker    jz .mask_edges_sec_only
743*c0909341SAndroid Build Coastguard Worker    vpaddd        m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16}
744*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k1, m26, m20 ; index in-range
745*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k2, m27, m20
746*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k3, m28, m20
747*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k4, m29, m20
748*c0909341SAndroid Build Coastguard Worker    mova           m8, m4
749*c0909341SAndroid Build Coastguard Worker    mova           m9, m5
750*c0909341SAndroid Build Coastguard Worker    mova          m10, m6
751*c0909341SAndroid Build Coastguard Worker    mova          m11, m7
752*c0909341SAndroid Build Coastguard Worker    vpermb     m8{k1}, m20, m12
753*c0909341SAndroid Build Coastguard Worker    vpermb     m9{k2}, m20, m13
754*c0909341SAndroid Build Coastguard Worker    vpermb    m10{k3}, m20, m14
755*c0909341SAndroid Build Coastguard Worker    vpermb    m11{k4}, m20, m15
756*c0909341SAndroid Build Coastguard Worker    mova   [rsp+0x00], m26
757*c0909341SAndroid Build Coastguard Worker    mova   [rsp+0x40], m27
758*c0909341SAndroid Build Coastguard Worker    mova   [rsp+0x80], m28
759*c0909341SAndroid Build Coastguard Worker    mova   [rsp+0xC0], m29
760*c0909341SAndroid Build Coastguard Worker    CDEF_FILTER_8x8_PRI
761*c0909341SAndroid Build Coastguard Worker    test          t1d, t1d
762*c0909341SAndroid Build Coastguard Worker    jz .end_no_clip
763*c0909341SAndroid Build Coastguard Worker    mova          m26, [rsp+0x00]
764*c0909341SAndroid Build Coastguard Worker    mova          m27, [rsp+0x40]
765*c0909341SAndroid Build Coastguard Worker    mova          m28, [rsp+0x80]
766*c0909341SAndroid Build Coastguard Worker    mova          m29, [rsp+0xC0]
767*c0909341SAndroid Build Coastguard Worker    call .mask_edges_sec
768*c0909341SAndroid Build Coastguard Worker    jmp .end_clip
769*c0909341SAndroid Build Coastguard Worker.mask_edges_sec:
770*c0909341SAndroid Build Coastguard Worker    vpaddd        m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16}
771*c0909341SAndroid Build Coastguard Worker    vpaddd        m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16}
772*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k1, m26, m20
773*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k2, m27, m20
774*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k3, m28, m20
775*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k4, m29, m20
776*c0909341SAndroid Build Coastguard Worker    mova          m16, m4
777*c0909341SAndroid Build Coastguard Worker    mova          m17, m5
778*c0909341SAndroid Build Coastguard Worker    mova          m18, m6
779*c0909341SAndroid Build Coastguard Worker    mova          m19, m7
780*c0909341SAndroid Build Coastguard Worker    vpermb    m16{k1}, m20, m12
781*c0909341SAndroid Build Coastguard Worker    vpermb    m17{k2}, m20, m13
782*c0909341SAndroid Build Coastguard Worker    vpermb    m18{k3}, m20, m14
783*c0909341SAndroid Build Coastguard Worker    vpermb    m19{k4}, m20, m15
784*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k1, m26, m21
785*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k2, m27, m21
786*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k3, m28, m21
787*c0909341SAndroid Build Coastguard Worker    vpshufbitqmb   k4, m29, m21
788*c0909341SAndroid Build Coastguard Worker    vpermb        m12, m21, m12
789*c0909341SAndroid Build Coastguard Worker    vpermb        m13, m21, m13
790*c0909341SAndroid Build Coastguard Worker    vpermb        m14, m21, m14
791*c0909341SAndroid Build Coastguard Worker    vpermb        m15, m21, m15
792*c0909341SAndroid Build Coastguard Worker    vpblendmb m12{k1}, m4, m12
793*c0909341SAndroid Build Coastguard Worker    vpblendmb m13{k2}, m5, m13
794*c0909341SAndroid Build Coastguard Worker    vpblendmb m14{k3}, m6, m14
795*c0909341SAndroid Build Coastguard Worker    vpblendmb m15{k4}, m7, m15
796*c0909341SAndroid Build Coastguard Worker    jmp .sec_main
797*c0909341SAndroid Build Coastguard WorkerALIGN function_align
798*c0909341SAndroid Build Coastguard Worker.sec:
799*c0909341SAndroid Build Coastguard Worker    vpaddd        m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
800*c0909341SAndroid Build Coastguard Worker    vpaddd        m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
801*c0909341SAndroid Build Coastguard Worker    vpermb        m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1
802*c0909341SAndroid Build Coastguard Worker    vpermb        m17, m20, m13 ; pNtr
803*c0909341SAndroid Build Coastguard Worker    vpermb        m18, m20, m14 ; pNbl
804*c0909341SAndroid Build Coastguard Worker    vpermb        m19, m20, m15 ; pNbr
805*c0909341SAndroid Build Coastguard Worker    vpermb        m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3
806*c0909341SAndroid Build Coastguard Worker    vpermb        m13, m21, m13 ; pNtr
807*c0909341SAndroid Build Coastguard Worker    vpermb        m14, m21, m14 ; pNbl
808*c0909341SAndroid Build Coastguard Worker    vpermb        m15, m21, m15 ; pNbr
809*c0909341SAndroid Build Coastguard Worker.sec_main:
810*c0909341SAndroid Build Coastguard Worker%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants
811*c0909341SAndroid Build Coastguard Worker    vpcmpub        k1, m4, %1, 6
812*c0909341SAndroid Build Coastguard Worker    vpcmpub        k2, m5, %2, 6
813*c0909341SAndroid Build Coastguard Worker    vpcmpub        k3, m6, %3, 6
814*c0909341SAndroid Build Coastguard Worker    vpcmpub        k4, m7, %4, 6
815*c0909341SAndroid Build Coastguard Worker    psubb         m20, %1, m4
816*c0909341SAndroid Build Coastguard Worker    psubb         m21, %2, m5
817*c0909341SAndroid Build Coastguard Worker    psubb         m22, %3, m6
818*c0909341SAndroid Build Coastguard Worker    psubb         m23, %4, m7
819*c0909341SAndroid Build Coastguard Worker%if %5
820*c0909341SAndroid Build Coastguard Worker    vpbroadcastb  m28, t1d
821*c0909341SAndroid Build Coastguard Worker    lzcnt         t1d, t1d
822*c0909341SAndroid Build Coastguard Worker    vpbroadcastq  m29, [r3+t1*8]
823*c0909341SAndroid Build Coastguard Worker%endif
824*c0909341SAndroid Build Coastguard Worker    vpsubb    m20{k1}, m4, %1
825*c0909341SAndroid Build Coastguard Worker    vpsubb    m21{k2}, m5, %2
826*c0909341SAndroid Build Coastguard Worker    vpsubb    m22{k3}, m6, %3
827*c0909341SAndroid Build Coastguard Worker    vpsubb    m23{k4}, m7, %4
828*c0909341SAndroid Build Coastguard Worker    gf2p8affineqb m24, m20, m29, 0
829*c0909341SAndroid Build Coastguard Worker    gf2p8affineqb m25, m21, m29, 0
830*c0909341SAndroid Build Coastguard Worker    gf2p8affineqb m26, m22, m29, 0
831*c0909341SAndroid Build Coastguard Worker    gf2p8affineqb m27, m23, m29, 0
832*c0909341SAndroid Build Coastguard Worker%if %5
833*c0909341SAndroid Build Coastguard Worker    vpbroadcastd  m30, [base+sec_tap]
834*c0909341SAndroid Build Coastguard Worker%endif
835*c0909341SAndroid Build Coastguard Worker    psubusb       m24, m28, m24
836*c0909341SAndroid Build Coastguard Worker    psubusb       m25, m28, m25
837*c0909341SAndroid Build Coastguard Worker    psubusb       m26, m28, m26
838*c0909341SAndroid Build Coastguard Worker    psubusb       m27, m28, m27
839*c0909341SAndroid Build Coastguard Worker    pminub        m20, m24
840*c0909341SAndroid Build Coastguard Worker    pminub        m21, m25
841*c0909341SAndroid Build Coastguard Worker    pminub        m22, m26
842*c0909341SAndroid Build Coastguard Worker    pminub        m23, m27
843*c0909341SAndroid Build Coastguard Worker    mova          m24, m30
844*c0909341SAndroid Build Coastguard Worker    mova          m25, m30
845*c0909341SAndroid Build Coastguard Worker    mova          m26, m30
846*c0909341SAndroid Build Coastguard Worker    mova          m27, m30
847*c0909341SAndroid Build Coastguard Worker    vpsubb    m24{k1}, m31, m30
848*c0909341SAndroid Build Coastguard Worker    vpsubb    m25{k2}, m31, m30
849*c0909341SAndroid Build Coastguard Worker    vpsubb    m26{k3}, m31, m30
850*c0909341SAndroid Build Coastguard Worker    vpsubb    m27{k4}, m31, m30
851*c0909341SAndroid Build Coastguard Worker    vpdpbusd       m0, m20, m24
852*c0909341SAndroid Build Coastguard Worker    vpdpbusd       m1, m21, m25
853*c0909341SAndroid Build Coastguard Worker    vpdpbusd       m2, m22, m26
854*c0909341SAndroid Build Coastguard Worker    vpdpbusd       m3, m23, m27
855*c0909341SAndroid Build Coastguard Worker%endmacro
856*c0909341SAndroid Build Coastguard Worker    CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1
857*c0909341SAndroid Build Coastguard Worker    CDEF_FILTER_8x8_SEC m12, m13, m14, m15
858*c0909341SAndroid Build Coastguard Worker    ret
859*c0909341SAndroid Build Coastguard Worker
860*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
861