xref: /aosp_15_r20/external/libdav1d/src/x86/ipred16_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2022-2024, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2022-2024, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64
32*c0909341SAndroid Build Coastguard Worker
33*c0909341SAndroid Build Coastguard Workeripred_shuf:    db 14, 15, 14, 15,  0,  1,  2,  3,  6,  7,  6,  7,  0,  1,  2,  3
34*c0909341SAndroid Build Coastguard Worker               db 10, 11, 10, 11,  8,  9, 10, 11,  2,  3,  2,  3,  8,  9, 10, 11
35*c0909341SAndroid Build Coastguard Worker               db 12, 13, 12, 13,  4,  5,  6,  7,  4,  5,  4,  5,  4,  5,  6,  7
36*c0909341SAndroid Build Coastguard Worker               db  8,  9,  8,  9, 12, 13, 14, 15,  0,  1,  0,  1, 12, 13, 14, 15
37*c0909341SAndroid Build Coastguard Workersmooth_perm:   db  1,  2,  5,  6,  9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
38*c0909341SAndroid Build Coastguard Worker               db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
39*c0909341SAndroid Build Coastguard Worker               db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
40*c0909341SAndroid Build Coastguard Worker               db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
41*c0909341SAndroid Build Coastguard Workerpal_pred_perm: db  0, 16, 32, 48,  1, 17, 33, 49,  2, 18, 34, 50,  3, 19, 35, 51
42*c0909341SAndroid Build Coastguard Worker               db  4, 20, 36, 52,  5, 21, 37, 53,  6, 22, 38, 54,  7, 23, 39, 55
43*c0909341SAndroid Build Coastguard Worker               db  8, 24, 40, 56,  9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
44*c0909341SAndroid Build Coastguard Worker               db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63
45*c0909341SAndroid Build Coastguard Workerpw_31to0:      dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
46*c0909341SAndroid Build Coastguard Worker               dw 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0
47*c0909341SAndroid Build Coastguard Workerpw_1to32:      dw  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
48*c0909341SAndroid Build Coastguard Worker               dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
49*c0909341SAndroid Build Coastguard Workerz_upsample:    dw  0, -1,  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6
50*c0909341SAndroid Build Coastguard Worker               dw  8,  7,  9,  8, 10,  9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14
51*c0909341SAndroid Build Coastguard Workerz_xpos_mul:    dw  1,  1,  1,  1,  2,  2,  1,  1,  3,  3,  2,  2,  4,  4,  2,  2
52*c0909341SAndroid Build Coastguard Worker               dw  5,  5,  3,  3,  6,  6,  3,  3,  7,  7,  4,  4,  8,  8,  4,  4
53*c0909341SAndroid Build Coastguard Workerz_ypos_mul:    dw  0,  0,  0,  0,  1,  1,  0,  0,  2,  2,  1,  1,  3,  3,  1,  1
54*c0909341SAndroid Build Coastguard Worker               dw  4,  4,  2,  2,  5,  5,  2,  2,  6,  6,  3,  3,  7,  7,  3,  3
55*c0909341SAndroid Build Coastguard Workerz_filter_t0:   db 55,127, 39,127, 39,127,  7, 15, 31,  7, 15, 31,  0,  3, 31,  0
56*c0909341SAndroid Build Coastguard Workerz_filter_t1:   db 39, 63, 19, 47, 19, 47,  3,  3,  3,  3,  3,  3,  0,  0,  0,  0
57*c0909341SAndroid Build Coastguard Workerz_xpos_off1a:  dw  30720,  30784,  30848,  30912,  30976,  31040,  31104,  31168
58*c0909341SAndroid Build Coastguard Workerz_xpos_off1b:  dw  30720,  30848,  30976,  31104,  31232,  31360,  31488,  31616
59*c0909341SAndroid Build Coastguard Workerfilter_permA:  times 4 db  6,  7,  8,  9, 14, 15,  4,  5
60*c0909341SAndroid Build Coastguard Worker               times 4 db 10, 11, 12, 13,  2,  3, -1, -1
61*c0909341SAndroid Build Coastguard Workerfilter_permB:  times 4 db 22, 23, 24, 25, 30, 31,  6,  7
62*c0909341SAndroid Build Coastguard Worker               times 4 db 26, 27, 28, 29, 14, 15, -1, -1
63*c0909341SAndroid Build Coastguard Workerfilter_permC:          dd  8 ; dq  8, 10,  1, 11,  0,  9
64*c0909341SAndroid Build Coastguard Workerpw_1:          times 2 dw  1
65*c0909341SAndroid Build Coastguard Worker                       dd 10
66*c0909341SAndroid Build Coastguard Workerfilter_rnd:            dd 32
67*c0909341SAndroid Build Coastguard Worker                       dd  1
68*c0909341SAndroid Build Coastguard Worker                       dd  8
69*c0909341SAndroid Build Coastguard Worker                       dd 11
70*c0909341SAndroid Build Coastguard Workerfilter_shift:  times 2 dw  6
71*c0909341SAndroid Build Coastguard Worker                       dd  0
72*c0909341SAndroid Build Coastguard Worker               times 2 dw  4
73*c0909341SAndroid Build Coastguard Worker                       dd  9
74*c0909341SAndroid Build Coastguard Workerpd_65536:              dd 65536
75*c0909341SAndroid Build Coastguard Workerpal_unpack:    db  0,  8,  4, 12, 32, 40, 36, 44
76*c0909341SAndroid Build Coastguard Worker               db 16, 24, 20, 28, 48, 56, 52, 60
77*c0909341SAndroid Build Coastguard Workerz_filter_wh:   db  7,  7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
78*c0909341SAndroid Build Coastguard Worker               db 39, 39, 47, 47, 47, 79, 79, 79
79*c0909341SAndroid Build Coastguard Workerz_filter_k:    dw  8,  8,  6,  6,  4,  4
80*c0909341SAndroid Build Coastguard Worker               dw  4,  4,  5,  5,  4,  4
81*c0909341SAndroid Build Coastguard Worker               dw  0,  0,  0,  0,  2,  2
82*c0909341SAndroid Build Coastguard Workerpb_90:         times 4 db 90
83*c0909341SAndroid Build Coastguard Workerpw_15:         times 2 dw 15
84*c0909341SAndroid Build Coastguard Workerpw_16:         times 2 dw 16
85*c0909341SAndroid Build Coastguard Workerpw_17:         times 2 dw 17
86*c0909341SAndroid Build Coastguard Workerpw_24:         times 2 dw 24
87*c0909341SAndroid Build Coastguard Workerpw_31:         times 2 dw 31
88*c0909341SAndroid Build Coastguard Workerpw_32:         times 2 dw 32
89*c0909341SAndroid Build Coastguard Workerpw_63:         times 2 dw 63
90*c0909341SAndroid Build Coastguard Workerpw_64:         times 2 dw 64
91*c0909341SAndroid Build Coastguard Workerpw_512:        times 2 dw 512
92*c0909341SAndroid Build Coastguard Workerpw_2048:       times 2 dw 2048
93*c0909341SAndroid Build Coastguard Workerpw_31806:      times 2 dw 31806
94*c0909341SAndroid Build Coastguard Workerpw_32640:      times 2 dw 32640
95*c0909341SAndroid Build Coastguard Workerpw_32672:      times 2 dw 32672
96*c0909341SAndroid Build Coastguard Workerpw_32704:      times 2 dw 32704
97*c0909341SAndroid Build Coastguard Workerpw_32735:      times 2 dw 32735
98*c0909341SAndroid Build Coastguard Workerpw_32736:      times 2 dw 32736
99*c0909341SAndroid Build Coastguard Worker
100*c0909341SAndroid Build Coastguard Worker%define pw_2 (z_xpos_mul+4* 2)
101*c0909341SAndroid Build Coastguard Worker%define pw_3 (z_xpos_mul+4* 4)
102*c0909341SAndroid Build Coastguard Worker%define pw_7 (z_xpos_mul+4*12)
103*c0909341SAndroid Build Coastguard Worker%define pw_0to31 (pw_1to32-2)
104*c0909341SAndroid Build Coastguard Worker
105*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 3-*
106*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - 2*4)
107*c0909341SAndroid Build Coastguard Worker    %xdefine %%base mangle(private_prefix %+ _%1_%2)
108*c0909341SAndroid Build Coastguard Worker    %%table:
109*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
110*c0909341SAndroid Build Coastguard Worker        dd %%base %+ .%3 - (%%table - 2*4)
111*c0909341SAndroid Build Coastguard Worker        %rotate 1
112*c0909341SAndroid Build Coastguard Worker    %endrep
113*c0909341SAndroid Build Coastguard Worker%endmacro
114*c0909341SAndroid Build Coastguard Worker
115*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_paeth_16bpc,      avx512icl, w4, w8, w16, w32, w64
116*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_16bpc,     avx512icl, w4, w8, w16, w32, w64
117*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_h_16bpc,   avx512icl, w4, w8, w16, w32, w64
118*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_v_16bpc,   avx512icl, w4, w8, w16, w32, w64
119*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z1_16bpc,         avx512icl, w4, w8, w16, w32, w64
120*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z2_16bpc,         avx512icl, w4, w8, w16, w32, w64
121*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z3_16bpc,         avx512icl, w4, w8, w16, w32, w64
122*c0909341SAndroid Build Coastguard WorkerJMP_TABLE pal_pred_16bpc,         avx512icl, w4, w8, w16, w32, w64
123*c0909341SAndroid Build Coastguard Worker
124*c0909341SAndroid Build Coastguard Workercextern smooth_weights_1d_16bpc
125*c0909341SAndroid Build Coastguard Workercextern smooth_weights_2d_16bpc
126*c0909341SAndroid Build Coastguard Workercextern dr_intra_derivative
127*c0909341SAndroid Build Coastguard Workercextern filter_intra_taps
128*c0909341SAndroid Build Coastguard Worker
129*c0909341SAndroid Build Coastguard WorkerSECTION .text
130*c0909341SAndroid Build Coastguard Worker
131*c0909341SAndroid Build Coastguard Worker%macro PAETH 3 ; top, signed_ldiff, ldiff
132*c0909341SAndroid Build Coastguard Worker    paddw               m0, m%2, m2
133*c0909341SAndroid Build Coastguard Worker    psubw               m1, m0, m3  ; tldiff
134*c0909341SAndroid Build Coastguard Worker    psubw               m0, m%1     ; tdiff
135*c0909341SAndroid Build Coastguard Worker    pabsw               m1, m1
136*c0909341SAndroid Build Coastguard Worker    pabsw               m0, m0
137*c0909341SAndroid Build Coastguard Worker    pcmpgtw             k1, m0, m1
138*c0909341SAndroid Build Coastguard Worker    pminsw              m0, m1
139*c0909341SAndroid Build Coastguard Worker    pcmpgtw             k2, m%3, m0
140*c0909341SAndroid Build Coastguard Worker    vpblendmw       m0{k1}, m%1, m3
141*c0909341SAndroid Build Coastguard Worker    vpblendmw       m0{k2}, m2, m0
142*c0909341SAndroid Build Coastguard Worker%endmacro
143*c0909341SAndroid Build Coastguard Worker
144*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
145*c0909341SAndroid Build Coastguard Workercglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h
146*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_paeth_16bpc_avx512icl_table
147*c0909341SAndroid Build Coastguard Worker    lea                 r6, [ipred_paeth_16bpc_avx512icl_table]
148*c0909341SAndroid Build Coastguard Worker    tzcnt               wd, wm
149*c0909341SAndroid Build Coastguard Worker    movifnidn           hd, hm
150*c0909341SAndroid Build Coastguard Worker    movsxd              wq, [r6+wq*4]
151*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m3, [tlq]   ; topleft
152*c0909341SAndroid Build Coastguard Worker    add                 wq, r6
153*c0909341SAndroid Build Coastguard Worker    jmp                 wq
154*c0909341SAndroid Build Coastguard Worker.w4:
155*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m4, [tlq+2] ; top
156*c0909341SAndroid Build Coastguard Worker    movsldup            m7, [base+ipred_shuf]
157*c0909341SAndroid Build Coastguard Worker    lea                 r6, [strideq*3]
158*c0909341SAndroid Build Coastguard Worker    psubw               m5, m4, m3
159*c0909341SAndroid Build Coastguard Worker    pabsw               m6, m5
160*c0909341SAndroid Build Coastguard Worker.w4_loop:
161*c0909341SAndroid Build Coastguard Worker    sub                tlq, 16
162*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m2, [tlq]
163*c0909341SAndroid Build Coastguard Worker    pshufb              m2, m7      ; left
164*c0909341SAndroid Build Coastguard Worker    PAETH                4, 5, 6
165*c0909341SAndroid Build Coastguard Worker    vextracti32x4      xm1, m0, 2
166*c0909341SAndroid Build Coastguard Worker    vextracti32x4      xm8, ym0, 1
167*c0909341SAndroid Build Coastguard Worker    vextracti32x4      xm9, m0, 3
168*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
169*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
170*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm8
171*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r6       ], xm9
172*c0909341SAndroid Build Coastguard Worker    sub                 hd, 8
173*c0909341SAndroid Build Coastguard Worker    jl .w4_end
174*c0909341SAndroid Build Coastguard Worker    lea               dstq, [dstq+strideq*4]
175*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*0], xm0
176*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
177*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm8
178*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r6       ], xm9
179*c0909341SAndroid Build Coastguard Worker    lea               dstq, [dstq+strideq*4]
180*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
181*c0909341SAndroid Build Coastguard Worker.w4_end:
182*c0909341SAndroid Build Coastguard Worker    RET
183*c0909341SAndroid Build Coastguard Worker.w8:
184*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m4, [tlq+2]
185*c0909341SAndroid Build Coastguard Worker    movsldup            m7, [base+ipred_shuf]
186*c0909341SAndroid Build Coastguard Worker    lea                 r6, [strideq*3]
187*c0909341SAndroid Build Coastguard Worker    psubw               m5, m4, m3
188*c0909341SAndroid Build Coastguard Worker    pabsw               m6, m5
189*c0909341SAndroid Build Coastguard Worker.w8_loop:
190*c0909341SAndroid Build Coastguard Worker    sub                tlq, 8
191*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m2, [tlq]
192*c0909341SAndroid Build Coastguard Worker    pshufb              m2, m7
193*c0909341SAndroid Build Coastguard Worker    PAETH                4, 5, 6
194*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
195*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], m0, 2
196*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], ym0, 1
197*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r6       ], m0, 3
198*c0909341SAndroid Build Coastguard Worker    lea               dstq, [dstq+strideq*4]
199*c0909341SAndroid Build Coastguard Worker    sub                 hd, 4
200*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
201*c0909341SAndroid Build Coastguard Worker    RET
202*c0909341SAndroid Build Coastguard Worker.w16:
203*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8     m4, [tlq+2]
204*c0909341SAndroid Build Coastguard Worker    movsldup            m7, [base+ipred_shuf]
205*c0909341SAndroid Build Coastguard Worker    psubw               m5, m4, m3
206*c0909341SAndroid Build Coastguard Worker    pabsw               m6, m5
207*c0909341SAndroid Build Coastguard Worker.w16_loop:
208*c0909341SAndroid Build Coastguard Worker    sub                tlq, 4
209*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m2, [tlq]
210*c0909341SAndroid Build Coastguard Worker    pshufb              m2, m7
211*c0909341SAndroid Build Coastguard Worker    PAETH                4, 5, 6
212*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
213*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
214*c0909341SAndroid Build Coastguard Worker    lea               dstq, [dstq+strideq*2]
215*c0909341SAndroid Build Coastguard Worker    sub                 hd, 2
216*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
217*c0909341SAndroid Build Coastguard Worker    RET
218*c0909341SAndroid Build Coastguard Worker.w32:
219*c0909341SAndroid Build Coastguard Worker    movu                m4, [tlq+2]
220*c0909341SAndroid Build Coastguard Worker    psubw               m5, m4, m3
221*c0909341SAndroid Build Coastguard Worker    pabsw               m6, m5
222*c0909341SAndroid Build Coastguard Worker.w32_loop:
223*c0909341SAndroid Build Coastguard Worker    sub                tlq, 2
224*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m2, [tlq]
225*c0909341SAndroid Build Coastguard Worker    PAETH                4, 5, 6
226*c0909341SAndroid Build Coastguard Worker    mova            [dstq], m0
227*c0909341SAndroid Build Coastguard Worker    add               dstq, strideq
228*c0909341SAndroid Build Coastguard Worker    dec                 hd
229*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
230*c0909341SAndroid Build Coastguard Worker    RET
231*c0909341SAndroid Build Coastguard Worker.w64:
232*c0909341SAndroid Build Coastguard Worker    movu                m4, [tlq+ 2]
233*c0909341SAndroid Build Coastguard Worker    movu                m7, [tlq+66]
234*c0909341SAndroid Build Coastguard Worker    psubw               m5, m4, m3
235*c0909341SAndroid Build Coastguard Worker    psubw               m8, m7, m3
236*c0909341SAndroid Build Coastguard Worker    pabsw               m6, m5
237*c0909341SAndroid Build Coastguard Worker    pabsw               m9, m8
238*c0909341SAndroid Build Coastguard Worker.w64_loop:
239*c0909341SAndroid Build Coastguard Worker    sub                tlq, 2
240*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m2, [tlq]
241*c0909341SAndroid Build Coastguard Worker    PAETH                4, 5, 6
242*c0909341SAndroid Build Coastguard Worker    mova       [dstq+64*0], m0
243*c0909341SAndroid Build Coastguard Worker    PAETH                7, 8, 9
244*c0909341SAndroid Build Coastguard Worker    mova       [dstq+64*1], m0
245*c0909341SAndroid Build Coastguard Worker    add               dstq, strideq
246*c0909341SAndroid Build Coastguard Worker    dec                 hd
247*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
248*c0909341SAndroid Build Coastguard Worker    RET
249*c0909341SAndroid Build Coastguard Worker
250*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
251*c0909341SAndroid Build Coastguard Worker%define base r6-$$
252*c0909341SAndroid Build Coastguard Worker    lea                  r6, [$$]
253*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
254*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
255*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq*4]
256*c0909341SAndroid Build Coastguard Worker    lea            weightsq, [base+smooth_weights_1d_16bpc+hq*4]
257*c0909341SAndroid Build Coastguard Worker    neg                  hq
258*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, [tlq+hq*2] ; bottom
259*c0909341SAndroid Build Coastguard Worker    lea                  wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq]
260*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
261*c0909341SAndroid Build Coastguard Worker    jmp                  wq
262*c0909341SAndroid Build Coastguard Worker.w4:
263*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m5, [tlq+2]    ; top
264*c0909341SAndroid Build Coastguard Worker    movsldup             m4, [ipred_shuf]
265*c0909341SAndroid Build Coastguard Worker    psubw                m5, m6         ; top - bottom
266*c0909341SAndroid Build Coastguard Worker.w4_loop:
267*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [weightsq+hq*2]
268*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4
269*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
270*c0909341SAndroid Build Coastguard Worker    paddw                m3, m6
271*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm0, m3, 3
272*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym3, 1
273*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, m3, 2
274*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*0], xm0
275*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
276*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm2
277*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm3
278*c0909341SAndroid Build Coastguard Worker    add                  hq, 8
279*c0909341SAndroid Build Coastguard Worker    jg .end
280*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
281*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
282*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
283*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm2
284*c0909341SAndroid Build Coastguard Worker    movq   [dstq+stride3q ], xm3
285*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
286*c0909341SAndroid Build Coastguard Worker    jl .w4_loop
287*c0909341SAndroid Build Coastguard Worker.end:
288*c0909341SAndroid Build Coastguard Worker    RET
289*c0909341SAndroid Build Coastguard Worker.w8:
290*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m5, [tlq+2]    ; top
291*c0909341SAndroid Build Coastguard Worker    movsldup             m4, [ipred_shuf]
292*c0909341SAndroid Build Coastguard Worker    psubw                m5, m6         ; top - bottom
293*c0909341SAndroid Build Coastguard Worker.w8_loop:
294*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [weightsq+hq*2]
295*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
296*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
297*c0909341SAndroid Build Coastguard Worker    paddw                m0, m6
298*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*0], m0, 3
299*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
300*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m0, 2
301*c0909341SAndroid Build Coastguard Worker    mova          [dstq+stride3q ], xm0
302*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
303*c0909341SAndroid Build Coastguard Worker    add                  hq, 4
304*c0909341SAndroid Build Coastguard Worker    jl .w8_loop
305*c0909341SAndroid Build Coastguard Worker    RET
306*c0909341SAndroid Build Coastguard Worker.w16:
307*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m5, [tlq+2]    ; top
308*c0909341SAndroid Build Coastguard Worker    movsldup             m4, [ipred_shuf]
309*c0909341SAndroid Build Coastguard Worker    psubw                m5, m6         ; top - bottom
310*c0909341SAndroid Build Coastguard Worker.w16_loop:
311*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [weightsq+hq*2+0]
312*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [weightsq+hq*2+4]
313*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
314*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
315*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
316*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
317*c0909341SAndroid Build Coastguard Worker    paddw                m0, m6
318*c0909341SAndroid Build Coastguard Worker    paddw                m1, m6
319*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*0], m0, 1
320*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*1], ym0
321*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*2], m1, 1
322*c0909341SAndroid Build Coastguard Worker    mova          [dstq+stride3q ], ym1
323*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
324*c0909341SAndroid Build Coastguard Worker    add                  hq, 4
325*c0909341SAndroid Build Coastguard Worker    jl .w16_loop
326*c0909341SAndroid Build Coastguard Worker    RET
327*c0909341SAndroid Build Coastguard Worker.w32:
328*c0909341SAndroid Build Coastguard Worker    movu                 m5, [tlq+2]
329*c0909341SAndroid Build Coastguard Worker    psubw                m5, m6
330*c0909341SAndroid Build Coastguard Worker.w32_loop:
331*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, [weightsq+hq*2+0]
332*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, [weightsq+hq*2+2]
333*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, [weightsq+hq*2+4]
334*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, [weightsq+hq*2+6]
335*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
336*c0909341SAndroid Build Coastguard Worker    REPX   {paddw    x, m6}, m0, m1, m2, m3
337*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
338*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
339*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m2
340*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], m3
341*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
342*c0909341SAndroid Build Coastguard Worker    add                  hq, 4
343*c0909341SAndroid Build Coastguard Worker    jl .w32_loop
344*c0909341SAndroid Build Coastguard Worker    RET
345*c0909341SAndroid Build Coastguard Worker.w64:
346*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq+ 2]
347*c0909341SAndroid Build Coastguard Worker    movu                 m5, [tlq+66]
348*c0909341SAndroid Build Coastguard Worker    psubw                m4, m6
349*c0909341SAndroid Build Coastguard Worker    psubw                m5, m6
350*c0909341SAndroid Build Coastguard Worker.w64_loop:
351*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, [weightsq+hq*2+0]
352*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, [weightsq+hq*2+2]
353*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4, m1
354*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
355*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4, m3
356*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
357*c0909341SAndroid Build Coastguard Worker    REPX      {paddw x, m6}, m0, m1, m2, m3
358*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+64*0], m0
359*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+64*1], m1
360*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+64*0], m2
361*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+64*1], m3
362*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
363*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
364*c0909341SAndroid Build Coastguard Worker    jl .w64_loop
365*c0909341SAndroid Build Coastguard Worker    RET
366*c0909341SAndroid Build Coastguard Worker
367*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3
368*c0909341SAndroid Build Coastguard Worker    lea                  r6, [$$]
369*c0909341SAndroid Build Coastguard Worker    mov                  wd, wm
370*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
371*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, [tlq+wq*2] ; right
372*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
373*c0909341SAndroid Build Coastguard Worker    add                  hd, hd
374*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq*4]
375*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
376*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
377*c0909341SAndroid Build Coastguard Worker    lea                  wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq]
378*c0909341SAndroid Build Coastguard Worker    jmp                  wq
379*c0909341SAndroid Build Coastguard Worker.w4:
380*c0909341SAndroid Build Coastguard Worker    movsldup             m4, [base+ipred_shuf]
381*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m5, [base+smooth_weights_1d_16bpc+4*2]
382*c0909341SAndroid Build Coastguard Worker.w4_loop:
383*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [tlq+hq-16] ; left
384*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
385*c0909341SAndroid Build Coastguard Worker    psubw                m0, m6          ; left - right
386*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
387*c0909341SAndroid Build Coastguard Worker    paddw                m0, m6
388*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, m0, 2
389*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, ym0, 1
390*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm3, m0, 3
391*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
392*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
393*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm2
394*c0909341SAndroid Build Coastguard Worker    movq   [dstq+stride3q ], xm3
395*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8*2
396*c0909341SAndroid Build Coastguard Worker    jl .end
397*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
398*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*0], xm0
399*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
400*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm2
401*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm3
402*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
403*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
404*c0909341SAndroid Build Coastguard Worker.end:
405*c0909341SAndroid Build Coastguard Worker    RET
406*c0909341SAndroid Build Coastguard Worker.w8:
407*c0909341SAndroid Build Coastguard Worker    movsldup             m4, [base+ipred_shuf]
408*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m5, [base+smooth_weights_1d_16bpc+8*2]
409*c0909341SAndroid Build Coastguard Worker.w8_loop:
410*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [tlq+hq-8] ; left
411*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
412*c0909341SAndroid Build Coastguard Worker    psubw                m0, m6         ; left - right
413*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
414*c0909341SAndroid Build Coastguard Worker    paddw                m0, m6
415*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
416*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], m0, 2
417*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], ym0, 1
418*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m0, 3
419*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
420*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4*2
421*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
422*c0909341SAndroid Build Coastguard Worker    RET
423*c0909341SAndroid Build Coastguard Worker.w16:
424*c0909341SAndroid Build Coastguard Worker    movsldup             m4, [base+ipred_shuf]
425*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m5, [base+smooth_weights_1d_16bpc+16*2]
426*c0909341SAndroid Build Coastguard Worker.w16_loop:
427*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [tlq+hq-4]
428*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [tlq+hq-8]
429*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
430*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
431*c0909341SAndroid Build Coastguard Worker    psubw                m0, m6
432*c0909341SAndroid Build Coastguard Worker    psubw                m1, m6
433*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
434*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
435*c0909341SAndroid Build Coastguard Worker    paddw                m0, m6
436*c0909341SAndroid Build Coastguard Worker    paddw                m1, m6
437*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
438*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
439*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*2], ym1
440*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+stride3q ], m1, 1
441*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
442*c0909341SAndroid Build Coastguard Worker    sub                  hq, 4*2
443*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
444*c0909341SAndroid Build Coastguard Worker    RET
445*c0909341SAndroid Build Coastguard Worker.w32:
446*c0909341SAndroid Build Coastguard Worker    movu                 m5, [base+smooth_weights_1d_16bpc+32*2]
447*c0909341SAndroid Build Coastguard Worker.w32_loop:
448*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [tlq+hq-8]
449*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m3
450*c0909341SAndroid Build Coastguard Worker    psubw                m3, m6
451*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q3333
452*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m3, q2222
453*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q1111
454*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q0000
455*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
456*c0909341SAndroid Build Coastguard Worker    REPX   {paddw    x, m6}, m0, m1, m2, m3
457*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
458*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
459*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m2
460*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], m3
461*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
462*c0909341SAndroid Build Coastguard Worker    sub                  hq, 4*2
463*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
464*c0909341SAndroid Build Coastguard Worker    RET
465*c0909341SAndroid Build Coastguard Worker.w64:
466*c0909341SAndroid Build Coastguard Worker    movu                 m4, [base+smooth_weights_1d_16bpc+64*2]
467*c0909341SAndroid Build Coastguard Worker    movu                 m5, [base+smooth_weights_1d_16bpc+64*3]
468*c0909341SAndroid Build Coastguard Worker.w64_loop:
469*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, [tlq+hq-2]
470*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, [tlq+hq-4]
471*c0909341SAndroid Build Coastguard Worker    psubw                m1, m6
472*c0909341SAndroid Build Coastguard Worker    psubw                m3, m6
473*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4, m1
474*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
475*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4, m3
476*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
477*c0909341SAndroid Build Coastguard Worker    REPX      {paddw x, m6}, m0, m1, m2, m3
478*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+64*0], m0
479*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+64*1], m1
480*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+64*0], m2
481*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+64*1], m3
482*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
483*c0909341SAndroid Build Coastguard Worker    sub                  hq, 2*2
484*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
485*c0909341SAndroid Build Coastguard Worker    RET
486*c0909341SAndroid Build Coastguard Worker
487*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3
488*c0909341SAndroid Build Coastguard Worker    lea                 r6, [$$]
489*c0909341SAndroid Build Coastguard Worker    mov                 wd, wm
490*c0909341SAndroid Build Coastguard Worker    movifnidn           hd, hm
491*c0909341SAndroid Build Coastguard Worker    vpbroadcastw       m13, [tlq+wq*2]   ; right
492*c0909341SAndroid Build Coastguard Worker    tzcnt               wd, wd
493*c0909341SAndroid Build Coastguard Worker    add                 hd, hd
494*c0909341SAndroid Build Coastguard Worker    movsxd              wq, [base+ipred_smooth_16bpc_avx512icl_table+wq*4]
495*c0909341SAndroid Build Coastguard Worker    mov                r5d, 0x55555555
496*c0909341SAndroid Build Coastguard Worker    sub                tlq, hq
497*c0909341SAndroid Build Coastguard Worker    mova               m14, [base+smooth_perm]
498*c0909341SAndroid Build Coastguard Worker    kmovd               k1, r5d
499*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m0, [tlq]        ; bottom
500*c0909341SAndroid Build Coastguard Worker    mov                 r5, 0x3333333333333333
501*c0909341SAndroid Build Coastguard Worker    pxor               m15, m15
502*c0909341SAndroid Build Coastguard Worker    lea                 wq, [base+ipred_smooth_16bpc_avx512icl_table+wq]
503*c0909341SAndroid Build Coastguard Worker    kmovq               k2, r5
504*c0909341SAndroid Build Coastguard Worker    lea         v_weightsq, [base+smooth_weights_2d_16bpc+hq*2]
505*c0909341SAndroid Build Coastguard Worker    jmp                 wq
506*c0909341SAndroid Build Coastguard Worker.w4:
507*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m5, [tlq+hq+2]
508*c0909341SAndroid Build Coastguard Worker    movshdup            m3, [base+ipred_shuf]
509*c0909341SAndroid Build Coastguard Worker    movsldup            m4, [base+ipred_shuf]
510*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m6, [base+smooth_weights_2d_16bpc+4*4]
511*c0909341SAndroid Build Coastguard Worker    lea           stride3q, [strideq*3]
512*c0909341SAndroid Build Coastguard Worker    punpcklwd           m5, m0           ; top, bottom
513*c0909341SAndroid Build Coastguard Worker.w4_loop:
514*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m0, [v_weightsq]
515*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m2, [tlq+hq-8]
516*c0909341SAndroid Build Coastguard Worker    mova                m1, m13
517*c0909341SAndroid Build Coastguard Worker    pshufb              m0, m3
518*c0909341SAndroid Build Coastguard Worker    pmaddwd             m0, m5
519*c0909341SAndroid Build Coastguard Worker    pshufb          m1{k2}, m2, m4       ; left, right
520*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m0, m1, m6
521*c0909341SAndroid Build Coastguard Worker    vpermb              m0, m14, m0
522*c0909341SAndroid Build Coastguard Worker    pavgw              ym0, ym15
523*c0909341SAndroid Build Coastguard Worker    vextracti32x4      xm1, ym0, 1
524*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
525*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
526*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
527*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
528*c0909341SAndroid Build Coastguard Worker    lea               dstq, [dstq+strideq*4]
529*c0909341SAndroid Build Coastguard Worker    add         v_weightsq, 4*4
530*c0909341SAndroid Build Coastguard Worker    sub                 hd, 4*2
531*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
532*c0909341SAndroid Build Coastguard Worker    RET
533*c0909341SAndroid Build Coastguard Worker.w8:
534*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4    ym5, [tlq+hq+2]
535*c0909341SAndroid Build Coastguard Worker    movshdup            m6, [base+ipred_shuf]
536*c0909341SAndroid Build Coastguard Worker    movsldup            m7, [base+ipred_shuf]
537*c0909341SAndroid Build Coastguard Worker    pmovzxwd            m5, ym5
538*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8     m8, [base+smooth_weights_2d_16bpc+8*4]
539*c0909341SAndroid Build Coastguard Worker    lea           stride3q, [strideq*3]
540*c0909341SAndroid Build Coastguard Worker    vpblendmw       m5{k1}, m0, m5       ; top, bottom
541*c0909341SAndroid Build Coastguard Worker.w8_loop:
542*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m0, [v_weightsq+0]
543*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m1, [v_weightsq+8]
544*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m3, [tlq+hq-4]
545*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m4, [tlq+hq-8]
546*c0909341SAndroid Build Coastguard Worker    pshufb              m0, m6
547*c0909341SAndroid Build Coastguard Worker    pmaddwd             m0, m5
548*c0909341SAndroid Build Coastguard Worker    pshufb              m1, m6
549*c0909341SAndroid Build Coastguard Worker    pmaddwd             m1, m5
550*c0909341SAndroid Build Coastguard Worker    mova                m2, m13
551*c0909341SAndroid Build Coastguard Worker    pshufb          m2{k2}, m3, m7       ; left, right
552*c0909341SAndroid Build Coastguard Worker    mova                m3, m13
553*c0909341SAndroid Build Coastguard Worker    pshufb          m3{k2}, m4, m7
554*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m0, m2, m8
555*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m1, m3, m8
556*c0909341SAndroid Build Coastguard Worker    add         v_weightsq, 4*4
557*c0909341SAndroid Build Coastguard Worker    vpermt2b            m0, m14, m1
558*c0909341SAndroid Build Coastguard Worker    pavgw               m0, m15
559*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
560*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
561*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m0, 2
562*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m0, 3
563*c0909341SAndroid Build Coastguard Worker    lea               dstq, [dstq+strideq*4]
564*c0909341SAndroid Build Coastguard Worker    sub                 hd, 4*2
565*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
566*c0909341SAndroid Build Coastguard Worker    RET
567*c0909341SAndroid Build Coastguard Worker.w16:
568*c0909341SAndroid Build Coastguard Worker    pmovzxwd            m5, [tlq+hq+2]
569*c0909341SAndroid Build Coastguard Worker    mova                m6, [base+smooth_weights_2d_16bpc+16*4]
570*c0909341SAndroid Build Coastguard Worker    vpblendmw       m5{k1}, m0, m5       ; top, bottom
571*c0909341SAndroid Build Coastguard Worker.w16_loop:
572*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m0, [v_weightsq+0]
573*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m1, [v_weightsq+4]
574*c0909341SAndroid Build Coastguard Worker    pmaddwd             m0, m5
575*c0909341SAndroid Build Coastguard Worker    pmaddwd             m1, m5
576*c0909341SAndroid Build Coastguard Worker    mova                m2, m13
577*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m2{k1}, [tlq+hq-2] ; left, right
578*c0909341SAndroid Build Coastguard Worker    mova                m3, m13
579*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m3{k1}, [tlq+hq-4]
580*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m0, m2, m6
581*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m1, m3, m6
582*c0909341SAndroid Build Coastguard Worker    add         v_weightsq, 2*4
583*c0909341SAndroid Build Coastguard Worker    vpermt2b            m0, m14, m1
584*c0909341SAndroid Build Coastguard Worker    pavgw               m0, m15
585*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
586*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
587*c0909341SAndroid Build Coastguard Worker    lea               dstq, [dstq+strideq*2]
588*c0909341SAndroid Build Coastguard Worker    sub                 hq, 2*2
589*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
590*c0909341SAndroid Build Coastguard Worker    RET
591*c0909341SAndroid Build Coastguard Worker.w32:
592*c0909341SAndroid Build Coastguard Worker    pmovzxwd            m5, [tlq+hq+ 2]
593*c0909341SAndroid Build Coastguard Worker    pmovzxwd            m6, [tlq+hq+34]
594*c0909341SAndroid Build Coastguard Worker    mova                m7, [base+smooth_weights_2d_16bpc+32*4]
595*c0909341SAndroid Build Coastguard Worker    mova                m8, [base+smooth_weights_2d_16bpc+32*6]
596*c0909341SAndroid Build Coastguard Worker    vpblendmw       m5{k1}, m0, m5       ; top, bottom
597*c0909341SAndroid Build Coastguard Worker    vpblendmw       m6{k1}, m0, m6
598*c0909341SAndroid Build Coastguard Worker.w32_loop:
599*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m2, [v_weightsq+0]
600*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m3, [v_weightsq+4]
601*c0909341SAndroid Build Coastguard Worker    pmaddwd             m0, m5, m2
602*c0909341SAndroid Build Coastguard Worker    pmaddwd             m2, m6
603*c0909341SAndroid Build Coastguard Worker    pmaddwd             m1, m5, m3
604*c0909341SAndroid Build Coastguard Worker    pmaddwd             m3, m6
605*c0909341SAndroid Build Coastguard Worker    mova                m4, m13
606*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m4{k1}, [tlq+hq-2] ; left, right
607*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m0, m4, m7
608*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m2, m4, m8
609*c0909341SAndroid Build Coastguard Worker    mova                m4, m13
610*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m4{k1}, [tlq+hq-4]
611*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m1, m4, m7
612*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m3, m4, m8
613*c0909341SAndroid Build Coastguard Worker    add         v_weightsq, 2*4
614*c0909341SAndroid Build Coastguard Worker    vpermt2b            m0, m14, m2
615*c0909341SAndroid Build Coastguard Worker    vpermt2b            m1, m14, m3
616*c0909341SAndroid Build Coastguard Worker    pavgw               m0, m15
617*c0909341SAndroid Build Coastguard Worker    pavgw               m1, m15
618*c0909341SAndroid Build Coastguard Worker    mova  [dstq+strideq*0], m0
619*c0909341SAndroid Build Coastguard Worker    mova  [dstq+strideq*1], m1
620*c0909341SAndroid Build Coastguard Worker    lea               dstq, [dstq+strideq*2]
621*c0909341SAndroid Build Coastguard Worker    sub                 hq, 2*2
622*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
623*c0909341SAndroid Build Coastguard Worker    RET
624*c0909341SAndroid Build Coastguard Worker.w64:
625*c0909341SAndroid Build Coastguard Worker    pmovzxwd            m5, [tlq+hq+ 2]
626*c0909341SAndroid Build Coastguard Worker    pmovzxwd            m6, [tlq+hq+34]
627*c0909341SAndroid Build Coastguard Worker    pmovzxwd            m7, [tlq+hq+66]
628*c0909341SAndroid Build Coastguard Worker    pmovzxwd            m8, [tlq+hq+98]
629*c0909341SAndroid Build Coastguard Worker    mova                m9, [base+smooth_weights_2d_16bpc+64*4]
630*c0909341SAndroid Build Coastguard Worker    vpblendmw       m5{k1}, m0, m5       ; top, bottom
631*c0909341SAndroid Build Coastguard Worker    mova               m10, [base+smooth_weights_2d_16bpc+64*5]
632*c0909341SAndroid Build Coastguard Worker    vpblendmw       m6{k1}, m0, m6
633*c0909341SAndroid Build Coastguard Worker    mova               m11, [base+smooth_weights_2d_16bpc+64*6]
634*c0909341SAndroid Build Coastguard Worker    vpblendmw       m7{k1}, m0, m7
635*c0909341SAndroid Build Coastguard Worker    mova               m12, [base+smooth_weights_2d_16bpc+64*7]
636*c0909341SAndroid Build Coastguard Worker    vpblendmw       m8{k1}, m0, m8
637*c0909341SAndroid Build Coastguard Worker.w64_loop:
638*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m3, [v_weightsq]
639*c0909341SAndroid Build Coastguard Worker    mova                m4, m13
640*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m4{k1}, [tlq+hq-2] ; left, right
641*c0909341SAndroid Build Coastguard Worker    pmaddwd             m0, m5, m3
642*c0909341SAndroid Build Coastguard Worker    pmaddwd             m2, m6, m3
643*c0909341SAndroid Build Coastguard Worker    pmaddwd             m1, m7, m3
644*c0909341SAndroid Build Coastguard Worker    pmaddwd             m3, m8
645*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m0, m4, m9
646*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m2, m4, m10
647*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m1, m4, m11
648*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m3, m4, m12
649*c0909341SAndroid Build Coastguard Worker    add         v_weightsq, 1*4
650*c0909341SAndroid Build Coastguard Worker    vpermt2b            m0, m14, m2
651*c0909341SAndroid Build Coastguard Worker    vpermt2b            m1, m14, m3
652*c0909341SAndroid Build Coastguard Worker    pavgw               m0, m15
653*c0909341SAndroid Build Coastguard Worker    pavgw               m1, m15
654*c0909341SAndroid Build Coastguard Worker    mova       [dstq+64*0], m0
655*c0909341SAndroid Build Coastguard Worker    mova       [dstq+64*1], m1
656*c0909341SAndroid Build Coastguard Worker    add               dstq, strideq
657*c0909341SAndroid Build Coastguard Worker    sub                 hd, 1*2
658*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
659*c0909341SAndroid Build Coastguard Worker    RET
660*c0909341SAndroid Build Coastguard Worker
661*c0909341SAndroid Build Coastguard Worker%if WIN64
662*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 4
663*c0909341SAndroid Build Coastguard Worker%else
664*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 8
665*c0909341SAndroid Build Coastguard Worker%endif
666*c0909341SAndroid Build Coastguard Worker
667*c0909341SAndroid Build Coastguard Workercglobal ipred_z1_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx
668*c0909341SAndroid Build Coastguard Worker%define base r7-z_filter_t0
669*c0909341SAndroid Build Coastguard Worker    lea                  r7, [z_filter_t0]
670*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
671*c0909341SAndroid Build Coastguard Worker    movifnidn        angled, anglem
672*c0909341SAndroid Build Coastguard Worker    lea                  t0, [dr_intra_derivative]
673*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [base+ipred_z1_16bpc_avx512icl_table+wq*4]
674*c0909341SAndroid Build Coastguard Worker    add                 tlq, 2
675*c0909341SAndroid Build Coastguard Worker    mov                 dxd, angled
676*c0909341SAndroid Build Coastguard Worker    and                 dxd, 0x7e
677*c0909341SAndroid Build Coastguard Worker    add              angled, 165 ; ~90
678*c0909341SAndroid Build Coastguard Worker    movzx               dxd, word [t0+dxq]
679*c0909341SAndroid Build Coastguard Worker    lea                  wq, [base+ipred_z1_16bpc_avx512icl_table+wq]
680*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
681*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x4ff ; d = 90 - angle
682*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [base+pw_31806]
683*c0909341SAndroid Build Coastguard Worker    jmp                  wq
684*c0909341SAndroid Build Coastguard Worker.w4:
685*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, [tlq+14]
686*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m5, [tlq], 0
687*c0909341SAndroid Build Coastguard Worker    cmp              angleb, 40
688*c0909341SAndroid Build Coastguard Worker    jae .w4_no_upsample
689*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq-1024]
690*c0909341SAndroid Build Coastguard Worker    sar                 r3d, 7
691*c0909341SAndroid Build Coastguard Worker    add                 r3d, hd
692*c0909341SAndroid Build Coastguard Worker    jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
693*c0909341SAndroid Build Coastguard Worker    call .upsample_top
694*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [base+z_xpos_off1b]
695*c0909341SAndroid Build Coastguard Worker    jmp .w4_main2
696*c0909341SAndroid Build Coastguard Worker.w4_no_upsample:
697*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
698*c0909341SAndroid Build Coastguard Worker    jnz .w4_main ; !enable_intra_edge_filter
699*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
700*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm0, r3d
701*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm1, angled
702*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
703*c0909341SAndroid Build Coastguard Worker    vpcmpeqb             k1, xm0, [base+z_filter_wh]
704*c0909341SAndroid Build Coastguard Worker    vpcmpgtb         k1{k1}, xm1, [base+z_filter_t0+angleq*8]
705*c0909341SAndroid Build Coastguard Worker    kmovw               r5d, k1
706*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
707*c0909341SAndroid Build Coastguard Worker    jz .w4_main
708*c0909341SAndroid Build Coastguard Worker    call .w16_filter
709*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 9
710*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
711*c0909341SAndroid Build Coastguard Worker    cmovne              r3d, r2d
712*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, r3d
713*c0909341SAndroid Build Coastguard Worker    pminuw               m6, [base+pw_0to31]
714*c0909341SAndroid Build Coastguard Worker    vpermw               m5, m6, m5
715*c0909341SAndroid Build Coastguard Worker.w4_main:
716*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [base+z_xpos_off1a]
717*c0909341SAndroid Build Coastguard Worker.w4_main2:
718*c0909341SAndroid Build Coastguard Worker    movsldup             m3, [base+z_xpos_mul]
719*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, dxd
720*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
721*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m4
722*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m6, m5, m5, q3321
723*c0909341SAndroid Build Coastguard Worker    psllw                m4, 3       ; dx*8
724*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m0      ; xpos
725*c0909341SAndroid Build Coastguard Worker    palignr              m6, m5, 2   ; top+1
726*c0909341SAndroid Build Coastguard Worker.w4_loop:
727*c0909341SAndroid Build Coastguard Worker    psrlw                m1, m3, 6   ; base_x
728*c0909341SAndroid Build Coastguard Worker    pand                 m2, m15, m3 ; frac
729*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m1, m5  ; top[base_x]
730*c0909341SAndroid Build Coastguard Worker    vpermw               m1, m1, m6  ; top[base_x+1]
731*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
732*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
733*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
734*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
735*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
736*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
737*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
738*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
739*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r2       ], xm1
740*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
741*c0909341SAndroid Build Coastguard Worker    jl .w4_end
742*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, m0, 2
743*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m4      ; xpos += dx
744*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
745*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm0, m0, 3
746*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm1
747*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
748*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm0
749*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r2       ], xm0
750*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
751*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
752*c0909341SAndroid Build Coastguard Worker.w4_end:
753*c0909341SAndroid Build Coastguard Worker    RET
754*c0909341SAndroid Build Coastguard Worker.upsample_top:
755*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m5, [tlq-16], 3
756*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+z_upsample]
757*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+pd_65536]
758*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
759*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m3, m5
760*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
761*c0909341SAndroid Build Coastguard Worker    vpermw               m1, m3, m5
762*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
763*c0909341SAndroid Build Coastguard Worker    vpermw               m2, m3, m5
764*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
765*c0909341SAndroid Build Coastguard Worker    vpermw               m3, m3, m5
766*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, r9m     ; pixel_max
767*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2      ; b+c
768*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3      ; a+d
769*c0909341SAndroid Build Coastguard Worker    psubw                m0, m1, m0
770*c0909341SAndroid Build Coastguard Worker    psraw                m0, 3
771*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
772*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
773*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m2
774*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m2
775*c0909341SAndroid Build Coastguard Worker    pminsw               m5, m0
776*c0909341SAndroid Build Coastguard Worker    ret
777*c0909341SAndroid Build Coastguard Worker.w8:
778*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq+216]
779*c0909341SAndroid Build Coastguard Worker    movu                ym5, [tlq]
780*c0909341SAndroid Build Coastguard Worker    mov                 r3b, hb
781*c0909341SAndroid Build Coastguard Worker    movu                m10, [base+pw_0to31]
782*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
783*c0909341SAndroid Build Coastguard Worker    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
784*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
785*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, r3d
786*c0909341SAndroid Build Coastguard Worker    add                 r3d, r3d
787*c0909341SAndroid Build Coastguard Worker    pminuw               m6, m10
788*c0909341SAndroid Build Coastguard Worker    vpermw               m5, m6, m5
789*c0909341SAndroid Build Coastguard Worker    call .upsample_top
790*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [base+z_xpos_off1b]
791*c0909341SAndroid Build Coastguard Worker    jmp .w8_main2
792*c0909341SAndroid Build Coastguard Worker.w8_no_upsample:
793*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
794*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        ym0, r3d
795*c0909341SAndroid Build Coastguard Worker    and                 r3d, 7
796*c0909341SAndroid Build Coastguard Worker    or                  r3d, 8 ; imin(h+7, 15)
797*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, r3d
798*c0909341SAndroid Build Coastguard Worker    pminuw               m6, m10
799*c0909341SAndroid Build Coastguard Worker    vpermw               m5, m6, m5
800*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
801*c0909341SAndroid Build Coastguard Worker    jnz .w8_main
802*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        ym1, angled
803*c0909341SAndroid Build Coastguard Worker    shr              angled, 8
804*c0909341SAndroid Build Coastguard Worker    vpcmpeqb             k1, ym0, [base+z_filter_wh]
805*c0909341SAndroid Build Coastguard Worker    mova                xm0, [base+z_filter_t0+angleq*8]
806*c0909341SAndroid Build Coastguard Worker    vpcmpgtb         k1{k1}, ym1, ym0
807*c0909341SAndroid Build Coastguard Worker    kmovd               r5d, k1
808*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
809*c0909341SAndroid Build Coastguard Worker    jz .w8_main
810*c0909341SAndroid Build Coastguard Worker    call .w16_filter
811*c0909341SAndroid Build Coastguard Worker    cmp                  hd, r3d
812*c0909341SAndroid Build Coastguard Worker    jl .w8_filter_end
813*c0909341SAndroid Build Coastguard Worker    pminud               m6, m10, [base+pw_17] {1to16}
814*c0909341SAndroid Build Coastguard Worker    add                 r3d, 2
815*c0909341SAndroid Build Coastguard Worker.w8_filter_end:
816*c0909341SAndroid Build Coastguard Worker    vpermw               m5, m6, m5
817*c0909341SAndroid Build Coastguard Worker.w8_main:
818*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [base+z_xpos_off1a]
819*c0909341SAndroid Build Coastguard Worker.w8_main2:
820*c0909341SAndroid Build Coastguard Worker    movshdup             m3, [base+z_xpos_mul]
821*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, dxd
822*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
823*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
824*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m4
825*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m6, m5, m5, q3321
826*c0909341SAndroid Build Coastguard Worker    sub                 r3d, dxd
827*c0909341SAndroid Build Coastguard Worker    psllw                m4, 2       ; dx*4
828*c0909341SAndroid Build Coastguard Worker    shl                 dxd, 2
829*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m0      ; xpos
830*c0909341SAndroid Build Coastguard Worker    palignr              m6, m5, 2   ; top+1
831*c0909341SAndroid Build Coastguard Worker.w8_loop:
832*c0909341SAndroid Build Coastguard Worker    psrlw                m1, m3, 6   ; base_x
833*c0909341SAndroid Build Coastguard Worker    pand                 m2, m15, m3 ; frac
834*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m1, m5  ; top[base_x]
835*c0909341SAndroid Build Coastguard Worker    vpermw               m1, m1, m6  ; top[base_x+1]
836*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
837*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
838*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
839*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
840*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
841*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
842*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m0, 2
843*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r2       ], m0, 3
844*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
845*c0909341SAndroid Build Coastguard Worker    jz .w8_end
846*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m4      ; xpos += dx
847*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
848*c0909341SAndroid Build Coastguard Worker    sub                 r3d, dxd
849*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
850*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm5, m5, 3
851*c0909341SAndroid Build Coastguard Worker.w8_end_loop:
852*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], xm5
853*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], xm5
854*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], xm5
855*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r2       ], xm5
856*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
857*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
858*c0909341SAndroid Build Coastguard Worker    jg .w8_end_loop
859*c0909341SAndroid Build Coastguard Worker.w8_end:
860*c0909341SAndroid Build Coastguard Worker    RET
861*c0909341SAndroid Build Coastguard Worker.w16_filter:
862*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, [tlq-2]
863*c0909341SAndroid Build Coastguard Worker    popcnt              r5d, r5d
864*c0909341SAndroid Build Coastguard Worker    valignq              m3, m6, m5, 2
865*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+z_filter_k+(r5-1)*4+12*0]
866*c0909341SAndroid Build Coastguard Worker    valignq              m1, m5, m1, 6
867*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+z_filter_k+(r5-1)*4+12*1]
868*c0909341SAndroid Build Coastguard Worker    palignr              m2, m3, m5, 2
869*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+z_filter_k+(r5-1)*4+12*2]
870*c0909341SAndroid Build Coastguard Worker    palignr              m0, m5, m1, 14
871*c0909341SAndroid Build Coastguard Worker    pmullw               m7, m5
872*c0909341SAndroid Build Coastguard Worker    palignr              m3, m5, 4
873*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
874*c0909341SAndroid Build Coastguard Worker    palignr              m5, m1, 12
875*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m8
876*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3
877*c0909341SAndroid Build Coastguard Worker    pmullw               m5, m9
878*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
879*c0909341SAndroid Build Coastguard Worker    paddw                m0, m7
880*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0
881*c0909341SAndroid Build Coastguard Worker    psrlw                m5, 3
882*c0909341SAndroid Build Coastguard Worker    pavgw                m5, m1
883*c0909341SAndroid Build Coastguard Worker    ret
884*c0909341SAndroid Build Coastguard Worker.w16:
885*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+15]
886*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        ym0, r3d
887*c0909341SAndroid Build Coastguard Worker    and                 r3d, 15
888*c0909341SAndroid Build Coastguard Worker    or                  r3d, 16 ; imin(h+15, 31)
889*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m11, r3d
890*c0909341SAndroid Build Coastguard Worker    pminuw              m10, m11, [base+pw_0to31]
891*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, [tlq+r3*2]
892*c0909341SAndroid Build Coastguard Worker    vpermw               m5, m10, [tlq]
893*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
894*c0909341SAndroid Build Coastguard Worker    jnz .w16_main
895*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        ym1, angled
896*c0909341SAndroid Build Coastguard Worker    shr              angled, 8
897*c0909341SAndroid Build Coastguard Worker    vpcmpeqb             k1, ym0, [base+z_filter_wh]
898*c0909341SAndroid Build Coastguard Worker    mova                xm0, [base+z_filter_t0+angleq*8]
899*c0909341SAndroid Build Coastguard Worker    vpcmpgtb         k1{k1}, ym1, ym0
900*c0909341SAndroid Build Coastguard Worker    kmovd               r5d, k1
901*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
902*c0909341SAndroid Build Coastguard Worker    jz .w16_main
903*c0909341SAndroid Build Coastguard Worker    call .w16_filter
904*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
905*c0909341SAndroid Build Coastguard Worker    jg .w16_filter_h32
906*c0909341SAndroid Build Coastguard Worker    vpermw               m6, m11, m5
907*c0909341SAndroid Build Coastguard Worker    vpermw               m5, m10, m5
908*c0909341SAndroid Build Coastguard Worker    jmp .w16_main
909*c0909341SAndroid Build Coastguard Worker.w16_filter_h32:
910*c0909341SAndroid Build Coastguard Worker    movzx               r3d, word [tlq+62]
911*c0909341SAndroid Build Coastguard Worker    movzx               r2d, word [tlq+60]
912*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r2+r3*8+4]
913*c0909341SAndroid Build Coastguard Worker    sub                 r2d, r3d
914*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 1
915*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 3
916*c0909341SAndroid Build Coastguard Worker    kmovb                k1, r3d
917*c0909341SAndroid Build Coastguard Worker    movd                xm0, r2d
918*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32
919*c0909341SAndroid Build Coastguard Worker    vmovdqu16        m6{k1}, m0
920*c0909341SAndroid Build Coastguard Worker.w16_main:
921*c0909341SAndroid Build Coastguard Worker    rorx                r2d, dxd, 23
922*c0909341SAndroid Build Coastguard Worker    mov                  r7, rsp
923*c0909341SAndroid Build Coastguard Worker    and                 rsp, ~63
924*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, r2d
925*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 64*2
926*c0909341SAndroid Build Coastguard Worker    mov                 r2d, dxd
927*c0909341SAndroid Build Coastguard Worker    paddw                m4, m3, m3
928*c0909341SAndroid Build Coastguard Worker    mova         [rsp+64*0], m5
929*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, ym4, 1
930*c0909341SAndroid Build Coastguard Worker    mova         [rsp+64*1], m6
931*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
932*c0909341SAndroid Build Coastguard Worker.w16_loop:
933*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r2+dxq]
934*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6
935*c0909341SAndroid Build Coastguard Worker    movu                ym0, [rsp+r2*2]
936*c0909341SAndroid Build Coastguard Worker    movu                ym1, [rsp+r2*2+2]
937*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r5+dxq]
938*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 6
939*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [rsp+r5*2], 1
940*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [rsp+r5*2+2], 1
941*c0909341SAndroid Build Coastguard Worker    pand                 m2, m15, m3 ; frac << 9
942*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
943*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
944*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
945*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
946*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
947*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
948*c0909341SAndroid Build Coastguard Worker    jz .w16_end
949*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
950*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
951*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, r3d
952*c0909341SAndroid Build Coastguard Worker    jl .w16_loop
953*c0909341SAndroid Build Coastguard Worker    punpckhqdq          ym6, ym6
954*c0909341SAndroid Build Coastguard Worker.w16_end_loop:
955*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], ym6
956*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], ym6
957*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
958*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
959*c0909341SAndroid Build Coastguard Worker    jg .w16_end_loop
960*c0909341SAndroid Build Coastguard Worker.w16_end:
961*c0909341SAndroid Build Coastguard Worker    mov                 rsp, r7
962*c0909341SAndroid Build Coastguard Worker    RET
963*c0909341SAndroid Build Coastguard Worker.w32:
964*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+31]
965*c0909341SAndroid Build Coastguard Worker    movu                 m7, [tlq+64*0]
966*c0909341SAndroid Build Coastguard Worker    and                 r3d, 31
967*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m11, r3d
968*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32 ; imin(h+31, 63)
969*c0909341SAndroid Build Coastguard Worker    pminuw              m10, m11, [base+pw_0to31]
970*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, [tlq+r3*2]
971*c0909341SAndroid Build Coastguard Worker    vpermw               m8, m10, [tlq+64*1]
972*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
973*c0909341SAndroid Build Coastguard Worker    jnz .w32_main
974*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pw_3]
975*c0909341SAndroid Build Coastguard Worker    mov                 r5d, ~1
976*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq-2]
977*c0909341SAndroid Build Coastguard Worker    kmovd                k1, r5d
978*c0909341SAndroid Build Coastguard Worker    valignq              m2, m8, m7, 6
979*c0909341SAndroid Build Coastguard Worker    paddw                m7, m3
980*c0909341SAndroid Build Coastguard Worker    vmovdqu16        m3{k1}, [tlq-4]
981*c0909341SAndroid Build Coastguard Worker    valignq              m4, m9, m8, 2
982*c0909341SAndroid Build Coastguard Worker    paddw                m3, m5
983*c0909341SAndroid Build Coastguard Worker    paddw                m7, [tlq+2]
984*c0909341SAndroid Build Coastguard Worker    palignr              m1, m8, m2, 14
985*c0909341SAndroid Build Coastguard Worker    pavgw                m3, [tlq+4]
986*c0909341SAndroid Build Coastguard Worker    palignr              m2, m8, m2, 12
987*c0909341SAndroid Build Coastguard Worker    paddw                m7, m3
988*c0909341SAndroid Build Coastguard Worker    palignr              m3, m4, m8, 2
989*c0909341SAndroid Build Coastguard Worker    psrlw                m7, 2
990*c0909341SAndroid Build Coastguard Worker    palignr              m4, m8, 4
991*c0909341SAndroid Build Coastguard Worker    paddw                m8, m1
992*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
993*c0909341SAndroid Build Coastguard Worker    paddw                m8, m3
994*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m4
995*c0909341SAndroid Build Coastguard Worker    paddw                m8, m2
996*c0909341SAndroid Build Coastguard Worker    psrlw                m8, 2
997*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 64
998*c0909341SAndroid Build Coastguard Worker    je .w32_filter_h64
999*c0909341SAndroid Build Coastguard Worker    vpermw               m9, m11, m8
1000*c0909341SAndroid Build Coastguard Worker    vpermw               m8, m10, m8
1001*c0909341SAndroid Build Coastguard Worker    jmp .w32_main
1002*c0909341SAndroid Build Coastguard Worker.w32_filter_h64:
1003*c0909341SAndroid Build Coastguard Worker    movzx               r3d, word [tlq+126]
1004*c0909341SAndroid Build Coastguard Worker    movzx               r2d, word [tlq+124]
1005*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r2+r3*8+4]
1006*c0909341SAndroid Build Coastguard Worker    sub                 r2d, r3d
1007*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 65
1008*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 3
1009*c0909341SAndroid Build Coastguard Worker    movd                xm0, r2d
1010*c0909341SAndroid Build Coastguard Worker    vpblendmw        m9{k1}, m0, m9
1011*c0909341SAndroid Build Coastguard Worker.w32_main:
1012*c0909341SAndroid Build Coastguard Worker    rorx                r2d, dxd, 23
1013*c0909341SAndroid Build Coastguard Worker    mov                  r7, rsp
1014*c0909341SAndroid Build Coastguard Worker    and                 rsp, ~63
1015*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, r2d
1016*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 64*4
1017*c0909341SAndroid Build Coastguard Worker    mov                 r2d, dxd
1018*c0909341SAndroid Build Coastguard Worker    mova         [rsp+64*0], m7
1019*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1020*c0909341SAndroid Build Coastguard Worker    mova         [rsp+64*1], m8
1021*c0909341SAndroid Build Coastguard Worker    mova                 m6, m5
1022*c0909341SAndroid Build Coastguard Worker    mova         [rsp+64*2], m9
1023*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m9, m9
1024*c0909341SAndroid Build Coastguard Worker    mova         [rsp+64*3], ym9
1025*c0909341SAndroid Build Coastguard Worker.w32_loop:
1026*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r2+dxq]
1027*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6
1028*c0909341SAndroid Build Coastguard Worker    movu                 m0, [rsp+r2*2]
1029*c0909341SAndroid Build Coastguard Worker    movu                 m2, [rsp+r2*2+2]
1030*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r5+dxq]
1031*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 6
1032*c0909341SAndroid Build Coastguard Worker    movu                 m1, [rsp+r5*2]
1033*c0909341SAndroid Build Coastguard Worker    movu                 m3, [rsp+r5*2+2]
1034*c0909341SAndroid Build Coastguard Worker    pand                 m4, m15, m5
1035*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6
1036*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
1037*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
1038*c0909341SAndroid Build Coastguard Worker    pand                 m4, m15, m5
1039*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
1040*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4
1041*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1042*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
1043*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
1044*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
1045*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1046*c0909341SAndroid Build Coastguard Worker    jz .w32_end
1047*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6
1048*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1049*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, r3d
1050*c0909341SAndroid Build Coastguard Worker    jl .w32_loop
1051*c0909341SAndroid Build Coastguard Worker.w32_end_loop:
1052*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m9
1053*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m9
1054*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1055*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1056*c0909341SAndroid Build Coastguard Worker    jg .w32_end_loop
1057*c0909341SAndroid Build Coastguard Worker.w32_end:
1058*c0909341SAndroid Build Coastguard Worker    mov                 rsp, r7
1059*c0909341SAndroid Build Coastguard Worker    RET
1060*c0909341SAndroid Build Coastguard Worker.w64_filter96:
1061*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+pw_3]
1062*c0909341SAndroid Build Coastguard Worker    mov                 r5d, ~1
1063*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq-2]
1064*c0909341SAndroid Build Coastguard Worker    kmovd                k1, r5d
1065*c0909341SAndroid Build Coastguard Worker    paddw                m7, m0
1066*c0909341SAndroid Build Coastguard Worker    vmovdqu16        m0{k1}, [tlq-4]
1067*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
1068*c0909341SAndroid Build Coastguard Worker    paddw                m7, [tlq+2]
1069*c0909341SAndroid Build Coastguard Worker    pavgw                m0, [tlq+4]
1070*c0909341SAndroid Build Coastguard Worker    valignq              m1, m9, m8, 6
1071*c0909341SAndroid Build Coastguard Worker    paddw                m8, [tlq+62]
1072*c0909341SAndroid Build Coastguard Worker    paddw                m2, m4, [tlq+60]
1073*c0909341SAndroid Build Coastguard Worker    valignq              m3, m10, m9, 2
1074*c0909341SAndroid Build Coastguard Worker    paddw                m8, [tlq+66]
1075*c0909341SAndroid Build Coastguard Worker    pavgw                m2, [tlq+68]
1076*c0909341SAndroid Build Coastguard Worker    paddw                m7, m0
1077*c0909341SAndroid Build Coastguard Worker    palignr              m0, m9, m1, 14
1078*c0909341SAndroid Build Coastguard Worker    paddw                m8, m2
1079*c0909341SAndroid Build Coastguard Worker    palignr              m1, m9, m1, 12
1080*c0909341SAndroid Build Coastguard Worker    psrlw                m7, 2
1081*c0909341SAndroid Build Coastguard Worker    palignr              m2, m3, m9, 2
1082*c0909341SAndroid Build Coastguard Worker    psrlw                m8, 2
1083*c0909341SAndroid Build Coastguard Worker    palignr              m3, m9, 4
1084*c0909341SAndroid Build Coastguard Worker    paddw                m0, m9
1085*c0909341SAndroid Build Coastguard Worker    paddw                m1, m4
1086*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1087*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m3
1088*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1089*c0909341SAndroid Build Coastguard Worker    ret
1090*c0909341SAndroid Build Coastguard Worker.w64:
1091*c0909341SAndroid Build Coastguard Worker    movu                 m7, [tlq+64*0]
1092*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq-1]
1093*c0909341SAndroid Build Coastguard Worker    movu                 m8, [tlq+64*1]
1094*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m11, [tlq+r3*2+128]
1095*c0909341SAndroid Build Coastguard Worker    movu                 m9, [tlq+64*2]
1096*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 64
1097*c0909341SAndroid Build Coastguard Worker    je .w64_h64
1098*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m13, r3d
1099*c0909341SAndroid Build Coastguard Worker    or                  r3d, 64
1100*c0909341SAndroid Build Coastguard Worker    pminuw              m12, m13, [base+pw_0to31]
1101*c0909341SAndroid Build Coastguard Worker    mova                m10, m11
1102*c0909341SAndroid Build Coastguard Worker    vpermw               m9, m12, m9
1103*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1104*c0909341SAndroid Build Coastguard Worker    jnz .w64_main
1105*c0909341SAndroid Build Coastguard Worker    call .w64_filter96
1106*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
1107*c0909341SAndroid Build Coastguard Worker    vpermw               m9, m12, m0
1108*c0909341SAndroid Build Coastguard Worker    vpermw              m10, m13, m0
1109*c0909341SAndroid Build Coastguard Worker    mova                m11, m10
1110*c0909341SAndroid Build Coastguard Worker    jmp .w64_main
1111*c0909341SAndroid Build Coastguard Worker.w64_h64:
1112*c0909341SAndroid Build Coastguard Worker    movu                m10, [tlq+64*3]
1113*c0909341SAndroid Build Coastguard Worker    or                  r3d, 64
1114*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1115*c0909341SAndroid Build Coastguard Worker    jnz .w64_main
1116*c0909341SAndroid Build Coastguard Worker    call .w64_filter96
1117*c0909341SAndroid Build Coastguard Worker    valignq              m1, m10, m9, 6
1118*c0909341SAndroid Build Coastguard Worker    valignq              m3, m11, m10, 2
1119*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+pw_63]
1120*c0909341SAndroid Build Coastguard Worker    psrlw                m9, m0, 2
1121*c0909341SAndroid Build Coastguard Worker    palignr              m0, m10, m1, 14
1122*c0909341SAndroid Build Coastguard Worker    palignr              m1, m10, m1, 12
1123*c0909341SAndroid Build Coastguard Worker    palignr              m2, m3, m10, 2
1124*c0909341SAndroid Build Coastguard Worker    palignr              m3, m10, 4
1125*c0909341SAndroid Build Coastguard Worker    paddw               m10, m0
1126*c0909341SAndroid Build Coastguard Worker    paddw                m1, m4
1127*c0909341SAndroid Build Coastguard Worker    paddw               m10, m2
1128*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m3
1129*c0909341SAndroid Build Coastguard Worker    paddw               m10, m1
1130*c0909341SAndroid Build Coastguard Worker    psrlw               m10, 2
1131*c0909341SAndroid Build Coastguard Worker    vpermw              m11, m11, m10
1132*c0909341SAndroid Build Coastguard Worker.w64_main:
1133*c0909341SAndroid Build Coastguard Worker    rorx                r2d, dxd, 23
1134*c0909341SAndroid Build Coastguard Worker    mov                  r7, rsp
1135*c0909341SAndroid Build Coastguard Worker    and                 rsp, ~63
1136*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, r2d
1137*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 64*6
1138*c0909341SAndroid Build Coastguard Worker    mova         [rsp+64*0], m7
1139*c0909341SAndroid Build Coastguard Worker    mov                 r2d, dxd
1140*c0909341SAndroid Build Coastguard Worker    mova         [rsp+64*1], m8
1141*c0909341SAndroid Build Coastguard Worker    lea                  r5, [rsp+r3*2]
1142*c0909341SAndroid Build Coastguard Worker    mova         [rsp+64*2], m9
1143*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1144*c0909341SAndroid Build Coastguard Worker    mova         [rsp+64*3], m10
1145*c0909341SAndroid Build Coastguard Worker    sub                  r2, r3
1146*c0909341SAndroid Build Coastguard Worker    mova         [rsp+64*4], m11
1147*c0909341SAndroid Build Coastguard Worker    mova                 m6, m5
1148*c0909341SAndroid Build Coastguard Worker    mova         [rsp+64*5], m11
1149*c0909341SAndroid Build Coastguard Worker.w64_loop:
1150*c0909341SAndroid Build Coastguard Worker    mov                  r3, r2
1151*c0909341SAndroid Build Coastguard Worker    sar                  r3, 6
1152*c0909341SAndroid Build Coastguard Worker    movu                 m0, [r5+r3*2+64*0]
1153*c0909341SAndroid Build Coastguard Worker    movu                 m2, [r5+r3*2+64*0+2]
1154*c0909341SAndroid Build Coastguard Worker    movu                 m1, [r5+r3*2+64*1]
1155*c0909341SAndroid Build Coastguard Worker    movu                 m3, [r5+r3*2+64*1+2]
1156*c0909341SAndroid Build Coastguard Worker    pand                 m4, m15, m5
1157*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
1158*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
1159*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
1160*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4
1161*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1162*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
1163*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
1164*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
1165*c0909341SAndroid Build Coastguard Worker    dec                  hd
1166*c0909341SAndroid Build Coastguard Worker    jz .w64_end
1167*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6
1168*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1169*c0909341SAndroid Build Coastguard Worker    add                  r2, dxq
1170*c0909341SAndroid Build Coastguard Worker    jl .w64_loop
1171*c0909341SAndroid Build Coastguard Worker.w64_end_loop:
1172*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m11
1173*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m11
1174*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1175*c0909341SAndroid Build Coastguard Worker    dec                  hd
1176*c0909341SAndroid Build Coastguard Worker    jg .w64_end_loop
1177*c0909341SAndroid Build Coastguard Worker.w64_end:
1178*c0909341SAndroid Build Coastguard Worker    mov                 rsp, r7
1179*c0909341SAndroid Build Coastguard Worker    RET
1180*c0909341SAndroid Build Coastguard Worker
1181*c0909341SAndroid Build Coastguard Workercglobal ipred_z2_16bpc, 3, 9, 16, dst, stride, tl, w, h, angle, dx, _, dy
1182*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
1183*c0909341SAndroid Build Coastguard Worker    movifnidn        angled, anglem
1184*c0909341SAndroid Build Coastguard Worker    lea                 dxq, [dr_intra_derivative-90]
1185*c0909341SAndroid Build Coastguard Worker    movzx               dyd, angleb
1186*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x400
1187*c0909341SAndroid Build Coastguard Worker    mov                  r7, dxq
1188*c0909341SAndroid Build Coastguard Worker    sub                 dxq, dyq
1189*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
1190*c0909341SAndroid Build Coastguard Worker    and                 dyd, ~1
1191*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m12, [tlq]
1192*c0909341SAndroid Build Coastguard Worker    and                 dxq, ~1
1193*c0909341SAndroid Build Coastguard Worker    movzx               dyd, word [r7+dyq]  ; angle - 90
1194*c0909341SAndroid Build Coastguard Worker    lea                  r7, [z_filter_t0]
1195*c0909341SAndroid Build Coastguard Worker    movzx               dxd, word [dxq+270] ; 180 - angle
1196*c0909341SAndroid Build Coastguard Worker    mova                 m0, [base+pw_31to0]
1197*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [base+ipred_z2_16bpc_avx512icl_table+wq*4]
1198*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq+2]
1199*c0909341SAndroid Build Coastguard Worker    neg                 dyd
1200*c0909341SAndroid Build Coastguard Worker    vpermw               m7, m0, [tlq-64*1]
1201*c0909341SAndroid Build Coastguard Worker    lea                  wq, [base+ipred_z2_16bpc_avx512icl_table+wq]
1202*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [base+pw_31806]
1203*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [base+pw_1]
1204*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1205*c0909341SAndroid Build Coastguard Worker.w4:
1206*c0909341SAndroid Build Coastguard Worker    movq                xm3, [tlq]
1207*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m8, [base+pw_1to32]
1208*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1209*c0909341SAndroid Build Coastguard Worker    jnz .w4_main ; !enable_intra_edge_filter
1210*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+2]
1211*c0909341SAndroid Build Coastguard Worker    add              angled, 1022
1212*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1213*c0909341SAndroid Build Coastguard Worker    test                r3d, angled
1214*c0909341SAndroid Build Coastguard Worker    jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
1215*c0909341SAndroid Build Coastguard Worker    pshuflw             xm0, xm4, q3321
1216*c0909341SAndroid Build Coastguard Worker    sub              angled, 1075 ; angle - 53
1217*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
1218*c0909341SAndroid Build Coastguard Worker    call .upsample_above
1219*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm4, xm3, xm4
1220*c0909341SAndroid Build Coastguard Worker    palignr             xm3, xm4, xm12, 14
1221*c0909341SAndroid Build Coastguard Worker    jmp .w4_main
1222*c0909341SAndroid Build Coastguard Worker.w4_upsample_left:
1223*c0909341SAndroid Build Coastguard Worker    call .upsample_left
1224*c0909341SAndroid Build Coastguard Worker    movsldup             m1, [base+z_xpos_mul]
1225*c0909341SAndroid Build Coastguard Worker    paddw                m1, m1
1226*c0909341SAndroid Build Coastguard Worker    jmp .w4_main2
1227*c0909341SAndroid Build Coastguard Worker.w4_no_upsample_above:
1228*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
1229*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym0, [base+pw_3]
1230*c0909341SAndroid Build Coastguard Worker    sub              angled, 1112 ; angle - 90
1231*c0909341SAndroid Build Coastguard Worker    call .filter_above2
1232*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+2]
1233*c0909341SAndroid Build Coastguard Worker    add              angled, 973 ; angle + 883
1234*c0909341SAndroid Build Coastguard Worker    palignr             xm3, xm4, xm12, 14
1235*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1236*c0909341SAndroid Build Coastguard Worker    test                r3d, angled
1237*c0909341SAndroid Build Coastguard Worker    jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
1238*c0909341SAndroid Build Coastguard Worker    call .filter_left16
1239*c0909341SAndroid Build Coastguard Worker.w4_main:
1240*c0909341SAndroid Build Coastguard Worker    movsldup             m1, [base+z_xpos_mul]
1241*c0909341SAndroid Build Coastguard Worker    psllw               m15, 3
1242*c0909341SAndroid Build Coastguard Worker.w4_main2:
1243*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [base+pw_1to32]
1244*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m11, dxd
1245*c0909341SAndroid Build Coastguard Worker    movsldup             m2, [base+z_xpos_mul]
1246*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m13, dyd
1247*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [tlq-2]
1248*c0909341SAndroid Build Coastguard Worker    psllw               m10, m8, 6
1249*c0909341SAndroid Build Coastguard Worker    valignq              m5, m7, m5, 6
1250*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m11
1251*c0909341SAndroid Build Coastguard Worker    psubw               m10, m2       ; xpos
1252*c0909341SAndroid Build Coastguard Worker    pmullw              m13, m0       ; ypos
1253*c0909341SAndroid Build Coastguard Worker    palignr              m5, m7, m5, 14
1254*c0909341SAndroid Build Coastguard Worker    psrlw               m12, m13, 6
1255*c0909341SAndroid Build Coastguard Worker    psllw               m13, 9
1256*c0909341SAndroid Build Coastguard Worker    paddw               m12, m1       ; base_y
1257*c0909341SAndroid Build Coastguard Worker    pand                m13, m14      ; frac_y << 9
1258*c0909341SAndroid Build Coastguard Worker    psllw               m11, 3
1259*c0909341SAndroid Build Coastguard Worker    lea                  r5, [strideq*3]
1260*c0909341SAndroid Build Coastguard Worker.w4_loop:
1261*c0909341SAndroid Build Coastguard Worker    psrlw                m1, m10, 6   ; base_x
1262*c0909341SAndroid Build Coastguard Worker    pand                 m2, m14, m10 ; frac
1263*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m1, m3   ; top[base_x]
1264*c0909341SAndroid Build Coastguard Worker    vpermw               m1, m1, m4   ; top[base_x+1]
1265*c0909341SAndroid Build Coastguard Worker    vpmovw2m             k1, m10      ; base_x < 0
1266*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
1267*c0909341SAndroid Build Coastguard Worker    vpermw           m0{k1}, m12, m5  ; left[base_y]
1268*c0909341SAndroid Build Coastguard Worker    vpermw           m1{k1}, m12, m7  ; left[base_y+1]
1269*c0909341SAndroid Build Coastguard Worker    vmovdqu16        m2{k1}, m13
1270*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1271*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1272*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1273*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
1274*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
1275*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
1276*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
1277*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r5       ], xm1
1278*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
1279*c0909341SAndroid Build Coastguard Worker    jl .w4_end
1280*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym0, m0, 1
1281*c0909341SAndroid Build Coastguard Worker    psubw               m10, m11      ; base_x -= dx
1282*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1283*c0909341SAndroid Build Coastguard Worker    paddw               m12, m15      ; base_y++
1284*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
1285*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
1286*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
1287*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
1288*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r5       ], xm1
1289*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1290*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
1291*c0909341SAndroid Build Coastguard Worker.w4_end:
1292*c0909341SAndroid Build Coastguard Worker    RET
1293*c0909341SAndroid Build Coastguard Worker.upsample_above: ; w4/w8
1294*c0909341SAndroid Build Coastguard Worker    mova                ym9, [base+pw_1to32]
1295*c0909341SAndroid Build Coastguard Worker    palignr             xm1, xm4, xm12, 12
1296*c0909341SAndroid Build Coastguard Worker    paddw               xm3, xm4  ; b+c
1297*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x7f ; 180 - angle
1298*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1  ; a+d
1299*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        xm1, r9m  ; pixel_max
1300*c0909341SAndroid Build Coastguard Worker    vpbroadcastb       xm11, r3d
1301*c0909341SAndroid Build Coastguard Worker    psubw               xm0, xm3, xm0
1302*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm2, angled
1303*c0909341SAndroid Build Coastguard Worker    psraw               xm0, 3
1304*c0909341SAndroid Build Coastguard Worker    shr              angled, 8
1305*c0909341SAndroid Build Coastguard Worker    paddw               xm3, xm0
1306*c0909341SAndroid Build Coastguard Worker    pxor                xm0, xm0
1307*c0909341SAndroid Build Coastguard Worker    vpcmpeqb             k2, xm11, [base+z_filter_wh]
1308*c0909341SAndroid Build Coastguard Worker    pmaxsw              xm3, xm0
1309*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
1310*c0909341SAndroid Build Coastguard Worker    pavgw               xm3, xm0
1311*c0909341SAndroid Build Coastguard Worker    vpcmpgtb         k2{k2}, xm2, [base+z_filter_t0+angleq*8]
1312*c0909341SAndroid Build Coastguard Worker    pminsw              xm3, xm1
1313*c0909341SAndroid Build Coastguard Worker    paddw                m8, m8
1314*c0909341SAndroid Build Coastguard Worker    jmp .filter_left16b
1315*c0909341SAndroid Build Coastguard Worker.upsample_left: ; h4/h8
1316*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq-1]
1317*c0909341SAndroid Build Coastguard Worker    palignr             xm2, xm7, xm12, 14
1318*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        xm0, r3d
1319*c0909341SAndroid Build Coastguard Worker    palignr             xm1, xm7, xm12, 12
1320*c0909341SAndroid Build Coastguard Worker    pminuw              xm0, xm9
1321*c0909341SAndroid Build Coastguard Worker    paddw               xm2, xm7 ; b+c
1322*c0909341SAndroid Build Coastguard Worker    vpermw              xm0, xm0, xm7
1323*c0909341SAndroid Build Coastguard Worker    add                 dyd, dyd
1324*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1 ; a+d
1325*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        xm1, r9m ; pixel_max
1326*c0909341SAndroid Build Coastguard Worker    psubw               xm0, xm2, xm0
1327*c0909341SAndroid Build Coastguard Worker    psraw               xm0, 3
1328*c0909341SAndroid Build Coastguard Worker    paddw               xm2, xm0
1329*c0909341SAndroid Build Coastguard Worker    pxor                xm0, xm0
1330*c0909341SAndroid Build Coastguard Worker    pmaxsw              xm2, xm0
1331*c0909341SAndroid Build Coastguard Worker    pavgw               xm2, xm0
1332*c0909341SAndroid Build Coastguard Worker    pminsw              xm2, xm1
1333*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm0, xm2, xm7
1334*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm7, xm2, xm7
1335*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym7, xm0, 1
1336*c0909341SAndroid Build Coastguard Worker    ret
1337*c0909341SAndroid Build Coastguard Worker.filter_above:
1338*c0909341SAndroid Build Coastguard Worker    sub              angled, 90
1339*c0909341SAndroid Build Coastguard Worker.filter_above2:
1340*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        ym1, r3d
1341*c0909341SAndroid Build Coastguard Worker    vpbroadcastb       ym10, angled
1342*c0909341SAndroid Build Coastguard Worker    mov                 r3d, angled
1343*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 8
1344*c0909341SAndroid Build Coastguard Worker    vpcmpeqb             k2, ym1, [base+z_filter_wh]
1345*c0909341SAndroid Build Coastguard Worker    mova               xm11, [base+z_filter_t0+r3*8]
1346*c0909341SAndroid Build Coastguard Worker    vpcmpgtb         k1{k2}, ym10, ym11
1347*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pw_1to32]
1348*c0909341SAndroid Build Coastguard Worker    kmovd               r3d, k1
1349*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
1350*c0909341SAndroid Build Coastguard Worker    jz .filter_end
1351*c0909341SAndroid Build Coastguard Worker    pminuw              ym0, ym9
1352*c0909341SAndroid Build Coastguard Worker    popcnt              r3d, r3d
1353*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym6, r7m      ; max_w
1354*c0909341SAndroid Build Coastguard Worker    kxnorw               k1, k1, k1
1355*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym5, [base+z_filter_k+(r3-1)*4+12*0]
1356*c0909341SAndroid Build Coastguard Worker    kaddw                k1, k1, k1   ; ~1
1357*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym13, [base+z_filter_k+(r3-1)*4+12*1]
1358*c0909341SAndroid Build Coastguard Worker    vpermw              ym2, ym0, ym4 ; +1
1359*c0909341SAndroid Build Coastguard Worker    pmullw              ym5, ym4
1360*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym2, ym3
1361*c0909341SAndroid Build Coastguard Worker    vmovdqu16        m3{k1}, [tlq-2]  ; -2
1362*c0909341SAndroid Build Coastguard Worker    vpermw              ym2, ym0, ym2 ; +2
1363*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym0, [base+z_filter_k+(r3-1)*4+12*2]
1364*c0909341SAndroid Build Coastguard Worker    pmullw              ym1, ym13
1365*c0909341SAndroid Build Coastguard Worker    movu                m13, [base+pw_0to31]
1366*c0909341SAndroid Build Coastguard Worker    paddw               ym2, ym3
1367*c0909341SAndroid Build Coastguard Worker    packssdw            ym6, ym6
1368*c0909341SAndroid Build Coastguard Worker    pmullw              ym2, ym0
1369*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym5
1370*c0909341SAndroid Build Coastguard Worker    vpcmpgtw             k1, ym6, ym13
1371*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym2
1372*c0909341SAndroid Build Coastguard Worker    pxor                ym2, ym2
1373*c0909341SAndroid Build Coastguard Worker    psrlw               ym1, 3
1374*c0909341SAndroid Build Coastguard Worker    pavgw           ym4{k1}, ym1, ym2
1375*c0909341SAndroid Build Coastguard Worker.filter_end:
1376*c0909341SAndroid Build Coastguard Worker    ret
1377*c0909341SAndroid Build Coastguard Worker.filter_left16:
1378*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym1, [base+pb_90]
1379*c0909341SAndroid Build Coastguard Worker    psubb               ym1, ym10
1380*c0909341SAndroid Build Coastguard Worker    vpcmpgtb         k2{k2}, ym1, ym11
1381*c0909341SAndroid Build Coastguard Worker.filter_left16b:
1382*c0909341SAndroid Build Coastguard Worker    kmovd               r3d, k2
1383*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
1384*c0909341SAndroid Build Coastguard Worker    jz .filter_end
1385*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [hq-1]
1386*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, ym12, xm7, 1
1387*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        ym1, r5d
1388*c0909341SAndroid Build Coastguard Worker    popcnt              r3d, r3d
1389*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym6, r8m          ; max_h
1390*c0909341SAndroid Build Coastguard Worker    pminuw              ym9, ym1
1391*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym5, [base+z_filter_k+(r3-1)*4+12*0]
1392*c0909341SAndroid Build Coastguard Worker    vpermw              ym2, ym9, ym7     ; +1
1393*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym10, [base+z_filter_k+(r3-1)*4+12*1]
1394*c0909341SAndroid Build Coastguard Worker    palignr             ym1, ym7, ym0, 14 ; -1
1395*c0909341SAndroid Build Coastguard Worker    pmullw              ym5, ym7
1396*c0909341SAndroid Build Coastguard Worker    palignr             ym0, ym7, ym0, 12 ; -2
1397*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym2
1398*c0909341SAndroid Build Coastguard Worker    vpermw              ym2, ym9, ym2     ; +2
1399*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym9, [base+z_filter_k+(r3-1)*4+12*2]
1400*c0909341SAndroid Build Coastguard Worker    pmullw              ym1, ym10
1401*c0909341SAndroid Build Coastguard Worker    paddw               ym2, ym0
1402*c0909341SAndroid Build Coastguard Worker    packssdw            ym6, ym6
1403*c0909341SAndroid Build Coastguard Worker    pmullw              ym2, ym9
1404*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym5
1405*c0909341SAndroid Build Coastguard Worker    vpcmpgtw             k1, ym6, [base+pw_0to31]
1406*c0909341SAndroid Build Coastguard Worker    paddw               ym1, ym2
1407*c0909341SAndroid Build Coastguard Worker    pxor                ym2, ym2
1408*c0909341SAndroid Build Coastguard Worker    psrlw               ym1, 3
1409*c0909341SAndroid Build Coastguard Worker    pavgw           ym7{k1}, ym1, ym2
1410*c0909341SAndroid Build Coastguard Worker    ret
1411*c0909341SAndroid Build Coastguard Worker.filter_left:
1412*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
1413*c0909341SAndroid Build Coastguard Worker    jl .filter_left16
1414*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pw_3]
1415*c0909341SAndroid Build Coastguard Worker    pminud               m0, m9, [base+pw_31] {1to16}
1416*c0909341SAndroid Build Coastguard Worker.filter_left32:
1417*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, r8m         ; max_h
1418*c0909341SAndroid Build Coastguard Worker    valignq              m2, m7, m12, 6
1419*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m6
1420*c0909341SAndroid Build Coastguard Worker    palignr              m1, m7, m2, 14  ; -1
1421*c0909341SAndroid Build Coastguard Worker    paddw                m1, m7
1422*c0909341SAndroid Build Coastguard Worker    palignr              m2, m7, m2, 12  ; -2
1423*c0909341SAndroid Build Coastguard Worker    vpcmpgtw             k1, m6, m13
1424*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
1425*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 64
1426*c0909341SAndroid Build Coastguard Worker    je .filter_left64
1427*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq-1]
1428*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m10, r3d
1429*c0909341SAndroid Build Coastguard Worker    pminuw               m0, m10
1430*c0909341SAndroid Build Coastguard Worker    vpermw              m10, m0, m7      ; +1
1431*c0909341SAndroid Build Coastguard Worker    paddw                m1, m10
1432*c0909341SAndroid Build Coastguard Worker    vpermw              m10, m0, m10     ; +2
1433*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m10
1434*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1435*c0909341SAndroid Build Coastguard Worker    vpsrlw           m7{k1}, m1, 2
1436*c0909341SAndroid Build Coastguard Worker    ret
1437*c0909341SAndroid Build Coastguard Worker.filter_left64:
1438*c0909341SAndroid Build Coastguard Worker    valignq             m10, m8, m7, 2
1439*c0909341SAndroid Build Coastguard Worker    vpaddd              m13, [base+pw_32] {1to16}
1440*c0909341SAndroid Build Coastguard Worker    palignr             m11, m10, m7, 2  ; +1
1441*c0909341SAndroid Build Coastguard Worker    paddw                m1, m11
1442*c0909341SAndroid Build Coastguard Worker    palignr             m11, m10, m7, 4  ; +2
1443*c0909341SAndroid Build Coastguard Worker    valignq             m10, m8, m7, 6
1444*c0909341SAndroid Build Coastguard Worker    pavgw               m11, m2
1445*c0909341SAndroid Build Coastguard Worker    vpermw               m2, m0, m8      ; 32+1
1446*c0909341SAndroid Build Coastguard Worker    paddw                m1, m11
1447*c0909341SAndroid Build Coastguard Worker    vpsrlw           m7{k1}, m1, 2
1448*c0909341SAndroid Build Coastguard Worker    palignr              m1, m8, m10, 14 ; 32-1
1449*c0909341SAndroid Build Coastguard Worker    paddw                m1, m8
1450*c0909341SAndroid Build Coastguard Worker    palignr             m10, m8, m10, 12 ; 32-2
1451*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1452*c0909341SAndroid Build Coastguard Worker    vpermw               m2, m0, m2      ; 32+2
1453*c0909341SAndroid Build Coastguard Worker    paddw               m10, m5
1454*c0909341SAndroid Build Coastguard Worker    vpcmpgtw             k1, m6, m13
1455*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m10
1456*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1457*c0909341SAndroid Build Coastguard Worker    vpsrlw           m8{k1}, m1, 2
1458*c0909341SAndroid Build Coastguard Worker    ret
1459*c0909341SAndroid Build Coastguard Worker.w8:
1460*c0909341SAndroid Build Coastguard Worker    mova                xm3, [tlq]
1461*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m8, [base+pw_1to32]
1462*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1463*c0909341SAndroid Build Coastguard Worker    jnz .w8_main
1464*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq+126]
1465*c0909341SAndroid Build Coastguard Worker    mov                 r3b, hb
1466*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
1467*c0909341SAndroid Build Coastguard Worker    ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
1468*c0909341SAndroid Build Coastguard Worker    psrldq              xm0, xm4, 2
1469*c0909341SAndroid Build Coastguard Worker    sub              angled, 53
1470*c0909341SAndroid Build Coastguard Worker    pshufhw             xm0, xm0, q2210
1471*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
1472*c0909341SAndroid Build Coastguard Worker    call .upsample_above
1473*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm0, xm3, xm4
1474*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm4, xm3, xm4
1475*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym3, ym12, xm0, 1
1476*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym4, ym0, xm4, 1
1477*c0909341SAndroid Build Coastguard Worker    palignr             ym3, ym4, ym3, 14
1478*c0909341SAndroid Build Coastguard Worker    jmp .w8_main
1479*c0909341SAndroid Build Coastguard Worker.w8_upsample_left:
1480*c0909341SAndroid Build Coastguard Worker    call .upsample_left
1481*c0909341SAndroid Build Coastguard Worker    movshdup             m1, [base+z_xpos_mul]
1482*c0909341SAndroid Build Coastguard Worker    psllw               m15, 3
1483*c0909341SAndroid Build Coastguard Worker    paddw                m1, m1
1484*c0909341SAndroid Build Coastguard Worker    jmp .w8_main2
1485*c0909341SAndroid Build Coastguard Worker.w8_no_upsample_above:
1486*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
1487*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym0, [base+pw_7]
1488*c0909341SAndroid Build Coastguard Worker    call .filter_above
1489*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq-51]
1490*c0909341SAndroid Build Coastguard Worker    mov                 r3b, hb
1491*c0909341SAndroid Build Coastguard Worker    palignr             xm3, xm4, xm12, 14
1492*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
1493*c0909341SAndroid Build Coastguard Worker    jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
1494*c0909341SAndroid Build Coastguard Worker    call .filter_left
1495*c0909341SAndroid Build Coastguard Worker.w8_main:
1496*c0909341SAndroid Build Coastguard Worker    movshdup             m1, [base+z_xpos_mul]
1497*c0909341SAndroid Build Coastguard Worker    psllw               m15, 2
1498*c0909341SAndroid Build Coastguard Worker.w8_main2:
1499*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [base+pw_1to32]
1500*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m11, dxd
1501*c0909341SAndroid Build Coastguard Worker    movshdup             m2, [base+z_xpos_mul]
1502*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m13, dyd
1503*c0909341SAndroid Build Coastguard Worker    psllw               m10, m8, 6
1504*c0909341SAndroid Build Coastguard Worker    valignq              m5, m7, m12, 6
1505*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m11
1506*c0909341SAndroid Build Coastguard Worker    psubw               m10, m2       ; xpos
1507*c0909341SAndroid Build Coastguard Worker    pmullw              m13, m0       ; ypos
1508*c0909341SAndroid Build Coastguard Worker    palignr              m5, m7, m5, 14
1509*c0909341SAndroid Build Coastguard Worker    psrlw               m12, m13, 6
1510*c0909341SAndroid Build Coastguard Worker    psllw               m13, 9
1511*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 1<<6
1512*c0909341SAndroid Build Coastguard Worker    paddw               m12, m1       ; base_y
1513*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [dxq-(8<<6)] ; left-only threshold
1514*c0909341SAndroid Build Coastguard Worker    pand                m13, m14      ; frac_y << 9
1515*c0909341SAndroid Build Coastguard Worker    shl                 dxd, 2
1516*c0909341SAndroid Build Coastguard Worker    psllw               m11, 2
1517*c0909341SAndroid Build Coastguard Worker    lea                  r5, [strideq*3]
1518*c0909341SAndroid Build Coastguard Worker.w8_loop:
1519*c0909341SAndroid Build Coastguard Worker    psrlw                m1, m10, 6
1520*c0909341SAndroid Build Coastguard Worker    pand                 m2, m14, m10
1521*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m1, m3
1522*c0909341SAndroid Build Coastguard Worker    vpermw               m1, m1, m4
1523*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
1524*c0909341SAndroid Build Coastguard Worker    sub                 r2d, dxd
1525*c0909341SAndroid Build Coastguard Worker    jge .w8_toponly
1526*c0909341SAndroid Build Coastguard Worker    vpmovw2m             k1, m10
1527*c0909341SAndroid Build Coastguard Worker    vpermw           m0{k1}, m12, m5
1528*c0909341SAndroid Build Coastguard Worker    vpermw           m1{k1}, m12, m7
1529*c0909341SAndroid Build Coastguard Worker    vmovdqu16        m2{k1}, m13
1530*c0909341SAndroid Build Coastguard Worker.w8_toponly:
1531*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1532*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1533*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1534*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
1535*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
1536*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m0, 2
1537*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r5       ], m0, 3
1538*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1539*c0909341SAndroid Build Coastguard Worker    jz .w8_end
1540*c0909341SAndroid Build Coastguard Worker    psubw               m10, m11      ; base_x -= dx
1541*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1542*c0909341SAndroid Build Coastguard Worker    paddw               m12, m15      ; base_y++
1543*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, r3d
1544*c0909341SAndroid Build Coastguard Worker    jge .w8_loop
1545*c0909341SAndroid Build Coastguard Worker.w8_leftonly_loop:
1546*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m12, m5
1547*c0909341SAndroid Build Coastguard Worker    vpermw               m1, m12, m7
1548*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1549*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m13
1550*c0909341SAndroid Build Coastguard Worker    paddw               m12, m15
1551*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1552*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
1553*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
1554*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m0, 2
1555*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r5       ], m0, 3
1556*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1557*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1558*c0909341SAndroid Build Coastguard Worker    jg .w8_leftonly_loop
1559*c0909341SAndroid Build Coastguard Worker.w8_end:
1560*c0909341SAndroid Build Coastguard Worker    RET
1561*c0909341SAndroid Build Coastguard Worker.w16:
1562*c0909341SAndroid Build Coastguard Worker    mova                ym3, [tlq]
1563*c0909341SAndroid Build Coastguard Worker    vpermw               m8, m0, [tlq-64*2]
1564*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1565*c0909341SAndroid Build Coastguard Worker    jnz .w16_main
1566*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+15]
1567*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym0, [base+pw_15]
1568*c0909341SAndroid Build Coastguard Worker    call .filter_above
1569*c0909341SAndroid Build Coastguard Worker    call .filter_left
1570*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym3, ym12, xm4, 1
1571*c0909341SAndroid Build Coastguard Worker    palignr             ym3, ym4, ym3, 14
1572*c0909341SAndroid Build Coastguard Worker.w16_main:
1573*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m0, [base+pw_1to32]
1574*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m11, dxd
1575*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m13, dyd
1576*c0909341SAndroid Build Coastguard Worker    kxnorw               k2, k2, k2
1577*c0909341SAndroid Build Coastguard Worker    psllw               m10, m0, 6
1578*c0909341SAndroid Build Coastguard Worker    valignq              m5, m7, m12, 6
1579*c0909341SAndroid Build Coastguard Worker    psubw               m10, m11      ; xpos
1580*c0909341SAndroid Build Coastguard Worker    valignq              m6, m8, m7, 6
1581*c0909341SAndroid Build Coastguard Worker    pmullw              m13, m0       ; ypos
1582*c0909341SAndroid Build Coastguard Worker    knotd                k1, k2
1583*c0909341SAndroid Build Coastguard Worker    palignr              m5, m7, m5, 14
1584*c0909341SAndroid Build Coastguard Worker    palignr              m6, m8, m6, 14
1585*c0909341SAndroid Build Coastguard Worker    vpsubw          m10{k1}, m11
1586*c0909341SAndroid Build Coastguard Worker    psrlw               m12, m13, 6
1587*c0909341SAndroid Build Coastguard Worker    psllw               m13, 9
1588*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 1<<6
1589*c0909341SAndroid Build Coastguard Worker    vpsubw          m12{k2}, m15      ; base_y
1590*c0909341SAndroid Build Coastguard Worker    pand                m13, m14      ; frac_y << 9
1591*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [dxq-(16<<6)]
1592*c0909341SAndroid Build Coastguard Worker    paddw               m11, m11
1593*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
1594*c0909341SAndroid Build Coastguard Worker    paddw               m15, m15
1595*c0909341SAndroid Build Coastguard Worker.w16_loop:
1596*c0909341SAndroid Build Coastguard Worker    psrlw                m1, m10, 6
1597*c0909341SAndroid Build Coastguard Worker    pand                 m2, m14, m10
1598*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m1, m3
1599*c0909341SAndroid Build Coastguard Worker    vpermw               m1, m1, m4
1600*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
1601*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1602*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1603*c0909341SAndroid Build Coastguard Worker    paddw               m12, m15      ; base_y++
1604*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1605*c0909341SAndroid Build Coastguard Worker    sub                 r2d, dxd
1606*c0909341SAndroid Build Coastguard Worker    jge .w16_toponly
1607*c0909341SAndroid Build Coastguard Worker    mova                 m1, m5
1608*c0909341SAndroid Build Coastguard Worker    vpermt2w             m1, m12, m6
1609*c0909341SAndroid Build Coastguard Worker    mova                 m2, m7
1610*c0909341SAndroid Build Coastguard Worker    vpermt2w             m2, m12, m8
1611*c0909341SAndroid Build Coastguard Worker    vpmovw2m             k1, m10
1612*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
1613*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m13
1614*c0909341SAndroid Build Coastguard Worker    vpaddw           m0{k1}, m1, m2
1615*c0909341SAndroid Build Coastguard Worker.w16_toponly:
1616*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
1617*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
1618*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1619*c0909341SAndroid Build Coastguard Worker    jz .w16_end
1620*c0909341SAndroid Build Coastguard Worker    psubw               m10, m11      ; base_x -= dx
1621*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1622*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, r3d
1623*c0909341SAndroid Build Coastguard Worker    jge .w16_loop
1624*c0909341SAndroid Build Coastguard Worker    paddw               m12, m15
1625*c0909341SAndroid Build Coastguard Worker    vpermt2w             m5, m12, m6
1626*c0909341SAndroid Build Coastguard Worker    mova                 m1, m7
1627*c0909341SAndroid Build Coastguard Worker    vpermt2w             m1, m12, m8
1628*c0909341SAndroid Build Coastguard Worker    jmp .w16_leftonly_loop_start
1629*c0909341SAndroid Build Coastguard Worker.w16_leftonly_loop:
1630*c0909341SAndroid Build Coastguard Worker    mova                 m1, m7
1631*c0909341SAndroid Build Coastguard Worker    vpermt2w             m1, m12, m8
1632*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m5, m1, q1032
1633*c0909341SAndroid Build Coastguard Worker.w16_leftonly_loop_start:
1634*c0909341SAndroid Build Coastguard Worker    psubw                m0, m1, m5
1635*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m13
1636*c0909341SAndroid Build Coastguard Worker    paddw               m12, m15
1637*c0909341SAndroid Build Coastguard Worker    paddw                m0, m5
1638*c0909341SAndroid Build Coastguard Worker    mova                 m5, m1
1639*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
1640*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
1641*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1642*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1643*c0909341SAndroid Build Coastguard Worker    jg .w16_leftonly_loop
1644*c0909341SAndroid Build Coastguard Worker.w16_end:
1645*c0909341SAndroid Build Coastguard Worker    RET
1646*c0909341SAndroid Build Coastguard Worker.w32:
1647*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tlq]
1648*c0909341SAndroid Build Coastguard Worker    vpermw               m8, m0, [tlq-64*2]
1649*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pw_1to32]
1650*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1651*c0909341SAndroid Build Coastguard Worker    jnz .w32_main
1652*c0909341SAndroid Build Coastguard Worker    pminud               m0, m9, [base+pw_31] {1to16}
1653*c0909341SAndroid Build Coastguard Worker    mov                 r3d, ~1
1654*c0909341SAndroid Build Coastguard Worker    kmovd                k1, r3d
1655*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pw_3]
1656*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, r6m     ; max_w
1657*c0909341SAndroid Build Coastguard Worker    vpermw               m2, m0, m4  ; +1
1658*c0909341SAndroid Build Coastguard Worker    movu                m13, [base+pw_0to31]
1659*c0909341SAndroid Build Coastguard Worker    paddw                m1, m4, m3
1660*c0909341SAndroid Build Coastguard Worker    vmovdqu16        m3{k1}, [tlq-2] ; -2
1661*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m6
1662*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1663*c0909341SAndroid Build Coastguard Worker    vpermw               m2, m0, m2  ; +2
1664*c0909341SAndroid Build Coastguard Worker    paddw                m3, m5
1665*c0909341SAndroid Build Coastguard Worker    vpcmpgtw             k1, m6, m13
1666*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m3
1667*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1668*c0909341SAndroid Build Coastguard Worker    psrlw            m4{k1}, m1, 2
1669*c0909341SAndroid Build Coastguard Worker    call .filter_left32
1670*c0909341SAndroid Build Coastguard Worker.w32_main:
1671*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 64*2
1672*c0909341SAndroid Build Coastguard Worker    call .w32_main1
1673*c0909341SAndroid Build Coastguard Worker    add                 rsp, 64*2
1674*c0909341SAndroid Build Coastguard Worker    RET
1675*c0909341SAndroid Build Coastguard Worker.w32_main1:
1676*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m11, dxd
1677*c0909341SAndroid Build Coastguard Worker    movu           [rsp+64], m4
1678*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, dyd
1679*c0909341SAndroid Build Coastguard Worker    movd           [rsp+60], xm12
1680*c0909341SAndroid Build Coastguard Worker    valignq              m5, m7, m12, 6
1681*c0909341SAndroid Build Coastguard Worker    psllw                m3, m9, 6    ; xpos
1682*c0909341SAndroid Build Coastguard Worker    valignq              m6, m8, m7, 6
1683*c0909341SAndroid Build Coastguard Worker    pmullw               m9, m4       ; ypos
1684*c0909341SAndroid Build Coastguard Worker    palignr              m5, m7, m5, 14
1685*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 33<<6
1686*c0909341SAndroid Build Coastguard Worker    palignr              m6, m8, m6, 14
1687*c0909341SAndroid Build Coastguard Worker    mova                m10, m3
1688*c0909341SAndroid Build Coastguard Worker.w32_main2:
1689*c0909341SAndroid Build Coastguard Worker    psllw               m13, m9, 9
1690*c0909341SAndroid Build Coastguard Worker    sub                 r2d, dxd
1691*c0909341SAndroid Build Coastguard Worker    psrlw               m12, m9, 6    ; base_y
1692*c0909341SAndroid Build Coastguard Worker    mov                 r8d, hd
1693*c0909341SAndroid Build Coastguard Worker    pand                m13, m14      ; frac_y << 9
1694*c0909341SAndroid Build Coastguard Worker.w32_loop:
1695*c0909341SAndroid Build Coastguard Worker    mov                 r3d, r2d
1696*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6
1697*c0909341SAndroid Build Coastguard Worker    psubw               m10, m11      ; base_x -= dx
1698*c0909341SAndroid Build Coastguard Worker    movu                 m0, [rsp+r3*2-2]
1699*c0909341SAndroid Build Coastguard Worker    pand                 m2, m10, m14 ; frac_x
1700*c0909341SAndroid Build Coastguard Worker    movu                 m1, [rsp+r3*2]
1701*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
1702*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1703*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1704*c0909341SAndroid Build Coastguard Worker    paddw               m12, m15      ; base_y++
1705*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1706*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, 32<<6
1707*c0909341SAndroid Build Coastguard Worker    jge .w32_toponly
1708*c0909341SAndroid Build Coastguard Worker    mova                 m1, m5
1709*c0909341SAndroid Build Coastguard Worker    vpermt2w             m1, m12, m6
1710*c0909341SAndroid Build Coastguard Worker    mova                 m2, m7
1711*c0909341SAndroid Build Coastguard Worker    vpermt2w             m2, m12, m8
1712*c0909341SAndroid Build Coastguard Worker    vpmovw2m             k1, m10
1713*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
1714*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m13
1715*c0909341SAndroid Build Coastguard Worker    vpaddw           m0{k1}, m1, m2
1716*c0909341SAndroid Build Coastguard Worker.w32_toponly:
1717*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
1718*c0909341SAndroid Build Coastguard Worker    dec                 r8d
1719*c0909341SAndroid Build Coastguard Worker    jz .w32_end
1720*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1721*c0909341SAndroid Build Coastguard Worker    sub                 r2d, dxd
1722*c0909341SAndroid Build Coastguard Worker    jge .w32_loop
1723*c0909341SAndroid Build Coastguard Worker    paddw               m12, m15
1724*c0909341SAndroid Build Coastguard Worker    mova                 m2, m5
1725*c0909341SAndroid Build Coastguard Worker    vpermt2w             m2, m12, m6
1726*c0909341SAndroid Build Coastguard Worker.w32_leftonly_loop:
1727*c0909341SAndroid Build Coastguard Worker    mova                 m1, m7
1728*c0909341SAndroid Build Coastguard Worker    vpermt2w             m1, m12, m8
1729*c0909341SAndroid Build Coastguard Worker    psubw                m0, m1, m2
1730*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m13
1731*c0909341SAndroid Build Coastguard Worker    paddw               m12, m15
1732*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1733*c0909341SAndroid Build Coastguard Worker    mova                 m2, m1
1734*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
1735*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1736*c0909341SAndroid Build Coastguard Worker    dec                 r8d
1737*c0909341SAndroid Build Coastguard Worker    jg .w32_leftonly_loop
1738*c0909341SAndroid Build Coastguard Worker.w32_end:
1739*c0909341SAndroid Build Coastguard Worker    ret
1740*c0909341SAndroid Build Coastguard Worker.w64:
1741*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+66]
1742*c0909341SAndroid Build Coastguard Worker    vpermw               m8, m0, [tlq-64*2]
1743*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pw_1to32]
1744*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1745*c0909341SAndroid Build Coastguard Worker    jnz .w64_main
1746*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq]        ; -1
1747*c0909341SAndroid Build Coastguard Worker    mov                 r3d, ~1
1748*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pw_3]
1749*c0909341SAndroid Build Coastguard Worker    kmovd                k1, r3d
1750*c0909341SAndroid Build Coastguard Worker    movu                m13, [base+pw_0to31]
1751*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, r6m          ; max_w
1752*c0909341SAndroid Build Coastguard Worker    pminud               m0, m9, [base+pw_31] {1to16}
1753*c0909341SAndroid Build Coastguard Worker    paddw                m1, m4, m2
1754*c0909341SAndroid Build Coastguard Worker    vmovdqu16        m2{k1}, [tlq-2]      ; -2
1755*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m6
1756*c0909341SAndroid Build Coastguard Worker    paddw                m1, [tlq+4]      ; +1
1757*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
1758*c0909341SAndroid Build Coastguard Worker    vpcmpgtw             k1, m6, m13
1759*c0909341SAndroid Build Coastguard Worker    pavgw                m2, [tlq+6]      ; +2
1760*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1761*c0909341SAndroid Build Coastguard Worker    vpermw               m2, m0, m3       ; 32+1
1762*c0909341SAndroid Build Coastguard Worker    psrlw            m4{k1}, m1, 2
1763*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3, [tlq+64] ; 32-1
1764*c0909341SAndroid Build Coastguard Worker    vpaddd              m11, m13, [base+pw_32] {1to16}
1765*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1766*c0909341SAndroid Build Coastguard Worker    vpermw               m2, m0, m2       ; 32+2
1767*c0909341SAndroid Build Coastguard Worker    paddw               m10, m5, [tlq+62] ; 32-2
1768*c0909341SAndroid Build Coastguard Worker    vpcmpgtw             k1, m6, m11
1769*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m10
1770*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1771*c0909341SAndroid Build Coastguard Worker    psrlw            m3{k1}, m1, 2
1772*c0909341SAndroid Build Coastguard Worker    call .filter_left32
1773*c0909341SAndroid Build Coastguard Worker.w64_main:
1774*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 64*3
1775*c0909341SAndroid Build Coastguard Worker    movu [rsp+64*2-gprsize], m3
1776*c0909341SAndroid Build Coastguard Worker    mov                  r5, dstq
1777*c0909341SAndroid Build Coastguard Worker    call .w32_main1
1778*c0909341SAndroid Build Coastguard Worker    psllw                m4, 5
1779*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 65<<6
1780*c0909341SAndroid Build Coastguard Worker    vpaddd              m10, m3, [base+pw_2048] {1to16} ; xpos
1781*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+64]
1782*c0909341SAndroid Build Coastguard Worker    paddw                m9, m4 ; ypos
1783*c0909341SAndroid Build Coastguard Worker    call .w32_main2
1784*c0909341SAndroid Build Coastguard Worker    add                 rsp, 64*3
1785*c0909341SAndroid Build Coastguard Worker    RET
1786*c0909341SAndroid Build Coastguard Worker
1787*c0909341SAndroid Build Coastguard Workercglobal ipred_z3_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy
1788*c0909341SAndroid Build Coastguard Worker    lea                  r7, [z_filter_t0]
1789*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
1790*c0909341SAndroid Build Coastguard Worker    movifnidn        angled, anglem
1791*c0909341SAndroid Build Coastguard Worker    lea                  t0, [dr_intra_derivative+45*2-1]
1792*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [base+ipred_z3_16bpc_avx512icl_table+wq*4]
1793*c0909341SAndroid Build Coastguard Worker    sub              angled, 180
1794*c0909341SAndroid Build Coastguard Worker    mov                 dyd, angled
1795*c0909341SAndroid Build Coastguard Worker    neg                 dyd
1796*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x400
1797*c0909341SAndroid Build Coastguard Worker    or                  dyq, ~0x7e
1798*c0909341SAndroid Build Coastguard Worker    mova                 m0, [base+pw_31to0]
1799*c0909341SAndroid Build Coastguard Worker    movzx               dyd, word [t0+dyq]
1800*c0909341SAndroid Build Coastguard Worker    lea                  wq, [base+ipred_z3_16bpc_avx512icl_table+wq]
1801*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
1802*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [base+pw_31806]
1803*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [base+pw_1]
1804*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1805*c0909341SAndroid Build Coastguard Worker.w4:
1806*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
1807*c0909341SAndroid Build Coastguard Worker    xor                 r3d, 31 ; 32 - (h + imin(w, h))
1808*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, r3d
1809*c0909341SAndroid Build Coastguard Worker    pmaxuw               m7, m0
1810*c0909341SAndroid Build Coastguard Worker    vpermw               m6, m7, [tlq-64*1]
1811*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
1812*c0909341SAndroid Build Coastguard Worker    jnz .w4_main
1813*c0909341SAndroid Build Coastguard Worker    cmp              angleb, 40
1814*c0909341SAndroid Build Coastguard Worker    jae .w4_filter
1815*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq-1024]
1816*c0909341SAndroid Build Coastguard Worker    sar                 r3d, 7
1817*c0909341SAndroid Build Coastguard Worker    add                 r3d, hd
1818*c0909341SAndroid Build Coastguard Worker    jg .w4_filter ; h > 8 || (h == 8 && is_sm)
1819*c0909341SAndroid Build Coastguard Worker    call .upsample
1820*c0909341SAndroid Build Coastguard Worker    movsldup             m1, [base+z_ypos_mul]
1821*c0909341SAndroid Build Coastguard Worker    paddw                m1, m1
1822*c0909341SAndroid Build Coastguard Worker    jmp .w4_main2
1823*c0909341SAndroid Build Coastguard Worker.w4_filter:
1824*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
1825*c0909341SAndroid Build Coastguard Worker    call .filter32
1826*c0909341SAndroid Build Coastguard Worker.w4_main:
1827*c0909341SAndroid Build Coastguard Worker    movsldup             m1, [base+z_ypos_mul]
1828*c0909341SAndroid Build Coastguard Worker.w4_main2:
1829*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [base+pw_1to32]
1830*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, dyd
1831*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [hq+4]
1832*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 3
1833*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m0      ; ypos
1834*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, r2d
1835*c0909341SAndroid Build Coastguard Worker    imul                 r2, strideq ; stride * imax(height / 8, 1)
1836*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m0
1837*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r2*3]
1838*c0909341SAndroid Build Coastguard Worker    paddd                m1, [base+pw_32736] {1to16}
1839*c0909341SAndroid Build Coastguard Worker    psrlw                m2, m4, 6
1840*c0909341SAndroid Build Coastguard Worker    psllw                m4, 9
1841*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m1      ; base+0
1842*c0909341SAndroid Build Coastguard Worker    vpandd               m4, m14     ; frac << 9
1843*c0909341SAndroid Build Coastguard Worker    vpermw               m3, m2, m6  ; left[base+0]
1844*c0909341SAndroid Build Coastguard Worker.w4_loop:
1845*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m15     ; base+1
1846*c0909341SAndroid Build Coastguard Worker    vpermw               m1, m2, m6  ; left[base+1]
1847*c0909341SAndroid Build Coastguard Worker    psubw                m0, m1, m3
1848*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
1849*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
1850*c0909341SAndroid Build Coastguard Worker    movq        [dstq+r2*0], xm0
1851*c0909341SAndroid Build Coastguard Worker    movhps      [dstq+r2*1], xm0
1852*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm3, ym0, 1
1853*c0909341SAndroid Build Coastguard Worker    movq        [dstq+r2*2], xm3
1854*c0909341SAndroid Build Coastguard Worker    movhps      [dstq+r3  ], xm3
1855*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
1856*c0909341SAndroid Build Coastguard Worker    jl .w4_end
1857*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dstq+r2*4]
1858*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym0, m0, 1
1859*c0909341SAndroid Build Coastguard Worker    mova                 m3, m1
1860*c0909341SAndroid Build Coastguard Worker    movq          [r5+r2*0], xm0
1861*c0909341SAndroid Build Coastguard Worker    movhps        [r5+r2*1], xm0
1862*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
1863*c0909341SAndroid Build Coastguard Worker    movq          [r5+r2*2], xm1
1864*c0909341SAndroid Build Coastguard Worker    movhps        [r5+r3  ], xm1
1865*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1866*c0909341SAndroid Build Coastguard Worker    test                 hd, hd
1867*c0909341SAndroid Build Coastguard Worker    jnz .w4_loop
1868*c0909341SAndroid Build Coastguard Worker.w4_end:
1869*c0909341SAndroid Build Coastguard Worker    RET
1870*c0909341SAndroid Build Coastguard Worker.upsample:
1871*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m6, [tlq-14], 3
1872*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+z_upsample]
1873*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+pd_65536]
1874*c0909341SAndroid Build Coastguard Worker    add                 dyd, dyd
1875*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m3, m6
1876*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
1877*c0909341SAndroid Build Coastguard Worker    vpermw               m1, m3, m6
1878*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
1879*c0909341SAndroid Build Coastguard Worker    vpermw               m2, m3, m6
1880*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
1881*c0909341SAndroid Build Coastguard Worker    vpermw               m3, m3, m6
1882*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, r9m     ; pixel_max
1883*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2      ; b+c
1884*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3      ; a+d
1885*c0909341SAndroid Build Coastguard Worker    psubw                m0, m1, m0
1886*c0909341SAndroid Build Coastguard Worker    psraw                m0, 3
1887*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
1888*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1889*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m2
1890*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m2
1891*c0909341SAndroid Build Coastguard Worker    pminsw               m6, m0
1892*c0909341SAndroid Build Coastguard Worker    ret
1893*c0909341SAndroid Build Coastguard Worker.w8:
1894*c0909341SAndroid Build Coastguard Worker    mova                 m6, [tlq-64*1]
1895*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
1896*c0909341SAndroid Build Coastguard Worker    je .w8_h32
1897*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 8
1898*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
1899*c0909341SAndroid Build Coastguard Worker    cmove               r3d, hd
1900*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r3+hq-1]
1901*c0909341SAndroid Build Coastguard Worker    xor                 r3d, 31 ; 32 - (h + imin(w, h))
1902*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, r3d
1903*c0909341SAndroid Build Coastguard Worker    vpermw               m7, m1, m6
1904*c0909341SAndroid Build Coastguard Worker    pmaxuw               m1, m0
1905*c0909341SAndroid Build Coastguard Worker    vpermw               m6, m1, m6
1906*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1907*c0909341SAndroid Build Coastguard Worker    jnz .w8_main
1908*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq+216]
1909*c0909341SAndroid Build Coastguard Worker    mov                 r3b, hb
1910*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
1911*c0909341SAndroid Build Coastguard Worker    ja .w8_filter ; is_sm || d >= 40 || h > 8
1912*c0909341SAndroid Build Coastguard Worker    call .upsample
1913*c0909341SAndroid Build Coastguard Worker    movshdup             m1, [base+z_ypos_mul]
1914*c0909341SAndroid Build Coastguard Worker    paddw                m1, m1
1915*c0909341SAndroid Build Coastguard Worker    call .w8_main_setup
1916*c0909341SAndroid Build Coastguard Worker.w8_upsample_loop:
1917*c0909341SAndroid Build Coastguard Worker    vpermw               m3, m2, m6  ; left[base+0]
1918*c0909341SAndroid Build Coastguard Worker    paddw                m2, m15     ; base+1
1919*c0909341SAndroid Build Coastguard Worker    vpermw               m1, m2, m6  ; left[base+1]
1920*c0909341SAndroid Build Coastguard Worker    psubw                m0, m1, m3
1921*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
1922*c0909341SAndroid Build Coastguard Worker    paddw                m2, m15     ; base+2
1923*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
1924*c0909341SAndroid Build Coastguard Worker    mova                 m3, m1
1925*c0909341SAndroid Build Coastguard Worker    mova          [dstq+r2*0], xm0
1926*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r2*1], ym0, 1
1927*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r2*2], m0, 2
1928*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r3  ], m0, 3
1929*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1930*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1931*c0909341SAndroid Build Coastguard Worker    jg .w8_upsample_loop
1932*c0909341SAndroid Build Coastguard Worker    RET
1933*c0909341SAndroid Build Coastguard Worker.w8_main_setup:
1934*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [base+pw_1to32]
1935*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, dyd
1936*c0909341SAndroid Build Coastguard Worker    rorx                r2d, hd, 2
1937*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m0      ; ypos
1938*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, r2d
1939*c0909341SAndroid Build Coastguard Worker    imul                 r2, strideq ; stride * height / 4
1940*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r2*3]
1941*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m0      ; 0 1 2 3
1942*c0909341SAndroid Build Coastguard Worker    paddd                m1, [base+pw_32704] {1to16}
1943*c0909341SAndroid Build Coastguard Worker    psrlw                m2, m4, 6
1944*c0909341SAndroid Build Coastguard Worker    psllw                m4, 9
1945*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m1      ; base+0
1946*c0909341SAndroid Build Coastguard Worker    vpandd               m4, m14     ; frac << 9
1947*c0909341SAndroid Build Coastguard Worker    ret
1948*c0909341SAndroid Build Coastguard Worker.w8_h32:
1949*c0909341SAndroid Build Coastguard Worker    pmaxud               m7, m0, [base+pw_24] {1to16}
1950*c0909341SAndroid Build Coastguard Worker    vpermw               m6, m0, m6
1951*c0909341SAndroid Build Coastguard Worker    vpermw               m7, m7, [tlq-64*2]
1952*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1953*c0909341SAndroid Build Coastguard Worker    jnz .w8_main
1954*c0909341SAndroid Build Coastguard Worker    call .filter64
1955*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [base+pw_7]
1956*c0909341SAndroid Build Coastguard Worker    pminuw               m0, [base+pw_0to31]
1957*c0909341SAndroid Build Coastguard Worker    vpermw               m7, m0, m7
1958*c0909341SAndroid Build Coastguard Worker    jmp .w8_main
1959*c0909341SAndroid Build Coastguard Worker.w8_filter:
1960*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
1961*c0909341SAndroid Build Coastguard Worker    call .filter32
1962*c0909341SAndroid Build Coastguard Worker.w8_main:
1963*c0909341SAndroid Build Coastguard Worker    movshdup             m1, [base+z_ypos_mul]
1964*c0909341SAndroid Build Coastguard Worker    call .w8_main_setup
1965*c0909341SAndroid Build Coastguard Worker    mova                 m3, m6
1966*c0909341SAndroid Build Coastguard Worker    vpermt2w             m3, m2, m7  ; left[base+0]
1967*c0909341SAndroid Build Coastguard Worker.w8_loop:
1968*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m15     ; base+1
1969*c0909341SAndroid Build Coastguard Worker    mova                 m1, m6
1970*c0909341SAndroid Build Coastguard Worker    vpermt2w             m1, m2, m7  ; left[base+1]
1971*c0909341SAndroid Build Coastguard Worker    psubw                m0, m1, m3
1972*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
1973*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
1974*c0909341SAndroid Build Coastguard Worker    mova                 m3, m1
1975*c0909341SAndroid Build Coastguard Worker    mova          [dstq+r2*0], xm0
1976*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r2*1], ym0, 1
1977*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r2*2], m0, 2
1978*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r3  ], m0, 3
1979*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1980*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1981*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
1982*c0909341SAndroid Build Coastguard Worker    RET
1983*c0909341SAndroid Build Coastguard Worker.filter32:
1984*c0909341SAndroid Build Coastguard Worker    vpbroadcastb       ym10, r3d
1985*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        ym1, angled
1986*c0909341SAndroid Build Coastguard Worker    shr              angled, 8
1987*c0909341SAndroid Build Coastguard Worker    vpcmpeqb             k1, ym10, [base+z_filter_wh]
1988*c0909341SAndroid Build Coastguard Worker    mova                xm2, [base+z_filter_t0+angleq*8]
1989*c0909341SAndroid Build Coastguard Worker    vpcmpgtb         k1{k1}, ym1, ym2
1990*c0909341SAndroid Build Coastguard Worker    kmovd               r5d, k1
1991*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1992*c0909341SAndroid Build Coastguard Worker    jz .filter32_end
1993*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, [tlq]
1994*c0909341SAndroid Build Coastguard Worker    popcnt              r5d, r5d
1995*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+z_filter_k+(r5-1)*4+12*0]
1996*c0909341SAndroid Build Coastguard Worker    valignq              m2, m6, m2, 6
1997*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+z_filter_k+(r5-1)*4+12*1]
1998*c0909341SAndroid Build Coastguard Worker    valignq              m4, m7, m6, 2
1999*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+z_filter_k+(r5-1)*4+12*2]
2000*c0909341SAndroid Build Coastguard Worker    palignr              m1, m6, m2, 14
2001*c0909341SAndroid Build Coastguard Worker    pmullw               m5, m6
2002*c0909341SAndroid Build Coastguard Worker    palignr              m3, m4, m6, 2
2003*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
2004*c0909341SAndroid Build Coastguard Worker    palignr              m2, m6, m2, 12
2005*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m8
2006*c0909341SAndroid Build Coastguard Worker    palignr              m4, m6, 4
2007*c0909341SAndroid Build Coastguard Worker    paddw                m2, m4
2008*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m9
2009*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m10, ym10
2010*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
2011*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
2012*c0909341SAndroid Build Coastguard Worker    pminuw               m1, m10, [base+pw_0to31]
2013*c0909341SAndroid Build Coastguard Worker    paddw                m5, m2
2014*c0909341SAndroid Build Coastguard Worker    psrlw                m5, 3
2015*c0909341SAndroid Build Coastguard Worker    pavgw                m6, m5
2016*c0909341SAndroid Build Coastguard Worker    vpermw               m7, m10, m6
2017*c0909341SAndroid Build Coastguard Worker    vpermw               m6, m1, m6
2018*c0909341SAndroid Build Coastguard Worker.filter32_end:
2019*c0909341SAndroid Build Coastguard Worker    ret
2020*c0909341SAndroid Build Coastguard Worker.w16:
2021*c0909341SAndroid Build Coastguard Worker    mova                 m6, [tlq-64*1]
2022*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
2023*c0909341SAndroid Build Coastguard Worker    jl .w16_h16
2024*c0909341SAndroid Build Coastguard Worker    pmaxud               m8, m0, [base+pw_16] {1to16}
2025*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tlq-64*2]
2026*c0909341SAndroid Build Coastguard Worker    vpermw               m6, m0, m6
2027*c0909341SAndroid Build Coastguard Worker    jg .w16_h64
2028*c0909341SAndroid Build Coastguard Worker    vpermw               m7, m8, m7
2029*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2030*c0909341SAndroid Build Coastguard Worker    jnz .w16_main
2031*c0909341SAndroid Build Coastguard Worker    call .filter64
2032*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [base+pw_15]
2033*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [base+pw_0to31], 0
2034*c0909341SAndroid Build Coastguard Worker    vpermw               m7, m0, m7
2035*c0909341SAndroid Build Coastguard Worker    jmp .w16_main
2036*c0909341SAndroid Build Coastguard Worker.w16_h16:
2037*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq*2-1]
2038*c0909341SAndroid Build Coastguard Worker    xor                 r3d, 31 ; 32 - (h + imin(w, h))
2039*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, r3d
2040*c0909341SAndroid Build Coastguard Worker    vpermw               m7, m1, m6
2041*c0909341SAndroid Build Coastguard Worker    pmaxuw               m1, m0
2042*c0909341SAndroid Build Coastguard Worker    vpermw               m6, m1, m6
2043*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2044*c0909341SAndroid Build Coastguard Worker    jnz .w16_main
2045*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+15]
2046*c0909341SAndroid Build Coastguard Worker    call .filter32
2047*c0909341SAndroid Build Coastguard Worker.w16_main:
2048*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m0, [base+pw_1to32]
2049*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, dyd
2050*c0909341SAndroid Build Coastguard Worker    rorx                r2d, hd, 1
2051*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m0      ; ypos
2052*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        ym1, r2d
2053*c0909341SAndroid Build Coastguard Worker    imul                 r2, strideq ; stride * height / 2
2054*c0909341SAndroid Build Coastguard Worker    paddd                m1, [base+pw_32704] {1to16}
2055*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r2+strideq]
2056*c0909341SAndroid Build Coastguard Worker    psrlw                m2, m4, 6
2057*c0909341SAndroid Build Coastguard Worker    psllw                m4, 9
2058*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m1      ; base+0
2059*c0909341SAndroid Build Coastguard Worker    vpandd               m4, m14     ; frac << 9
2060*c0909341SAndroid Build Coastguard Worker    mova                 m3, m6
2061*c0909341SAndroid Build Coastguard Worker    vpermt2w             m3, m2, m7  ; left[base+0]
2062*c0909341SAndroid Build Coastguard Worker.w16_loop:
2063*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m2, m15 ; base+1
2064*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m1, m15 ; base+2
2065*c0909341SAndroid Build Coastguard Worker    vpermi2w             m1, m6, m7  ; left[base+1]
2066*c0909341SAndroid Build Coastguard Worker    psubw                m0, m1, m3
2067*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
2068*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
2069*c0909341SAndroid Build Coastguard Worker    mova                 m3, m6
2070*c0909341SAndroid Build Coastguard Worker    vpermt2w             m3, m2, m7  ; left[base+2]
2071*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*0], m0, 1
2072*c0909341SAndroid Build Coastguard Worker    mova          [dstq+r2       ], ym0
2073*c0909341SAndroid Build Coastguard Worker    psubw                m0, m3, m1
2074*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
2075*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2076*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
2077*c0909341SAndroid Build Coastguard Worker    mova          [dstq+r3       ], ym0
2078*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2079*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2080*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
2081*c0909341SAndroid Build Coastguard Worker    RET
2082*c0909341SAndroid Build Coastguard Worker.w16_h64:
2083*c0909341SAndroid Build Coastguard Worker    vpermw               m7, m0, m7
2084*c0909341SAndroid Build Coastguard Worker    vpermw               m8, m8, [tlq-64*3]
2085*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2086*c0909341SAndroid Build Coastguard Worker    jnz .w16_h64_main
2087*c0909341SAndroid Build Coastguard Worker    valignq             m11, m8, m7, 6
2088*c0909341SAndroid Build Coastguard Worker    call .filter64
2089*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m8, m8, q3321
2090*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [base+pw_15]
2091*c0909341SAndroid Build Coastguard Worker    palignr             ym3, ym8, ym11, 12
2092*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [base+pw_0to31], 0
2093*c0909341SAndroid Build Coastguard Worker    palignr             ym4, ym8, ym11, 14
2094*c0909341SAndroid Build Coastguard Worker    palignr             ym1, ym2, ym8, 4
2095*c0909341SAndroid Build Coastguard Worker    paddw               ym3, ym5
2096*c0909341SAndroid Build Coastguard Worker    palignr             ym2, ym8, 2
2097*c0909341SAndroid Build Coastguard Worker    paddw               ym8, ym4
2098*c0909341SAndroid Build Coastguard Worker    pavgw               ym3, ym1
2099*c0909341SAndroid Build Coastguard Worker    paddw               ym8, ym2
2100*c0909341SAndroid Build Coastguard Worker    paddw               ym8, ym3
2101*c0909341SAndroid Build Coastguard Worker    psrlw               ym8, 2
2102*c0909341SAndroid Build Coastguard Worker    vpermw               m8, m0, m8
2103*c0909341SAndroid Build Coastguard Worker.w16_h64_main:
2104*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8      m0, [base+pw_1to32]
2105*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, dyd
2106*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m0    ; ypos
2107*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym1, [base+pw_32]
2108*c0909341SAndroid Build Coastguard Worker    paddd                m1, [base+pw_32672] {1to16}
2109*c0909341SAndroid Build Coastguard Worker    mov                  r2, strideq
2110*c0909341SAndroid Build Coastguard Worker    shl                  r2, 5      ; stride*32
2111*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+pw_32735]
2112*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r2+strideq]
2113*c0909341SAndroid Build Coastguard Worker    psrlw                m2, m4, 6
2114*c0909341SAndroid Build Coastguard Worker    psllw                m4, 9
2115*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m1     ; base+0
2116*c0909341SAndroid Build Coastguard Worker    vpandd               m4, m14    ; frac << 9
2117*c0909341SAndroid Build Coastguard Worker    mova                 m3, m7
2118*c0909341SAndroid Build Coastguard Worker    vpermt2w             m3, m2, m6
2119*c0909341SAndroid Build Coastguard Worker    vpcmpgtw             k1, m2, m9
2120*c0909341SAndroid Build Coastguard Worker    vpermw           m3{k1}, m2, m8 ; left[base+0]
2121*c0909341SAndroid Build Coastguard Worker.w16_h64_loop:
2122*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m15    ; base+1
2123*c0909341SAndroid Build Coastguard Worker    mova                 m1, m7
2124*c0909341SAndroid Build Coastguard Worker    vpermt2w             m1, m2, m6
2125*c0909341SAndroid Build Coastguard Worker    vpcmpgtw             k1, m2, m9
2126*c0909341SAndroid Build Coastguard Worker    vpermw           m1{k1}, m2, m8 ; left[base+1]
2127*c0909341SAndroid Build Coastguard Worker    psubw                m0, m1, m3
2128*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
2129*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m15    ; base+2
2130*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
2131*c0909341SAndroid Build Coastguard Worker    mova                 m3, m7
2132*c0909341SAndroid Build Coastguard Worker    vpermt2w             m3, m2, m6
2133*c0909341SAndroid Build Coastguard Worker    vpcmpgtw             k1, m2, m9
2134*c0909341SAndroid Build Coastguard Worker    vpermw           m3{k1}, m2, m8 ; left[base+2]
2135*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*0], m0, 1
2136*c0909341SAndroid Build Coastguard Worker    mova          [dstq+r2       ], ym0
2137*c0909341SAndroid Build Coastguard Worker    psubw                m0, m3, m1
2138*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
2139*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2140*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
2141*c0909341SAndroid Build Coastguard Worker    mova          [dstq+r3       ], ym0
2142*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2143*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2144*c0909341SAndroid Build Coastguard Worker    jg .w16_h64_loop
2145*c0909341SAndroid Build Coastguard Worker    RET
2146*c0909341SAndroid Build Coastguard Worker.filter64:
2147*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, [tlq]
2148*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pw_3]
2149*c0909341SAndroid Build Coastguard Worker    valignq              m2, m6, m2, 6
2150*c0909341SAndroid Build Coastguard Worker    valignq              m4, m7, m6, 2
2151*c0909341SAndroid Build Coastguard Worker    valignq             m10, m7, m6, 6
2152*c0909341SAndroid Build Coastguard Worker    palignr              m1, m6, m2, 12
2153*c0909341SAndroid Build Coastguard Worker    palignr              m2, m6, m2, 14
2154*c0909341SAndroid Build Coastguard Worker    palignr              m3, m4, m6, 4
2155*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
2156*c0909341SAndroid Build Coastguard Worker    palignr              m4, m6, 2
2157*c0909341SAndroid Build Coastguard Worker    paddw                m6, m2
2158*c0909341SAndroid Build Coastguard Worker    valignq              m2, m8, m7, 2
2159*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m3
2160*c0909341SAndroid Build Coastguard Worker    palignr              m3, m7, m10, 12
2161*c0909341SAndroid Build Coastguard Worker    paddw                m6, m4
2162*c0909341SAndroid Build Coastguard Worker    palignr              m4, m7, m10, 14
2163*c0909341SAndroid Build Coastguard Worker    paddw                m6, m1
2164*c0909341SAndroid Build Coastguard Worker    palignr              m1, m2, m7, 4
2165*c0909341SAndroid Build Coastguard Worker    psrlw                m6, 2
2166*c0909341SAndroid Build Coastguard Worker    palignr              m2, m7, 2
2167*c0909341SAndroid Build Coastguard Worker    paddw                m3, m5
2168*c0909341SAndroid Build Coastguard Worker    paddw                m7, m4
2169*c0909341SAndroid Build Coastguard Worker    pavgw                m3, m1
2170*c0909341SAndroid Build Coastguard Worker    paddw                m7, m2
2171*c0909341SAndroid Build Coastguard Worker    paddw                m7, m3
2172*c0909341SAndroid Build Coastguard Worker    psrlw                m7, 2
2173*c0909341SAndroid Build Coastguard Worker    ret
2174*c0909341SAndroid Build Coastguard Worker.w32:
2175*c0909341SAndroid Build Coastguard Worker    mova                 m6, [tlq-64*1]
2176*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
2177*c0909341SAndroid Build Coastguard Worker    jl .w32_h16
2178*c0909341SAndroid Build Coastguard Worker    mova                 m8, [tlq-64*2]
2179*c0909341SAndroid Build Coastguard Worker    vpermw               m6, m0, m6
2180*c0909341SAndroid Build Coastguard Worker    vpermw               m7, m0, m8
2181*c0909341SAndroid Build Coastguard Worker    jg .w32_h64
2182*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2183*c0909341SAndroid Build Coastguard Worker    jnz .w32_main
2184*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        xm8, xm8
2185*c0909341SAndroid Build Coastguard Worker    jmp .w32_filter
2186*c0909341SAndroid Build Coastguard Worker.w32_h16:
2187*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq*2-1]
2188*c0909341SAndroid Build Coastguard Worker    xor                 r3d, 31 ; 32 - (h + imin(w, h))
2189*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, r3d
2190*c0909341SAndroid Build Coastguard Worker    vpermw               m7, m1, m6
2191*c0909341SAndroid Build Coastguard Worker    pmaxuw               m1, m0
2192*c0909341SAndroid Build Coastguard Worker    vpermw               m6, m1, m6
2193*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2194*c0909341SAndroid Build Coastguard Worker    jnz .w32_main
2195*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm8, m7, 3
2196*c0909341SAndroid Build Coastguard Worker.w32_filter:
2197*c0909341SAndroid Build Coastguard Worker    call .filter64
2198*c0909341SAndroid Build Coastguard Worker.w32_main:
2199*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, dyd
2200*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [base+pw_32704]
2201*c0909341SAndroid Build Coastguard Worker    pmullw               m4, [base+pw_1to32] ; ypos
2202*c0909341SAndroid Build Coastguard Worker    psrlw                m2, m4, 6
2203*c0909341SAndroid Build Coastguard Worker    psllw                m4, 9
2204*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m1      ; base+0
2205*c0909341SAndroid Build Coastguard Worker    vpandd               m4, m14     ; frac << 9
2206*c0909341SAndroid Build Coastguard Worker    mova                 m3, m6
2207*c0909341SAndroid Build Coastguard Worker    vpermt2w             m3, m2, m7  ; left[base+0]
2208*c0909341SAndroid Build Coastguard Worker.w32_loop:
2209*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m2, m15 ; base+1
2210*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m1, m15 ; base+2
2211*c0909341SAndroid Build Coastguard Worker    vpermi2w             m1, m6, m7  ; left[base+1]
2212*c0909341SAndroid Build Coastguard Worker    psubw                m0, m1, m3
2213*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
2214*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
2215*c0909341SAndroid Build Coastguard Worker    mova                 m3, m6
2216*c0909341SAndroid Build Coastguard Worker    vpermt2w             m3, m2, m7  ; left[base+2]
2217*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
2218*c0909341SAndroid Build Coastguard Worker    psubw                m0, m3, m1
2219*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
2220*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2221*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m0
2222*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2223*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2224*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
2225*c0909341SAndroid Build Coastguard Worker    RET
2226*c0909341SAndroid Build Coastguard Worker.w32_h64:
2227*c0909341SAndroid Build Coastguard Worker    mova                 m9, [tlq-64*3]
2228*c0909341SAndroid Build Coastguard Worker    vpermw               m8, m0, m9
2229*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2230*c0909341SAndroid Build Coastguard Worker    jnz .w32_h64_main
2231*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        xm9, xm9
2232*c0909341SAndroid Build Coastguard Worker    call .filter96
2233*c0909341SAndroid Build Coastguard Worker.w32_h64_main:
2234*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, dyd
2235*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [base+pw_32672]
2236*c0909341SAndroid Build Coastguard Worker    pmullw               m4, [base+pw_1to32] ; ypos
2237*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+pw_32735]
2238*c0909341SAndroid Build Coastguard Worker    psrlw                m2, m4, 6
2239*c0909341SAndroid Build Coastguard Worker    psllw                m4, 9
2240*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m1     ; base+0
2241*c0909341SAndroid Build Coastguard Worker    vpandd               m4, m14    ; frac << 9
2242*c0909341SAndroid Build Coastguard Worker    mova                 m3, m7
2243*c0909341SAndroid Build Coastguard Worker    vpermt2w             m3, m2, m6
2244*c0909341SAndroid Build Coastguard Worker    vpcmpgtw             k1, m2, m9
2245*c0909341SAndroid Build Coastguard Worker    vpermw           m3{k1}, m2, m8 ; left[base+0]
2246*c0909341SAndroid Build Coastguard Worker.w32_h64_loop:
2247*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m15    ; base+1
2248*c0909341SAndroid Build Coastguard Worker    mova                 m1, m7
2249*c0909341SAndroid Build Coastguard Worker    vpermt2w             m1, m2, m6
2250*c0909341SAndroid Build Coastguard Worker    vpcmpgtw             k1, m2, m9
2251*c0909341SAndroid Build Coastguard Worker    vpermw           m1{k1}, m2, m8 ; left[base+1]
2252*c0909341SAndroid Build Coastguard Worker    psubw                m0, m1, m3
2253*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
2254*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m15    ; base+2
2255*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
2256*c0909341SAndroid Build Coastguard Worker    mova                 m3, m7
2257*c0909341SAndroid Build Coastguard Worker    vpermt2w             m3, m2, m6
2258*c0909341SAndroid Build Coastguard Worker    vpcmpgtw             k1, m2, m9
2259*c0909341SAndroid Build Coastguard Worker    vpermw           m3{k1}, m2, m8 ; left[base+2]
2260*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
2261*c0909341SAndroid Build Coastguard Worker    psubw                m0, m3, m1
2262*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
2263*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2264*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m0
2265*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2266*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2267*c0909341SAndroid Build Coastguard Worker    jg .w32_h64_loop
2268*c0909341SAndroid Build Coastguard Worker    RET
2269*c0909341SAndroid Build Coastguard Worker.filter96:
2270*c0909341SAndroid Build Coastguard Worker    valignq             m11, m8, m7, 6
2271*c0909341SAndroid Build Coastguard Worker    call .filter64
2272*c0909341SAndroid Build Coastguard Worker    valignq              m2, m9, m8, 2
2273*c0909341SAndroid Build Coastguard Worker    palignr              m3, m8, m11, 12
2274*c0909341SAndroid Build Coastguard Worker    palignr              m4, m8, m11, 14
2275*c0909341SAndroid Build Coastguard Worker    palignr              m1, m2, m8, 4
2276*c0909341SAndroid Build Coastguard Worker    paddw                m3, m5
2277*c0909341SAndroid Build Coastguard Worker    palignr              m2, m8, 2
2278*c0909341SAndroid Build Coastguard Worker    paddw                m8, m4
2279*c0909341SAndroid Build Coastguard Worker    pavgw                m3, m1
2280*c0909341SAndroid Build Coastguard Worker    paddw                m8, m2
2281*c0909341SAndroid Build Coastguard Worker    paddw                m8, m3
2282*c0909341SAndroid Build Coastguard Worker    psrlw                m8, 2
2283*c0909341SAndroid Build Coastguard Worker    ret
2284*c0909341SAndroid Build Coastguard Worker.w64:
2285*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tlq-64*1]
2286*c0909341SAndroid Build Coastguard Worker    vpermw               m6, m0, m7
2287*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
2288*c0909341SAndroid Build Coastguard Worker    jl .w64_h16
2289*c0909341SAndroid Build Coastguard Worker    mova                 m8, [tlq-64*2]
2290*c0909341SAndroid Build Coastguard Worker    vpermw               m7, m0, m8
2291*c0909341SAndroid Build Coastguard Worker    jg .w64_h64
2292*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2293*c0909341SAndroid Build Coastguard Worker    jnz .w64_main
2294*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m8, xm8
2295*c0909341SAndroid Build Coastguard Worker    mova                 m9, m8
2296*c0909341SAndroid Build Coastguard Worker    call .filter96
2297*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m9, m8, m8, q3333
2298*c0909341SAndroid Build Coastguard Worker    jmp .w64_h64_main
2299*c0909341SAndroid Build Coastguard Worker.w64_h16:
2300*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, xm7
2301*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2302*c0909341SAndroid Build Coastguard Worker    jnz .w64_main
2303*c0909341SAndroid Build Coastguard Worker    mova                 m8, m7
2304*c0909341SAndroid Build Coastguard Worker    call .filter64
2305*c0909341SAndroid Build Coastguard Worker.w64_main:
2306*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m11, dyd
2307*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [base+pw_32704]
2308*c0909341SAndroid Build Coastguard Worker    pmullw              m10, m11, [base+pw_1to32] ; ypos
2309*c0909341SAndroid Build Coastguard Worker    psllw               m11, 5
2310*c0909341SAndroid Build Coastguard Worker    psrlw                m8, m10, 6
2311*c0909341SAndroid Build Coastguard Worker    paddw               m11, m10
2312*c0909341SAndroid Build Coastguard Worker    psllw               m10, 9
2313*c0909341SAndroid Build Coastguard Worker    psrlw                m9, m11, 6
2314*c0909341SAndroid Build Coastguard Worker    psllw               m11, 9
2315*c0909341SAndroid Build Coastguard Worker    psubw                m9, m8
2316*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m1     ; base+0
2317*c0909341SAndroid Build Coastguard Worker    vpandd              m10, m14    ; frac << 9
2318*c0909341SAndroid Build Coastguard Worker    vpandd              m11, m14    ; frac << 9
2319*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
2320*c0909341SAndroid Build Coastguard Worker    vpermt2w             m4, m8, m7 ; left[base+0] ( 0..31)
2321*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m8, m9
2322*c0909341SAndroid Build Coastguard Worker    vpermi2w             m5, m6, m7 ; left[base+0] (32..63)
2323*c0909341SAndroid Build Coastguard Worker.w64_loop:
2324*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m15    ; base+1      ( 0..31)
2325*c0909341SAndroid Build Coastguard Worker    mova                 m2, m6
2326*c0909341SAndroid Build Coastguard Worker    vpermt2w             m2, m8, m7 ; left[base+1] ( 0..31)
2327*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m8, m9 ; base+1      (32..63)
2328*c0909341SAndroid Build Coastguard Worker    vpermi2w             m3, m6, m7 ; left[base+1] (32..63)
2329*c0909341SAndroid Build Coastguard Worker    psubw                m0, m2, m4
2330*c0909341SAndroid Build Coastguard Worker    psubw                m1, m3, m5
2331*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
2332*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m11
2333*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
2334*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
2335*c0909341SAndroid Build Coastguard Worker    mova                 m4, m2
2336*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
2337*c0909341SAndroid Build Coastguard Worker    mova                 m5, m3
2338*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
2339*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
2340*c0909341SAndroid Build Coastguard Worker    dec                  hd
2341*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
2342*c0909341SAndroid Build Coastguard Worker    RET
2343*c0909341SAndroid Build Coastguard Worker.w64_h64:
2344*c0909341SAndroid Build Coastguard Worker    vpermw               m8, m0, [tlq-64*3]
2345*c0909341SAndroid Build Coastguard Worker    mova                m13, [tlq-64*4]
2346*c0909341SAndroid Build Coastguard Worker    vpermw               m9, m0, m13
2347*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2348*c0909341SAndroid Build Coastguard Worker    jnz .w64_h64_main
2349*c0909341SAndroid Build Coastguard Worker    valignq             m12, m9, m8, 6
2350*c0909341SAndroid Build Coastguard Worker    call .filter96
2351*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        xm2, xm13
2352*c0909341SAndroid Build Coastguard Worker    valignq              m2, m9, 2
2353*c0909341SAndroid Build Coastguard Worker    palignr              m3, m9, m12, 12
2354*c0909341SAndroid Build Coastguard Worker    palignr              m4, m9, m12, 14
2355*c0909341SAndroid Build Coastguard Worker    palignr              m1, m2, m9, 4
2356*c0909341SAndroid Build Coastguard Worker    paddw                m3, m5
2357*c0909341SAndroid Build Coastguard Worker    palignr              m2, m9, 2
2358*c0909341SAndroid Build Coastguard Worker    paddw                m9, m4
2359*c0909341SAndroid Build Coastguard Worker    pavgw                m3, m1
2360*c0909341SAndroid Build Coastguard Worker    paddw                m9, m2
2361*c0909341SAndroid Build Coastguard Worker    paddw                m9, m3
2362*c0909341SAndroid Build Coastguard Worker    psrlw                m9, 2
2363*c0909341SAndroid Build Coastguard Worker.w64_h64_main:
2364*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m11, dyd
2365*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [base+pw_32640]
2366*c0909341SAndroid Build Coastguard Worker    pmullw              m10, m11, [base+pw_1to32] ; ypos
2367*c0909341SAndroid Build Coastguard Worker    psllw               m11, 5
2368*c0909341SAndroid Build Coastguard Worker    psrlw               m12, m10, 6
2369*c0909341SAndroid Build Coastguard Worker    paddw               m11, m10
2370*c0909341SAndroid Build Coastguard Worker    psllw               m10, 9
2371*c0909341SAndroid Build Coastguard Worker    psrlw               m13, m11, 6
2372*c0909341SAndroid Build Coastguard Worker    psllw               m11, 9
2373*c0909341SAndroid Build Coastguard Worker    psubw               m13, m12
2374*c0909341SAndroid Build Coastguard Worker    paddsw              m12, m1     ; base+0
2375*c0909341SAndroid Build Coastguard Worker    vpandd              m10, m14    ; frac << 9
2376*c0909341SAndroid Build Coastguard Worker    vpandd              m11, m14    ; frac << 9
2377*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [base+pw_64]
2378*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
2379*c0909341SAndroid Build Coastguard Worker    vpermt2w             m4, m12, m7
2380*c0909341SAndroid Build Coastguard Worker    vptestmw             k1, m12, m14
2381*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
2382*c0909341SAndroid Build Coastguard Worker    vpermt2w             m0, m12, m9
2383*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m12, m13
2384*c0909341SAndroid Build Coastguard Worker    mova                 m5, m6
2385*c0909341SAndroid Build Coastguard Worker    vpermt2w             m5, m1, m7
2386*c0909341SAndroid Build Coastguard Worker    vptestmw             k2, m1, m14
2387*c0909341SAndroid Build Coastguard Worker    vpermi2w             m1, m8, m9
2388*c0909341SAndroid Build Coastguard Worker    vmovdqu16        m4{k1}, m0     ; left[base+0] ( 0..31)
2389*c0909341SAndroid Build Coastguard Worker    vmovdqu16        m5{k2}, m1     ; left[base+0] (32..63)
2390*c0909341SAndroid Build Coastguard Worker.w64_h64_loop:
2391*c0909341SAndroid Build Coastguard Worker    paddsw              m12, m15    ; base+1
2392*c0909341SAndroid Build Coastguard Worker    mova                 m2, m6
2393*c0909341SAndroid Build Coastguard Worker    vpermt2w             m2, m12, m7
2394*c0909341SAndroid Build Coastguard Worker    vptestmw             k1, m12, m14
2395*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
2396*c0909341SAndroid Build Coastguard Worker    vpermt2w             m0, m12, m9
2397*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m12, m13
2398*c0909341SAndroid Build Coastguard Worker    mova                 m3, m6
2399*c0909341SAndroid Build Coastguard Worker    vpermt2w             m3, m1, m7
2400*c0909341SAndroid Build Coastguard Worker    vptestmw             k2, m1, m14
2401*c0909341SAndroid Build Coastguard Worker    vpermi2w             m1, m8, m9
2402*c0909341SAndroid Build Coastguard Worker    vmovdqu16        m2{k1}, m0     ; left[base+1] ( 0..31)
2403*c0909341SAndroid Build Coastguard Worker    vmovdqu16        m3{k2}, m1     ; left[base+1] (32..63)
2404*c0909341SAndroid Build Coastguard Worker    psubw                m0, m2, m4
2405*c0909341SAndroid Build Coastguard Worker    psubw                m1, m3, m5
2406*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
2407*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m11
2408*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
2409*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
2410*c0909341SAndroid Build Coastguard Worker    mova                 m4, m2
2411*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
2412*c0909341SAndroid Build Coastguard Worker    mova                 m5, m3
2413*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
2414*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
2415*c0909341SAndroid Build Coastguard Worker    dec                  hd
2416*c0909341SAndroid Build Coastguard Worker    jg .w64_h64_loop
2417*c0909341SAndroid Build Coastguard Worker    RET
2418*c0909341SAndroid Build Coastguard Worker
2419*c0909341SAndroid Build Coastguard Workercglobal pal_pred_16bpc, 4, 7, 7, dst, stride, pal, idx, w, h, stride3
2420*c0909341SAndroid Build Coastguard Worker    lea                  r6, [pal_pred_16bpc_avx512icl_table]
2421*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
2422*c0909341SAndroid Build Coastguard Worker    mova                 m3, [pal_pred_perm]
2423*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
2424*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
2425*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [pal_unpack+0]
2426*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m5, [pal_unpack+8]
2427*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
2428*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m6, [palq]
2429*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
2430*c0909341SAndroid Build Coastguard Worker    jmp                  wq
2431*c0909341SAndroid Build Coastguard Worker.w4:
2432*c0909341SAndroid Build Coastguard Worker    pmovzxbd            ym0, [idxq]
2433*c0909341SAndroid Build Coastguard Worker    add                idxq, 8
2434*c0909341SAndroid Build Coastguard Worker    vpmultishiftqb      ym0, ym4, ym0
2435*c0909341SAndroid Build Coastguard Worker    vpermw              ym0, ym0, ym6
2436*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
2437*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
2438*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
2439*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
2440*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
2441*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2442*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2443*c0909341SAndroid Build Coastguard Worker    jg .w4
2444*c0909341SAndroid Build Coastguard Worker    RET
2445*c0909341SAndroid Build Coastguard Worker.w8:
2446*c0909341SAndroid Build Coastguard Worker    pmovzxbd             m0, [idxq]
2447*c0909341SAndroid Build Coastguard Worker    add                idxq, 16
2448*c0909341SAndroid Build Coastguard Worker    vpmultishiftqb       m0, m4, m0
2449*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m0, m6
2450*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
2451*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
2452*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m0, 2
2453*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+stride3q ], m0, 3
2454*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2455*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2456*c0909341SAndroid Build Coastguard Worker    jg .w8
2457*c0909341SAndroid Build Coastguard Worker    RET
2458*c0909341SAndroid Build Coastguard Worker.w16:
2459*c0909341SAndroid Build Coastguard Worker    movu                ym1, [idxq]
2460*c0909341SAndroid Build Coastguard Worker    add                idxq, 32
2461*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m3, m1
2462*c0909341SAndroid Build Coastguard Worker    vpmultishiftqb       m1, m4, m1
2463*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m1, m6
2464*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 8
2465*c0909341SAndroid Build Coastguard Worker    vpermw               m1, m1, m6
2466*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
2467*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
2468*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*2], ym1
2469*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+stride3q ], m1, 1
2470*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2471*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2472*c0909341SAndroid Build Coastguard Worker    jg .w16
2473*c0909341SAndroid Build Coastguard Worker    RET
2474*c0909341SAndroid Build Coastguard Worker.w32:
2475*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m3, [idxq]
2476*c0909341SAndroid Build Coastguard Worker    add                idxq, 64
2477*c0909341SAndroid Build Coastguard Worker    vpmultishiftqb       m1, m4, m2
2478*c0909341SAndroid Build Coastguard Worker    vpmultishiftqb       m2, m5, m2
2479*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m1, m6
2480*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 8
2481*c0909341SAndroid Build Coastguard Worker    vpermw               m1, m1, m6
2482*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
2483*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
2484*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m2, m6
2485*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 8
2486*c0909341SAndroid Build Coastguard Worker    vpermw               m1, m2, m6
2487*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m0
2488*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], m1
2489*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2490*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2491*c0909341SAndroid Build Coastguard Worker    jg .w32
2492*c0909341SAndroid Build Coastguard Worker    RET
2493*c0909341SAndroid Build Coastguard Worker.w64:
2494*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m3, [idxq]
2495*c0909341SAndroid Build Coastguard Worker    add                idxq, 64
2496*c0909341SAndroid Build Coastguard Worker    vpmultishiftqb       m1, m4, m2
2497*c0909341SAndroid Build Coastguard Worker    vpmultishiftqb       m2, m5, m2
2498*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m1, m6
2499*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 8
2500*c0909341SAndroid Build Coastguard Worker    vpermw               m1, m1, m6
2501*c0909341SAndroid Build Coastguard Worker    mova          [dstq+ 0], m0
2502*c0909341SAndroid Build Coastguard Worker    mova          [dstq+64], m1
2503*c0909341SAndroid Build Coastguard Worker    vpermw               m0, m2, m6
2504*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 8
2505*c0909341SAndroid Build Coastguard Worker    vpermw               m1, m2, m6
2506*c0909341SAndroid Build Coastguard Worker    mova  [dstq+strideq+ 0], m0
2507*c0909341SAndroid Build Coastguard Worker    mova  [dstq+strideq+64], m1
2508*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2509*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2510*c0909341SAndroid Build Coastguard Worker    jg .w64
2511*c0909341SAndroid Build Coastguard Worker    RET
2512*c0909341SAndroid Build Coastguard Worker
2513*c0909341SAndroid Build Coastguard Worker; The ipred_filter SIMD processes 4x2 blocks in the following order which
2514*c0909341SAndroid Build Coastguard Worker; increases parallelism compared to doing things row by row.
2515*c0909341SAndroid Build Coastguard Worker;     w4     w8       w16             w32
2516*c0909341SAndroid Build Coastguard Worker;     1     1 2     1 2 5 6     1 2 5 6 9 a d e
2517*c0909341SAndroid Build Coastguard Worker;     2     2 3     2 3 6 7     2 3 6 7 a b e f
2518*c0909341SAndroid Build Coastguard Worker;     3     3 4     3 4 7 8     3 4 7 8 b c f g
2519*c0909341SAndroid Build Coastguard Worker;     4     4 5     4 5 8 9     4 5 8 9 c d g h
2520*c0909341SAndroid Build Coastguard Worker
2521*c0909341SAndroid Build Coastguard Workercglobal ipred_filter_16bpc, 4, 7, 14, dst, stride, tl, w, h, filter, top
2522*c0909341SAndroid Build Coastguard Worker%define base r6-$$
2523*c0909341SAndroid Build Coastguard Worker    lea                  r6, [$$]
2524*c0909341SAndroid Build Coastguard Worker%ifidn filterd, filterm
2525*c0909341SAndroid Build Coastguard Worker    movzx           filterd, filterb
2526*c0909341SAndroid Build Coastguard Worker%else
2527*c0909341SAndroid Build Coastguard Worker    movzx           filterd, byte filterm
2528*c0909341SAndroid Build Coastguard Worker%endif
2529*c0909341SAndroid Build Coastguard Worker    shl             filterd, 6
2530*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
2531*c0909341SAndroid Build Coastguard Worker    movu                xm0, [tlq-6]
2532*c0909341SAndroid Build Coastguard Worker    pmovsxbw             m7, [base+filter_intra_taps+filterq+32*0]
2533*c0909341SAndroid Build Coastguard Worker    pmovsxbw             m8, [base+filter_intra_taps+filterq+32*1]
2534*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r8m ; bitdepth_max
2535*c0909341SAndroid Build Coastguard Worker    movsldup             m9, [base+filter_permA]
2536*c0909341SAndroid Build Coastguard Worker    movshdup            m10, [base+filter_permA]
2537*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 11  ; is_12bpc
2538*c0909341SAndroid Build Coastguard Worker    jnz .12bpc
2539*c0909341SAndroid Build Coastguard Worker    psllw                m7, 2   ; upshift multipliers so that packusdw
2540*c0909341SAndroid Build Coastguard Worker    psllw                m8, 2   ; will perform clipping for free
2541*c0909341SAndroid Build Coastguard Worker.12bpc:
2542*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+filter_rnd+r5*8]
2543*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+filter_shift+r5*8]
2544*c0909341SAndroid Build Coastguard Worker    sub                  wd, 8
2545*c0909341SAndroid Build Coastguard Worker    jl .w4
2546*c0909341SAndroid Build Coastguard Worker.w8:
2547*c0909341SAndroid Build Coastguard Worker    call .main4
2548*c0909341SAndroid Build Coastguard Worker    movsldup            m11, [filter_permB]
2549*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [hq*2+2]
2550*c0909341SAndroid Build Coastguard Worker    movshdup            m12, [filter_permB]
2551*c0909341SAndroid Build Coastguard Worker    lea                topq, [tlq+2]
2552*c0909341SAndroid Build Coastguard Worker    mova                m13, [filter_permC]
2553*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2554*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, [topq], 1 ; a0 b0   t0 t1
2555*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r5
2556*c0909341SAndroid Build Coastguard Worker%if WIN64
2557*c0909341SAndroid Build Coastguard Worker    push                 r7
2558*c0909341SAndroid Build Coastguard Worker    push                 r8
2559*c0909341SAndroid Build Coastguard Worker%endif
2560*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
2561*c0909341SAndroid Build Coastguard Worker    mov                 r8d, hd
2562*c0909341SAndroid Build Coastguard Worker.w8_loop:
2563*c0909341SAndroid Build Coastguard Worker    movlps              xm4, xm0, [tlq+hq*2]
2564*c0909341SAndroid Build Coastguard Worker    call .main8
2565*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2566*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2567*c0909341SAndroid Build Coastguard Worker    jge .w8_loop
2568*c0909341SAndroid Build Coastguard Worker    test                 wd, wd
2569*c0909341SAndroid Build Coastguard Worker    jz .end
2570*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x0d
2571*c0909341SAndroid Build Coastguard Worker    kmovb                k1, r2d
2572*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
2573*c0909341SAndroid Build Coastguard Worker.w16:
2574*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [r7+strideq*1+12]
2575*c0909341SAndroid Build Coastguard Worker    vpblendd           xmm0, [topq+8], 0x0e ; t1 t2
2576*c0909341SAndroid Build Coastguard Worker    pinsrw              xm4, xmm0, [r7+strideq*0+14], 2
2577*c0909341SAndroid Build Coastguard Worker    call .main8
2578*c0909341SAndroid Build Coastguard Worker    add                  r7, 16
2579*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, [topq+16], 1   ; a2 b2   t2 t3
2580*c0909341SAndroid Build Coastguard Worker    mov                  hd, r8d
2581*c0909341SAndroid Build Coastguard Worker    mov                dstq, r7
2582*c0909341SAndroid Build Coastguard Worker    add                topq, 16
2583*c0909341SAndroid Build Coastguard Worker.w16_loop:
2584*c0909341SAndroid Build Coastguard Worker    movd               xmm1, [dstq+strideq*2-4]
2585*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm4, xmm1, xmm0
2586*c0909341SAndroid Build Coastguard Worker    movd               xmm0, [dstq+r2-4]
2587*c0909341SAndroid Build Coastguard Worker    shufps          xm4{k1}, xmm0, xm0, q3210
2588*c0909341SAndroid Build Coastguard Worker    call .main8
2589*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2590*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2591*c0909341SAndroid Build Coastguard Worker    jge .w16_loop
2592*c0909341SAndroid Build Coastguard Worker    sub                  wd, 8
2593*c0909341SAndroid Build Coastguard Worker    jg .w16
2594*c0909341SAndroid Build Coastguard Worker.end:
2595*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m11, m0
2596*c0909341SAndroid Build Coastguard Worker    mova                ym1, ym5
2597*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m2, m7
2598*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m12, m0
2599*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m2, m8
2600*c0909341SAndroid Build Coastguard Worker%if WIN64
2601*c0909341SAndroid Build Coastguard Worker    pop                  r8
2602*c0909341SAndroid Build Coastguard Worker    pop                  r7
2603*c0909341SAndroid Build Coastguard Worker%endif
2604*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym2, m1, 1
2605*c0909341SAndroid Build Coastguard Worker    paddd               ym1, ym2
2606*c0909341SAndroid Build Coastguard Worker    packusdw            ym1, ym1
2607*c0909341SAndroid Build Coastguard Worker    vpsrlvw             ym1, ym6
2608*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m13, m1
2609*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*0], m0, 2
2610*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
2611*c0909341SAndroid Build Coastguard Worker    RET
2612*c0909341SAndroid Build Coastguard Worker.w4_loop:
2613*c0909341SAndroid Build Coastguard Worker    movlps              xm0, [tlq-10]
2614*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2615*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
2616*c0909341SAndroid Build Coastguard Worker.w4:
2617*c0909341SAndroid Build Coastguard Worker    call .main4
2618*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
2619*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
2620*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2621*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
2622*c0909341SAndroid Build Coastguard Worker    RET
2623*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2624*c0909341SAndroid Build Coastguard Worker.main4:
2625*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m9, m0
2626*c0909341SAndroid Build Coastguard Worker    mova                ym1, ym5
2627*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m2, m7
2628*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m10, m0
2629*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m0, m8
2630*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym0, m1, 1
2631*c0909341SAndroid Build Coastguard Worker    paddd               ym0, ym1
2632*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm1, ym0, 1
2633*c0909341SAndroid Build Coastguard Worker    packusdw            xm0, xm1     ; clip
2634*c0909341SAndroid Build Coastguard Worker    vpsrlvw             xm0, xm6
2635*c0909341SAndroid Build Coastguard Worker    ret
2636*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2637*c0909341SAndroid Build Coastguard Worker.main8:
2638*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m11, m0
2639*c0909341SAndroid Build Coastguard Worker    mova                ym2, ym5
2640*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m3, m7
2641*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m9, m4
2642*c0909341SAndroid Build Coastguard Worker    mova                ym1, ym5
2643*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m3, m7
2644*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m12, m0
2645*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m3, m8
2646*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m10, m4
2647*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m3, m8
2648*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym4, m2, 1
2649*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym3, m1, 1
2650*c0909341SAndroid Build Coastguard Worker    paddd               ym2, ym4
2651*c0909341SAndroid Build Coastguard Worker    paddd               ym1, ym3
2652*c0909341SAndroid Build Coastguard Worker    packusdw            ym1, ym2     ; clip
2653*c0909341SAndroid Build Coastguard Worker    vpsrlvw             ym1, ym6
2654*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m13, m1 ; c0 d0   b0 b1   a0 a1
2655*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*0], m0, 2
2656*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
2657*c0909341SAndroid Build Coastguard Worker    ret
2658*c0909341SAndroid Build Coastguard Worker
2659*c0909341SAndroid Build Coastguard Worker%endif
2660