xref: /aosp_15_r20/external/libdav1d/src/x86/ipred16_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Worker%macro SMOOTH_WEIGHTS 1-*
32*c0909341SAndroid Build Coastguard Workerconst smooth_weights_1d_16bpc ; sm_weights[] << 7
33*c0909341SAndroid Build Coastguard Worker    %rep %0
34*c0909341SAndroid Build Coastguard Worker        dw %1*128
35*c0909341SAndroid Build Coastguard Worker        %rotate 1
36*c0909341SAndroid Build Coastguard Worker    %endrep
37*c0909341SAndroid Build Coastguard Workerconst smooth_weights_2d_16bpc ; sm_weights[], 256 - sm_weights[]
38*c0909341SAndroid Build Coastguard Worker    %rep %0
39*c0909341SAndroid Build Coastguard Worker        dw %1, 256-%1
40*c0909341SAndroid Build Coastguard Worker        %rotate 1
41*c0909341SAndroid Build Coastguard Worker    %endrep
42*c0909341SAndroid Build Coastguard Worker%endmacro
43*c0909341SAndroid Build Coastguard Worker
44*c0909341SAndroid Build Coastguard WorkerSMOOTH_WEIGHTS   0,   0, 255, 128, 255, 149,  85,  64, \
45*c0909341SAndroid Build Coastguard Worker               255, 197, 146, 105,  73,  50,  37,  32, \
46*c0909341SAndroid Build Coastguard Worker               255, 225, 196, 170, 145, 123, 102,  84, \
47*c0909341SAndroid Build Coastguard Worker                68,  54,  43,  33,  26,  20,  17,  16, \
48*c0909341SAndroid Build Coastguard Worker               255, 240, 225, 210, 196, 182, 169, 157, \
49*c0909341SAndroid Build Coastguard Worker               145, 133, 122, 111, 101,  92,  83,  74, \
50*c0909341SAndroid Build Coastguard Worker                66,  59,  52,  45,  39,  34,  29,  25, \
51*c0909341SAndroid Build Coastguard Worker                21,  17,  14,  12,  10,   9,   8,   8, \
52*c0909341SAndroid Build Coastguard Worker               255, 248, 240, 233, 225, 218, 210, 203, \
53*c0909341SAndroid Build Coastguard Worker               196, 189, 182, 176, 169, 163, 156, 150, \
54*c0909341SAndroid Build Coastguard Worker               144, 138, 133, 127, 121, 116, 111, 106, \
55*c0909341SAndroid Build Coastguard Worker               101,  96,  91,  86,  82,  77,  73,  69, \
56*c0909341SAndroid Build Coastguard Worker                65,  61,  57,  54,  50,  47,  44,  41, \
57*c0909341SAndroid Build Coastguard Worker                38,  35,  32,  29,  27,  25,  22,  20, \
58*c0909341SAndroid Build Coastguard Worker                18,  16,  15,  13,  12,  10,   9,   8, \
59*c0909341SAndroid Build Coastguard Worker                 7,   6,   6,   5,   5,   4,   4,   4
60*c0909341SAndroid Build Coastguard Worker
61*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
62*c0909341SAndroid Build Coastguard Worker
63*c0909341SAndroid Build Coastguard Workeripred_hv_shuf: db  6,  7,  6,  7,  0,  1,  2,  3,  2,  3,  2,  3,  8,  9, 10, 11
64*c0909341SAndroid Build Coastguard Worker               db  4,  5,  4,  5,  4,  5,  6,  7,  0,  1,  0,  1, 12, 13, 14, 15
65*c0909341SAndroid Build Coastguard Workerfilter_shuf1:  db  8,  9,  0,  1,  2,  3,  4,  5,  6,  7, 14, 15, 12, 13, -1, -1
66*c0909341SAndroid Build Coastguard Workerfilter_shuf2:  db  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  4,  5,  2,  3, -1, -1
67*c0909341SAndroid Build Coastguard Workerfilter_shuf3:  db 12, 13,  0,  1,  2,  3,  4,  5,  6,  7, 10, 11,  8,  9, -1, -1
68*c0909341SAndroid Build Coastguard Workerpal_pred_shuf: db  0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15
69*c0909341SAndroid Build Coastguard Workerz_base_inc:    dw   0*64,   1*64,   2*64,   3*64,   4*64,   5*64,   6*64,   7*64
70*c0909341SAndroid Build Coastguard Worker               dw   8*64,   9*64,  10*64,  11*64,  12*64,  13*64,  14*64,  15*64
71*c0909341SAndroid Build Coastguard Workerz_filter_t0:   db 55,127, 39,127, 39,127,  7, 15, 31,  7, 15, 31,  0,  3, 31,  0
72*c0909341SAndroid Build Coastguard Workerz_filter_t1:   db 39, 63, 19, 47, 19, 47,  3,  3,  3,  3,  3,  3,  0,  0,  0,  0
73*c0909341SAndroid Build Coastguard Workerz_filter_wh:   db  7,  7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
74*c0909341SAndroid Build Coastguard Worker               db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
75*c0909341SAndroid Build Coastguard Workerpw_m1024:      times 2 dw -1024
76*c0909341SAndroid Build Coastguard Workerpw_1to16:      dw  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
77*c0909341SAndroid Build Coastguard Workerpw_16to1:      dw 16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1
78*c0909341SAndroid Build Coastguard Workerz2_ymul:       dw  1,  2,  1,  2,  1,  2,  1,  2,  3,  4,  3,  4,  3,  4,  3,  4
79*c0909341SAndroid Build Coastguard Workerz2_ymul8:      dw  1,  2,  5,  6,  3,  4,  7,  8,  5,  6, 16, 16,  7,  8
80*c0909341SAndroid Build Coastguard Workerpb_90:         times 4 db 90
81*c0909341SAndroid Build Coastguard Workerz2_y_shuf_h4:  dd  3,  7,  2,  6,  1,  5,  0,  4
82*c0909341SAndroid Build Coastguard Workerz_upsample:    db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
83*c0909341SAndroid Build Coastguard Workerz2_x_shuf:     db  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9
84*c0909341SAndroid Build Coastguard Workerz2_y_shuf:     db  6,  7, 14, 15,  4,  5, 12, 13,  4,  5, 12, 13,  2,  3, 10, 11
85*c0909341SAndroid Build Coastguard Workerz2_y_shuf_us:  db  6,  7, 14, 15,  2,  3, 10, 11,  4,  5, 12, 13,  0,  1,  8,  9
86*c0909341SAndroid Build Coastguard Workerz_filter_k:    dw  4,  4,  5,  5,  4,  4
87*c0909341SAndroid Build Coastguard Worker               dw  8,  8,  6,  6,  4,  4
88*c0909341SAndroid Build Coastguard Worker               dw  0,  0,  0,  0,  2,  2
89*c0909341SAndroid Build Coastguard Worker
90*c0909341SAndroid Build Coastguard Worker%define pw_2  (z_filter_k+32)
91*c0909341SAndroid Build Coastguard Worker%define pw_4  (z_filter_k+ 0)
92*c0909341SAndroid Build Coastguard Worker%define pw_16 (z2_ymul8  +20)
93*c0909341SAndroid Build Coastguard Worker
94*c0909341SAndroid Build Coastguard Workerpw_1:    times 2 dw 1
95*c0909341SAndroid Build Coastguard Workerpw_3:    times 2 dw 3
96*c0909341SAndroid Build Coastguard Workerpw_62:   times 2 dw 62
97*c0909341SAndroid Build Coastguard Workerpw_512:  times 2 dw 512
98*c0909341SAndroid Build Coastguard Workerpw_2048: times 2 dw 2048
99*c0909341SAndroid Build Coastguard Workerpd_8:    dd 8
100*c0909341SAndroid Build Coastguard Worker
101*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 3-*
102*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - 2*4)
103*c0909341SAndroid Build Coastguard Worker    %xdefine %%base mangle(private_prefix %+ _%1_%2)
104*c0909341SAndroid Build Coastguard Worker    %%table:
105*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
106*c0909341SAndroid Build Coastguard Worker        dd %%base %+ .%3 - (%%table - 2*4)
107*c0909341SAndroid Build Coastguard Worker        %rotate 1
108*c0909341SAndroid Build Coastguard Worker    %endrep
109*c0909341SAndroid Build Coastguard Worker%endmacro
110*c0909341SAndroid Build Coastguard Worker
111*c0909341SAndroid Build Coastguard Worker%define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4)
112*c0909341SAndroid Build Coastguard Worker%define ipred_cfl_splat_16bpc_avx2_table (ipred_cfl_16bpc_avx2_table + 8*4)
113*c0909341SAndroid Build Coastguard Worker
114*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_dc_16bpc,         avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
115*c0909341SAndroid Build Coastguard Worker                                        s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
116*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_dc_left_16bpc,    avx2, h4, h8, h16, h32, h64
117*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_h_16bpc,          avx2, w4, w8, w16, w32, w64
118*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_paeth_16bpc,      avx2, w4, w8, w16, w32, w64
119*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_16bpc,     avx2, w4, w8, w16, w32, w64
120*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_h_16bpc,   avx2, w4, w8, w16, w32, w64
121*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_v_16bpc,   avx2, w4, w8, w16, w32, w64
122*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z1_16bpc,         avx2, w4, w8, w16, w32, w64
123*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z2_16bpc,         avx2, w4, w8, w16, w32, w64
124*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z3_16bpc,         avx2, h4, h8, h16, h32, h64
125*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_filter_16bpc,     avx2, w4, w8, w16, w32
126*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl_16bpc,        avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
127*c0909341SAndroid Build Coastguard Worker                                        s4-8*4, s8-8*4, s16-8*4, s32-8*4
128*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl_left_16bpc,   avx2, h4, h8, h16, h32
129*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl_ac_444_16bpc, avx2, w4, w8, w16, w32
130*c0909341SAndroid Build Coastguard WorkerJMP_TABLE pal_pred_16bpc,         avx2, w4, w8, w16, w32, w64
131*c0909341SAndroid Build Coastguard Worker
132*c0909341SAndroid Build Coastguard Workercextern dr_intra_derivative
133*c0909341SAndroid Build Coastguard Workercextern filter_intra_taps
134*c0909341SAndroid Build Coastguard Worker
135*c0909341SAndroid Build Coastguard WorkerSECTION .text
136*c0909341SAndroid Build Coastguard Worker
137*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
138*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
139*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
140*c0909341SAndroid Build Coastguard Worker    add                 tlq, 2
141*c0909341SAndroid Build Coastguard Worker    movd                xm4, wd
142*c0909341SAndroid Build Coastguard Worker    pxor                xm3, xm3
143*c0909341SAndroid Build Coastguard Worker    pavgw               xm4, xm3
144*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
145*c0909341SAndroid Build Coastguard Worker    movd                xm5, wd
146*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq]
147*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_dc_left_16bpc_avx2_table]
148*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [r5+wq*4]
149*c0909341SAndroid Build Coastguard Worker    add                  r6, r5
150*c0909341SAndroid Build Coastguard Worker    add                  r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table
151*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
152*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
153*c0909341SAndroid Build Coastguard Worker    jmp                  r6
154*c0909341SAndroid Build Coastguard Worker
155*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
156*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
157*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
158*c0909341SAndroid Build Coastguard Worker    movd                xm4, hd
159*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
160*c0909341SAndroid Build Coastguard Worker    pxor                xm3, xm3
161*c0909341SAndroid Build Coastguard Worker    pavgw               xm4, xm3
162*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, hd
163*c0909341SAndroid Build Coastguard Worker    movd                xm5, r6d
164*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq]
165*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_dc_left_16bpc_avx2_table]
166*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [r5+r6*4]
167*c0909341SAndroid Build Coastguard Worker    add                  r6, r5
168*c0909341SAndroid Build Coastguard Worker    add                  r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table
169*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
170*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
171*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
172*c0909341SAndroid Build Coastguard Worker    jmp                  r6
173*c0909341SAndroid Build Coastguard Worker.h64:
174*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+96]
175*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+64]
176*c0909341SAndroid Build Coastguard Worker.h32:
177*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+32]
178*c0909341SAndroid Build Coastguard Worker.h16:
179*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
180*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
181*c0909341SAndroid Build Coastguard Worker.h8:
182*c0909341SAndroid Build Coastguard Worker    psrldq              xm1, xm0, 8
183*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
184*c0909341SAndroid Build Coastguard Worker.h4:
185*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm0, xm3
186*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
187*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
188*c0909341SAndroid Build Coastguard Worker    psrldq              xm1, xm0, 8
189*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
190*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm4
191*c0909341SAndroid Build Coastguard Worker    psrld               xm0, xm5
192*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
193*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
194*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
195*c0909341SAndroid Build Coastguard Worker    mova                 m2, m0
196*c0909341SAndroid Build Coastguard Worker    mova                 m3, m0
197*c0909341SAndroid Build Coastguard Worker    jmp                  wq
198*c0909341SAndroid Build Coastguard Worker
199*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
200*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
201*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, hd
202*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [wq+hq]
203*c0909341SAndroid Build Coastguard Worker    movd                xm4, r5d
204*c0909341SAndroid Build Coastguard Worker    tzcnt               r5d, r5d
205*c0909341SAndroid Build Coastguard Worker    movd                xm5, r5d
206*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_dc_16bpc_avx2_table]
207*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
208*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [r5+r6*4]
209*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4+5*4]
210*c0909341SAndroid Build Coastguard Worker    pxor                 m3, m3
211*c0909341SAndroid Build Coastguard Worker    psrlw               xm4, 1
212*c0909341SAndroid Build Coastguard Worker    add                  r6, r5
213*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
214*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
215*c0909341SAndroid Build Coastguard Worker    jmp                  r6
216*c0909341SAndroid Build Coastguard Worker.h4:
217*c0909341SAndroid Build Coastguard Worker    movq                xm0, [tlq-8]
218*c0909341SAndroid Build Coastguard Worker    jmp                  wq
219*c0909341SAndroid Build Coastguard Worker.w4:
220*c0909341SAndroid Build Coastguard Worker    movq                xm1, [tlq+2]
221*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
222*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
223*c0909341SAndroid Build Coastguard Worker    psrlq                m1, m0, 32
224*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
225*c0909341SAndroid Build Coastguard Worker    psrld                m1, m0, 16
226*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
227*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
228*c0909341SAndroid Build Coastguard Worker    jg .w4_mul
229*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 3
230*c0909341SAndroid Build Coastguard Worker    jmp .w4_end
231*c0909341SAndroid Build Coastguard Worker.w4_mul:
232*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
233*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
234*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [hq*2]
235*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0xAAAB6667
236*c0909341SAndroid Build Coastguard Worker    shrx                r6d, r6d, r2d
237*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm1, xm0, xm3
238*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm0, xm3
239*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
240*c0909341SAndroid Build Coastguard Worker    movd                xm1, r6d
241*c0909341SAndroid Build Coastguard Worker    psrld               xm0, 2
242*c0909341SAndroid Build Coastguard Worker    pmulhuw             xm0, xm1
243*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 1
244*c0909341SAndroid Build Coastguard Worker.w4_end:
245*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        xm0, xm0
246*c0909341SAndroid Build Coastguard Worker.s4:
247*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
248*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm0
249*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm0
250*c0909341SAndroid Build Coastguard Worker    movq   [dstq+stride3q ], xm0
251*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
252*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
253*c0909341SAndroid Build Coastguard Worker    jg .s4
254*c0909341SAndroid Build Coastguard Worker    RET
255*c0909341SAndroid Build Coastguard WorkerALIGN function_align
256*c0909341SAndroid Build Coastguard Worker.h8:
257*c0909341SAndroid Build Coastguard Worker    mova                xm0, [tlq-16]
258*c0909341SAndroid Build Coastguard Worker    jmp                  wq
259*c0909341SAndroid Build Coastguard Worker.w8:
260*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
261*c0909341SAndroid Build Coastguard Worker    paddw               xm0, [tlq+2]
262*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm4
263*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
264*c0909341SAndroid Build Coastguard Worker    psrld               xm1, xm0, 16
265*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
266*c0909341SAndroid Build Coastguard Worker    pblendw             xm0, xm3, 0xAA
267*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
268*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
269*c0909341SAndroid Build Coastguard Worker    psrldq              xm1, xm0, 8
270*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
271*c0909341SAndroid Build Coastguard Worker    psrld               xm0, xm5
272*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
273*c0909341SAndroid Build Coastguard Worker    je .w8_end
274*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0xAAAB
275*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x6667
276*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
277*c0909341SAndroid Build Coastguard Worker    cmovz               r6d, r2d
278*c0909341SAndroid Build Coastguard Worker    movd                xm1, r6d
279*c0909341SAndroid Build Coastguard Worker    pmulhuw             xm0, xm1
280*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 1
281*c0909341SAndroid Build Coastguard Worker.w8_end:
282*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        xm0, xm0
283*c0909341SAndroid Build Coastguard Worker.s8:
284*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], xm0
285*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], xm0
286*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], xm0
287*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], xm0
288*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
289*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
290*c0909341SAndroid Build Coastguard Worker    jg .s8
291*c0909341SAndroid Build Coastguard Worker    RET
292*c0909341SAndroid Build Coastguard WorkerALIGN function_align
293*c0909341SAndroid Build Coastguard Worker.h16:
294*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-32]
295*c0909341SAndroid Build Coastguard Worker    jmp                  wq
296*c0909341SAndroid Build Coastguard Worker.w16:
297*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+2]
298*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
299*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm4
300*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
301*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm1, xm0, xm3
302*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm0, xm3
303*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
304*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
305*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
306*c0909341SAndroid Build Coastguard Worker    psrldq              xm1, xm0, 8
307*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
308*c0909341SAndroid Build Coastguard Worker    psrld               xm0, xm5
309*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
310*c0909341SAndroid Build Coastguard Worker    je .w16_end
311*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0xAAAB
312*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x6667
313*c0909341SAndroid Build Coastguard Worker    test                 hb, 8|32
314*c0909341SAndroid Build Coastguard Worker    cmovz               r6d, r2d
315*c0909341SAndroid Build Coastguard Worker    movd                xm1, r6d
316*c0909341SAndroid Build Coastguard Worker    pmulhuw             xm0, xm1
317*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 1
318*c0909341SAndroid Build Coastguard Worker.w16_end:
319*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
320*c0909341SAndroid Build Coastguard Worker.s16:
321*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
322*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m0
323*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m0
324*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], m0
325*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
326*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
327*c0909341SAndroid Build Coastguard Worker    jg .s16
328*c0909341SAndroid Build Coastguard Worker    RET
329*c0909341SAndroid Build Coastguard WorkerALIGN function_align
330*c0909341SAndroid Build Coastguard Worker.h32:
331*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-64]
332*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-32]
333*c0909341SAndroid Build Coastguard Worker    jmp                  wq
334*c0909341SAndroid Build Coastguard Worker.w32:
335*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+ 2]
336*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+34]
337*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
338*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm4
339*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
340*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm1, xm0, xm3
341*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm0, xm3
342*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
343*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
344*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
345*c0909341SAndroid Build Coastguard Worker    psrldq              xm1, xm0, 8
346*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
347*c0909341SAndroid Build Coastguard Worker    psrld               xm0, xm5
348*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
349*c0909341SAndroid Build Coastguard Worker    je .w32_end
350*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [hq*2]
351*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x6667AAAB
352*c0909341SAndroid Build Coastguard Worker    shrx                r6d, r6d, r2d
353*c0909341SAndroid Build Coastguard Worker    movd                xm1, r6d
354*c0909341SAndroid Build Coastguard Worker    pmulhuw             xm0, xm1
355*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 1
356*c0909341SAndroid Build Coastguard Worker.w32_end:
357*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
358*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
359*c0909341SAndroid Build Coastguard Worker.s32:
360*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*0], m0
361*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*1], m1
362*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*0], m0
363*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*1], m1
364*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*2+32*0], m0
365*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*2+32*1], m1
366*c0909341SAndroid Build Coastguard Worker    mova [dstq+stride3q +32*0], m0
367*c0909341SAndroid Build Coastguard Worker    mova [dstq+stride3q +32*1], m1
368*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
369*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
370*c0909341SAndroid Build Coastguard Worker    jg .s32
371*c0909341SAndroid Build Coastguard Worker    RET
372*c0909341SAndroid Build Coastguard WorkerALIGN function_align
373*c0909341SAndroid Build Coastguard Worker.h64:
374*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-128]
375*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tlq- 96]
376*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq- 64]
377*c0909341SAndroid Build Coastguard Worker    paddw                m1, [tlq- 32]
378*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
379*c0909341SAndroid Build Coastguard Worker    jmp                  wq
380*c0909341SAndroid Build Coastguard Worker.w64:
381*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+ 2]
382*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+34]
383*c0909341SAndroid Build Coastguard Worker    paddw                m1, [tlq+66]
384*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+98]
385*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
386*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
387*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
388*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm1, xm0, xm3
389*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm0, xm3
390*c0909341SAndroid Build Coastguard Worker    paddd               xm1, xm4
391*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
392*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
393*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
394*c0909341SAndroid Build Coastguard Worker    psrldq              xm1, xm0, 8
395*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
396*c0909341SAndroid Build Coastguard Worker    psrld               xm0, xm5
397*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 64
398*c0909341SAndroid Build Coastguard Worker    je .w64_end
399*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x6667AAAB
400*c0909341SAndroid Build Coastguard Worker    shrx                r6d, r6d, hd
401*c0909341SAndroid Build Coastguard Worker    movd                xm1, r6d
402*c0909341SAndroid Build Coastguard Worker    pmulhuw             xm0, xm1
403*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 1
404*c0909341SAndroid Build Coastguard Worker.w64_end:
405*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
406*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
407*c0909341SAndroid Build Coastguard Worker    mova                 m2, m0
408*c0909341SAndroid Build Coastguard Worker    mova                 m3, m0
409*c0909341SAndroid Build Coastguard Worker.s64:
410*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*0], m0
411*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*1], m1
412*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*2], m2
413*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*3], m3
414*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*0], m0
415*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*1], m1
416*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*2], m2
417*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*3], m3
418*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
419*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
420*c0909341SAndroid Build Coastguard Worker    jg .s64
421*c0909341SAndroid Build Coastguard Worker    RET
422*c0909341SAndroid Build Coastguard Worker
423*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
424*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r8m
425*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
426*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_dc_splat_16bpc_avx2_table]
427*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
428*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
429*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
430*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [r5-ipred_dc_splat_16bpc_avx2_table+pw_512+r6*4]
431*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
432*c0909341SAndroid Build Coastguard Worker    mova                 m2, m0
433*c0909341SAndroid Build Coastguard Worker    mova                 m3, m0
434*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
435*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
436*c0909341SAndroid Build Coastguard Worker    jmp                  wq
437*c0909341SAndroid Build Coastguard Worker
438*c0909341SAndroid Build Coastguard Workercglobal ipred_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
439*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
440*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+ 2]
441*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+34]
442*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+66]
443*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+98]
444*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_dc_splat_16bpc_avx2_table]
445*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
446*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
447*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
448*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
449*c0909341SAndroid Build Coastguard Worker    jmp                  wq
450*c0909341SAndroid Build Coastguard Worker
451*c0909341SAndroid Build Coastguard Worker%macro IPRED_H 2 ; w, store_type
452*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, [tlq-2]
453*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, [tlq-4]
454*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, [tlq-6]
455*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, [tlq-8]
456*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 8
457*c0909341SAndroid Build Coastguard Worker    mov%2  [dstq+strideq*0], m0
458*c0909341SAndroid Build Coastguard Worker    mov%2  [dstq+strideq*1], m1
459*c0909341SAndroid Build Coastguard Worker    mov%2  [dstq+strideq*2], m2
460*c0909341SAndroid Build Coastguard Worker    mov%2  [dstq+stride3q ], m3
461*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
462*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
463*c0909341SAndroid Build Coastguard Worker    jg .w%1
464*c0909341SAndroid Build Coastguard Worker    RET
465*c0909341SAndroid Build Coastguard WorkerALIGN function_align
466*c0909341SAndroid Build Coastguard Worker%endmacro
467*c0909341SAndroid Build Coastguard Worker
468*c0909341SAndroid Build Coastguard Workercglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
469*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
470*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_h_16bpc_avx2_table]
471*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
472*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
473*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
474*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
475*c0909341SAndroid Build Coastguard Worker    jmp                  wq
476*c0909341SAndroid Build Coastguard WorkerINIT_XMM avx2
477*c0909341SAndroid Build Coastguard Worker.w4:
478*c0909341SAndroid Build Coastguard Worker    IPRED_H               4, q
479*c0909341SAndroid Build Coastguard Worker.w8:
480*c0909341SAndroid Build Coastguard Worker    IPRED_H               8, a
481*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
482*c0909341SAndroid Build Coastguard Worker.w16:
483*c0909341SAndroid Build Coastguard Worker    IPRED_H              16, a
484*c0909341SAndroid Build Coastguard Worker.w32:
485*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, [tlq-2]
486*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, [tlq-4]
487*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, [tlq-6]
488*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, [tlq-8]
489*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 8
490*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*0], m0
491*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*1], m0
492*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*0], m1
493*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*1], m1
494*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*2+32*0], m2
495*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*2+32*1], m2
496*c0909341SAndroid Build Coastguard Worker    mova [dstq+stride3q +32*0], m3
497*c0909341SAndroid Build Coastguard Worker    mova [dstq+stride3q +32*1], m3
498*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
499*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
500*c0909341SAndroid Build Coastguard Worker    jg .w32
501*c0909341SAndroid Build Coastguard Worker    RET
502*c0909341SAndroid Build Coastguard Worker.w64:
503*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, [tlq-2]
504*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, [tlq-4]
505*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
506*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*0], m0
507*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*1], m0
508*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*2], m0
509*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*3], m0
510*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*0], m1
511*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*1], m1
512*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*2], m1
513*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*3], m1
514*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
515*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
516*c0909341SAndroid Build Coastguard Worker    jg .w64
517*c0909341SAndroid Build Coastguard Worker    RET
518*c0909341SAndroid Build Coastguard Worker
519*c0909341SAndroid Build Coastguard Worker%macro PAETH 3 ; top, signed_ldiff, ldiff
520*c0909341SAndroid Build Coastguard Worker    paddw               m0, m%2, m1
521*c0909341SAndroid Build Coastguard Worker    psubw               m7, m3, m0  ; tldiff
522*c0909341SAndroid Build Coastguard Worker    psubw               m0, m%1     ; tdiff
523*c0909341SAndroid Build Coastguard Worker    pabsw               m7, m7
524*c0909341SAndroid Build Coastguard Worker    pabsw               m0, m0
525*c0909341SAndroid Build Coastguard Worker    pminsw              m7, m0
526*c0909341SAndroid Build Coastguard Worker    pcmpeqw             m0, m7
527*c0909341SAndroid Build Coastguard Worker    pcmpgtw             m7, m%3, m7
528*c0909341SAndroid Build Coastguard Worker    vpblendvb           m0, m3, m%1, m0
529*c0909341SAndroid Build Coastguard Worker    vpblendvb           m0, m1, m0, m7
530*c0909341SAndroid Build Coastguard Worker%endmacro
531*c0909341SAndroid Build Coastguard Worker
532*c0909341SAndroid Build Coastguard Workercglobal ipred_paeth_16bpc, 3, 6, 8, dst, stride, tl, w, h
533*c0909341SAndroid Build Coastguard Worker%define base r5-ipred_paeth_16bpc_avx2_table
534*c0909341SAndroid Build Coastguard Worker    movifnidn           hd, hm
535*c0909341SAndroid Build Coastguard Worker    lea                 r5, [ipred_paeth_16bpc_avx2_table]
536*c0909341SAndroid Build Coastguard Worker    tzcnt               wd, wd
537*c0909341SAndroid Build Coastguard Worker    movsxd              wq, [r5+wq*4]
538*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m3, [tlq]   ; topleft
539*c0909341SAndroid Build Coastguard Worker    add                 wq, r5
540*c0909341SAndroid Build Coastguard Worker    jmp                 wq
541*c0909341SAndroid Build Coastguard Worker.w4:
542*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m2, [tlq+2] ; top
543*c0909341SAndroid Build Coastguard Worker    movsldup            m6, [base+ipred_hv_shuf]
544*c0909341SAndroid Build Coastguard Worker    lea                 r3, [strideq*3]
545*c0909341SAndroid Build Coastguard Worker    psubw               m4, m2, m3
546*c0909341SAndroid Build Coastguard Worker    pabsw               m5, m4
547*c0909341SAndroid Build Coastguard Worker.w4_loop:
548*c0909341SAndroid Build Coastguard Worker    sub                tlq, 8
549*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m1, [tlq]
550*c0909341SAndroid Build Coastguard Worker    pshufb              m1, m6      ; left
551*c0909341SAndroid Build Coastguard Worker    PAETH                2, 4, 5
552*c0909341SAndroid Build Coastguard Worker    vextracti128       xm1, m0, 1
553*c0909341SAndroid Build Coastguard Worker    movq  [dstq+strideq*0], xm0
554*c0909341SAndroid Build Coastguard Worker    movq  [dstq+strideq*1], xm1
555*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
556*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r3       ], xm1
557*c0909341SAndroid Build Coastguard Worker    lea               dstq, [dstq+strideq*4]
558*c0909341SAndroid Build Coastguard Worker    sub                 hd, 4
559*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
560*c0909341SAndroid Build Coastguard Worker    RET
561*c0909341SAndroid Build Coastguard WorkerALIGN function_align
562*c0909341SAndroid Build Coastguard Worker.w8:
563*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m2, [tlq+2]
564*c0909341SAndroid Build Coastguard Worker    movsldup            m6, [base+ipred_hv_shuf]
565*c0909341SAndroid Build Coastguard Worker    psubw               m4, m2, m3
566*c0909341SAndroid Build Coastguard Worker    pabsw               m5, m4
567*c0909341SAndroid Build Coastguard Worker.w8_loop:
568*c0909341SAndroid Build Coastguard Worker    sub                tlq, 4
569*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m1, [tlq]
570*c0909341SAndroid Build Coastguard Worker    pshufb              m1, m6
571*c0909341SAndroid Build Coastguard Worker    PAETH                2, 4, 5
572*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
573*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
574*c0909341SAndroid Build Coastguard Worker    lea               dstq, [dstq+strideq*2]
575*c0909341SAndroid Build Coastguard Worker    sub                 hd, 2
576*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
577*c0909341SAndroid Build Coastguard Worker    RET
578*c0909341SAndroid Build Coastguard WorkerALIGN function_align
579*c0909341SAndroid Build Coastguard Worker.w16:
580*c0909341SAndroid Build Coastguard Worker    movu                m2, [tlq+2]
581*c0909341SAndroid Build Coastguard Worker    psubw               m4, m2, m3
582*c0909341SAndroid Build Coastguard Worker    pabsw               m5, m4
583*c0909341SAndroid Build Coastguard Worker.w16_loop:
584*c0909341SAndroid Build Coastguard Worker    sub                tlq, 2
585*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m1, [tlq]
586*c0909341SAndroid Build Coastguard Worker    PAETH                2, 4, 5
587*c0909341SAndroid Build Coastguard Worker    mova            [dstq], m0
588*c0909341SAndroid Build Coastguard Worker    add               dstq, strideq
589*c0909341SAndroid Build Coastguard Worker    dec                 hd
590*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
591*c0909341SAndroid Build Coastguard Worker    RET
592*c0909341SAndroid Build Coastguard WorkerALIGN function_align
593*c0909341SAndroid Build Coastguard Worker.w32:
594*c0909341SAndroid Build Coastguard Worker    movu                m2, [tlq+2]
595*c0909341SAndroid Build Coastguard Worker    movu                m6, [tlq+34]
596*c0909341SAndroid Build Coastguard Worker%if WIN64
597*c0909341SAndroid Build Coastguard Worker    movaps             r4m, xmm8
598*c0909341SAndroid Build Coastguard Worker    movaps             r6m, xmm9
599*c0909341SAndroid Build Coastguard Worker%endif
600*c0909341SAndroid Build Coastguard Worker    psubw               m4, m2, m3
601*c0909341SAndroid Build Coastguard Worker    psubw               m8, m6, m3
602*c0909341SAndroid Build Coastguard Worker    pabsw               m5, m4
603*c0909341SAndroid Build Coastguard Worker    pabsw               m9, m8
604*c0909341SAndroid Build Coastguard Worker.w32_loop:
605*c0909341SAndroid Build Coastguard Worker    sub                tlq, 2
606*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m1, [tlq]
607*c0909341SAndroid Build Coastguard Worker    PAETH                2, 4, 5
608*c0909341SAndroid Build Coastguard Worker    mova       [dstq+32*0], m0
609*c0909341SAndroid Build Coastguard Worker    PAETH                6, 8, 9
610*c0909341SAndroid Build Coastguard Worker    mova       [dstq+32*1], m0
611*c0909341SAndroid Build Coastguard Worker    add               dstq, strideq
612*c0909341SAndroid Build Coastguard Worker    dec                 hd
613*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
614*c0909341SAndroid Build Coastguard Worker%if WIN64
615*c0909341SAndroid Build Coastguard Worker    movaps            xmm8, r4m
616*c0909341SAndroid Build Coastguard Worker    movaps            xmm9, r6m
617*c0909341SAndroid Build Coastguard Worker%endif
618*c0909341SAndroid Build Coastguard Worker    RET
619*c0909341SAndroid Build Coastguard WorkerALIGN function_align
620*c0909341SAndroid Build Coastguard Worker.w64:
621*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM 16
622*c0909341SAndroid Build Coastguard Worker    movu                m2, [tlq+ 2]
623*c0909341SAndroid Build Coastguard Worker    movu                m6, [tlq+34]
624*c0909341SAndroid Build Coastguard Worker    movu               m10, [tlq+66]
625*c0909341SAndroid Build Coastguard Worker    movu               m13, [tlq+98]
626*c0909341SAndroid Build Coastguard Worker    psubw               m4, m2, m3
627*c0909341SAndroid Build Coastguard Worker    psubw               m8, m6, m3
628*c0909341SAndroid Build Coastguard Worker    psubw              m11, m10, m3
629*c0909341SAndroid Build Coastguard Worker    psubw              m14, m13, m3
630*c0909341SAndroid Build Coastguard Worker    pabsw               m5, m4
631*c0909341SAndroid Build Coastguard Worker    pabsw               m9, m8
632*c0909341SAndroid Build Coastguard Worker    pabsw              m12, m11
633*c0909341SAndroid Build Coastguard Worker    pabsw              m15, m14
634*c0909341SAndroid Build Coastguard Worker.w64_loop:
635*c0909341SAndroid Build Coastguard Worker    sub                tlq, 2
636*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m1, [tlq]
637*c0909341SAndroid Build Coastguard Worker    PAETH                2, 4, 5
638*c0909341SAndroid Build Coastguard Worker    mova       [dstq+32*0], m0
639*c0909341SAndroid Build Coastguard Worker    PAETH                6, 8, 9
640*c0909341SAndroid Build Coastguard Worker    mova       [dstq+32*1], m0
641*c0909341SAndroid Build Coastguard Worker    PAETH               10, 11, 12
642*c0909341SAndroid Build Coastguard Worker    mova       [dstq+32*2], m0
643*c0909341SAndroid Build Coastguard Worker    PAETH               13, 14, 15
644*c0909341SAndroid Build Coastguard Worker    mova       [dstq+32*3], m0
645*c0909341SAndroid Build Coastguard Worker    add               dstq, strideq
646*c0909341SAndroid Build Coastguard Worker    dec                 hd
647*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
648*c0909341SAndroid Build Coastguard Worker    RET
649*c0909341SAndroid Build Coastguard Worker
650*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, weights
651*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_smooth_v_16bpc_avx2_table
652*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ipred_smooth_v_16bpc_avx2_table]
653*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
654*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
655*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
656*c0909341SAndroid Build Coastguard Worker    lea            weightsq, [base+smooth_weights_1d_16bpc+hq*4]
657*c0909341SAndroid Build Coastguard Worker    neg                  hq
658*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, [tlq+hq*2] ; bottom
659*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
660*c0909341SAndroid Build Coastguard Worker    jmp                  wq
661*c0909341SAndroid Build Coastguard Worker.w4:
662*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [tlq+2]    ; top
663*c0909341SAndroid Build Coastguard Worker    movsldup             m3, [base+ipred_hv_shuf]
664*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
665*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5         ; top - bottom
666*c0909341SAndroid Build Coastguard Worker.w4_loop:
667*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [weightsq+hq*2]
668*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
669*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
670*c0909341SAndroid Build Coastguard Worker    paddw                m0, m5
671*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
672*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*0], xm1
673*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
674*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
675*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r6       ], xm0
676*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
677*c0909341SAndroid Build Coastguard Worker    add                  hq, 4
678*c0909341SAndroid Build Coastguard Worker    jl .w4_loop
679*c0909341SAndroid Build Coastguard Worker.ret:
680*c0909341SAndroid Build Coastguard Worker    RET
681*c0909341SAndroid Build Coastguard Worker.w8:
682*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [tlq+2]
683*c0909341SAndroid Build Coastguard Worker    movsldup             m3, [base+ipred_hv_shuf]
684*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
685*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5
686*c0909341SAndroid Build Coastguard Worker.w8_loop:
687*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [weightsq+hq*2+0]
688*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [weightsq+hq*2+4]
689*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
690*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
691*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
692*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4
693*c0909341SAndroid Build Coastguard Worker    paddw                m0, m5
694*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
695*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*0], m0, 1
696*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*1], xm0
697*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*2], m1, 1
698*c0909341SAndroid Build Coastguard Worker    mova         [dstq+r6       ], xm1
699*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
700*c0909341SAndroid Build Coastguard Worker    add                  hq, 4
701*c0909341SAndroid Build Coastguard Worker    jl .w8_loop
702*c0909341SAndroid Build Coastguard Worker    RET
703*c0909341SAndroid Build Coastguard Worker.w16:
704*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq+2]
705*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
706*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5
707*c0909341SAndroid Build Coastguard Worker.w16_loop:
708*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, [weightsq+hq*2+0]
709*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, [weightsq+hq*2+2]
710*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, [weightsq+hq*2+4]
711*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, [weightsq+hq*2+6]
712*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
713*c0909341SAndroid Build Coastguard Worker    REPX   {paddw    x, m5}, m0, m1, m2, m3
714*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
715*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
716*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m2
717*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r6       ], m3
718*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
719*c0909341SAndroid Build Coastguard Worker    add                  hq, 4
720*c0909341SAndroid Build Coastguard Worker    jl .w16_loop
721*c0909341SAndroid Build Coastguard Worker    RET
722*c0909341SAndroid Build Coastguard Worker.w32:
723*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       7
724*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq+ 2]
725*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq+34]
726*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5
727*c0909341SAndroid Build Coastguard Worker    psubw                m6, m5
728*c0909341SAndroid Build Coastguard Worker.w32_loop:
729*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, [weightsq+hq*2+0]
730*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, [weightsq+hq*2+2]
731*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4, m1
732*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6
733*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4, m3
734*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6
735*c0909341SAndroid Build Coastguard Worker    REPX      {paddw x, m5}, m0, m1, m2, m3
736*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*0], m0
737*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*1], m1
738*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*0], m2
739*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*1], m3
740*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
741*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
742*c0909341SAndroid Build Coastguard Worker    jl .w32_loop
743*c0909341SAndroid Build Coastguard Worker    RET
744*c0909341SAndroid Build Coastguard Worker.w64:
745*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       8
746*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+ 2]
747*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq+34]
748*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq+66]
749*c0909341SAndroid Build Coastguard Worker    movu                 m7, [tlq+98]
750*c0909341SAndroid Build Coastguard Worker    REPX      {psubw x, m5}, m3, m4, m6, m7
751*c0909341SAndroid Build Coastguard Worker.w64_loop:
752*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, [weightsq+hq*2]
753*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3, m2
754*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4, m2
755*c0909341SAndroid Build Coastguard Worker    paddw                m0, m5
756*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
757*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
758*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m6, m2
759*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m1
760*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7, m2
761*c0909341SAndroid Build Coastguard Worker    paddw                m0, m5
762*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
763*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*2], m0
764*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*3], m1
765*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
766*c0909341SAndroid Build Coastguard Worker    inc                  hq
767*c0909341SAndroid Build Coastguard Worker    jl .w64_loop
768*c0909341SAndroid Build Coastguard Worker    RET
769*c0909341SAndroid Build Coastguard Worker
770*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
771*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_smooth_h_16bpc_avx2_table
772*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ipred_smooth_h_16bpc_avx2_table]
773*c0909341SAndroid Build Coastguard Worker    mov                  wd, wm
774*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
775*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, [tlq+wq*2] ; right
776*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
777*c0909341SAndroid Build Coastguard Worker    add                  hd, hd
778*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
779*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
780*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
781*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
782*c0909341SAndroid Build Coastguard Worker    jmp                  wq
783*c0909341SAndroid Build Coastguard Worker.w4:
784*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [base+smooth_weights_1d_16bpc+4*2]
785*c0909341SAndroid Build Coastguard Worker    movsldup             m3, [base+ipred_hv_shuf]
786*c0909341SAndroid Build Coastguard Worker.w4_loop:
787*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [tlq+hq-8] ; left
788*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
789*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5         ; left - right
790*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
791*c0909341SAndroid Build Coastguard Worker    paddw                m0, m5
792*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
793*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
794*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
795*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
796*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
797*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
798*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4*2
799*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
800*c0909341SAndroid Build Coastguard Worker    RET
801*c0909341SAndroid Build Coastguard Worker.w8:
802*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [base+smooth_weights_1d_16bpc+8*2]
803*c0909341SAndroid Build Coastguard Worker    movsldup             m3, [base+ipred_hv_shuf]
804*c0909341SAndroid Build Coastguard Worker.w8_loop:
805*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [tlq+hq-4]
806*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [tlq+hq-8]
807*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
808*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
809*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
810*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5
811*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
812*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4
813*c0909341SAndroid Build Coastguard Worker    paddw                m0, m5
814*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
815*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
816*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
817*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*2], xm1
818*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+stride3q ], m1, 1
819*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
820*c0909341SAndroid Build Coastguard Worker    sub                  hq, 4*2
821*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
822*c0909341SAndroid Build Coastguard Worker    RET
823*c0909341SAndroid Build Coastguard Worker.w16:
824*c0909341SAndroid Build Coastguard Worker    movu                 m4, [base+smooth_weights_1d_16bpc+16*2]
825*c0909341SAndroid Build Coastguard Worker.w16_loop:
826*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [tlq+hq-8]
827*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m3
828*c0909341SAndroid Build Coastguard Worker    psubw                m3, m5
829*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q3333
830*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m3, q2222
831*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q1111
832*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q0000
833*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
834*c0909341SAndroid Build Coastguard Worker    REPX   {paddw    x, m5}, m0, m1, m2, m3
835*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
836*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
837*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m2
838*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], m3
839*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
840*c0909341SAndroid Build Coastguard Worker    sub                  hq, 4*2
841*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
842*c0909341SAndroid Build Coastguard Worker    RET
843*c0909341SAndroid Build Coastguard Worker.w32:
844*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       7
845*c0909341SAndroid Build Coastguard Worker    movu                 m4, [base+smooth_weights_1d_16bpc+32*2]
846*c0909341SAndroid Build Coastguard Worker    movu                 m6, [base+smooth_weights_1d_16bpc+32*3]
847*c0909341SAndroid Build Coastguard Worker.w32_loop:
848*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, [tlq+hq-2]
849*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, [tlq+hq-4]
850*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5
851*c0909341SAndroid Build Coastguard Worker    psubw                m3, m5
852*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4, m1
853*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6
854*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4, m3
855*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6
856*c0909341SAndroid Build Coastguard Worker    REPX      {paddw x, m5}, m0, m1, m2, m3
857*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*0], m0
858*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*1], m1
859*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*0], m2
860*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*1], m3
861*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
862*c0909341SAndroid Build Coastguard Worker    sub                  hq, 2*2
863*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
864*c0909341SAndroid Build Coastguard Worker    RET
865*c0909341SAndroid Build Coastguard Worker.w64:
866*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       8
867*c0909341SAndroid Build Coastguard Worker    movu                 m3, [base+smooth_weights_1d_16bpc+32*4]
868*c0909341SAndroid Build Coastguard Worker    movu                 m4, [base+smooth_weights_1d_16bpc+32*5]
869*c0909341SAndroid Build Coastguard Worker    movu                 m6, [base+smooth_weights_1d_16bpc+32*6]
870*c0909341SAndroid Build Coastguard Worker    movu                 m7, [base+smooth_weights_1d_16bpc+32*7]
871*c0909341SAndroid Build Coastguard Worker.w64_loop:
872*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, [tlq+hq-2]
873*c0909341SAndroid Build Coastguard Worker    psubw                m2, m5
874*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3, m2
875*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4, m2
876*c0909341SAndroid Build Coastguard Worker    paddw                m0, m5
877*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
878*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
879*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m6, m2
880*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m1
881*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7, m2
882*c0909341SAndroid Build Coastguard Worker    paddw                m0, m5
883*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
884*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*2], m0
885*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*3], m1
886*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
887*c0909341SAndroid Build Coastguard Worker    sub                  hq, 1*2
888*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
889*c0909341SAndroid Build Coastguard Worker    RET
890*c0909341SAndroid Build Coastguard Worker
891*c0909341SAndroid Build Coastguard Worker%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2]
892*c0909341SAndroid Build Coastguard Worker    pmaddwd             m0, m%1, m%3
893*c0909341SAndroid Build Coastguard Worker    pmaddwd             m1, m%2, m%4
894*c0909341SAndroid Build Coastguard Worker    paddd               m0, m%5
895*c0909341SAndroid Build Coastguard Worker    paddd               m1, m%6
896*c0909341SAndroid Build Coastguard Worker    psrld               m0, 8
897*c0909341SAndroid Build Coastguard Worker    psrld               m1, 8
898*c0909341SAndroid Build Coastguard Worker    packssdw            m0, m1
899*c0909341SAndroid Build Coastguard Worker    pavgw               m0, m5
900*c0909341SAndroid Build Coastguard Worker%endmacro
901*c0909341SAndroid Build Coastguard Worker
902*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights
903*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_smooth_16bpc_avx2_table
904*c0909341SAndroid Build Coastguard Worker    lea                 r6, [ipred_smooth_16bpc_avx2_table]
905*c0909341SAndroid Build Coastguard Worker    mov                 wd, wm
906*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m4, [tlq+wq*2] ; right
907*c0909341SAndroid Build Coastguard Worker    tzcnt               wd, wd
908*c0909341SAndroid Build Coastguard Worker    mov                 hd, hm
909*c0909341SAndroid Build Coastguard Worker    sub                tlq, hq
910*c0909341SAndroid Build Coastguard Worker    sub                tlq, hq
911*c0909341SAndroid Build Coastguard Worker    movsxd              wq, [r6+wq*4]
912*c0909341SAndroid Build Coastguard Worker    pxor                m5, m5
913*c0909341SAndroid Build Coastguard Worker    add                 wq, r6
914*c0909341SAndroid Build Coastguard Worker    lea         v_weightsq, [base+smooth_weights_2d_16bpc+hq*4]
915*c0909341SAndroid Build Coastguard Worker    jmp                 wq
916*c0909341SAndroid Build Coastguard Worker.w4:
917*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM     11
918*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m0, [tlq] ; bottom
919*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m6, [tlq+hq*2+2]
920*c0909341SAndroid Build Coastguard Worker    movsldup            m7, [base+ipred_hv_shuf]
921*c0909341SAndroid Build Coastguard Worker    movshdup            m9, [base+ipred_hv_shuf]
922*c0909341SAndroid Build Coastguard Worker    vbroadcasti128     m10, [base+smooth_weights_2d_16bpc+4*4]
923*c0909341SAndroid Build Coastguard Worker    punpcklwd           m6, m0 ; top, bottom
924*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m8, m9, m9
925*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m9, m9
926*c0909341SAndroid Build Coastguard Worker    lea                 r3, [strideq*3]
927*c0909341SAndroid Build Coastguard Worker.w4_loop:
928*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m3, [tlq+hq*2-8]
929*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m1, [v_weightsq]
930*c0909341SAndroid Build Coastguard Worker    pshufb              m3, m7
931*c0909341SAndroid Build Coastguard Worker    punpcklwd           m2, m3, m4 ; left, right
932*c0909341SAndroid Build Coastguard Worker    punpckhwd           m3, m4
933*c0909341SAndroid Build Coastguard Worker    pmaddwd             m2, m10
934*c0909341SAndroid Build Coastguard Worker    pmaddwd             m3, m10
935*c0909341SAndroid Build Coastguard Worker    pshufb              m0, m1, m8
936*c0909341SAndroid Build Coastguard Worker    pshufb              m1, m9
937*c0909341SAndroid Build Coastguard Worker    SMOOTH_2D_END        0, 1, 6, 6, 2, 3
938*c0909341SAndroid Build Coastguard Worker    vextracti128       xm1, m0, 1
939*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
940*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
941*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
942*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r3       ], xm1
943*c0909341SAndroid Build Coastguard Worker    lea               dstq, [dstq+strideq*4]
944*c0909341SAndroid Build Coastguard Worker    add         v_weightsq, 16
945*c0909341SAndroid Build Coastguard Worker    sub                 hd, 4
946*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
947*c0909341SAndroid Build Coastguard Worker    RET
948*c0909341SAndroid Build Coastguard Worker.w8:
949*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM     12
950*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m0, [tlq] ; bottom
951*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m7, [tlq+hq*2+2]
952*c0909341SAndroid Build Coastguard Worker    movsldup            m8, [base+ipred_hv_shuf]
953*c0909341SAndroid Build Coastguard Worker    movshdup            m9, [base+ipred_hv_shuf]
954*c0909341SAndroid Build Coastguard Worker    vbroadcasti128     m10, [base+smooth_weights_2d_16bpc+8*4+16*0]
955*c0909341SAndroid Build Coastguard Worker    vbroadcasti128     m11, [base+smooth_weights_2d_16bpc+8*4+16*1]
956*c0909341SAndroid Build Coastguard Worker    punpcklwd           m6, m7, m0 ; top, bottom
957*c0909341SAndroid Build Coastguard Worker    punpckhwd           m7, m0
958*c0909341SAndroid Build Coastguard Worker.w8_loop:
959*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m3, [tlq+hq*2-4]
960*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m1, [v_weightsq]
961*c0909341SAndroid Build Coastguard Worker    pshufb              m3, m8
962*c0909341SAndroid Build Coastguard Worker    punpcklwd           m2, m3, m4 ; left, right
963*c0909341SAndroid Build Coastguard Worker    punpckhwd           m3, m4
964*c0909341SAndroid Build Coastguard Worker    pmaddwd             m2, m10
965*c0909341SAndroid Build Coastguard Worker    pmaddwd             m3, m11
966*c0909341SAndroid Build Coastguard Worker    pshufb              m1, m9
967*c0909341SAndroid Build Coastguard Worker    SMOOTH_2D_END        1, 1, 6, 7, 2, 3
968*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
969*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
970*c0909341SAndroid Build Coastguard Worker    lea               dstq, [dstq+strideq*2]
971*c0909341SAndroid Build Coastguard Worker    add         v_weightsq, 8
972*c0909341SAndroid Build Coastguard Worker    sub                 hd, 2
973*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
974*c0909341SAndroid Build Coastguard Worker    RET
975*c0909341SAndroid Build Coastguard Worker.w16:
976*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM     11
977*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m0, [tlq] ; bottom
978*c0909341SAndroid Build Coastguard Worker    movu                m7, [tlq+hq*2+2]
979*c0909341SAndroid Build Coastguard Worker    mova               xm8, [base+smooth_weights_2d_16bpc+16*4+16*0]
980*c0909341SAndroid Build Coastguard Worker    mova               xm9, [base+smooth_weights_2d_16bpc+16*4+16*1]
981*c0909341SAndroid Build Coastguard Worker    vinserti128         m8, [base+smooth_weights_2d_16bpc+16*4+16*2], 1
982*c0909341SAndroid Build Coastguard Worker    vinserti128         m9, [base+smooth_weights_2d_16bpc+16*4+16*3], 1
983*c0909341SAndroid Build Coastguard Worker    punpcklwd           m6, m7, m0 ; top, bottom
984*c0909341SAndroid Build Coastguard Worker    punpckhwd           m7, m0
985*c0909341SAndroid Build Coastguard Worker.w16_loop:
986*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m3, [tlq+hq*2-4]
987*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m1, [v_weightsq+0]
988*c0909341SAndroid Build Coastguard Worker    punpcklwd           m3, m4     ; left, right
989*c0909341SAndroid Build Coastguard Worker    pshufd              m2, m3, q1111
990*c0909341SAndroid Build Coastguard Worker    pmaddwd            m10, m8, m2
991*c0909341SAndroid Build Coastguard Worker    pmaddwd             m2, m9
992*c0909341SAndroid Build Coastguard Worker    pshufd              m3, m3, q0000
993*c0909341SAndroid Build Coastguard Worker    SMOOTH_2D_END        1, 1, 6, 7, 10, 2
994*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m1, [v_weightsq+4]
995*c0909341SAndroid Build Coastguard Worker    pmaddwd             m2, m8, m3
996*c0909341SAndroid Build Coastguard Worker    pmaddwd             m3, m9
997*c0909341SAndroid Build Coastguard Worker    mova  [dstq+strideq*0], m0
998*c0909341SAndroid Build Coastguard Worker    SMOOTH_2D_END        1, 1, 6, 7, 2, 3
999*c0909341SAndroid Build Coastguard Worker    mova  [dstq+strideq*1], m0
1000*c0909341SAndroid Build Coastguard Worker    lea               dstq, [dstq+strideq*2]
1001*c0909341SAndroid Build Coastguard Worker    add         v_weightsq, 8
1002*c0909341SAndroid Build Coastguard Worker    sub                 hq, 2
1003*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
1004*c0909341SAndroid Build Coastguard Worker    RET
1005*c0909341SAndroid Build Coastguard Worker.w32:
1006*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM     15
1007*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m0, [tlq] ; bottom
1008*c0909341SAndroid Build Coastguard Worker    movu                m7, [tlq+hq*2+ 2]
1009*c0909341SAndroid Build Coastguard Worker    movu                m9, [tlq+hq*2+34]
1010*c0909341SAndroid Build Coastguard Worker    mova              xm10, [base+smooth_weights_2d_16bpc+32*4+16*0]
1011*c0909341SAndroid Build Coastguard Worker    mova              xm11, [base+smooth_weights_2d_16bpc+32*4+16*1]
1012*c0909341SAndroid Build Coastguard Worker    vinserti128        m10, [base+smooth_weights_2d_16bpc+32*4+16*2], 1
1013*c0909341SAndroid Build Coastguard Worker    vinserti128        m11, [base+smooth_weights_2d_16bpc+32*4+16*3], 1
1014*c0909341SAndroid Build Coastguard Worker    mova              xm12, [base+smooth_weights_2d_16bpc+32*4+16*4]
1015*c0909341SAndroid Build Coastguard Worker    mova              xm13, [base+smooth_weights_2d_16bpc+32*4+16*5]
1016*c0909341SAndroid Build Coastguard Worker    vinserti128        m12, [base+smooth_weights_2d_16bpc+32*4+16*6], 1
1017*c0909341SAndroid Build Coastguard Worker    vinserti128        m13, [base+smooth_weights_2d_16bpc+32*4+16*7], 1
1018*c0909341SAndroid Build Coastguard Worker    punpcklwd           m6, m7, m0
1019*c0909341SAndroid Build Coastguard Worker    punpckhwd           m7, m0
1020*c0909341SAndroid Build Coastguard Worker    punpcklwd           m8, m9, m0
1021*c0909341SAndroid Build Coastguard Worker    punpckhwd           m9, m0
1022*c0909341SAndroid Build Coastguard Worker.w32_loop:
1023*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m3, [tlq+hq*2-2]
1024*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       m14, [v_weightsq]
1025*c0909341SAndroid Build Coastguard Worker    punpcklwd           m3, m4
1026*c0909341SAndroid Build Coastguard Worker    pmaddwd             m1, m10, m3
1027*c0909341SAndroid Build Coastguard Worker    pmaddwd             m2, m11, m3
1028*c0909341SAndroid Build Coastguard Worker    pmaddwd             m0, m6, m14
1029*c0909341SAndroid Build Coastguard Worker    paddd               m0, m1
1030*c0909341SAndroid Build Coastguard Worker    pmaddwd             m1, m7, m14
1031*c0909341SAndroid Build Coastguard Worker    paddd               m1, m2
1032*c0909341SAndroid Build Coastguard Worker    pmaddwd             m2, m12, m3
1033*c0909341SAndroid Build Coastguard Worker    pmaddwd             m3, m13
1034*c0909341SAndroid Build Coastguard Worker    psrld               m0, 8
1035*c0909341SAndroid Build Coastguard Worker    psrld               m1, 8
1036*c0909341SAndroid Build Coastguard Worker    packssdw            m0, m1
1037*c0909341SAndroid Build Coastguard Worker    pavgw               m0, m5
1038*c0909341SAndroid Build Coastguard Worker    mova       [dstq+32*0], m0
1039*c0909341SAndroid Build Coastguard Worker    SMOOTH_2D_END       14, 14, 8, 9, 2, 3
1040*c0909341SAndroid Build Coastguard Worker    mova       [dstq+32*1], m0
1041*c0909341SAndroid Build Coastguard Worker    add               dstq, strideq
1042*c0909341SAndroid Build Coastguard Worker    add         v_weightsq, 4
1043*c0909341SAndroid Build Coastguard Worker    dec                 hd
1044*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
1045*c0909341SAndroid Build Coastguard Worker    RET
1046*c0909341SAndroid Build Coastguard Worker.w64:
1047*c0909341SAndroid Build Coastguard Worker    PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base
1048*c0909341SAndroid Build Coastguard Worker    mov          dst_baseq, dstq
1049*c0909341SAndroid Build Coastguard Worker    mov           tl_baseq, tlq
1050*c0909341SAndroid Build Coastguard Worker    mov    v_weights_baseq, v_weightsq
1051*c0909341SAndroid Build Coastguard Worker    xor                 xq, xq
1052*c0909341SAndroid Build Coastguard Worker.w64_loop_x:
1053*c0909341SAndroid Build Coastguard Worker    mov                 yq, hq
1054*c0909341SAndroid Build Coastguard Worker    lea                tlq, [tl_baseq+hq*2]
1055*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m0, [tl_baseq] ; bottom
1056*c0909341SAndroid Build Coastguard Worker    movu                m7, [tlq+xq*2+ 2]
1057*c0909341SAndroid Build Coastguard Worker    movu                m9, [tlq+xq*2+34]
1058*c0909341SAndroid Build Coastguard Worker    mova              xm10, [base+smooth_weights_2d_16bpc+64*4+16*0]
1059*c0909341SAndroid Build Coastguard Worker    mova              xm11, [base+smooth_weights_2d_16bpc+64*4+16*1]
1060*c0909341SAndroid Build Coastguard Worker    vinserti128        m10, [base+smooth_weights_2d_16bpc+64*4+16*2], 1
1061*c0909341SAndroid Build Coastguard Worker    vinserti128        m11, [base+smooth_weights_2d_16bpc+64*4+16*3], 1
1062*c0909341SAndroid Build Coastguard Worker    mova              xm12, [base+smooth_weights_2d_16bpc+64*4+16*4]
1063*c0909341SAndroid Build Coastguard Worker    mova              xm13, [base+smooth_weights_2d_16bpc+64*4+16*5]
1064*c0909341SAndroid Build Coastguard Worker    vinserti128        m12, [base+smooth_weights_2d_16bpc+64*4+16*6], 1
1065*c0909341SAndroid Build Coastguard Worker    vinserti128        m13, [base+smooth_weights_2d_16bpc+64*4+16*7], 1
1066*c0909341SAndroid Build Coastguard Worker    punpcklwd           m6, m7, m0
1067*c0909341SAndroid Build Coastguard Worker    punpckhwd           m7, m0
1068*c0909341SAndroid Build Coastguard Worker    punpcklwd           m8, m9, m0
1069*c0909341SAndroid Build Coastguard Worker    punpckhwd           m9, m0
1070*c0909341SAndroid Build Coastguard Worker    lea                tlq, [tl_baseq-2]
1071*c0909341SAndroid Build Coastguard Worker.w64_loop_y:
1072*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m3, [tlq+yq*2]
1073*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m1, [v_weightsq]
1074*c0909341SAndroid Build Coastguard Worker    punpcklwd           m3, m4
1075*c0909341SAndroid Build Coastguard Worker    pmaddwd            m14, m10, m3
1076*c0909341SAndroid Build Coastguard Worker    pmaddwd            m15, m11, m3
1077*c0909341SAndroid Build Coastguard Worker    pmaddwd             m2, m12, m3
1078*c0909341SAndroid Build Coastguard Worker    pmaddwd             m3, m13
1079*c0909341SAndroid Build Coastguard Worker    pmaddwd             m0, m6, m1
1080*c0909341SAndroid Build Coastguard Worker    paddd               m0, m14
1081*c0909341SAndroid Build Coastguard Worker    pmaddwd            m14, m7, m1
1082*c0909341SAndroid Build Coastguard Worker    paddd              m14, m15
1083*c0909341SAndroid Build Coastguard Worker    psrld               m0, 8
1084*c0909341SAndroid Build Coastguard Worker    psrld              m14, 8
1085*c0909341SAndroid Build Coastguard Worker    packssdw            m0, m14
1086*c0909341SAndroid Build Coastguard Worker    pavgw               m0, m5
1087*c0909341SAndroid Build Coastguard Worker    mova       [dstq+32*0], m0
1088*c0909341SAndroid Build Coastguard Worker    SMOOTH_2D_END        8, 9, 1, 1, 2, 3
1089*c0909341SAndroid Build Coastguard Worker    mova       [dstq+32*1], m0
1090*c0909341SAndroid Build Coastguard Worker    add               dstq, strideq
1091*c0909341SAndroid Build Coastguard Worker    add         v_weightsq, 4
1092*c0909341SAndroid Build Coastguard Worker    dec                 yq
1093*c0909341SAndroid Build Coastguard Worker    jg .w64_loop_y
1094*c0909341SAndroid Build Coastguard Worker    lea               dstq, [dst_baseq+32*2]
1095*c0909341SAndroid Build Coastguard Worker    add                 r6, 16*8
1096*c0909341SAndroid Build Coastguard Worker    mov         v_weightsq, v_weights_baseq
1097*c0909341SAndroid Build Coastguard Worker    add                 xq, 32
1098*c0909341SAndroid Build Coastguard Worker    test                xb, 64
1099*c0909341SAndroid Build Coastguard Worker    jz .w64_loop_x
1100*c0909341SAndroid Build Coastguard Worker    RET
1101*c0909341SAndroid Build Coastguard Worker
1102*c0909341SAndroid Build Coastguard Workercglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
1103*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ipred_z1_16bpc_avx2_table]
1104*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
1105*c0909341SAndroid Build Coastguard Worker    movifnidn        angled, anglem
1106*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
1107*c0909341SAndroid Build Coastguard Worker    lea                  r7, [dr_intra_derivative]
1108*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
1109*c0909341SAndroid Build Coastguard Worker    add                 tlq, 2
1110*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
1111*c0909341SAndroid Build Coastguard Worker    mov                 dxd, angled
1112*c0909341SAndroid Build Coastguard Worker    and                 dxd, 0x7e
1113*c0909341SAndroid Build Coastguard Worker    add              angled, 165 ; ~90
1114*c0909341SAndroid Build Coastguard Worker    movzx               dxd, word [r7+dxq]
1115*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x4ff ; d = 90 - angle
1116*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pw_62]
1117*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1118*c0909341SAndroid Build Coastguard Worker.w4:
1119*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK         -64, 7
1120*c0909341SAndroid Build Coastguard Worker    cmp              angleb, 40
1121*c0909341SAndroid Build Coastguard Worker    jae .w4_no_upsample
1122*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq-1024]
1123*c0909341SAndroid Build Coastguard Worker    sar                 r3d, 7
1124*c0909341SAndroid Build Coastguard Worker    add                 r3d, hd
1125*c0909341SAndroid Build Coastguard Worker    jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
1126*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        xm3, [tlq+14]
1127*c0909341SAndroid Build Coastguard Worker    movu                xm1, [tlq+ 0]    ; 1 2 3 4 5 6 7 8
1128*c0909341SAndroid Build Coastguard Worker    palignr             xm0, xm3, xm1, 4 ; 3 4 5 6 7 8 8 8
1129*c0909341SAndroid Build Coastguard Worker    paddw               xm0, [tlq- 2]    ; 0 1 2 3 4 5 6 7
1130*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
1131*c0909341SAndroid Build Coastguard Worker    palignr             xm2, xm3, xm1, 2 ; 2 3 4 5 6 7 8 8
1132*c0909341SAndroid Build Coastguard Worker    paddw               xm2, xm1         ; -1 * a + 9 * b + 9 * c + -1 * d
1133*c0909341SAndroid Build Coastguard Worker    psubw               xm0, xm2, xm0    ; = (b + c - a - d + (b + c) << 3 + 8) >> 4
1134*c0909341SAndroid Build Coastguard Worker    psraw               xm0, 3           ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1
1135*c0909341SAndroid Build Coastguard Worker    pxor                xm4, xm4
1136*c0909341SAndroid Build Coastguard Worker    paddw               xm2, xm0
1137*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        xm0, r8m         ; pixel_max
1138*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], xm3
1139*c0909341SAndroid Build Coastguard Worker    movd                xm3, dxd
1140*c0909341SAndroid Build Coastguard Worker    pmaxsw              xm2, xm4
1141*c0909341SAndroid Build Coastguard Worker    mov                 r3d, dxd
1142*c0909341SAndroid Build Coastguard Worker    pavgw               xm2, xm4
1143*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, xm3
1144*c0909341SAndroid Build Coastguard Worker    pminsw              xm2, xm0
1145*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm0, xm1, xm2
1146*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm1, xm2
1147*c0909341SAndroid Build Coastguard Worker    lea                  r5, [strideq*3]
1148*c0909341SAndroid Build Coastguard Worker    pslldq               m2, m3, 8
1149*c0909341SAndroid Build Coastguard Worker    mova           [rsp+ 0], xm0
1150*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], xm1
1151*c0909341SAndroid Build Coastguard Worker    paddw                m6, m3, m3
1152*c0909341SAndroid Build Coastguard Worker    paddw                m3, m2
1153*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m6, 0xf0
1154*c0909341SAndroid Build Coastguard Worker    paddw                m6, m6
1155*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4 ; xpos0 xpos1 xpos2 xpos3
1156*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [z_upsample]
1157*c0909341SAndroid Build Coastguard Worker.w4_upsample_loop:
1158*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+dxq]
1159*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6 ; base0
1160*c0909341SAndroid Build Coastguard Worker    movu                xm1, [rsp+r3*2]
1161*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r2+dxq]
1162*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6 ; base1
1163*c0909341SAndroid Build Coastguard Worker    movu                xm2, [rsp+r2*2]
1164*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+dxq]
1165*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6 ; base2
1166*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [rsp+r3*2], 1 ; 0 2
1167*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r2+dxq]
1168*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6 ; base3
1169*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [rsp+r2*2], 1 ; 1 3
1170*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1171*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
1172*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m1, m2
1173*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m2
1174*c0909341SAndroid Build Coastguard Worker    pand                 m2, m5, m3 ; frac
1175*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9      ; (a * (64 - frac) + b * frac + 32) >> 6
1176*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0     ; = a + (((b - a) * frac + 32) >> 6)
1177*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2     ; = a + (((b - a) * (frac << 9) + 16384) >> 15)
1178*c0909341SAndroid Build Coastguard Worker    paddw                m3, m6     ; xpos += dx
1179*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1180*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
1181*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
1182*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
1183*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
1184*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r5       ], xm1
1185*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1186*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1187*c0909341SAndroid Build Coastguard Worker    jg .w4_upsample_loop
1188*c0909341SAndroid Build Coastguard Worker    RET
1189*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1190*c0909341SAndroid Build Coastguard Worker.filter_strength: ; w4/w8/w16
1191*c0909341SAndroid Build Coastguard Worker%define base r3-z_filter_t0
1192*c0909341SAndroid Build Coastguard Worker    movd                xm0, maxbased
1193*c0909341SAndroid Build Coastguard Worker    lea                  r3, [z_filter_t0]
1194*c0909341SAndroid Build Coastguard Worker    movd                xm1, angled
1195*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
1196*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, xm0
1197*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m1, xm1
1198*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m0, [base+z_filter_wh]
1199*c0909341SAndroid Build Coastguard Worker    mova                xm2, [r3+angleq*8]
1200*c0909341SAndroid Build Coastguard Worker    pand                 m0, m1
1201*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m0, m2
1202*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m0
1203*c0909341SAndroid Build Coastguard Worker    ret
1204*c0909341SAndroid Build Coastguard Worker.w4_no_upsample:
1205*c0909341SAndroid Build Coastguard Worker    mov            maxbased, 7
1206*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
1207*c0909341SAndroid Build Coastguard Worker    jnz .w4_main
1208*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [hq+3]
1209*c0909341SAndroid Build Coastguard Worker    call .filter_strength
1210*c0909341SAndroid Build Coastguard Worker    mov            maxbased, 7
1211*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1212*c0909341SAndroid Build Coastguard Worker    jz .w4_main ; filter_strength == 0
1213*c0909341SAndroid Build Coastguard Worker    popcnt              r5d, r5d
1214*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        xm3, [tlq+14]
1215*c0909341SAndroid Build Coastguard Worker    mova                xm0, [tlq- 2]      ; 0 1 2 3 4 5 6 7
1216*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm1, [base+z_filter_k-4+r5*4+12*1]
1217*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [base+z_filter_k-4+r5*4+12*0]
1218*c0909341SAndroid Build Coastguard Worker    palignr             xm2, xm3, xm0, 4   ; 2 3 4 5 6 7 8 8
1219*c0909341SAndroid Build Coastguard Worker    pmullw              xm1, [tlq+ 0]      ; 1 2 3 4 5 6 7 8
1220*c0909341SAndroid Build Coastguard Worker    paddw               xm2, xm0
1221*c0909341SAndroid Build Coastguard Worker    pmullw              xm2, xm4
1222*c0909341SAndroid Build Coastguard Worker    movd           [rsp+16], xm3
1223*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, 3
1224*c0909341SAndroid Build Coastguard Worker    jne .w4_3tap
1225*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
1226*c0909341SAndroid Build Coastguard Worker    palignr             xm2, xm3, xm0, 6   ; 3 4 5 6 7 8 8 8
1227*c0909341SAndroid Build Coastguard Worker    pblendw             xm0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6
1228*c0909341SAndroid Build Coastguard Worker    movzx               r3d, word [tlq+14]
1229*c0909341SAndroid Build Coastguard Worker    movzx               r2d, word [tlq+12]
1230*c0909341SAndroid Build Coastguard Worker    inc            maxbased
1231*c0909341SAndroid Build Coastguard Worker    paddw               xm2, xm0
1232*c0909341SAndroid Build Coastguard Worker    sub                 r2d, r3d
1233*c0909341SAndroid Build Coastguard Worker    paddw               xm2, xm2
1234*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r2+r3*8+4]
1235*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 3 ; (1 * top[6] + 7 * top[7] + 4) >> 3
1236*c0909341SAndroid Build Coastguard Worker    mov            [rsp+16], r2w
1237*c0909341SAndroid Build Coastguard Worker.w4_3tap:
1238*c0909341SAndroid Build Coastguard Worker    pxor                xm0, xm0
1239*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
1240*c0909341SAndroid Build Coastguard Worker    mov                 tlq, rsp
1241*c0909341SAndroid Build Coastguard Worker    psrlw               xm1, 3
1242*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
1243*c0909341SAndroid Build Coastguard Worker    sbb            maxbased, -1
1244*c0909341SAndroid Build Coastguard Worker    pavgw               xm0, xm1
1245*c0909341SAndroid Build Coastguard Worker    mova              [tlq], xm0
1246*c0909341SAndroid Build Coastguard Worker.w4_main:
1247*c0909341SAndroid Build Coastguard Worker    movd                xm3, dxd
1248*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [z_base_inc]
1249*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, [tlq+maxbaseq*2] ; top[max_base_x]
1250*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
1251*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, xm3
1252*c0909341SAndroid Build Coastguard Worker    movd                xm0, maxbased
1253*c0909341SAndroid Build Coastguard Worker    mov                 r3d, dxd      ; xpos
1254*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
1255*c0909341SAndroid Build Coastguard Worker    paddw                m4, m3, m3
1256*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0       ; -max_base_x
1257*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m4, 0xcc
1258*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4, m3
1259*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m0, 0xf0 ; xpos0 xpos1 xpos2 xpos3
1260*c0909341SAndroid Build Coastguard Worker    paddw                m4, m4
1261*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
1262*c0909341SAndroid Build Coastguard Worker.w4_loop:
1263*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r3+dxq]
1264*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6 ; base0
1265*c0909341SAndroid Build Coastguard Worker    movu                xm1, [tlq+r3*2]
1266*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r5+dxq]
1267*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 6 ; base1
1268*c0909341SAndroid Build Coastguard Worker    movu                xm2, [tlq+r5*2]
1269*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r3+dxq]
1270*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6 ; base2
1271*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [tlq+r3*2], 1 ; 0 2
1272*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r5+dxq]
1273*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 6 ; base3
1274*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [tlq+r5*2], 1 ; 1 3
1275*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m1, m2
1276*c0909341SAndroid Build Coastguard Worker    psrldq               m1, 2
1277*c0909341SAndroid Build Coastguard Worker    pslldq               m2, 6
1278*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m2, 0xcc
1279*c0909341SAndroid Build Coastguard Worker    pand                 m2, m5, m3
1280*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
1281*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1282*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1283*c0909341SAndroid Build Coastguard Worker    psraw                m2, m3, 15 ; xpos < max_base_x
1284*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
1285*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1286*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m6, m0, m2
1287*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
1288*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
1289*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
1290*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1291*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm1
1292*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm1
1293*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1294*c0909341SAndroid Build Coastguard Worker    jz .w4_end
1295*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1296*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, maxbased
1297*c0909341SAndroid Build Coastguard Worker    jb .w4_loop
1298*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
1299*c0909341SAndroid Build Coastguard Worker.w4_end_loop:
1300*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm6
1301*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm6
1302*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm6
1303*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r6       ], xm6
1304*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1305*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1306*c0909341SAndroid Build Coastguard Worker    jg .w4_end_loop
1307*c0909341SAndroid Build Coastguard Worker.w4_end:
1308*c0909341SAndroid Build Coastguard Worker    RET
1309*c0909341SAndroid Build Coastguard Worker.w8:
1310*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK         -64, 7
1311*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq+216]
1312*c0909341SAndroid Build Coastguard Worker    mov                 r3b, hb
1313*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
1314*c0909341SAndroid Build Coastguard Worker    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
1315*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+2]    ; 2 3 4 5 6 7 8 9   a b c d e f g _
1316*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+4]    ; 3 4 5 6 7 8 9 a   b c d e f g _ _
1317*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+0]    ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1318*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
1319*c0909341SAndroid Build Coastguard Worker    jne .w8_upsample_h8 ; awkward single-pixel edge case
1320*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m2, 0x20   ; 3 4 5 6 7 8 9 a   b c c _ _ _ _ _
1321*c0909341SAndroid Build Coastguard Worker.w8_upsample_h8:
1322*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1
1323*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-2]    ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1324*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
1325*c0909341SAndroid Build Coastguard Worker    psubw                m0, m2, m0
1326*c0909341SAndroid Build Coastguard Worker    psraw                m0, 3
1327*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
1328*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
1329*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, r8m
1330*c0909341SAndroid Build Coastguard Worker    movd                xm3, dxd
1331*c0909341SAndroid Build Coastguard Worker    pmaxsw               m2, m4
1332*c0909341SAndroid Build Coastguard Worker    mov                 r3d, dxd
1333*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m4
1334*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, xm3
1335*c0909341SAndroid Build Coastguard Worker    pminsw               m2, m0
1336*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2
1337*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2
1338*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [z_upsample]
1339*c0909341SAndroid Build Coastguard Worker    mova           [rsp+ 0], xm0
1340*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], xm1
1341*c0909341SAndroid Build Coastguard Worker    paddw                m6, m3, m3
1342*c0909341SAndroid Build Coastguard Worker    vextracti128   [rsp+32], m0, 1
1343*c0909341SAndroid Build Coastguard Worker    vextracti128   [rsp+48], m1, 1
1344*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m6, 0xf0 ; xpos0 xpos1
1345*c0909341SAndroid Build Coastguard Worker.w8_upsample_loop:
1346*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+dxq]
1347*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6 ; base0
1348*c0909341SAndroid Build Coastguard Worker    movu                xm1, [rsp+r3*2]
1349*c0909341SAndroid Build Coastguard Worker    movu                xm2, [rsp+r3*2+16]
1350*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r2+dxq]
1351*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6 ; base1
1352*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [rsp+r2*2], 1
1353*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [rsp+r2*2+16], 1
1354*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1355*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
1356*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m1, m2
1357*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m2
1358*c0909341SAndroid Build Coastguard Worker    pand                 m2, m5, m3
1359*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
1360*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1361*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1362*c0909341SAndroid Build Coastguard Worker    paddw                m3, m6
1363*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1364*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
1365*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
1366*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1367*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1368*c0909341SAndroid Build Coastguard Worker    jg .w8_upsample_loop
1369*c0909341SAndroid Build Coastguard Worker    RET
1370*c0909341SAndroid Build Coastguard Worker.w8_no_intra_edge_filter:
1371*c0909341SAndroid Build Coastguard Worker    and            maxbased, 7
1372*c0909341SAndroid Build Coastguard Worker    or             maxbased, 8 ; imin(h+7, 15)
1373*c0909341SAndroid Build Coastguard Worker    jmp .w8_main
1374*c0909341SAndroid Build Coastguard Worker.w8_no_upsample:
1375*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [hq+7]
1376*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1377*c0909341SAndroid Build Coastguard Worker    jnz .w8_no_intra_edge_filter
1378*c0909341SAndroid Build Coastguard Worker    call .filter_strength
1379*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1380*c0909341SAndroid Build Coastguard Worker    jz .w8_main
1381*c0909341SAndroid Build Coastguard Worker    popcnt              r5d, r5d
1382*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [base+z_filter_k-4+r5*4+12*1]
1383*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+z_filter_k-4+r5*4+12*0]
1384*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-2]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1385*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+0]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1386*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m2
1387*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
1388*c0909341SAndroid Build Coastguard Worker    jl .w8_filter_h4
1389*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m2
1390*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m2, [tlq+2], 0x7f ; 2 3 4 5 6 7 8 9   a b c d e f g g
1391*c0909341SAndroid Build Coastguard Worker    je .w8_filter_end ; 8x4 and 8x8 are always 3-tap
1392*c0909341SAndroid Build Coastguard Worker    movzx               r3d, word [tlq+30]
1393*c0909341SAndroid Build Coastguard Worker    mov            maxbased, 16
1394*c0909341SAndroid Build Coastguard Worker    mov            [rsp+32], r3d
1395*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, 3
1396*c0909341SAndroid Build Coastguard Worker    jne .w8_filter_end
1397*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm6, xm0, xm0
1398*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, [tlq+4], 0x7f     ; 3 4 5 6 7 8 9 a   b c d e f g g g
1399*c0909341SAndroid Build Coastguard Worker    vpblendd             m6, [tlq-4], 0xfe     ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
1400*c0909341SAndroid Build Coastguard Worker    movzx               r5d, word [tlq+28]
1401*c0909341SAndroid Build Coastguard Worker    mov            [rsp+34], r3w
1402*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6
1403*c0909341SAndroid Build Coastguard Worker    sub                 r5d, r3d
1404*c0909341SAndroid Build Coastguard Worker    inc            maxbased
1405*c0909341SAndroid Build Coastguard Worker    paddw                m2, m2
1406*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r5+r3*8+4]
1407*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1408*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 3
1409*c0909341SAndroid Build Coastguard Worker    mov            [rsp+32], r3w
1410*c0909341SAndroid Build Coastguard Worker    jmp .w8_filter_end
1411*c0909341SAndroid Build Coastguard Worker.w8_filter_h4:
1412*c0909341SAndroid Build Coastguard Worker    pshuflw              m3, m2, q3321
1413*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [tlq+2], 0        ; 2 3 4 5 6 7 8 9   a b c c _ _ _ _
1414*c0909341SAndroid Build Coastguard Worker.w8_filter_end:
1415*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
1416*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m4
1417*c0909341SAndroid Build Coastguard Worker    mov                 tlq, rsp
1418*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
1419*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1420*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 3
1421*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m2
1422*c0909341SAndroid Build Coastguard Worker    mova              [tlq], m0
1423*c0909341SAndroid Build Coastguard Worker.w8_main:
1424*c0909341SAndroid Build Coastguard Worker    movd                xm3, dxd
1425*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [z_base_inc]
1426*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, [tlq+maxbaseq*2]
1427*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
1428*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, xm3
1429*c0909341SAndroid Build Coastguard Worker    movd                xm0, maxbased
1430*c0909341SAndroid Build Coastguard Worker    mov                 r3d, dxd
1431*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
1432*c0909341SAndroid Build Coastguard Worker    paddw                m4, m3, m3
1433*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1434*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m4, 0xf0 ; xpos0 xpos1
1435*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
1436*c0909341SAndroid Build Coastguard Worker.w8_loop:
1437*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r3+dxq]
1438*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6
1439*c0909341SAndroid Build Coastguard Worker    movu                xm0, [tlq+r3*2]
1440*c0909341SAndroid Build Coastguard Worker    movu                xm1, [tlq+r3*2+2]
1441*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r5+dxq]
1442*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 6
1443*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [tlq+r5*2], 1
1444*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [tlq+r5*2+2], 1
1445*c0909341SAndroid Build Coastguard Worker    pand                 m2, m5, m3
1446*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
1447*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1448*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1449*c0909341SAndroid Build Coastguard Worker    psraw                m2, m3, 15
1450*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
1451*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1452*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m6, m0, m2
1453*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
1454*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
1455*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1456*c0909341SAndroid Build Coastguard Worker    jz .w8_end
1457*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1458*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, maxbased
1459*c0909341SAndroid Build Coastguard Worker    jb .w8_loop
1460*c0909341SAndroid Build Coastguard Worker.w8_end_loop:
1461*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], xm6
1462*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], xm6
1463*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1464*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1465*c0909341SAndroid Build Coastguard Worker    jg .w8_end_loop
1466*c0909341SAndroid Build Coastguard Worker.w8_end:
1467*c0909341SAndroid Build Coastguard Worker    RET
1468*c0909341SAndroid Build Coastguard Worker.w16_no_intra_edge_filter:
1469*c0909341SAndroid Build Coastguard Worker    and            maxbased, 15
1470*c0909341SAndroid Build Coastguard Worker    or             maxbased, 16 ; imin(h+15, 31)
1471*c0909341SAndroid Build Coastguard Worker    jmp .w16_main
1472*c0909341SAndroid Build Coastguard Worker.w16:
1473*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK         -96, 7
1474*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [hq+15]
1475*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1476*c0909341SAndroid Build Coastguard Worker    jnz .w16_no_intra_edge_filter
1477*c0909341SAndroid Build Coastguard Worker    call .filter_strength
1478*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1479*c0909341SAndroid Build Coastguard Worker    jz .w16_main
1480*c0909341SAndroid Build Coastguard Worker    popcnt              r5d, r5d
1481*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-2]            ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1482*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0, [tlq+2]        ; 2 3 4 5 6 7 8 9   a b c d e f g h
1483*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, 3
1484*c0909341SAndroid Build Coastguard Worker    jne .w16_filter_3tap
1485*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [base+pw_3]
1486*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm0, xm0
1487*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, [tlq-4], 0xfe      ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
1488*c0909341SAndroid Build Coastguard Worker    paddw                m1, [tlq+0]            ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1489*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1490*c0909341SAndroid Build Coastguard Worker    pavgw                m0, [tlq+4]            ; 3 4 5 6 7 8 9 a   b c d e f g h i
1491*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1492*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
1493*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+32]           ; 2 3 4 5 6 7 8 9   a b c d e f g h
1494*c0909341SAndroid Build Coastguard Worker    paddw                m2, [tlq+28]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1495*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3, [tlq+30]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1496*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
1497*c0909341SAndroid Build Coastguard Worker    jl .w16_filter_5tap_h4
1498*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m3
1499*c0909341SAndroid Build Coastguard Worker    je .w16_filter_5tap_h8
1500*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m3, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h h h
1501*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, [tlq+34], 0x7f     ; 3 4 5 6 7 8 9 a   b c d e f g h h
1502*c0909341SAndroid Build Coastguard Worker    movzx               r3d, word [tlq+62]
1503*c0909341SAndroid Build Coastguard Worker    movzx               r2d, word [tlq+60]
1504*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m4
1505*c0909341SAndroid Build Coastguard Worker    sub                 r2d, r3d
1506*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
1507*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r2+r3*8+4]
1508*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1509*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 3
1510*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2
1511*c0909341SAndroid Build Coastguard Worker    mov            [rsp+66], r3w
1512*c0909341SAndroid Build Coastguard Worker    mov            [rsp+64], r2w
1513*c0909341SAndroid Build Coastguard Worker    mov                 tlq, rsp
1514*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 33
1515*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
1516*c0909341SAndroid Build Coastguard Worker    cmovg          maxbased, r3d
1517*c0909341SAndroid Build Coastguard Worker    jmp .w16_filter_end2
1518*c0909341SAndroid Build Coastguard Worker.w16_filter_5tap_h8:
1519*c0909341SAndroid Build Coastguard Worker    vpblendd            xm4, xm3, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9
1520*c0909341SAndroid Build Coastguard Worker    vpblendd            xm3, [tlq+34], 0x07      ; 3 4 5 6 7 8 9 9
1521*c0909341SAndroid Build Coastguard Worker    pavgw               xm2, xm4
1522*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm3
1523*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
1524*c0909341SAndroid Build Coastguard Worker    psrlw               xm1, 2
1525*c0909341SAndroid Build Coastguard Worker    jmp .w16_filter_end2
1526*c0909341SAndroid Build Coastguard Worker.w16_filter_5tap_h4:
1527*c0909341SAndroid Build Coastguard Worker    pshuflw             xm4, xm3, q3332          ; 4 5 5 5
1528*c0909341SAndroid Build Coastguard Worker    pshuflw             xm3, xm3, q3321          ; 3 4 5 5
1529*c0909341SAndroid Build Coastguard Worker    pavgw               xm2, xm4
1530*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm3
1531*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
1532*c0909341SAndroid Build Coastguard Worker    psrlw               xm1, 2
1533*c0909341SAndroid Build Coastguard Worker    jmp .w16_filter_end2
1534*c0909341SAndroid Build Coastguard Worker.w16_filter_3tap:
1535*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [base+z_filter_k-4+r5*4+12*1]
1536*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+z_filter_k-4+r5*4+12*0]
1537*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m3, [tlq+0]    ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1538*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+32]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1539*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
1540*c0909341SAndroid Build Coastguard Worker    pmullw               m3, m2
1541*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1542*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
1543*c0909341SAndroid Build Coastguard Worker    je .w16_filter_3tap_h8
1544*c0909341SAndroid Build Coastguard Worker    jl .w16_filter_3tap_h4
1545*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m2
1546*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, [tlq+34], 0x7f ; 2 3 4 5 6 7 8 9   a b c d e f g g
1547*c0909341SAndroid Build Coastguard Worker    jmp .w16_filter_end
1548*c0909341SAndroid Build Coastguard Worker.w16_filter_3tap_h4:
1549*c0909341SAndroid Build Coastguard Worker    pshuflw             xm2, xm2, q3321     ; 2 3 4 4 _ _ _ _
1550*c0909341SAndroid Build Coastguard Worker    jmp .w16_filter_end
1551*c0909341SAndroid Build Coastguard Worker.w16_filter_3tap_h8:
1552*c0909341SAndroid Build Coastguard Worker    psrldq              xm2, 2
1553*c0909341SAndroid Build Coastguard Worker    pshufhw             xm2, xm2, q2210     ; 2 3 4 5 6 7 8 8
1554*c0909341SAndroid Build Coastguard Worker.w16_filter_end:
1555*c0909341SAndroid Build Coastguard Worker    paddw                m2, [tlq+30]       ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1556*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m4
1557*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 3
1558*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
1559*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
1560*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 3
1561*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m1
1562*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m2
1563*c0909341SAndroid Build Coastguard Worker.w16_filter_end2:
1564*c0909341SAndroid Build Coastguard Worker    mov                 tlq, rsp
1565*c0909341SAndroid Build Coastguard Worker    mova           [tlq+ 0], m0
1566*c0909341SAndroid Build Coastguard Worker    mova           [tlq+32], m1
1567*c0909341SAndroid Build Coastguard Worker.w16_main:
1568*c0909341SAndroid Build Coastguard Worker    movd                xm4, dxd
1569*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, [tlq+maxbaseq*2]
1570*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
1571*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, xm4
1572*c0909341SAndroid Build Coastguard Worker    movd                xm0, maxbased
1573*c0909341SAndroid Build Coastguard Worker    mov                 r3d, dxd
1574*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
1575*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4, [z_base_inc]
1576*c0909341SAndroid Build Coastguard Worker    psubw                m3, m0
1577*c0909341SAndroid Build Coastguard Worker.w16_loop:
1578*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r3+dxq]
1579*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6
1580*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3*2]
1581*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+2]
1582*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r5+dxq]
1583*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 6
1584*c0909341SAndroid Build Coastguard Worker    pand                 m2, m5, m3
1585*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
1586*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1587*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1588*c0909341SAndroid Build Coastguard Worker    psraw                m2, m3, 15
1589*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
1590*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
1591*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r5*2]
1592*c0909341SAndroid Build Coastguard Worker    vpblendvb            m2, m6, m1, m2
1593*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r5*2+2]
1594*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m2
1595*c0909341SAndroid Build Coastguard Worker    pand                 m2, m5, m3
1596*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
1597*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1598*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1599*c0909341SAndroid Build Coastguard Worker    psraw                m2, m3, 15
1600*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
1601*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1602*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m6, m0, m2
1603*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m0
1604*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1605*c0909341SAndroid Build Coastguard Worker    jz .w16_end
1606*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1607*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, maxbased
1608*c0909341SAndroid Build Coastguard Worker    jb .w16_loop
1609*c0909341SAndroid Build Coastguard Worker.w16_end_loop:
1610*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m6
1611*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m6
1612*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1613*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1614*c0909341SAndroid Build Coastguard Worker    jg .w16_end_loop
1615*c0909341SAndroid Build Coastguard Worker.w16_end:
1616*c0909341SAndroid Build Coastguard Worker    RET
1617*c0909341SAndroid Build Coastguard Worker.w32:
1618*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK        -160, 8
1619*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [hq+31]
1620*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 63
1621*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
1622*c0909341SAndroid Build Coastguard Worker    cmova          maxbased, r3d
1623*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1624*c0909341SAndroid Build Coastguard Worker    jnz .w32_main
1625*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [pw_3]
1626*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-2]       ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1627*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm1, xm0, xm0
1628*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
1629*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+0]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1630*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1631*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+2]       ; 2 3 4 5 6 7 8 9   a b c d e f g h
1632*c0909341SAndroid Build Coastguard Worker    pavgw                m1, [tlq+4]       ; 3 4 5 6 7 8 9 a   b c d e f g h i
1633*c0909341SAndroid Build Coastguard Worker    mov                  r3, rsp
1634*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1635*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [maxbaseq-31]
1636*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
1637*c0909341SAndroid Build Coastguard Worker    mova               [r3], m0
1638*c0909341SAndroid Build Coastguard Worker.w32_filter_loop:
1639*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq+30]
1640*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2, [tlq+28]
1641*c0909341SAndroid Build Coastguard Worker    add                 tlq, 32
1642*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+0]
1643*c0909341SAndroid Build Coastguard Worker    pavgw                m1, [tlq+4]
1644*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+2]
1645*c0909341SAndroid Build Coastguard Worker    add                  r3, 32
1646*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1647*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
1648*c0909341SAndroid Build Coastguard Worker    mova               [r3], m0
1649*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 16
1650*c0909341SAndroid Build Coastguard Worker    jg .w32_filter_loop
1651*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+32]           ; 2 3 4 5 6 7 8 9   a b c d e f g h
1652*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m0
1653*c0909341SAndroid Build Coastguard Worker    paddw                m2, [tlq+28]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1654*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+30]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1655*c0909341SAndroid Build Coastguard Worker    jl .w32_filter_h8
1656*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h h h
1657*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, [tlq+34], 0x7f     ; 3 4 5 6 7 8 9 a   b c d e f g h h
1658*c0909341SAndroid Build Coastguard Worker    movzx               r5d, word [tlq+62]
1659*c0909341SAndroid Build Coastguard Worker    movzx               r2d, word [tlq+60]
1660*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m3
1661*c0909341SAndroid Build Coastguard Worker    sub                 r2d, r5d
1662*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1663*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r2+r5*8+4]
1664*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1665*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 3
1666*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
1667*c0909341SAndroid Build Coastguard Worker    mova            [r3+32], m0
1668*c0909341SAndroid Build Coastguard Worker    mov             [r3+66], r5w
1669*c0909341SAndroid Build Coastguard Worker    mov             [r3+64], r2w
1670*c0909341SAndroid Build Coastguard Worker    mov                 tlq, rsp
1671*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 65
1672*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 64
1673*c0909341SAndroid Build Coastguard Worker    cmove          maxbased, r3d
1674*c0909341SAndroid Build Coastguard Worker    jmp .w32_main
1675*c0909341SAndroid Build Coastguard Worker.w32_filter_h8:
1676*c0909341SAndroid Build Coastguard Worker    vpblendd            xm3, xm1, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9
1677*c0909341SAndroid Build Coastguard Worker    vpblendd            xm1, [tlq+34], 0x07      ; 3 4 5 6 7 8 9 9
1678*c0909341SAndroid Build Coastguard Worker    pavgw               xm2, xm3
1679*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
1680*c0909341SAndroid Build Coastguard Worker    mov                 tlq, rsp
1681*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm2
1682*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 2
1683*c0909341SAndroid Build Coastguard Worker    mova            [r3+32], xm0
1684*c0909341SAndroid Build Coastguard Worker.w32_main:
1685*c0909341SAndroid Build Coastguard Worker    movd                xm4, dxd
1686*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, [tlq+maxbaseq*2]
1687*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
1688*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, xm4
1689*c0909341SAndroid Build Coastguard Worker    movd                xm0, maxbased
1690*c0909341SAndroid Build Coastguard Worker    mov                 r5d, dxd
1691*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pw_m1024] ; -16 * 64
1692*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
1693*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4, [z_base_inc]
1694*c0909341SAndroid Build Coastguard Worker    psubw                m3, m0
1695*c0909341SAndroid Build Coastguard Worker.w32_loop:
1696*c0909341SAndroid Build Coastguard Worker    mov                 r3d, r5d
1697*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6
1698*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3*2]
1699*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+2]
1700*c0909341SAndroid Build Coastguard Worker    pand                 m2, m5, m3
1701*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
1702*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1703*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1704*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1705*c0909341SAndroid Build Coastguard Worker    psraw                m1, m3, 15
1706*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m6, m0, m1
1707*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
1708*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3*2+32]
1709*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+34]
1710*c0909341SAndroid Build Coastguard Worker    add                 r5d, dxd
1711*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1712*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1713*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m7, m3
1714*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
1715*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1716*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m6, m0, m2
1717*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m0
1718*c0909341SAndroid Build Coastguard Worker    dec                  hd
1719*c0909341SAndroid Build Coastguard Worker    jz .w32_end
1720*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1721*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, maxbased
1722*c0909341SAndroid Build Coastguard Worker    jb .w32_loop
1723*c0909341SAndroid Build Coastguard Worker.w32_end_loop:
1724*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m6
1725*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m6
1726*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1727*c0909341SAndroid Build Coastguard Worker    dec                  hd
1728*c0909341SAndroid Build Coastguard Worker    jg .w32_end_loop
1729*c0909341SAndroid Build Coastguard Worker.w32_end:
1730*c0909341SAndroid Build Coastguard Worker    RET
1731*c0909341SAndroid Build Coastguard Worker.w64:
1732*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK        -256, 10
1733*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [hq+63]
1734*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1735*c0909341SAndroid Build Coastguard Worker    jnz .w64_main
1736*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [pw_3]
1737*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-2]       ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1738*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm1, xm0, xm0
1739*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
1740*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+0]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1741*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1742*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+2]       ; 2 3 4 5 6 7 8 9   a b c d e f g h
1743*c0909341SAndroid Build Coastguard Worker    pavgw                m1, [tlq+4]       ; 3 4 5 6 7 8 9 a   b c d e f g h i
1744*c0909341SAndroid Build Coastguard Worker    mov                  r3, rsp
1745*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1746*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [hq+32]
1747*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
1748*c0909341SAndroid Build Coastguard Worker    mova               [r3], m0
1749*c0909341SAndroid Build Coastguard Worker.w64_filter_loop:
1750*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq+30]
1751*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2, [tlq+28]
1752*c0909341SAndroid Build Coastguard Worker    add                 tlq, 32
1753*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+0]
1754*c0909341SAndroid Build Coastguard Worker    pavgw                m1, [tlq+4]
1755*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+2]
1756*c0909341SAndroid Build Coastguard Worker    add                  r3, 32
1757*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1758*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
1759*c0909341SAndroid Build Coastguard Worker    mova               [r3], m0
1760*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 16
1761*c0909341SAndroid Build Coastguard Worker    jg .w64_filter_loop
1762*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+32]           ; 2 3 4 5 6 7 8 9   a b c d e f g h
1763*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m0
1764*c0909341SAndroid Build Coastguard Worker    paddw                m2, [tlq+28]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1765*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+30]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1766*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h h h
1767*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, [tlq+34], 0x7f     ; 3 4 5 6 7 8 9 a   b c d e f g h h
1768*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m3
1769*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1770*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1771*c0909341SAndroid Build Coastguard Worker    mov                 tlq, rsp
1772*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
1773*c0909341SAndroid Build Coastguard Worker    mova            [r3+32], m0
1774*c0909341SAndroid Build Coastguard Worker.w64_main:
1775*c0909341SAndroid Build Coastguard Worker    movd                xm4, dxd
1776*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, [tlq+maxbaseq*2]
1777*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
1778*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, xm4
1779*c0909341SAndroid Build Coastguard Worker    movd                xm0, maxbased
1780*c0909341SAndroid Build Coastguard Worker    mov                 r5d, dxd
1781*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pw_m1024] ; -16 * 64
1782*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
1783*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4, [z_base_inc]
1784*c0909341SAndroid Build Coastguard Worker    paddw                m8, m7, m7     ; -32 * 64
1785*c0909341SAndroid Build Coastguard Worker    psubw                m3, m0
1786*c0909341SAndroid Build Coastguard Worker    paddw                m9, m8, m7     ; -48 * 64
1787*c0909341SAndroid Build Coastguard Worker.w64_loop:
1788*c0909341SAndroid Build Coastguard Worker    mov                 r3d, r5d
1789*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6
1790*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3*2]
1791*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+2]
1792*c0909341SAndroid Build Coastguard Worker    pand                 m2, m5, m3
1793*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
1794*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1795*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1796*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1797*c0909341SAndroid Build Coastguard Worker    psraw                m1, m3, 15
1798*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m6, m0, m1
1799*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
1800*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3*2+32]
1801*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+34]
1802*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1803*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1804*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1805*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m1, m7, m3
1806*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m6, m0, m1
1807*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m0
1808*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3*2+64]
1809*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+66]
1810*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1811*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1812*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1813*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m1, m8, m3
1814*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m6, m0, m1
1815*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*2], m0
1816*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3*2+96]
1817*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+98]
1818*c0909341SAndroid Build Coastguard Worker    add                 r5d, dxd
1819*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1820*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1821*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m9, m3
1822*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
1823*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1824*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m6, m0, m2
1825*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*3], m0
1826*c0909341SAndroid Build Coastguard Worker    dec                  hd
1827*c0909341SAndroid Build Coastguard Worker    jz .w64_end
1828*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1829*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, maxbased
1830*c0909341SAndroid Build Coastguard Worker    jb .w64_loop
1831*c0909341SAndroid Build Coastguard Worker.w64_end_loop:
1832*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m6
1833*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m6
1834*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*2], m6
1835*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*3], m6
1836*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1837*c0909341SAndroid Build Coastguard Worker    dec                  hd
1838*c0909341SAndroid Build Coastguard Worker    jg .w64_end_loop
1839*c0909341SAndroid Build Coastguard Worker.w64_end:
1840*c0909341SAndroid Build Coastguard Worker    RET
1841*c0909341SAndroid Build Coastguard Worker
1842*c0909341SAndroid Build Coastguard Workercglobal ipred_z2_16bpc, 3, 12, 12, 352, dst, stride, tl, w, h, angle, dx, dy
1843*c0909341SAndroid Build Coastguard Worker%define base r9-z_filter_t0
1844*c0909341SAndroid Build Coastguard Worker    lea                  r9, [ipred_z2_16bpc_avx2_table]
1845*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
1846*c0909341SAndroid Build Coastguard Worker    movifnidn        angled, anglem
1847*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
1848*c0909341SAndroid Build Coastguard Worker    lea                 dxq, [dr_intra_derivative-90]
1849*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r9+wq*4]
1850*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tlq-  0]
1851*c0909341SAndroid Build Coastguard Worker    movzx               dyd, angleb
1852*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x400
1853*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq- 32]
1854*c0909341SAndroid Build Coastguard Worker    mov                  r8, dxq
1855*c0909341SAndroid Build Coastguard Worker    sub                 dxq, dyq
1856*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tlq- 64]
1857*c0909341SAndroid Build Coastguard Worker    add                  wq, r9
1858*c0909341SAndroid Build Coastguard Worker    add                  r9, z_filter_t0-ipred_z2_16bpc_avx2_table
1859*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tlq- 96]
1860*c0909341SAndroid Build Coastguard Worker    and                 dyd, ~1
1861*c0909341SAndroid Build Coastguard Worker    mova                 m5, [tlq-128]
1862*c0909341SAndroid Build Coastguard Worker    and                 dxq, ~1
1863*c0909341SAndroid Build Coastguard Worker    movzx               dyd, word [r8+dyq]  ; angle - 90
1864*c0909341SAndroid Build Coastguard Worker    movzx               dxd, word [dxq+270] ; 180 - angle
1865*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+pw_62]
1866*c0909341SAndroid Build Coastguard Worker    mova          [rsp+128], m1
1867*c0909341SAndroid Build Coastguard Worker    mova          [rsp+ 96], m2
1868*c0909341SAndroid Build Coastguard Worker    mova          [rsp+ 64], m3
1869*c0909341SAndroid Build Coastguard Worker    neg                 dxd
1870*c0909341SAndroid Build Coastguard Worker    mova          [rsp+ 32], m4
1871*c0909341SAndroid Build Coastguard Worker    neg                 dyq
1872*c0909341SAndroid Build Coastguard Worker    mova          [rsp+  0], m5
1873*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1874*c0909341SAndroid Build Coastguard Worker.w4:
1875*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m10, [base+z2_x_shuf]
1876*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m6, [base+z_base_inc+2]
1877*c0909341SAndroid Build Coastguard Worker    lea                 r8d, [dxq+(65<<6)] ; xpos
1878*c0909341SAndroid Build Coastguard Worker    mov                r10d, (63-4)<<6
1879*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1880*c0909341SAndroid Build Coastguard Worker    jnz .w4_main ; !enable_intra_edge_filter
1881*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+2]
1882*c0909341SAndroid Build Coastguard Worker    add              angled, 1022
1883*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1884*c0909341SAndroid Build Coastguard Worker    test                r3d, angled
1885*c0909341SAndroid Build Coastguard Worker    jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
1886*c0909341SAndroid Build Coastguard Worker    movq                xm0, [tlq+2]    ; 1 2 3 4
1887*c0909341SAndroid Build Coastguard Worker    movq                xm1, [tlq+0]    ; 0 1 2 3
1888*c0909341SAndroid Build Coastguard Worker    pshuflw             xm2, xm0, q3321 ; 2 3 4 4
1889*c0909341SAndroid Build Coastguard Worker    pshuflw             xm3, xm1, q2100 ; 0 0 1 2
1890*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        xm4, r8m        ; pixel_max
1891*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m10, [base+z_upsample]
1892*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm0
1893*c0909341SAndroid Build Coastguard Worker    paddw               xm2, xm3
1894*c0909341SAndroid Build Coastguard Worker    lea                 r8d, [r8+dxq+(1<<6)]
1895*c0909341SAndroid Build Coastguard Worker    psubw               xm2, xm1, xm2
1896*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
1897*c0909341SAndroid Build Coastguard Worker    psraw               xm2, 3
1898*c0909341SAndroid Build Coastguard Worker    pxor                xm3, xm3
1899*c0909341SAndroid Build Coastguard Worker    sub                r10d, 3<<6
1900*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
1901*c0909341SAndroid Build Coastguard Worker    paddw                m6, m6
1902*c0909341SAndroid Build Coastguard Worker    pmaxsw              xm1, xm3
1903*c0909341SAndroid Build Coastguard Worker    sub              angled, 1075 ; angle - 53
1904*c0909341SAndroid Build Coastguard Worker    pavgw               xm1, xm3
1905*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
1906*c0909341SAndroid Build Coastguard Worker    pminsw              xm1, xm4
1907*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x7f ; 180 - angle
1908*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm1, xm0
1909*c0909341SAndroid Build Coastguard Worker    movu          [rsp+130], xm1
1910*c0909341SAndroid Build Coastguard Worker    call .filter_strength
1911*c0909341SAndroid Build Coastguard Worker    jmp .w4_filter_left
1912*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1913*c0909341SAndroid Build Coastguard Worker.filter_strength:
1914*c0909341SAndroid Build Coastguard Worker    movd                xm8, r3d
1915*c0909341SAndroid Build Coastguard Worker    mov                 r3d, angled
1916*c0909341SAndroid Build Coastguard Worker    movd                xm7, angled
1917*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m8, xm8
1918*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 8 ; is_sm << 1
1919*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m7, xm7
1920*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m8, [base+z_filter_wh]
1921*c0909341SAndroid Build Coastguard Worker    mova                xm9, [r9+r3*8]
1922*c0909341SAndroid Build Coastguard Worker    pand                 m0, m8, m7
1923*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m0, m9
1924*c0909341SAndroid Build Coastguard Worker    pmovmskb            r3d, m0
1925*c0909341SAndroid Build Coastguard Worker    ret
1926*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1927*c0909341SAndroid Build Coastguard Worker.upsample_left: ; h4/h8
1928*c0909341SAndroid Build Coastguard Worker    mova                xm0, [tlq-16]            ; 8 7 6 5 4 3 2 1
1929*c0909341SAndroid Build Coastguard Worker    movu                xm1, [tlq-14]            ; 7 6 5 4 3 2 1 0
1930*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        xm4, r8m ; pixel_max
1931*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
1932*c0909341SAndroid Build Coastguard Worker    je .upsample_left_h8
1933*c0909341SAndroid Build Coastguard Worker    pshufhw             xm2, xm0, q2100          ; _ _ _ _ 4 4 3 2
1934*c0909341SAndroid Build Coastguard Worker    pshufhw             xm3, xm1, q3321          ; _ _ _ _ 2 1 0 0
1935*c0909341SAndroid Build Coastguard Worker    jmp .upsample_left_end
1936*c0909341SAndroid Build Coastguard Worker.upsample_left_h8:
1937*c0909341SAndroid Build Coastguard Worker    pblendw             xm2, xm0, [tlq-18], 0xfe ; 8 8 7 6 5 4 3 2
1938*c0909341SAndroid Build Coastguard Worker    pblendw             xm3, xm1, [tlq-12], 0x7f ; 6 5 4 3 2 1 0 0
1939*c0909341SAndroid Build Coastguard Worker.upsample_left_end:
1940*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm0
1941*c0909341SAndroid Build Coastguard Worker    paddw               xm2, xm3
1942*c0909341SAndroid Build Coastguard Worker    psubw               xm2, xm1, xm2
1943*c0909341SAndroid Build Coastguard Worker    add                 dyq, dyq
1944*c0909341SAndroid Build Coastguard Worker    psraw               xm2, 3
1945*c0909341SAndroid Build Coastguard Worker    pxor                xm3, xm3
1946*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
1947*c0909341SAndroid Build Coastguard Worker    pmaxsw              xm1, xm3
1948*c0909341SAndroid Build Coastguard Worker    pavgw               xm1, xm3
1949*c0909341SAndroid Build Coastguard Worker    pminsw              xm1, xm4
1950*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm0, xm1
1951*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm0, xm1
1952*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 96+gprsize], xm2
1953*c0909341SAndroid Build Coastguard Worker    mova  [rsp+112+gprsize], xm0
1954*c0909341SAndroid Build Coastguard Worker    ret
1955*c0909341SAndroid Build Coastguard Worker.w4_no_upsample_above:
1956*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
1957*c0909341SAndroid Build Coastguard Worker    sub              angled, 1112 ; angle - 90
1958*c0909341SAndroid Build Coastguard Worker    call .filter_strength
1959*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
1960*c0909341SAndroid Build Coastguard Worker    jz .w4_no_filter_above
1961*c0909341SAndroid Build Coastguard Worker    popcnt              r3d, r3d
1962*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*1]
1963*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm5, [base+z_filter_k-4+r3*4+12*0]
1964*c0909341SAndroid Build Coastguard Worker    psrldq              xm0, xm1, 2     ; 1 2 3 4
1965*c0909341SAndroid Build Coastguard Worker    pshuflw             xm2, xm1, q2100 ; 0 0 1 2
1966*c0909341SAndroid Build Coastguard Worker    pmullw              xm4, xm0
1967*c0909341SAndroid Build Coastguard Worker    pshuflw             xm3, xm0, q3321 ; 2 3 4 4
1968*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm3
1969*c0909341SAndroid Build Coastguard Worker    pshuflw             xm3, xm0, q3332 ; 3 4 4 4
1970*c0909341SAndroid Build Coastguard Worker    pmullw              xm1, xm5
1971*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm5, [base+z_filter_k-4+r3*4+12*2]
1972*c0909341SAndroid Build Coastguard Worker    paddw               xm2, xm3
1973*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm3, r6m ; max_width
1974*c0909341SAndroid Build Coastguard Worker    pmullw              xm2, xm5
1975*c0909341SAndroid Build Coastguard Worker    packssdw            xm3, xm3
1976*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm4
1977*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
1978*c0909341SAndroid Build Coastguard Worker    psubw               xm3, [base+pw_1to16]
1979*c0909341SAndroid Build Coastguard Worker    pxor                xm4, xm4
1980*c0909341SAndroid Build Coastguard Worker    psrlw               xm1, 3
1981*c0909341SAndroid Build Coastguard Worker    pminsw              xm3, xm11 ; clip to byte range since there's no variable word blend
1982*c0909341SAndroid Build Coastguard Worker    pavgw               xm1, xm4
1983*c0909341SAndroid Build Coastguard Worker    vpblendvb           xm1, xm0, xm3
1984*c0909341SAndroid Build Coastguard Worker    movq          [rsp+130], xm1
1985*c0909341SAndroid Build Coastguard Worker.w4_no_filter_above:
1986*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+2]
1987*c0909341SAndroid Build Coastguard Worker    add              angled, 973 ; angle + 883
1988*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1989*c0909341SAndroid Build Coastguard Worker    test                r3d, angled
1990*c0909341SAndroid Build Coastguard Worker    jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
1991*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [base+pb_90]
1992*c0909341SAndroid Build Coastguard Worker    psubb               xm0, xm7 ; 180 - angle
1993*c0909341SAndroid Build Coastguard Worker    pand                xm0, xm8 ; reuse from previous filter_strength call
1994*c0909341SAndroid Build Coastguard Worker    pcmpgtb             xm0, xm9
1995*c0909341SAndroid Build Coastguard Worker    pmovmskb            r3d, xm0
1996*c0909341SAndroid Build Coastguard Worker.w4_filter_left:
1997*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
1998*c0909341SAndroid Build Coastguard Worker    jz .w4_main
1999*c0909341SAndroid Build Coastguard Worker    popcnt              r3d, r3d
2000*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-32]  ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
2001*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, r7m ; max_height
2002*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 3
2003*c0909341SAndroid Build Coastguard Worker    je .w4_filter_left_s3
2004*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [base+z_filter_k-4+r3*4+12*1]
2005*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [base+z_filter_k-4+r3*4+12*0]
2006*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m0
2007*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
2008*c0909341SAndroid Build Coastguard Worker    jl .w4_filter_left_h4
2009*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq-34]
2010*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0, m0
2011*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m4, 0xee  ; 0 0 1 2 3 4 5 6   8 8 9 a b c d e
2012*c0909341SAndroid Build Coastguard Worker    je .w4_filter_left_end
2013*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m4, 0x10  ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
2014*c0909341SAndroid Build Coastguard Worker    jmp .w4_filter_left_end
2015*c0909341SAndroid Build Coastguard Worker.w4_upsample_left:
2016*c0909341SAndroid Build Coastguard Worker    call .upsample_left
2017*c0909341SAndroid Build Coastguard Worker    mov                 r11, -16
2018*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m9, [base+z_upsample]
2019*c0909341SAndroid Build Coastguard Worker    jmp .w4_main_upsample_left
2020*c0909341SAndroid Build Coastguard Worker.w4_filter_left_s3: ; can only be h16
2021*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq-30]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2022*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+pw_3]
2023*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0, m2
2024*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m2
2025*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, [tlq-28], 0x7f     ; 2 3 4 5 6 7 8 9   a b c d e f g g
2026*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm0, xm0
2027*c0909341SAndroid Build Coastguard Worker    paddw                m2, m4
2028*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m3, [tlq-34], 0xfe ; 0 0 1 2 3 4 5 6   8 8 9 a b c d e
2029*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, [tlq-36], 0xfe     ; 0 0 0 1 2 3 4 5   6 8 8 9 a b c d
2030*c0909341SAndroid Build Coastguard Worker    paddw                m1, m4
2031*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m3
2032*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2033*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2
2034*c0909341SAndroid Build Coastguard Worker    jmp .w4_filter_left_end2
2035*c0909341SAndroid Build Coastguard Worker.w4_filter_left_h4:
2036*c0909341SAndroid Build Coastguard Worker    pshufhw              m1, m0, q2100 ; _ _ _ _ _ _ _ _   _ _ _ _ c c d e
2037*c0909341SAndroid Build Coastguard Worker.w4_filter_left_end:
2038*c0909341SAndroid Build Coastguard Worker    paddw                m1, [tlq-30]  ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2039*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m3
2040*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2041*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
2042*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 3
2043*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m2
2044*c0909341SAndroid Build Coastguard Worker.w4_filter_left_end2:
2045*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m5
2046*c0909341SAndroid Build Coastguard Worker    psubw                m5, [base+pw_16to1]
2047*c0909341SAndroid Build Coastguard Worker    pminsw               m5, m11
2048*c0909341SAndroid Build Coastguard Worker    vpblendvb            m1, m0, m5
2049*c0909341SAndroid Build Coastguard Worker    mova           [rsp+96], m1
2050*c0909341SAndroid Build Coastguard Worker.w4_main:
2051*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m9, [base+z2_x_shuf]
2052*c0909341SAndroid Build Coastguard Worker    mov                 r11, -8
2053*c0909341SAndroid Build Coastguard Worker.w4_main_upsample_left:
2054*c0909341SAndroid Build Coastguard Worker    movd                xm5, dyd
2055*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+z2_y_shuf_h4]
2056*c0909341SAndroid Build Coastguard Worker    mov                 r2d, r8d
2057*c0909341SAndroid Build Coastguard Worker    movd                xm0, dxd
2058*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m5, xm5
2059*c0909341SAndroid Build Coastguard Worker    rorx                 r5, dyq, 5
2060*c0909341SAndroid Build Coastguard Worker    lea                 r8d, [dyq*3]
2061*c0909341SAndroid Build Coastguard Worker    pmullw               m5, [base+z2_ymul]
2062*c0909341SAndroid Build Coastguard Worker    rorx                 r9, dyq, 4
2063*c0909341SAndroid Build Coastguard Worker    sar                 dyd, 6
2064*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
2065*c0909341SAndroid Build Coastguard Worker    sar                 r8d, 6
2066*c0909341SAndroid Build Coastguard Worker    pand                 m5, m11       ; frac_y
2067*c0909341SAndroid Build Coastguard Worker    neg                 dyd
2068*c0909341SAndroid Build Coastguard Worker    psllw                m5, 9
2069*c0909341SAndroid Build Coastguard Worker    add                 r5d, dyd
2070*c0909341SAndroid Build Coastguard Worker    add                 r8d, dyd
2071*c0909341SAndroid Build Coastguard Worker    add                 r9d, dyd
2072*c0909341SAndroid Build Coastguard Worker    paddw                m7, m0, m0
2073*c0909341SAndroid Build Coastguard Worker    lea                 dyq, [rsp+dyq*2+126]
2074*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m7, 0xcc
2075*c0909341SAndroid Build Coastguard Worker    add                 dyq, r11
2076*c0909341SAndroid Build Coastguard Worker    neg                 r5d
2077*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0, m7
2078*c0909341SAndroid Build Coastguard Worker    neg                 r8d
2079*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m1, 0xf0  ; xpos0 xpos1 xpos2 xpos3
2080*c0909341SAndroid Build Coastguard Worker    neg                 r9d
2081*c0909341SAndroid Build Coastguard Worker    paddw                m7, m7
2082*c0909341SAndroid Build Coastguard Worker    paddw                m6, m0
2083*c0909341SAndroid Build Coastguard Worker.w4_loop:
2084*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r2+dxq]
2085*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6         ; base_x0
2086*c0909341SAndroid Build Coastguard Worker    movu                xm1, [rsp+r2*2]
2087*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+dxq]
2088*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6         ; base_x1
2089*c0909341SAndroid Build Coastguard Worker    movu                xm3, [rsp+r3*2]
2090*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r2+dxq]
2091*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6         ; base_x2
2092*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [rsp+r2*2], 1
2093*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+dxq]
2094*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6         ; base_x3
2095*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [rsp+r3*2], 1
2096*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m10 ; a0 a1 a2 a3 A0 A1 A2 A3
2097*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m10 ; b0 b1 b2 b3 B0 B1 B2 B3
2098*c0909341SAndroid Build Coastguard Worker    pand                 m2, m11, m6
2099*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m1, m3
2100*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m3
2101*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
2102*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
2103*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
2104*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2105*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 64
2106*c0909341SAndroid Build Coastguard Worker    jge .w4_toponly
2107*c0909341SAndroid Build Coastguard Worker    movu                xm2, [dyq]
2108*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [dyq+r8*2], 1
2109*c0909341SAndroid Build Coastguard Worker    movu                xm3, [dyq+r5*2]
2110*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [dyq+r9*2], 1
2111*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m9
2112*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m9
2113*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2, m3 ; a3 b3 a2 b2 a1 b1 a0 b0
2114*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
2115*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
2116*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5
2117*c0909341SAndroid Build Coastguard Worker    psraw                m3, m6, 15 ; base_x < topleft
2118*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2119*c0909341SAndroid Build Coastguard Worker    vpermd               m1, m4, m1 ; a0 b0 c0 d0 a1 b1 c1 d1   a2 b2 c2 d2 a3 b3 c3 d3
2120*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m1, m3
2121*c0909341SAndroid Build Coastguard Worker.w4_toponly:
2122*c0909341SAndroid Build Coastguard Worker    paddw                m6, m7     ; xpos += dx
2123*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
2124*c0909341SAndroid Build Coastguard Worker    add                 dyq, r11
2125*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
2126*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
2127*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
2128*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
2129*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r3       ], xm1
2130*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2131*c0909341SAndroid Build Coastguard Worker    jz .w4_end
2132*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2133*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, r10d
2134*c0909341SAndroid Build Coastguard Worker    jge .w4_loop
2135*c0909341SAndroid Build Coastguard Worker.w4_leftonly_loop:
2136*c0909341SAndroid Build Coastguard Worker    movu                xm1, [dyq]
2137*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [dyq+r8*2], 1
2138*c0909341SAndroid Build Coastguard Worker    movu                xm2, [dyq+r5*2]
2139*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [dyq+r9*2], 1
2140*c0909341SAndroid Build Coastguard Worker    add                 dyq, r11
2141*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m9
2142*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m9
2143*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m1, m2
2144*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2
2145*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
2146*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
2147*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2148*c0909341SAndroid Build Coastguard Worker    vpermd               m0, m4, m0
2149*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
2150*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
2151*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
2152*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
2153*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r3       ], xm1
2154*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2155*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2156*c0909341SAndroid Build Coastguard Worker    jg .w4_leftonly_loop
2157*c0909341SAndroid Build Coastguard Worker.w4_end:
2158*c0909341SAndroid Build Coastguard Worker    RET
2159*c0909341SAndroid Build Coastguard Worker.w8:
2160*c0909341SAndroid Build Coastguard Worker    mov                r10d, hd
2161*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2162*c0909341SAndroid Build Coastguard Worker    jnz .w8_main
2163*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq+126]
2164*c0909341SAndroid Build Coastguard Worker    xor                 r8d, r8d
2165*c0909341SAndroid Build Coastguard Worker    mov                 r3b, hb
2166*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
2167*c0909341SAndroid Build Coastguard Worker    ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
2168*c0909341SAndroid Build Coastguard Worker    movu                xm0, [tlq+2]            ; 1 2 3 4 5 6 7 8
2169*c0909341SAndroid Build Coastguard Worker    mova                xm1, [tlq+0]            ; 0 1 2 3 4 5 6 7
2170*c0909341SAndroid Build Coastguard Worker    pblendw             xm2, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8
2171*c0909341SAndroid Build Coastguard Worker    pblendw             xm3, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6
2172*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        xm4, r8m ; pixel_max
2173*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm0
2174*c0909341SAndroid Build Coastguard Worker    paddw               xm2, xm3
2175*c0909341SAndroid Build Coastguard Worker    not                 r8d
2176*c0909341SAndroid Build Coastguard Worker    psubw               xm2, xm1, xm2
2177*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
2178*c0909341SAndroid Build Coastguard Worker    psraw               xm2, 3
2179*c0909341SAndroid Build Coastguard Worker    sub              angled, 53 ; angle - 53
2180*c0909341SAndroid Build Coastguard Worker    pxor                xm3, xm3
2181*c0909341SAndroid Build Coastguard Worker    paddw               xm2, xm1
2182*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
2183*c0909341SAndroid Build Coastguard Worker    pmaxsw              xm2, xm3
2184*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x7f ; 180 - angle
2185*c0909341SAndroid Build Coastguard Worker    pavgw               xm2, xm3
2186*c0909341SAndroid Build Coastguard Worker    pminsw              xm2, xm4
2187*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm1, xm2, xm0
2188*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm2, xm0
2189*c0909341SAndroid Build Coastguard Worker    movu          [rsp+130], xm1
2190*c0909341SAndroid Build Coastguard Worker    movu          [rsp+146], xm2
2191*c0909341SAndroid Build Coastguard Worker    call .filter_strength
2192*c0909341SAndroid Build Coastguard Worker    jmp .w8_filter_left
2193*c0909341SAndroid Build Coastguard Worker.w8_no_upsample_above:
2194*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
2195*c0909341SAndroid Build Coastguard Worker    sub              angled, 90 ; angle - 90
2196*c0909341SAndroid Build Coastguard Worker    call .filter_strength
2197*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
2198*c0909341SAndroid Build Coastguard Worker    jz .w8_no_filter_above
2199*c0909341SAndroid Build Coastguard Worker    popcnt              r3d, r3d
2200*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*1]
2201*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm5, [base+z_filter_k-4+r3*4+12*0]
2202*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm6, [base+z_filter_k-4+r3*4+12*2]
2203*c0909341SAndroid Build Coastguard Worker    movu                xm0, [tlq+2]            ; 1 2 3 4 5 6 7 8 x
2204*c0909341SAndroid Build Coastguard Worker    pblendw             xm2, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 x
2205*c0909341SAndroid Build Coastguard Worker    pmullw              xm4, xm0
2206*c0909341SAndroid Build Coastguard Worker    pblendw             xm3, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 x
2207*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm3
2208*c0909341SAndroid Build Coastguard Worker    vpblendd            xm3, [tlq+6], 0x07      ; 3 4 5 6 7 8 8 8 x
2209*c0909341SAndroid Build Coastguard Worker    paddw               xm2, xm3
2210*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm3, r6m ; max_width
2211*c0909341SAndroid Build Coastguard Worker    pmullw              xm1, xm5
2212*c0909341SAndroid Build Coastguard Worker    pmullw              xm2, xm6
2213*c0909341SAndroid Build Coastguard Worker    packssdw            xm3, xm3
2214*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm4
2215*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
2216*c0909341SAndroid Build Coastguard Worker    psubw               xm3, [base+pw_1to16]
2217*c0909341SAndroid Build Coastguard Worker    pxor                xm4, xm4
2218*c0909341SAndroid Build Coastguard Worker    psrlw               xm1, 3
2219*c0909341SAndroid Build Coastguard Worker    pminsw              xm3, xm11
2220*c0909341SAndroid Build Coastguard Worker    pavgw               xm1, xm4
2221*c0909341SAndroid Build Coastguard Worker    vpblendvb           xm1, xm0, xm3
2222*c0909341SAndroid Build Coastguard Worker    movu          [rsp+130], xm1
2223*c0909341SAndroid Build Coastguard Worker.w8_no_filter_above:
2224*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq-51]
2225*c0909341SAndroid Build Coastguard Worker    mov                 r3b, hb
2226*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
2227*c0909341SAndroid Build Coastguard Worker    jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
2228*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [base+pb_90]
2229*c0909341SAndroid Build Coastguard Worker    psubb                m0, m7
2230*c0909341SAndroid Build Coastguard Worker    pand                 m0, m8
2231*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m0, m9
2232*c0909341SAndroid Build Coastguard Worker    pmovmskb            r3d, m0
2233*c0909341SAndroid Build Coastguard Worker.w8_filter_left:
2234*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
2235*c0909341SAndroid Build Coastguard Worker    jz .w8_main
2236*c0909341SAndroid Build Coastguard Worker    popcnt              r3d, r3d
2237*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 3
2238*c0909341SAndroid Build Coastguard Worker    jne .w8_filter_left_s12
2239*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pw_3]
2240*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+pw_16]
2241*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16 ; flags needed for later
2242*c0909341SAndroid Build Coastguard Worker    jmp .filter_left_s3b
2243*c0909341SAndroid Build Coastguard Worker.w8_upsample_left:
2244*c0909341SAndroid Build Coastguard Worker    call .upsample_left
2245*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [base+z2_y_shuf_us]
2246*c0909341SAndroid Build Coastguard Worker    lea                 r11, [rsp+118]
2247*c0909341SAndroid Build Coastguard Worker    mov                  r8, -8
2248*c0909341SAndroid Build Coastguard Worker    jmp .w8_main_upsample_left
2249*c0909341SAndroid Build Coastguard Worker.w16_filter_left_s12:
2250*c0909341SAndroid Build Coastguard Worker    xor                 r8d, r8d
2251*c0909341SAndroid Build Coastguard Worker.w8_filter_left_s12:
2252*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-32]  ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
2253*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, r7m ; max_height
2254*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [base+z_filter_k-4+r3*4+12*1]
2255*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [base+z_filter_k-4+r3*4+12*0]
2256*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m0
2257*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
2258*c0909341SAndroid Build Coastguard Worker    jl .w8_filter_left_h4
2259*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq-34]
2260*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0, m0
2261*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m4, 0xee  ; 0 0 1 2 3 4 5 6   8 8 9 a b c d e
2262*c0909341SAndroid Build Coastguard Worker    je .w8_filter_left_end
2263*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m4, 0x10  ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
2264*c0909341SAndroid Build Coastguard Worker    jmp .w8_filter_left_end
2265*c0909341SAndroid Build Coastguard Worker.w8_filter_left_h4:
2266*c0909341SAndroid Build Coastguard Worker    pshufhw              m1, m0, q2100 ; _ _ _ _ _ _ _ _   _ _ _ _ c c d e
2267*c0909341SAndroid Build Coastguard Worker.w8_filter_left_end:
2268*c0909341SAndroid Build Coastguard Worker    paddw                m1, [tlq-30]  ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2269*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m3
2270*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2271*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
2272*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 3
2273*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m2
2274*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m5
2275*c0909341SAndroid Build Coastguard Worker    psubw                m5, [base+pw_16to1]
2276*c0909341SAndroid Build Coastguard Worker    pminsw               m5, m11
2277*c0909341SAndroid Build Coastguard Worker    vpblendvb            m1, m0, m5
2278*c0909341SAndroid Build Coastguard Worker    mova           [rsp+96], m1
2279*c0909341SAndroid Build Coastguard Worker    test                r8d, r8d
2280*c0909341SAndroid Build Coastguard Worker    jz .w8_main
2281*c0909341SAndroid Build Coastguard Worker; upsample_main
2282*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m10, [base+z_upsample]
2283*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [base+z2_y_shuf]
2284*c0909341SAndroid Build Coastguard Worker    lea                  r5, [rsp+120]
2285*c0909341SAndroid Build Coastguard Worker    movd                xm1, dyd
2286*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [base+z_base_inc+2]
2287*c0909341SAndroid Build Coastguard Worker    movd                xm2, dxd
2288*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, xm1
2289*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, xm2
2290*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
2291*c0909341SAndroid Build Coastguard Worker    paddw                m4, m4
2292*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m1, [base+z2_ymul8]
2293*c0909341SAndroid Build Coastguard Worker    paddw                m5, m2, m2
2294*c0909341SAndroid Build Coastguard Worker    psllw               xm1, 3
2295*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m5, 0xf0
2296*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [dxq+(66<<6)] ; xpos
2297*c0909341SAndroid Build Coastguard Worker    paddw                m4, m2
2298*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m0, q2020
2299*c0909341SAndroid Build Coastguard Worker    psraw               xm0, 6
2300*c0909341SAndroid Build Coastguard Worker    pxor                xm1, xm1
2301*c0909341SAndroid Build Coastguard Worker    psubw               xm8, xm1, xm0
2302*c0909341SAndroid Build Coastguard Worker    pand                 m6, m11
2303*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm9, xm8, xm1
2304*c0909341SAndroid Build Coastguard Worker    psllw                m6, 9
2305*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm8, xm1
2306*c0909341SAndroid Build Coastguard Worker.w8_upsample_above_loop:
2307*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r2+dxq]
2308*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6
2309*c0909341SAndroid Build Coastguard Worker    movu                xm1, [rsp+r2*2]
2310*c0909341SAndroid Build Coastguard Worker    movu                xm2, [rsp+r2*2+16]
2311*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+dxq]
2312*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6
2313*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [rsp+r3*2], 1
2314*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [rsp+r3*2+16], 1
2315*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m10
2316*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m10
2317*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m1, m2   ; a0 b0 c0 d0 e0 f0 g0 h0
2318*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m2
2319*c0909341SAndroid Build Coastguard Worker    pand                 m2, m11, m4
2320*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
2321*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
2322*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
2323*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2324*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 64
2325*c0909341SAndroid Build Coastguard Worker    jge .w8_upsample_above_toponly
2326*c0909341SAndroid Build Coastguard Worker    mova                 m1, m5
2327*c0909341SAndroid Build Coastguard Worker    vpgatherdq           m3, [r5+xm9*2], m5
2328*c0909341SAndroid Build Coastguard Worker    mova                 m5, m1
2329*c0909341SAndroid Build Coastguard Worker    vpgatherdq           m2, [r5+xm8*2], m1
2330*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7
2331*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
2332*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m2, m3
2333*c0909341SAndroid Build Coastguard Worker    punpckhdq            m2, m3
2334*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
2335*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
2336*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2337*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q3120
2338*c0909341SAndroid Build Coastguard Worker    psraw                m2, m4, 15
2339*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m1, m2
2340*c0909341SAndroid Build Coastguard Worker.w8_upsample_above_toponly:
2341*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
2342*c0909341SAndroid Build Coastguard Worker    sub                  r5, 4
2343*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
2344*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
2345*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2346*c0909341SAndroid Build Coastguard Worker    jz .w8_ret
2347*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2348*c0909341SAndroid Build Coastguard Worker    jmp .w8_upsample_above_loop
2349*c0909341SAndroid Build Coastguard Worker.w8_main:
2350*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [base+z2_y_shuf]
2351*c0909341SAndroid Build Coastguard Worker    lea                 r11, [rsp+120]
2352*c0909341SAndroid Build Coastguard Worker    mov                  r8, -4
2353*c0909341SAndroid Build Coastguard Worker.w8_main_upsample_left:
2354*c0909341SAndroid Build Coastguard Worker    movd                xm1, dyd
2355*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [base+z_base_inc+2]
2356*c0909341SAndroid Build Coastguard Worker    movd                xm2, dxd
2357*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, xm1
2358*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, xm2
2359*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
2360*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m1, [base+z2_ymul8]
2361*c0909341SAndroid Build Coastguard Worker    paddw                m5, m2, m2
2362*c0909341SAndroid Build Coastguard Worker    psllw               xm1, 3
2363*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m5, 0xf0 ; xpos0 xpos1
2364*c0909341SAndroid Build Coastguard Worker    lea                 r9d, [dxq+(65<<6)] ; xpos
2365*c0909341SAndroid Build Coastguard Worker    paddw                m4, m2
2366*c0909341SAndroid Build Coastguard Worker    movd          [rsp+284], xm1
2367*c0909341SAndroid Build Coastguard Worker.w8_loop0:
2368*c0909341SAndroid Build Coastguard Worker    mov                 r2d, r9d
2369*c0909341SAndroid Build Coastguard Worker    mova          [rsp+288], m0
2370*c0909341SAndroid Build Coastguard Worker    mov                  r5, r11
2371*c0909341SAndroid Build Coastguard Worker    mova          [rsp+320], m4
2372*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m0, q2020
2373*c0909341SAndroid Build Coastguard Worker    psraw               xm0, 6
2374*c0909341SAndroid Build Coastguard Worker    pxor                xm1, xm1
2375*c0909341SAndroid Build Coastguard Worker    psubw               xm8, xm1, xm0 ; base_y
2376*c0909341SAndroid Build Coastguard Worker    pand                 m6, m11      ; frac_y
2377*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm9, xm8, xm1 ; base_y 2 3 6 7
2378*c0909341SAndroid Build Coastguard Worker    psllw                m6, 9
2379*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm8, xm1      ; base_y 0 1 4 5
2380*c0909341SAndroid Build Coastguard Worker.w8_loop:
2381*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r2+dxq]
2382*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6        ; base_x0
2383*c0909341SAndroid Build Coastguard Worker    movu                xm0, [rsp+r2*2]
2384*c0909341SAndroid Build Coastguard Worker    movu                xm1, [rsp+r2*2+2]
2385*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+dxq]
2386*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6        ; base_x1
2387*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [rsp+r3*2], 1
2388*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [rsp+r3*2+2], 1
2389*c0909341SAndroid Build Coastguard Worker    pand                 m2, m11, m4
2390*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
2391*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
2392*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
2393*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2394*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 64
2395*c0909341SAndroid Build Coastguard Worker    jge .w8_toponly
2396*c0909341SAndroid Build Coastguard Worker    mova                 m1, m5
2397*c0909341SAndroid Build Coastguard Worker    vpgatherdq           m3, [r5+xm9*2], m5
2398*c0909341SAndroid Build Coastguard Worker    mova                 m5, m1
2399*c0909341SAndroid Build Coastguard Worker    vpgatherdq           m2, [r5+xm8*2], m1
2400*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7       ; c0 d0 c1 d1               g0 h0 g1 h1
2401*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7       ; a0 b0 a1 b1               e0 f0 e1 f1
2402*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m2, m3   ; a0 b0 c0 d0 a1 b1 c1 d1   e0 f0 g0 h0 e1 f1 g1 h1
2403*c0909341SAndroid Build Coastguard Worker    punpckhdq            m2, m3
2404*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
2405*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
2406*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2407*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q3120
2408*c0909341SAndroid Build Coastguard Worker    psraw                m2, m4, 15   ; base_x < topleft
2409*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m1, m2
2410*c0909341SAndroid Build Coastguard Worker.w8_toponly:
2411*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5       ; xpos += dx
2412*c0909341SAndroid Build Coastguard Worker    add                  r5, r8
2413*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
2414*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
2415*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2416*c0909341SAndroid Build Coastguard Worker    jz .w8_end
2417*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2418*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, (63-8)<<6
2419*c0909341SAndroid Build Coastguard Worker    jge .w8_loop
2420*c0909341SAndroid Build Coastguard Worker.w8_leftonly_loop:
2421*c0909341SAndroid Build Coastguard Worker    mova                 m0, m5
2422*c0909341SAndroid Build Coastguard Worker    vpgatherdq           m4, [r5+xm9*2], m5
2423*c0909341SAndroid Build Coastguard Worker    mova                 m5, m0
2424*c0909341SAndroid Build Coastguard Worker    vpgatherdq           m3, [r5+xm8*2], m0
2425*c0909341SAndroid Build Coastguard Worker    add                  r5, r8
2426*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4, m7
2427*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3, m7
2428*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1, m2
2429*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m2
2430*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
2431*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6
2432*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2433*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
2434*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
2435*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
2436*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2437*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2438*c0909341SAndroid Build Coastguard Worker    jg .w8_leftonly_loop
2439*c0909341SAndroid Build Coastguard Worker.w8_end:
2440*c0909341SAndroid Build Coastguard Worker    sub                r10d, 1<<8
2441*c0909341SAndroid Build Coastguard Worker    jl .w8_ret
2442*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [rsp+284]
2443*c0909341SAndroid Build Coastguard Worker    add                  r7, 16
2444*c0909341SAndroid Build Coastguard Worker    paddw                m0, [rsp+288] ; base_y += 8*dy
2445*c0909341SAndroid Build Coastguard Worker    add                 r9d, 8<<6
2446*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [pw_512]
2447*c0909341SAndroid Build Coastguard Worker    movzx                hd, r10b
2448*c0909341SAndroid Build Coastguard Worker    paddw                m4, [rsp+320] ; base_x += 8*64
2449*c0909341SAndroid Build Coastguard Worker    mov                dstq, r7
2450*c0909341SAndroid Build Coastguard Worker    jmp .w8_loop0
2451*c0909341SAndroid Build Coastguard Worker.w8_ret:
2452*c0909341SAndroid Build Coastguard Worker    RET
2453*c0909341SAndroid Build Coastguard Worker.w16:
2454*c0909341SAndroid Build Coastguard Worker    movd                xm0, [tlq+32]
2455*c0909341SAndroid Build Coastguard Worker    lea                r10d, [hq+(1<<8)]
2456*c0909341SAndroid Build Coastguard Worker    movd          [rsp+160], xm0
2457*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2458*c0909341SAndroid Build Coastguard Worker    jnz .w8_main
2459*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+15]
2460*c0909341SAndroid Build Coastguard Worker    sub              angled, 90
2461*c0909341SAndroid Build Coastguard Worker    call .filter_strength
2462*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
2463*c0909341SAndroid Build Coastguard Worker    jz .w16_no_filter_above
2464*c0909341SAndroid Build Coastguard Worker    popcnt              r3d, r3d
2465*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+z_filter_k-4+r3*4+12*1]
2466*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+z_filter_k-4+r3*4+12*0]
2467*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+z_filter_k-4+r3*4+12*2]
2468*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+2]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2469*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm1, xm1
2470*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, [tlq-2], 0xfe     ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
2471*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m0
2472*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m0
2473*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, [tlq+4], 0x7f     ; 2 3 4 5 6 7 8 9   a b c d e f g g
2474*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
2475*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, [tlq+6], 0x7f     ; 3 4 5 6 7 8 9 a   b c d e f g g g
2476*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
2477*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, r6m ; max_width
2478*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m5
2479*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m6
2480*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m3
2481*c0909341SAndroid Build Coastguard Worker    paddw                m1, m4
2482*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2483*c0909341SAndroid Build Coastguard Worker    psubw                m3, [base+pw_1to16]
2484*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
2485*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 3
2486*c0909341SAndroid Build Coastguard Worker    pminsw               m3, m11
2487*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m4
2488*c0909341SAndroid Build Coastguard Worker    vpblendvb            m1, m0, m3
2489*c0909341SAndroid Build Coastguard Worker    movu          [rsp+130], m1
2490*c0909341SAndroid Build Coastguard Worker.w16_no_filter_above:
2491*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [base+pb_90]
2492*c0909341SAndroid Build Coastguard Worker    psubb                m0, m7
2493*c0909341SAndroid Build Coastguard Worker    pand                 m0, m8
2494*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m0, m9
2495*c0909341SAndroid Build Coastguard Worker    pmovmskb            r3d, m0
2496*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
2497*c0909341SAndroid Build Coastguard Worker    jz .w8_main
2498*c0909341SAndroid Build Coastguard Worker    popcnt              r3d, r3d
2499*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 3
2500*c0909341SAndroid Build Coastguard Worker    jne .w16_filter_left_s12
2501*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pw_3]
2502*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+pw_16]
2503*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
2504*c0909341SAndroid Build Coastguard Worker    jne .filter_left_s3
2505*c0909341SAndroid Build Coastguard Worker    movq                xm0, [tlq-8]    ; 0 1 2 3
2506*c0909341SAndroid Build Coastguard Worker    movq                xm1, [tlq-6]    ; 1 2 3 4
2507*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm5, r7m ; max_height
2508*c0909341SAndroid Build Coastguard Worker    movq                xm4, [base+pw_16to1+24] ; 4to1
2509*c0909341SAndroid Build Coastguard Worker    pshuflw             xm2, xm0, q2100 ; 0 0 1 2
2510*c0909341SAndroid Build Coastguard Worker    pshuflw             xm3, xm1, q3321 ; 2 3 4 4
2511*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm0
2512*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
2513*c0909341SAndroid Build Coastguard Worker    pshuflw             xm2, xm0, q1000 ; 0 0 0 1
2514*c0909341SAndroid Build Coastguard Worker    paddw               xm3, xm6
2515*c0909341SAndroid Build Coastguard Worker    packssdw            xm5, xm5
2516*c0909341SAndroid Build Coastguard Worker    pavgw               xm2, xm3
2517*c0909341SAndroid Build Coastguard Worker    psubw               xm5, xm4
2518*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
2519*c0909341SAndroid Build Coastguard Worker    pminsw              xm5, xm11
2520*c0909341SAndroid Build Coastguard Worker    psrlw               xm1, 2
2521*c0909341SAndroid Build Coastguard Worker    vpblendvb           xm1, xm0, xm5
2522*c0909341SAndroid Build Coastguard Worker    movq          [rsp+120], xm1
2523*c0909341SAndroid Build Coastguard Worker    jmp .w8_main
2524*c0909341SAndroid Build Coastguard Worker.w32:
2525*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq+32]
2526*c0909341SAndroid Build Coastguard Worker    movd                xm0, [tlq+64]
2527*c0909341SAndroid Build Coastguard Worker    lea                r10d, [hq+(3<<8)]
2528*c0909341SAndroid Build Coastguard Worker    mova          [rsp+160], m2
2529*c0909341SAndroid Build Coastguard Worker    movd          [rsp+192], xm0
2530*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2531*c0909341SAndroid Build Coastguard Worker    jnz .w8_main
2532*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pw_3]
2533*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, r6m ; max_width
2534*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+pw_16]
2535*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 32
2536*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m0
2537*c0909341SAndroid Build Coastguard Worker    psubw                m0, [base+pw_1to16]
2538*c0909341SAndroid Build Coastguard Worker    pminsw               m8, m0, m11
2539*c0909341SAndroid Build Coastguard Worker    psubw                m9, m8, m7
2540*c0909341SAndroid Build Coastguard Worker.w32_filter_above:
2541*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+2]
2542*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm4, xm1, xm1
2543*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6, [tlq+6]
2544*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
2545*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, [tlq-2], 0xfe        ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
2546*c0909341SAndroid Build Coastguard Worker    paddw                m1, [tlq+4]
2547*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r3+2]
2548*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6, [tlq+r3-2]
2549*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m4
2550*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m3, m3
2551*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2552*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m4, [tlq+r3+6], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h h h
2553*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, [tlq+r3+4], 0x7f     ; 3 4 5 6 7 8 9 a   b c d e f g h h
2554*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m5
2555*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3, [tlq+r3]
2556*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
2557*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2
2558*c0909341SAndroid Build Coastguard Worker    paddw                m2, m4
2559*c0909341SAndroid Build Coastguard Worker    vpblendvb            m1, m0, m8
2560*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 2
2561*c0909341SAndroid Build Coastguard Worker    vpblendvb            m2, m3, m9
2562*c0909341SAndroid Build Coastguard Worker    movu          [rsp+130], m1
2563*c0909341SAndroid Build Coastguard Worker    movu       [rsp+r3+130], m2
2564*c0909341SAndroid Build Coastguard Worker.filter_left_s3:
2565*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
2566*c0909341SAndroid Build Coastguard Worker    jl .filter_left_s3_h8 ; h8
2567*c0909341SAndroid Build Coastguard Worker.filter_left_s3b:
2568*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-32]       ; 2 3 4 5 6 7 8 9   a b c d e f g h
2569*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq-30]       ; 3 4 5 6 7 8 9 a   b c d e f g h i
2570*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, r7m ; max_height
2571*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0, m2
2572*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m2
2573*c0909341SAndroid Build Coastguard Worker    mov                 r3d, hd
2574*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, [tlq-28], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h i i
2575*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m5
2576*c0909341SAndroid Build Coastguard Worker    not                  r3
2577*c0909341SAndroid Build Coastguard Worker    psubw                m5, [base+pw_16to1]
2578*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6
2579*c0909341SAndroid Build Coastguard Worker    pminsw               m8, m11, m5
2580*c0909341SAndroid Build Coastguard Worker    je .filter_left_s3_end ; h16
2581*c0909341SAndroid Build Coastguard Worker    paddw                m1, [tlq-34]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2582*c0909341SAndroid Build Coastguard Worker    pavgw                m2, [tlq-36]       ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
2583*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2584*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2
2585*c0909341SAndroid Build Coastguard Worker    vpblendvb            m3, m1, m0, m8
2586*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-64]       ; 2 3 4 5 6 7 8 9   a b c d e f g h
2587*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0, [tlq-62]   ; 3 4 5 6 7 8 9 a   b c d e f g h i
2588*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6, [tlq-60]   ; 4 5 6 7 8 9 a b   c d e f g h i j
2589*c0909341SAndroid Build Coastguard Worker    psubw                m8, m7
2590*c0909341SAndroid Build Coastguard Worker    mova           [rsp+96], m3
2591*c0909341SAndroid Build Coastguard Worker    jnp .filter_left_s3_end ; h32
2592*c0909341SAndroid Build Coastguard Worker    mova                 m5, [tlq-96]
2593*c0909341SAndroid Build Coastguard Worker    paddw                m1, [tlq-66]
2594*c0909341SAndroid Build Coastguard Worker    pavgw                m2, [tlq-68]
2595*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2596*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5, [tlq-94]
2597*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6, [tlq-92]
2598*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2
2599*c0909341SAndroid Build Coastguard Worker    paddw                m4, [tlq- 98]
2600*c0909341SAndroid Build Coastguard Worker    pavgw                m2, [tlq-100]
2601*c0909341SAndroid Build Coastguard Worker    vpblendvb            m3, m1, m0, m8
2602*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-128]
2603*c0909341SAndroid Build Coastguard Worker    psubw                m8, m7
2604*c0909341SAndroid Build Coastguard Worker    paddw                m4, m2
2605*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0, [tlq-126]
2606*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6, [tlq-124]
2607*c0909341SAndroid Build Coastguard Worker    psrlw                m4, 2
2608*c0909341SAndroid Build Coastguard Worker    mova           [rsp+64], m3
2609*c0909341SAndroid Build Coastguard Worker    vpblendvb            m4, m5, m8
2610*c0909341SAndroid Build Coastguard Worker    psubw                m8, m7
2611*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m4
2612*c0909341SAndroid Build Coastguard Worker.filter_left_s3_end:
2613*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm0, xm0
2614*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m3, [tlq+r3*2], 0xfe ; 2 2 3 4 5 6 7 8   9 a b c d e f g
2615*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, [tlq+r3*2-2], 0xfe   ; 2 2 2 3 4 5 6 7   8 9 a b c d e f
2616*c0909341SAndroid Build Coastguard Worker    paddw                m1, m4
2617*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m3
2618*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2619*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2
2620*c0909341SAndroid Build Coastguard Worker    vpblendvb            m1, m0, m8
2621*c0909341SAndroid Build Coastguard Worker    mova     [rsp+r3*2+130], m1
2622*c0909341SAndroid Build Coastguard Worker    jmp .w8_main
2623*c0909341SAndroid Build Coastguard Worker.filter_left_s3_h8:
2624*c0909341SAndroid Build Coastguard Worker    mova                xm0, [tlq-16]            ; 0 1 2 3 4 5 6 7
2625*c0909341SAndroid Build Coastguard Worker    movu                xm3, [tlq-14]            ; 1 2 3 4 5 6 7 8
2626*c0909341SAndroid Build Coastguard Worker    pblendw             xm2, xm0, [tlq-18], 0xfe ; 0 0 1 2 3 4 5 6
2627*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm5, r7m ; max_height
2628*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm0, xm3
2629*c0909341SAndroid Build Coastguard Worker    pblendw             xm3, [tlq-12], 0x7f      ; 2 3 4 5 6 7 8 8
2630*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
2631*c0909341SAndroid Build Coastguard Worker    vpblendd            xm2, [tlq-20], 0x0e      ; 0 0 0 1 2 3 4 5
2632*c0909341SAndroid Build Coastguard Worker    paddw               xm3, xm6
2633*c0909341SAndroid Build Coastguard Worker    packssdw            xm5, xm5
2634*c0909341SAndroid Build Coastguard Worker    pavgw               xm2, xm3
2635*c0909341SAndroid Build Coastguard Worker    psubw               xm5, [base+pw_16to1+16] ; 8to1
2636*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
2637*c0909341SAndroid Build Coastguard Worker    pminsw              xm5, xm11
2638*c0909341SAndroid Build Coastguard Worker    psrlw               xm1, 2
2639*c0909341SAndroid Build Coastguard Worker    vpblendvb           xm1, xm0, xm5
2640*c0909341SAndroid Build Coastguard Worker    mova          [rsp+112], xm1
2641*c0909341SAndroid Build Coastguard Worker    jmp .w8_main
2642*c0909341SAndroid Build Coastguard Worker.w64:
2643*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq+ 32]
2644*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tlq+ 64]
2645*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tlq+ 96]
2646*c0909341SAndroid Build Coastguard Worker    movd                xm0, [tlq+128]
2647*c0909341SAndroid Build Coastguard Worker    lea                r10d, [hq+(7<<8)]
2648*c0909341SAndroid Build Coastguard Worker    mova          [rsp+160], m2
2649*c0909341SAndroid Build Coastguard Worker    mova          [rsp+192], m3
2650*c0909341SAndroid Build Coastguard Worker    mova          [rsp+224], m4
2651*c0909341SAndroid Build Coastguard Worker    movd          [rsp+256], xm0
2652*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2653*c0909341SAndroid Build Coastguard Worker    jnz .w8_main
2654*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+pw_3]
2655*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+34]     ; 2 3 4 5 6 7 8 9   a b c d e f g h
2656*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6, [tlq+30] ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
2657*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0, [tlq+32] ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2658*c0909341SAndroid Build Coastguard Worker    pavgw                m2, [tlq+38]     ; 4 5 6 7 8 9 a b   c d e f g h h h
2659*c0909341SAndroid Build Coastguard Worker    paddw                m5, [tlq+36]     ; 3 4 5 6 7 8 9 a   b c d e f g h h
2660*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq+66]
2661*c0909341SAndroid Build Coastguard Worker    paddw                m3, m6, [tlq+62]
2662*c0909341SAndroid Build Coastguard Worker    paddw                m7, m4, [tlq+64]
2663*c0909341SAndroid Build Coastguard Worker    pavgw                m3, [tlq+70]
2664*c0909341SAndroid Build Coastguard Worker    paddw                m7, [tlq+68]
2665*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
2666*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, r6m ; max_width
2667*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 96
2668*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m5
2669*c0909341SAndroid Build Coastguard Worker    paddw                m3, m7
2670*c0909341SAndroid Build Coastguard Worker    psubw                m5, [base+pw_1to16]
2671*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 2
2672*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+pw_16]
2673*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 2
2674*c0909341SAndroid Build Coastguard Worker    pminsw               m8, m11, m5
2675*c0909341SAndroid Build Coastguard Worker    psubw                m9, m8, m7
2676*c0909341SAndroid Build Coastguard Worker    vpblendvb            m2, m0, m9
2677*c0909341SAndroid Build Coastguard Worker    psubw                m9, m7
2678*c0909341SAndroid Build Coastguard Worker    vpblendvb            m3, m4, m9
2679*c0909341SAndroid Build Coastguard Worker    psubw                m9, m7
2680*c0909341SAndroid Build Coastguard Worker    movu          [rsp+162], m2
2681*c0909341SAndroid Build Coastguard Worker    movu          [rsp+194], m3
2682*c0909341SAndroid Build Coastguard Worker    jmp .w32_filter_above
2683*c0909341SAndroid Build Coastguard Worker
2684*c0909341SAndroid Build Coastguard Workercglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
2685*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ipred_z3_16bpc_avx2_table]
2686*c0909341SAndroid Build Coastguard Worker    tzcnt                hd, hm
2687*c0909341SAndroid Build Coastguard Worker    movifnidn        angled, anglem
2688*c0909341SAndroid Build Coastguard Worker    lea                  r7, [dr_intra_derivative+45*2-1]
2689*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 2
2690*c0909341SAndroid Build Coastguard Worker    movsxd               hq, [r6+hq*4]
2691*c0909341SAndroid Build Coastguard Worker    sub              angled, 180
2692*c0909341SAndroid Build Coastguard Worker    add                  hq, r6
2693*c0909341SAndroid Build Coastguard Worker    mov                 dyd, angled
2694*c0909341SAndroid Build Coastguard Worker    neg                 dyd
2695*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x400
2696*c0909341SAndroid Build Coastguard Worker    or                  dyq, ~0x7e
2697*c0909341SAndroid Build Coastguard Worker    movzx               dyd, word [r7+dyq]
2698*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pw_62]
2699*c0909341SAndroid Build Coastguard Worker    mov              org_wd, wd
2700*c0909341SAndroid Build Coastguard Worker    jmp                  hq
2701*c0909341SAndroid Build Coastguard Worker.h4:
2702*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK         -64, 7
2703*c0909341SAndroid Build Coastguard Worker    lea                  r7, [strideq*3]
2704*c0909341SAndroid Build Coastguard Worker    cmp              angleb, 40
2705*c0909341SAndroid Build Coastguard Worker    jae .h4_no_upsample
2706*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [angleq-1024]
2707*c0909341SAndroid Build Coastguard Worker    sar                 r4d, 7
2708*c0909341SAndroid Build Coastguard Worker    add                 r4d, wd
2709*c0909341SAndroid Build Coastguard Worker    jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm)
2710*c0909341SAndroid Build Coastguard Worker    mova                xm2, [tlq-14]            ; 0 1 2 3 4 5 6 7
2711*c0909341SAndroid Build Coastguard Worker    pblendw             xm1, xm2, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6
2712*c0909341SAndroid Build Coastguard Worker    vpblendd            xm0, xm1, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5
2713*c0909341SAndroid Build Coastguard Worker    pshufd              xm3, xm1, q0000
2714*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
2715*c0909341SAndroid Build Coastguard Worker    paddw               xm0, [tlq-12]            ; 1 2 3 4 5 6 7 8
2716*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        xm4, r8m ; pixel_max
2717*c0909341SAndroid Build Coastguard Worker    add                 dyd, dyd
2718*c0909341SAndroid Build Coastguard Worker    psubw               xm0, xm1, xm0
2719*c0909341SAndroid Build Coastguard Worker    mova           [rsp+ 0], xm3
2720*c0909341SAndroid Build Coastguard Worker    movd                xm3, dyd
2721*c0909341SAndroid Build Coastguard Worker    psraw               xm0, 3
2722*c0909341SAndroid Build Coastguard Worker    neg                 dyd
2723*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm0
2724*c0909341SAndroid Build Coastguard Worker    pxor                xm0, xm0
2725*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [dyq+(16<<6)+63] ; ypos
2726*c0909341SAndroid Build Coastguard Worker    pmaxsw              xm1, xm0
2727*c0909341SAndroid Build Coastguard Worker    pavgw               xm1, xm0
2728*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, xm3
2729*c0909341SAndroid Build Coastguard Worker    pminsw              xm1, xm4
2730*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm0, xm1, xm2
2731*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm1, xm2
2732*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3, m3
2733*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], xm0
2734*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m2
2735*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], xm1
2736*c0909341SAndroid Build Coastguard Worker    paddw                m4, m2, m2
2737*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
2738*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m2, 0xf0 ; ypos0 ypos1   ypos2 ypos3
2739*c0909341SAndroid Build Coastguard Worker.h4_upsample_loop:
2740*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [r2+dyq]
2741*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6
2742*c0909341SAndroid Build Coastguard Worker    movu                xm1, [rsp+r2*2]
2743*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r4+dyq]
2744*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6
2745*c0909341SAndroid Build Coastguard Worker    movu                xm2, [rsp+r4*2]
2746*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [r2+dyq]
2747*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6
2748*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [rsp+r2*2], 1
2749*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r4+dyq]
2750*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6
2751*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [rsp+r4*2], 1
2752*c0909341SAndroid Build Coastguard Worker    psrld                m0, m1, 16
2753*c0909341SAndroid Build Coastguard Worker    pblendw              m0, m2, 0xaa ; a3 b3 a2 b2 a1 b1 a0 b0   c3 d3 c2 d2 c1 d1 c0 d0
2754*c0909341SAndroid Build Coastguard Worker    pslld                m2, 16
2755*c0909341SAndroid Build Coastguard Worker    pblendw              m1, m2, 0xaa
2756*c0909341SAndroid Build Coastguard Worker    pand                 m2, m5, m3
2757*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
2758*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
2759*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
2760*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
2761*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
2762*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m1, 1
2763*c0909341SAndroid Build Coastguard Worker    punpckhdq           xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0
2764*c0909341SAndroid Build Coastguard Worker    punpckldq           xm1, xm2      ; a3 b3 c3 d3 a2 b2 c2 d2
2765*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*0], xm0
2766*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm0
2767*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm1
2768*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r7       ], xm1
2769*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
2770*c0909341SAndroid Build Coastguard Worker    sub                  wd, 4
2771*c0909341SAndroid Build Coastguard Worker    jg .h4_upsample_loop
2772*c0909341SAndroid Build Coastguard Worker    RET
2773*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2774*c0909341SAndroid Build Coastguard Worker.filter_strength: ; h4/h8/h16
2775*c0909341SAndroid Build Coastguard Worker%define base r4-z_filter_t0
2776*c0909341SAndroid Build Coastguard Worker    lea                  r4, [z_filter_t0]
2777*c0909341SAndroid Build Coastguard Worker    movd                xm0, maxbased
2778*c0909341SAndroid Build Coastguard Worker    movd                xm1, angled
2779*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
2780*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, xm0
2781*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m1, xm1
2782*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m0, [base+z_filter_wh]
2783*c0909341SAndroid Build Coastguard Worker    pand                 m0, m1
2784*c0909341SAndroid Build Coastguard Worker    mova                xm1, [r4+angleq*8]
2785*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m0, m1
2786*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m0
2787*c0909341SAndroid Build Coastguard Worker    ret
2788*c0909341SAndroid Build Coastguard Worker.h4_no_upsample:
2789*c0909341SAndroid Build Coastguard Worker    mov            maxbased, 7
2790*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
2791*c0909341SAndroid Build Coastguard Worker    jnz .h4_main
2792*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [wq+3]
2793*c0909341SAndroid Build Coastguard Worker    call .filter_strength
2794*c0909341SAndroid Build Coastguard Worker    mov            maxbased, 7
2795*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2796*c0909341SAndroid Build Coastguard Worker    jz .h4_main ; filter_strength == 0
2797*c0909341SAndroid Build Coastguard Worker    popcnt              r5d, r5d
2798*c0909341SAndroid Build Coastguard Worker    mova                xm0, [tlq-14]       ; 0 1 2 3 4 5 6 7
2799*c0909341SAndroid Build Coastguard Worker    movu                xm3, [tlq-12]       ; 1 2 3 4 5 6 7 8
2800*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm2, [base+z_filter_k-4+r5*4+12*1]
2801*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [base+z_filter_k-4+r5*4+12*0]
2802*c0909341SAndroid Build Coastguard Worker    pmullw              xm2, xm0
2803*c0909341SAndroid Build Coastguard Worker    pblendw             xm0, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6
2804*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm0, xm3
2805*c0909341SAndroid Build Coastguard Worker    movd           [rsp+12], xm0
2806*c0909341SAndroid Build Coastguard Worker    pmullw              xm1, xm4
2807*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, 3
2808*c0909341SAndroid Build Coastguard Worker    jne .h4_filter_3tap
2809*c0909341SAndroid Build Coastguard Worker    pblendw             xm3, [tlq-10], 0x7f ; 2 3 4 5 6 7 8 8
2810*c0909341SAndroid Build Coastguard Worker    vpblendd            xm0, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5
2811*c0909341SAndroid Build Coastguard Worker    movzx               r4d, word [tlq-14]
2812*c0909341SAndroid Build Coastguard Worker    movzx               r2d, word [tlq-12]
2813*c0909341SAndroid Build Coastguard Worker    inc            maxbased
2814*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
2815*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm3
2816*c0909341SAndroid Build Coastguard Worker    sub                 r2d, r4d
2817*c0909341SAndroid Build Coastguard Worker    paddw               xm2, xm0, xm0
2818*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r2+r4*8+4]
2819*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 3
2820*c0909341SAndroid Build Coastguard Worker    mov            [rsp+14], r2w
2821*c0909341SAndroid Build Coastguard Worker.h4_filter_3tap:
2822*c0909341SAndroid Build Coastguard Worker    pxor                xm0, xm0
2823*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
2824*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+30]
2825*c0909341SAndroid Build Coastguard Worker    psrlw               xm1, 3
2826*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
2827*c0909341SAndroid Build Coastguard Worker    sbb            maxbased, -1
2828*c0909341SAndroid Build Coastguard Worker    pavgw               xm0, xm1
2829*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], xm0
2830*c0909341SAndroid Build Coastguard Worker.h4_main:
2831*c0909341SAndroid Build Coastguard Worker    movd                xm3, dyd
2832*c0909341SAndroid Build Coastguard Worker    neg            maxbaseq
2833*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [z_base_inc]
2834*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, [tlq+maxbaseq*2]
2835*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
2836*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, xm3
2837*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [maxbaseq+3*64]
2838*c0909341SAndroid Build Coastguard Worker    neg                 dyq
2839*c0909341SAndroid Build Coastguard Worker    movd                xm2, r4d
2840*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 8
2841*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dyq+63] ; ypos
2842*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m1
2843*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3, m3
2844*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, xm2
2845*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m0
2846*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0, m0
2847*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
2848*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
2849*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m0, 0xf0 ; ypos0 ypos1   ypos2 ypos3
2850*c0909341SAndroid Build Coastguard Worker    or             maxbased, 63
2851*c0909341SAndroid Build Coastguard Worker    paddw                m3, m2
2852*c0909341SAndroid Build Coastguard Worker.h4_loop:
2853*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r4+dyq]
2854*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6 ; base0
2855*c0909341SAndroid Build Coastguard Worker    movu                xm1, [tlq+r4*2]
2856*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r5+dyq]
2857*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6 ; base1
2858*c0909341SAndroid Build Coastguard Worker    movu                xm2, [tlq+r5*2]
2859*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r4+dyq]
2860*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6 ; base2
2861*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [tlq+r4*2], 1
2862*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r5+dyq]
2863*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6 ; base3
2864*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [tlq+r5*2], 1
2865*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m1, m2
2866*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2
2867*c0909341SAndroid Build Coastguard Worker    pand                 m2, m5, m3
2868*c0909341SAndroid Build Coastguard Worker    palignr              m0, m1, 4    ; a3 b3 a2 b2 a1 b1 a0 b0   c3 d3 c2 d2 c1 d1 c0 d0
2869*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
2870*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
2871*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
2872*c0909341SAndroid Build Coastguard Worker    psraw                m2, m3, 15   ; ypos < max_base_y
2873*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
2874*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
2875*c0909341SAndroid Build Coastguard Worker    vpblendvb            m1, m6, m1, m2
2876*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m1, 1
2877*c0909341SAndroid Build Coastguard Worker    punpckhdq           xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0
2878*c0909341SAndroid Build Coastguard Worker    punpckldq           xm1, xm2      ; a3 b3 c3 d3 a2 b2 c2 d2
2879*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*0], xm0
2880*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm0
2881*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm1
2882*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r7       ], xm1
2883*c0909341SAndroid Build Coastguard Worker    sub                  wd, 4
2884*c0909341SAndroid Build Coastguard Worker    jz .h4_end
2885*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
2886*c0909341SAndroid Build Coastguard Worker    cmp                 r4d, maxbased
2887*c0909341SAndroid Build Coastguard Worker    jg .h4_loop
2888*c0909341SAndroid Build Coastguard Worker.h4_end_loop:
2889*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm6
2890*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm6
2891*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm6
2892*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r7       ], xm6
2893*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
2894*c0909341SAndroid Build Coastguard Worker    sub                  wd, 4
2895*c0909341SAndroid Build Coastguard Worker    jg .h4_end_loop
2896*c0909341SAndroid Build Coastguard Worker.h4_end:
2897*c0909341SAndroid Build Coastguard Worker    RET
2898*c0909341SAndroid Build Coastguard Worker.h8:
2899*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [angleq+216]
2900*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK         -64, 8
2901*c0909341SAndroid Build Coastguard Worker    mov                 r4b, wb
2902*c0909341SAndroid Build Coastguard Worker    lea                  r7, [strideq*3]
2903*c0909341SAndroid Build Coastguard Worker    cmp                 r4d, 8
2904*c0909341SAndroid Build Coastguard Worker    ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
2905*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq-30]     ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
2906*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2, [tlq-32] ; _ 0 1 2 3 4 5 6   7 8 9 a b c d e
2907*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq-34]     ; _ _ 0 1 2 3 4 5   6 7 8 9 a b c d
2908*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
2909*c0909341SAndroid Build Coastguard Worker    je .h8_upsample_w8
2910*c0909341SAndroid Build Coastguard Worker    pshufhw             xm3, xm2, q1000
2911*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m3, 0x0f     ; _ _ _ _ 4 4 4 5   6 7 8 9 a b c d
2912*c0909341SAndroid Build Coastguard Worker.h8_upsample_w8:
2913*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-28]     ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2914*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, r8m ; pixel_max
2915*c0909341SAndroid Build Coastguard Worker    add                 dyd, dyd
2916*c0909341SAndroid Build Coastguard Worker    psubw                m0, m1, m0
2917*c0909341SAndroid Build Coastguard Worker    movd                xm6, dyd
2918*c0909341SAndroid Build Coastguard Worker    psraw                m0, 3
2919*c0909341SAndroid Build Coastguard Worker    neg                 dyd
2920*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
2921*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
2922*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m0
2923*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [dyq+(16<<6)+63] ; ypos
2924*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m0
2925*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, xm6
2926*c0909341SAndroid Build Coastguard Worker    pminsw               m1, m4
2927*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m1, m2
2928*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2
2929*c0909341SAndroid Build Coastguard Worker    vextracti128   [rsp+48], m0, 1
2930*c0909341SAndroid Build Coastguard Worker    vextracti128   [rsp+32], m1, 1
2931*c0909341SAndroid Build Coastguard Worker    paddw                m7, m6, m6
2932*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], xm0
2933*c0909341SAndroid Build Coastguard Worker    mova           [rsp+ 0], xm1
2934*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7 ; ypos0 ypos1
2935*c0909341SAndroid Build Coastguard Worker.h8_upsample_loop:
2936*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r4+dyq]
2937*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6 ; base0
2938*c0909341SAndroid Build Coastguard Worker    movu                 m1, [rsp+r4*2]
2939*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [r2+dyq]
2940*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6 ; base1
2941*c0909341SAndroid Build Coastguard Worker    movu                 m2, [rsp+r2*2]
2942*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r4+dyq]
2943*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6 ; base2
2944*c0909341SAndroid Build Coastguard Worker    movu                 m3, [rsp+r4*2]
2945*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [r2+dyq]
2946*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6 ; base3
2947*c0909341SAndroid Build Coastguard Worker    movu                 m4, [rsp+r2*2]
2948*c0909341SAndroid Build Coastguard Worker    psrld                m0, m1, 16
2949*c0909341SAndroid Build Coastguard Worker    pblendw              m0, m2, 0xaa ; a7 b7 a6 b6 a5 b5 a4 b4   a3 b3 a2 b2 a1 b1 a0 b0
2950*c0909341SAndroid Build Coastguard Worker    pslld                m2, 16
2951*c0909341SAndroid Build Coastguard Worker    pblendw              m1, m2, 0xaa
2952*c0909341SAndroid Build Coastguard Worker    psrld                m2, m3, 16
2953*c0909341SAndroid Build Coastguard Worker    pblendw              m2, m4, 0xaa ; c7 d7 c6 d6 c5 d5 c4 d4   c3 d3 c2 d2 c1 d1 c0 d0
2954*c0909341SAndroid Build Coastguard Worker    pslld                m4, 16
2955*c0909341SAndroid Build Coastguard Worker    pblendw              m3, m4, 0xaa
2956*c0909341SAndroid Build Coastguard Worker    pand                 m4, m5, m6
2957*c0909341SAndroid Build Coastguard Worker    paddw                m6, m7
2958*c0909341SAndroid Build Coastguard Worker    psllw                m4, 9
2959*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
2960*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4
2961*c0909341SAndroid Build Coastguard Worker    pand                 m4, m5, m6
2962*c0909341SAndroid Build Coastguard Worker    psllw                m4, 9
2963*c0909341SAndroid Build Coastguard Worker    psubw                m3, m2
2964*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4
2965*c0909341SAndroid Build Coastguard Worker    paddw                m6, m7
2966*c0909341SAndroid Build Coastguard Worker    lea                  r2, [dstq+strideq*4]
2967*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
2968*c0909341SAndroid Build Coastguard Worker    paddw                m3, m2
2969*c0909341SAndroid Build Coastguard Worker    punpckhdq            m0, m1, m3   ; a5 b5 c5 d5 a4 b4 c4 d4   a1 b1 c1 d1 a0 b0 c0 d0
2970*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m3       ; a7 b7 c7 d7 a6 b6 c6 d6   a3 b3 c3 d3 a2 b2 c2 d2
2971*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m0, 1
2972*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m1, 1
2973*c0909341SAndroid Build Coastguard Worker    movhps [r2  +strideq*0], xm0
2974*c0909341SAndroid Build Coastguard Worker    movq   [r2  +strideq*1], xm0
2975*c0909341SAndroid Build Coastguard Worker    movhps [r2  +strideq*2], xm1
2976*c0909341SAndroid Build Coastguard Worker    movq   [r2  +r7       ], xm1
2977*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*0], xm2
2978*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm2
2979*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm3
2980*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r7       ], xm3
2981*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
2982*c0909341SAndroid Build Coastguard Worker    sub                  wd, 4
2983*c0909341SAndroid Build Coastguard Worker    jg .h8_upsample_loop
2984*c0909341SAndroid Build Coastguard Worker    RET
2985*c0909341SAndroid Build Coastguard Worker.h8_no_intra_edge_filter:
2986*c0909341SAndroid Build Coastguard Worker    and            maxbased, 7
2987*c0909341SAndroid Build Coastguard Worker    or             maxbased, 8 ; imin(w+7, 15)
2988*c0909341SAndroid Build Coastguard Worker    jmp .h8_main
2989*c0909341SAndroid Build Coastguard Worker.h8_no_upsample:
2990*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [wq+7]
2991*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2992*c0909341SAndroid Build Coastguard Worker    jnz .h8_no_intra_edge_filter
2993*c0909341SAndroid Build Coastguard Worker    call .filter_strength
2994*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2995*c0909341SAndroid Build Coastguard Worker    jz .h8_main
2996*c0909341SAndroid Build Coastguard Worker    popcnt              r5d, r5d
2997*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-30]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
2998*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq-28]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2999*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [base+z_filter_k-4+r5*4+12*1]
3000*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+z_filter_k-4+r5*4+12*0]
3001*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m0
3002*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
3003*c0909341SAndroid Build Coastguard Worker    jl .h8_filter_w4
3004*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm0, xm0
3005*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m0, [tlq-32], 0xfe ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
3006*c0909341SAndroid Build Coastguard Worker    movd           [rsp+28], xm0
3007*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
3008*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 16
3009*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
3010*c0909341SAndroid Build Coastguard Worker    cmovg          maxbased, r4d
3011*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, 3
3012*c0909341SAndroid Build Coastguard Worker    jne .h8_filter_3tap
3013*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m3
3014*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, [tlq-34], 0xfe     ; 0 0 0 1 2 3 4 5   6 7 8 9 a b c d
3015*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, [tlq-26], 0x7f     ; 2 3 4 5 6 7 8 9   a b c d e f g g
3016*c0909341SAndroid Build Coastguard Worker    movzx               r4d, word [tlq-30]
3017*c0909341SAndroid Build Coastguard Worker    movzx               r2d, word [tlq-28]
3018*c0909341SAndroid Build Coastguard Worker    inc            maxbased
3019*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
3020*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
3021*c0909341SAndroid Build Coastguard Worker    sub                 r2d, r4d
3022*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0, m0
3023*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r2+r4*8+4]
3024*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 3
3025*c0909341SAndroid Build Coastguard Worker    mov            [rsp+30], r2w
3026*c0909341SAndroid Build Coastguard Worker    jmp .h8_filter_3tap
3027*c0909341SAndroid Build Coastguard Worker.h8_filter_w4:
3028*c0909341SAndroid Build Coastguard Worker    pshufhw             xm1, xm0, q2100
3029*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [tlq-16], 1        ; _ _ _ _ 4 4 5 6   7 8 9 a b c d e
3030*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
3031*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4
3032*c0909341SAndroid Build Coastguard Worker.h8_filter_3tap:
3033*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
3034*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
3035*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+62]
3036*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 3
3037*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m1
3038*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m0
3039*c0909341SAndroid Build Coastguard Worker.h8_main:
3040*c0909341SAndroid Build Coastguard Worker    movd                xm4, dyd
3041*c0909341SAndroid Build Coastguard Worker    neg            maxbaseq
3042*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [z_base_inc]
3043*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, [tlq+maxbaseq*2]
3044*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
3045*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, xm4
3046*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [maxbaseq+7*64]
3047*c0909341SAndroid Build Coastguard Worker    neg                 dyq
3048*c0909341SAndroid Build Coastguard Worker    movd                xm2, r4d
3049*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 16
3050*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dyq+63]
3051*c0909341SAndroid Build Coastguard Worker    paddw                m6, m4, m4
3052*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, xm2
3053*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m6, 0xf0 ; ypos0 ypos1
3054*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
3055*c0909341SAndroid Build Coastguard Worker    or             maxbased, 63
3056*c0909341SAndroid Build Coastguard Worker    paddw                m4, m2
3057*c0909341SAndroid Build Coastguard Worker.h8_loop:
3058*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r4+dyq]
3059*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6 ; base0
3060*c0909341SAndroid Build Coastguard Worker    movu                xm0, [tlq+r4*2+2]
3061*c0909341SAndroid Build Coastguard Worker    movu                xm1, [tlq+r4*2]
3062*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r5+dyq]
3063*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6 ; base1
3064*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [tlq+r5*2+2], 1
3065*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [tlq+r5*2], 1
3066*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r4+dyq]
3067*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6 ; base2
3068*c0909341SAndroid Build Coastguard Worker    pand                 m3, m5, m4
3069*c0909341SAndroid Build Coastguard Worker    psllw                m3, 9
3070*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
3071*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
3072*c0909341SAndroid Build Coastguard Worker    psraw                m3, m4, 15
3073*c0909341SAndroid Build Coastguard Worker    paddw                m4, m6
3074*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3075*c0909341SAndroid Build Coastguard Worker    movu                xm1, [tlq+r4*2+2]
3076*c0909341SAndroid Build Coastguard Worker    movu                xm2, [tlq+r4*2]
3077*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r5+dyq]
3078*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6 ; base3
3079*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m7, m0, m3
3080*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [tlq+r5*2+2], 1
3081*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [tlq+r5*2], 1
3082*c0909341SAndroid Build Coastguard Worker    pand                 m3, m5, m4
3083*c0909341SAndroid Build Coastguard Worker    psllw                m3, 9
3084*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
3085*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
3086*c0909341SAndroid Build Coastguard Worker    psraw                m3, m4, 15
3087*c0909341SAndroid Build Coastguard Worker    paddw                m4, m6
3088*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dstq+strideq*4]
3089*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
3090*c0909341SAndroid Build Coastguard Worker    vpblendvb            m1, m7, m1, m3
3091*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0, m1   ; a3 c3 a2 c2 a1 c1 a0 c0   b3 d3 b2 d2 b1 d1 b0 d0
3092*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m2, 1
3093*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1       ; a7 c7 a6 c6 a5 c5 a4 c5   b7 d7 b6 d6 b5 d5 b4 d4
3094*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm1, xm2, xm3 ; a1 b1 c1 d1 a0 b0 c0 d0
3095*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm3      ; a3 b3 c3 d3 a2 b2 c2 d2
3096*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m0, 1
3097*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*0], xm1
3098*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
3099*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm2
3100*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r7       ], xm2
3101*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm1, xm0, xm3 ; a5 b5 c5 d5 a4 b4 c4 d4
3102*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm0, xm3      ; a7 b7 c7 d7 a6 b6 c6 d6
3103*c0909341SAndroid Build Coastguard Worker    movhps [r5  +strideq*0], xm1
3104*c0909341SAndroid Build Coastguard Worker    movq   [r5  +strideq*1], xm1
3105*c0909341SAndroid Build Coastguard Worker    movhps [r5  +strideq*2], xm0
3106*c0909341SAndroid Build Coastguard Worker    movq   [r5  +r7       ], xm0
3107*c0909341SAndroid Build Coastguard Worker    sub                  wd, 4
3108*c0909341SAndroid Build Coastguard Worker    jz .h8_end
3109*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
3110*c0909341SAndroid Build Coastguard Worker    cmp                 r4d, maxbased
3111*c0909341SAndroid Build Coastguard Worker    jg .h8_loop
3112*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*5]
3113*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq+r7*2] ; stride*7
3114*c0909341SAndroid Build Coastguard Worker    test                 wd, 4
3115*c0909341SAndroid Build Coastguard Worker    jz .h8_end_loop
3116*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm7
3117*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm7
3118*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm7
3119*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r7       ], xm7
3120*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*4], xm7
3121*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r6       ], xm7
3122*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r7*2     ], xm7
3123*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r2       ], xm7
3124*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
3125*c0909341SAndroid Build Coastguard Worker    sub                  wd, 4
3126*c0909341SAndroid Build Coastguard Worker    jz .h8_end
3127*c0909341SAndroid Build Coastguard Worker.h8_end_loop:
3128*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], xm7
3129*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], xm7
3130*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], xm7
3131*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r7       ], xm7
3132*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*4], xm7
3133*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r6       ], xm7
3134*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r7*2     ], xm7
3135*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r2       ], xm7
3136*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
3137*c0909341SAndroid Build Coastguard Worker    sub                  wd, 8
3138*c0909341SAndroid Build Coastguard Worker    jg .h8_end_loop
3139*c0909341SAndroid Build Coastguard Worker.h8_end:
3140*c0909341SAndroid Build Coastguard Worker    RET
3141*c0909341SAndroid Build Coastguard Worker.h16_no_intra_edge_filter:
3142*c0909341SAndroid Build Coastguard Worker    and            maxbased, 15
3143*c0909341SAndroid Build Coastguard Worker    or             maxbased, 16 ; imin(w+15, 31)
3144*c0909341SAndroid Build Coastguard Worker    jmp .h16_main
3145*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3146*c0909341SAndroid Build Coastguard Worker.h16:
3147*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK         -96, 10
3148*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [wq+15]
3149*c0909341SAndroid Build Coastguard Worker    lea                  r7, [strideq*3]
3150*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
3151*c0909341SAndroid Build Coastguard Worker    jnz .h16_no_intra_edge_filter
3152*c0909341SAndroid Build Coastguard Worker    call .filter_strength
3153*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
3154*c0909341SAndroid Build Coastguard Worker    jz .h16_main ; filter_strength == 0
3155*c0909341SAndroid Build Coastguard Worker    popcnt              r5d, r5d
3156*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq-28]            ; 3 4 5 6 7 8 9 a   b c d e f g h i
3157*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0, [tlq-32]        ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3158*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [base+z_filter_k-4+r5*4+12*1]
3159*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*0]
3160*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m6, [tlq-30]        ; 2 3 4 5 6 7 8 9   a b c d e f g h
3161*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m7
3162*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
3163*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
3164*c0909341SAndroid Build Coastguard Worker    jg .h16_filter_w16
3165*c0909341SAndroid Build Coastguard Worker    mova                xm3, [tlq-46]            ; 0 1 2 3 4 5 6 7
3166*c0909341SAndroid Build Coastguard Worker    pmullw              xm6, xm3
3167*c0909341SAndroid Build Coastguard Worker    jl .h16_filter_w4
3168*c0909341SAndroid Build Coastguard Worker    pblendw             xm3, [tlq-48], 0xfe      ; 0 0 1 2 3 4 5 6
3169*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, 3
3170*c0909341SAndroid Build Coastguard Worker    jne .h16_filter_w8_3tap
3171*c0909341SAndroid Build Coastguard Worker    vpblendd            xm4, xm3, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5
3172*c0909341SAndroid Build Coastguard Worker.h16_filter_w8_5tap:
3173*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m0
3174*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, [tlq-26], 0x7f      ; 4 5 6 7 8 9 a b   c d e f g h i i
3175*c0909341SAndroid Build Coastguard Worker    paddw               xm4, [tlq-42]            ; 2 3 4 5 6 7 8 9
3176*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-34]            ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3177*c0909341SAndroid Build Coastguard Worker    paddw               xm4, xm4
3178*c0909341SAndroid Build Coastguard Worker    paddw                m0, m0
3179*c0909341SAndroid Build Coastguard Worker    paddw               xm6, xm4
3180*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
3181*c0909341SAndroid Build Coastguard Worker.h16_filter_w8_3tap:
3182*c0909341SAndroid Build Coastguard Worker    paddw               xm3, [tlq-44]            ; 1 2 3 4 5 6 7 8
3183*c0909341SAndroid Build Coastguard Worker    pmullw              xm3, xm7
3184*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
3185*c0909341SAndroid Build Coastguard Worker    paddw               xm3, xm6
3186*c0909341SAndroid Build Coastguard Worker    psrlw               xm3, 3
3187*c0909341SAndroid Build Coastguard Worker    pavgw               xm3, xm0
3188*c0909341SAndroid Build Coastguard Worker    mova           [rsp+48], xm3
3189*c0909341SAndroid Build Coastguard Worker    jmp .h16_filter_end
3190*c0909341SAndroid Build Coastguard Worker.h16_filter_w4:
3191*c0909341SAndroid Build Coastguard Worker    pshufhw             xm3, xm3, q2100          ; _ _ _ _ 4 4 5 6
3192*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, 3
3193*c0909341SAndroid Build Coastguard Worker    jne .h16_filter_w8_3tap
3194*c0909341SAndroid Build Coastguard Worker    pshufhw             xm4, xm3, q2100          ; _ _ _ _ 4 4 4 5
3195*c0909341SAndroid Build Coastguard Worker    jmp .h16_filter_w8_5tap
3196*c0909341SAndroid Build Coastguard Worker.h16_filter_w16:
3197*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tlq-62]            ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3198*c0909341SAndroid Build Coastguard Worker    pmullw               m6, m3
3199*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm3
3200*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m3, [tlq-64], 0xfe  ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
3201*c0909341SAndroid Build Coastguard Worker    paddw                m4, [tlq-60]            ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3202*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 32
3203*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 16
3204*c0909341SAndroid Build Coastguard Worker    cmovg          maxbased, r4d
3205*c0909341SAndroid Build Coastguard Worker    movd           [rsp+28], xm3
3206*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m7
3207*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, 3
3208*c0909341SAndroid Build Coastguard Worker    jne .h16_filter_w16_3tap
3209*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m0
3210*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, [tlq-66], 0xfe      ; 0 0 0 1 2 3 4 5   6 7 8 9 a b c d
3211*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, [tlq-26], 0x7f      ; 4 5 6 7 8 9 a b   c d e f g h i i
3212*c0909341SAndroid Build Coastguard Worker    paddw                m3, [tlq-58]            ; 2 3 4 5 6 7 8 9   a b c d e f g h
3213*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-34]            ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3214*c0909341SAndroid Build Coastguard Worker    movzx               r4d, word [tlq-62]
3215*c0909341SAndroid Build Coastguard Worker    movzx               r2d, word [tlq-60]
3216*c0909341SAndroid Build Coastguard Worker    or             maxbased, 1
3217*c0909341SAndroid Build Coastguard Worker    paddw                m3, m3
3218*c0909341SAndroid Build Coastguard Worker    sub                 r2d, r4d
3219*c0909341SAndroid Build Coastguard Worker    paddw                m0, m0
3220*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r2+r4*8+4]
3221*c0909341SAndroid Build Coastguard Worker    paddw                m4, m3
3222*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 3
3223*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
3224*c0909341SAndroid Build Coastguard Worker    mov            [rsp+30], r2w
3225*c0909341SAndroid Build Coastguard Worker.h16_filter_w16_3tap:
3226*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
3227*c0909341SAndroid Build Coastguard Worker    paddw                m4, m6
3228*c0909341SAndroid Build Coastguard Worker    psrlw                m4, 3
3229*c0909341SAndroid Build Coastguard Worker    pavgw                m4, m0
3230*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m4
3231*c0909341SAndroid Build Coastguard Worker.h16_filter_end:
3232*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 3
3233*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+94]
3234*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m0
3235*c0909341SAndroid Build Coastguard Worker    mova           [rsp+64], m1
3236*c0909341SAndroid Build Coastguard Worker.h16_main:
3237*c0909341SAndroid Build Coastguard Worker    movd                xm8, dyd
3238*c0909341SAndroid Build Coastguard Worker    neg            maxbaseq
3239*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, [tlq+maxbaseq*2]
3240*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
3241*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m8, xm8
3242*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [maxbaseq+dyq+15*64]
3243*c0909341SAndroid Build Coastguard Worker    neg                 dyq
3244*c0909341SAndroid Build Coastguard Worker    movd                xm7, r4d
3245*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 32
3246*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dyq+63]
3247*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, xm7
3248*c0909341SAndroid Build Coastguard Worker    or             maxbased, 63
3249*c0909341SAndroid Build Coastguard Worker    psubw                m7, [z_base_inc]
3250*c0909341SAndroid Build Coastguard Worker.h16_loop:
3251*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r4+dyq]
3252*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6 ; base0
3253*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r4*2+2]
3254*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r4*2]
3255*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r5+dyq]
3256*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6 ; base1
3257*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r5*2+2]
3258*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r5*2]
3259*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r4+dyq]
3260*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6 ; base3
3261*c0909341SAndroid Build Coastguard Worker    pand                 m6, m5, m7
3262*c0909341SAndroid Build Coastguard Worker    psllw                m6, 9
3263*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
3264*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
3265*c0909341SAndroid Build Coastguard Worker    psraw                m6, m7, 15
3266*c0909341SAndroid Build Coastguard Worker    paddw                m7, m8
3267*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
3268*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r4*2+2]
3269*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq+r4*2]
3270*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r5+dyq]
3271*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6 ; base3
3272*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m9, m0, m6
3273*c0909341SAndroid Build Coastguard Worker    pand                 m6, m5, m7
3274*c0909341SAndroid Build Coastguard Worker    psllw                m6, 9
3275*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
3276*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6
3277*c0909341SAndroid Build Coastguard Worker    psraw                m6, m7, 15
3278*c0909341SAndroid Build Coastguard Worker    paddw                m7, m8
3279*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
3280*c0909341SAndroid Build Coastguard Worker    vpblendvb            m1, m9, m1, m6
3281*c0909341SAndroid Build Coastguard Worker    pand                 m6, m5, m7
3282*c0909341SAndroid Build Coastguard Worker    psllw                m6, 9
3283*c0909341SAndroid Build Coastguard Worker    psubw                m4, m2
3284*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m6
3285*c0909341SAndroid Build Coastguard Worker    psraw                m6, m7, 15
3286*c0909341SAndroid Build Coastguard Worker    paddw                m7, m8
3287*c0909341SAndroid Build Coastguard Worker    paddw                m2, m4
3288*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r5*2+2]
3289*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq+r5*2]
3290*c0909341SAndroid Build Coastguard Worker    vpblendvb            m2, m9, m2, m6
3291*c0909341SAndroid Build Coastguard Worker    pand                 m6, m5, m7
3292*c0909341SAndroid Build Coastguard Worker    psllw                m6, 9
3293*c0909341SAndroid Build Coastguard Worker    psubw                m4, m3
3294*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m6
3295*c0909341SAndroid Build Coastguard Worker    psraw                m6, m7, 15
3296*c0909341SAndroid Build Coastguard Worker    paddw                m7, m8
3297*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dstq+strideq*4]
3298*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
3299*c0909341SAndroid Build Coastguard Worker    vpblendvb            m3, m9, m3, m6
3300*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0, m1 ; ab bb aa ba a9 b9 a8 b8   a3 b3 a2 b2 a1 b1 a0 b0
3301*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1     ; af bf ae be ad bd ac bc   a7 b7 a6 b6 a5 b5 a4 b4
3302*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2, m3 ; cb db ca da c9 d9 c8 d8   c3 d3 c2 d2 c1 d1 c0 d0
3303*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3     ; cf df ce de cd dd cc dc   c7 d7 c6 d6 c5 d5 c4 d4
3304*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m4, m1 ; a9 b9 c9 d9 a8 b8 c8 d8   a1 b1 c1 d1 a0 b0 c0 d0
3305*c0909341SAndroid Build Coastguard Worker    vextracti128        xm6, m3, 1
3306*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m1     ; ab bb cb db aa ba ca da   a3 b3 c3 d3 a2 b2 c2 d2
3307*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m0, m2 ; ad bd cd dd ac bc cc dc   a5 b5 c5 d5 a4 b4 c4 d4
3308*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2     ; af bf cf df ae be ce de   a7 b7 c7 d7 a6 b6 c6 d6
3309*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m4, 1
3310*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*0], xm6
3311*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm6
3312*c0909341SAndroid Build Coastguard Worker    vextracti128        xm6, m1, 1
3313*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm2
3314*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r7       ], xm2
3315*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m0, 1
3316*c0909341SAndroid Build Coastguard Worker    movhps [r5  +strideq*0], xm6
3317*c0909341SAndroid Build Coastguard Worker    movq   [r5  +strideq*1], xm6
3318*c0909341SAndroid Build Coastguard Worker    movhps [r5  +strideq*2], xm2
3319*c0909341SAndroid Build Coastguard Worker    movq   [r5  +r7       ], xm2
3320*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dstq+strideq*8]
3321*c0909341SAndroid Build Coastguard Worker    movhps [r5  +strideq*0], xm3
3322*c0909341SAndroid Build Coastguard Worker    movq   [r5  +strideq*1], xm3
3323*c0909341SAndroid Build Coastguard Worker    movhps [r5  +strideq*2], xm4
3324*c0909341SAndroid Build Coastguard Worker    movq   [r5  +r7       ], xm4
3325*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+strideq*4]
3326*c0909341SAndroid Build Coastguard Worker    movhps [r5  +strideq*0], xm1
3327*c0909341SAndroid Build Coastguard Worker    movq   [r5  +strideq*1], xm1
3328*c0909341SAndroid Build Coastguard Worker    movhps [r5  +strideq*2], xm0
3329*c0909341SAndroid Build Coastguard Worker    movq   [r5  +r7       ], xm0
3330*c0909341SAndroid Build Coastguard Worker    sub                  wd, 4
3331*c0909341SAndroid Build Coastguard Worker    jz .h16_end
3332*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
3333*c0909341SAndroid Build Coastguard Worker    cmp                 r4d, maxbased
3334*c0909341SAndroid Build Coastguard Worker    jg .h16_loop
3335*c0909341SAndroid Build Coastguard Worker    mov                  hd, 4
3336*c0909341SAndroid Build Coastguard Worker.h16_end_loop0:
3337*c0909341SAndroid Build Coastguard Worker    mov                 r6d, wd
3338*c0909341SAndroid Build Coastguard Worker    mov                  r2, dstq
3339*c0909341SAndroid Build Coastguard Worker    test                 wb, 4
3340*c0909341SAndroid Build Coastguard Worker    jz .h16_end_loop
3341*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm9
3342*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm9
3343*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm9
3344*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r7       ], xm9
3345*c0909341SAndroid Build Coastguard Worker    and                 r6d, 120
3346*c0909341SAndroid Build Coastguard Worker    jz .h16_end_w4
3347*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
3348*c0909341SAndroid Build Coastguard Worker.h16_end_loop:
3349*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], xm9
3350*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], xm9
3351*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], xm9
3352*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r7       ], xm9
3353*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
3354*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 8
3355*c0909341SAndroid Build Coastguard Worker    jg .h16_end_loop
3356*c0909341SAndroid Build Coastguard Worker.h16_end_w4:
3357*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r2+strideq*4]
3358*c0909341SAndroid Build Coastguard Worker    dec                  hd
3359*c0909341SAndroid Build Coastguard Worker    jg .h16_end_loop0
3360*c0909341SAndroid Build Coastguard Worker.h16_end:
3361*c0909341SAndroid Build Coastguard Worker    RET
3362*c0909341SAndroid Build Coastguard Worker.h32:
3363*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK        -160, 9
3364*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [wq+31]
3365*c0909341SAndroid Build Coastguard Worker    and            maxbased, 31
3366*c0909341SAndroid Build Coastguard Worker    or             maxbased, 32 ; imin(w+31, 63)
3367*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
3368*c0909341SAndroid Build Coastguard Worker    jnz .h32_main
3369*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [pw_3]
3370*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq-28]       ; 3 4 5 6 7 8 9 a   b c d e f g h i
3371*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m0
3372*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h i i
3373*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-30]       ; 2 3 4 5 6 7 8 9   a b c d e f g h
3374*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
3375*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-32]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3376*c0909341SAndroid Build Coastguard Worker    pavgw                m1, [tlq-34]       ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3377*c0909341SAndroid Build Coastguard Worker    lea                  r4, [rsp+128]
3378*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3379*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [maxbaseq-31]
3380*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
3381*c0909341SAndroid Build Coastguard Worker    mova               [r4], m0
3382*c0909341SAndroid Build Coastguard Worker.h32_filter_loop:
3383*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-62]
3384*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2, [tlq-66]
3385*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-64]
3386*c0909341SAndroid Build Coastguard Worker    pavgw                m1, [tlq-58]
3387*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-60]
3388*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 32
3389*c0909341SAndroid Build Coastguard Worker    sub                  r4, 32
3390*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3391*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
3392*c0909341SAndroid Build Coastguard Worker    mova               [r4], m0
3393*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 16
3394*c0909341SAndroid Build Coastguard Worker    jg .h32_filter_loop
3395*c0909341SAndroid Build Coastguard Worker    jl .h32_filter_h8
3396*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-62]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3397*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm1, xm0, xm0
3398*c0909341SAndroid Build Coastguard Worker    paddw                m2, [tlq-58]           ; 2 3 4 5 6 7 8 9   a b c d e f g h
3399*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-60]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3400*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5   6 7 8 9 a b c d
3401*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, [tlq-64], 0xfe     ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
3402*c0909341SAndroid Build Coastguard Worker    movzx               r5d, word [tlq-62]
3403*c0909341SAndroid Build Coastguard Worker    movzx               r2d, word [tlq-60]
3404*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m3
3405*c0909341SAndroid Build Coastguard Worker    sub                 r2d, r5d
3406*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3407*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r2+r5*8+4]
3408*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
3409*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 3
3410*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
3411*c0909341SAndroid Build Coastguard Worker    mova            [r4-32], m0
3412*c0909341SAndroid Build Coastguard Worker    mov             [r4-36], r5w
3413*c0909341SAndroid Build Coastguard Worker    mov             [r4-34], r2w
3414*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+158]
3415*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 65
3416*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 64
3417*c0909341SAndroid Build Coastguard Worker    cmove          maxbased, r4d
3418*c0909341SAndroid Build Coastguard Worker    jmp .h32_main
3419*c0909341SAndroid Build Coastguard Worker.h32_filter_h8:
3420*c0909341SAndroid Build Coastguard Worker    mova                xm0, [tlq-46]            ; 0 1 2 3 4 5 6 7
3421*c0909341SAndroid Build Coastguard Worker    pblendw             xm1, xm0, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6
3422*c0909341SAndroid Build Coastguard Worker    paddw               xm2, [tlq-42]            ; 2 3 4 5 6 7 8 9
3423*c0909341SAndroid Build Coastguard Worker    paddw               xm0, [tlq-44]            ; 1 2 3 4 5 6 7 8
3424*c0909341SAndroid Build Coastguard Worker    vpblendd            xm3, xm1, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5
3425*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+158]
3426*c0909341SAndroid Build Coastguard Worker    pavgw               xm2, xm3
3427*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
3428*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm2
3429*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 2
3430*c0909341SAndroid Build Coastguard Worker    mova            [r4-16], xm0
3431*c0909341SAndroid Build Coastguard Worker.h32_main:
3432*c0909341SAndroid Build Coastguard Worker    movd                xm6, dyd
3433*c0909341SAndroid Build Coastguard Worker    neg            maxbaseq
3434*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, [tlq+maxbaseq*2]
3435*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
3436*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, xm6
3437*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [maxbaseq+dyq+15*64]
3438*c0909341SAndroid Build Coastguard Worker    neg                 dyq
3439*c0909341SAndroid Build Coastguard Worker    movd                xm4, r4d
3440*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [pw_m1024]
3441*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dyq+63]
3442*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, xm4
3443*c0909341SAndroid Build Coastguard Worker    or             maxbased, 63
3444*c0909341SAndroid Build Coastguard Worker    psubw                m4, [z_base_inc]
3445*c0909341SAndroid Build Coastguard Worker.h32_loop:
3446*c0909341SAndroid Build Coastguard Worker    mov                  r5, r4
3447*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6
3448*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r5*2-64]
3449*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r5*2-62]
3450*c0909341SAndroid Build Coastguard Worker    pand                 m3, m5, m4
3451*c0909341SAndroid Build Coastguard Worker    psllw                m3, 9
3452*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
3453*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
3454*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m8, m4
3455*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3456*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m7, m0, m2
3457*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r5*2-32]
3458*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r5*2-30]
3459*c0909341SAndroid Build Coastguard Worker    add                  r4, dyq
3460*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 64
3461*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
3462*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
3463*c0909341SAndroid Build Coastguard Worker    psraw                m3, m4, 15
3464*c0909341SAndroid Build Coastguard Worker    paddw                m4, m6
3465*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*0], m0
3466*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
3467*c0909341SAndroid Build Coastguard Worker    vpblendvb            m1, m7, m1, m3
3468*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*1], m1
3469*c0909341SAndroid Build Coastguard Worker    dec                  wd
3470*c0909341SAndroid Build Coastguard Worker    jz .h32_transpose
3471*c0909341SAndroid Build Coastguard Worker    cmp                 r4d, maxbased
3472*c0909341SAndroid Build Coastguard Worker    jg .h32_loop
3473*c0909341SAndroid Build Coastguard Worker.h32_end_loop:
3474*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 64
3475*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*0], m7
3476*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*1], m7
3477*c0909341SAndroid Build Coastguard Worker    dec                  wd
3478*c0909341SAndroid Build Coastguard Worker    jg .h32_end_loop
3479*c0909341SAndroid Build Coastguard Worker.h32_transpose:
3480*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
3481*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq*5]
3482*c0909341SAndroid Build Coastguard Worker    mov                  r8, dstq
3483*c0909341SAndroid Build Coastguard Worker    lea                  r5, [strideq+r3*2]
3484*c0909341SAndroid Build Coastguard Worker.h32_transpose_loop0:
3485*c0909341SAndroid Build Coastguard Worker    lea                  r6, [rsp+32]
3486*c0909341SAndroid Build Coastguard Worker    lea                  r2, [r8+org_wq*2-16]
3487*c0909341SAndroid Build Coastguard Worker.h32_transpose_loop:
3488*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r6+64*7]
3489*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r6+64*6]
3490*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r6+64*5]
3491*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r6+64*4]
3492*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r6+64*3]
3493*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r6+64*2]
3494*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r6+64*1]
3495*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r6+64*0]
3496*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m0, m1 ; a3 b3 a2 b2 a1 b1 a0 b0
3497*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1     ; a7 b7 a6 b6 a5 b5 a4 b4
3498*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2, m3 ; c3 d3 c2 d2 c1 d1 c0 d0
3499*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3     ; c7 d7 c6 d6 c5 d5 c4 d4
3500*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4, m5 ; e3 f3 e2 f2 e1 f1 e0 f0
3501*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5     ; e7 f7 e6 f6 e5 f5 e4 f4
3502*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6, m7 ; g3 h3 g2 h2 g1 h1 g0 h0
3503*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7     ; g7 h7 g6 h6 g5 h5 g4 h4
3504*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r2+strideq*8]
3505*c0909341SAndroid Build Coastguard Worker    sub                  r6, 32
3506*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m8, m1 ; a1 b1 c1 d1 a0 b0 c0 d0
3507*c0909341SAndroid Build Coastguard Worker    punpckldq            m8, m1     ; a3 b3 c3 d3 a2 b2 c2 d2
3508*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m3, m5 ; e1 f1 g1 h1 e0 f0 g0 h0
3509*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m5     ; e3 f3 g3 h3 e2 f2 g2 h2
3510*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m5, m7, m1 ;  8  0
3511*c0909341SAndroid Build Coastguard Worker    vextracti128 [r2  +strideq*0], m5, 1
3512*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m7, m1     ;  9  1
3513*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm5
3514*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m8, m3 ; 10  2
3515*c0909341SAndroid Build Coastguard Worker    vextracti128 [r2  +strideq*1], m7, 1
3516*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m8, m3     ; 11  3
3517*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*1], xm7
3518*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m0, m2 ; a5 b5 c5 d5 a4 b4 c4 d4
3519*c0909341SAndroid Build Coastguard Worker    vextracti128 [r2  +strideq*2], m1, 1
3520*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2     ; a7 b7 c7 d7 a6 b6 c6 d6
3521*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*2], xm1
3522*c0909341SAndroid Build Coastguard Worker    punpckhdq            m2, m4, m6 ; e5 f5 g5 h5 e4 f4 g4 h4
3523*c0909341SAndroid Build Coastguard Worker    vextracti128 [r2  +r3       ], m8, 1
3524*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m6     ; e7 f7 g7 h7 e6 f6 g6 h6
3525*c0909341SAndroid Build Coastguard Worker    mova         [dstq+r3       ], xm8
3526*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m6, m3, m2 ; 12  4
3527*c0909341SAndroid Build Coastguard Worker    vextracti128 [r2  +strideq*4], m6, 1
3528*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m3, m2     ; 13  5
3529*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*4], xm6
3530*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m2, m0, m4 ; 14  6
3531*c0909341SAndroid Build Coastguard Worker    vextracti128 [r2  +r4       ], m3, 1
3532*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m4     ; 15  7
3533*c0909341SAndroid Build Coastguard Worker    mova         [dstq+r4       ], xm3
3534*c0909341SAndroid Build Coastguard Worker    vextracti128 [r2  +r3*2     ], m2, 1
3535*c0909341SAndroid Build Coastguard Worker    mova         [dstq+r3*2     ], xm2
3536*c0909341SAndroid Build Coastguard Worker    vextracti128 [r2  +r5       ], m0, 1
3537*c0909341SAndroid Build Coastguard Worker    mova         [dstq+r5       ], xm0
3538*c0909341SAndroid Build Coastguard Worker    lea                  r2, [dstq+strideq*8]
3539*c0909341SAndroid Build Coastguard Worker    cmp                  r6, rsp
3540*c0909341SAndroid Build Coastguard Worker    jae .h32_transpose_loop
3541*c0909341SAndroid Build Coastguard Worker    add                 rsp, 64*8
3542*c0909341SAndroid Build Coastguard Worker    sub              org_wd, 8
3543*c0909341SAndroid Build Coastguard Worker    jg .h32_transpose_loop0
3544*c0909341SAndroid Build Coastguard Worker.h32_end:
3545*c0909341SAndroid Build Coastguard Worker    RET
3546*c0909341SAndroid Build Coastguard Worker.h64:
3547*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK        -256, 10
3548*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [wq+63]
3549*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
3550*c0909341SAndroid Build Coastguard Worker    jnz .h64_main
3551*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [pw_3]
3552*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq-28]       ; 3 4 5 6 7 8 9 a   b c d e f g h i
3553*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m0
3554*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h i i
3555*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-30]       ; 2 3 4 5 6 7 8 9   a b c d e f g h
3556*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
3557*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-32]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3558*c0909341SAndroid Build Coastguard Worker    pavgw                m1, [tlq-34]       ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3559*c0909341SAndroid Build Coastguard Worker    lea                  r4, [rsp+224]
3560*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3561*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [wq+32]
3562*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
3563*c0909341SAndroid Build Coastguard Worker    mova               [r4], m0
3564*c0909341SAndroid Build Coastguard Worker.h64_filter_loop:
3565*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-62]
3566*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2, [tlq-66]
3567*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-64]
3568*c0909341SAndroid Build Coastguard Worker    pavgw                m1, [tlq-58]
3569*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-60]
3570*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 32
3571*c0909341SAndroid Build Coastguard Worker    sub                  r4, 32
3572*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3573*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
3574*c0909341SAndroid Build Coastguard Worker    mova               [r4], m0
3575*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 16
3576*c0909341SAndroid Build Coastguard Worker    jg .h64_filter_loop
3577*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-62]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3578*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm1, xm0, xm0
3579*c0909341SAndroid Build Coastguard Worker    paddw                m2, [tlq-58]           ; 2 3 4 5 6 7 8 9   a b c d e f g h
3580*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-60]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3581*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5   6 7 8 9 a b c d
3582*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, [tlq-64], 0xfe     ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
3583*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+254]
3584*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m3
3585*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3586*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
3587*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
3588*c0909341SAndroid Build Coastguard Worker    mova            [r4-32], m0
3589*c0909341SAndroid Build Coastguard Worker.h64_main:
3590*c0909341SAndroid Build Coastguard Worker    neg            maxbaseq
3591*c0909341SAndroid Build Coastguard Worker    movd                xm4, dyd
3592*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, [tlq+maxbaseq*2]
3593*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
3594*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, xm4
3595*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [maxbaseq+dyq+15*64]
3596*c0909341SAndroid Build Coastguard Worker    neg                 dyq
3597*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pw_m1024]
3598*c0909341SAndroid Build Coastguard Worker    movd                xm3, r4d
3599*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dyq+63]
3600*c0909341SAndroid Build Coastguard Worker    paddw                m8, m7, m7
3601*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, xm3
3602*c0909341SAndroid Build Coastguard Worker    or             maxbased, 63
3603*c0909341SAndroid Build Coastguard Worker    paddw                m9, m8, m7
3604*c0909341SAndroid Build Coastguard Worker    psubw                m3, [z_base_inc]
3605*c0909341SAndroid Build Coastguard Worker.h64_loop:
3606*c0909341SAndroid Build Coastguard Worker    mov                  r5, r4
3607*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6
3608*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r5*2-128]
3609*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r5*2-126]
3610*c0909341SAndroid Build Coastguard Worker    pand                 m2, m5, m3
3611*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
3612*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
3613*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
3614*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 128
3615*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3616*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m1, m9, m3
3617*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m6, m0, m1
3618*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*0], m0
3619*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r5*2-96]
3620*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r5*2-94]
3621*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
3622*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
3623*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3624*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m1, m8, m3
3625*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m6, m0, m1
3626*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*1], m0
3627*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r5*2-64]
3628*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r5*2-62]
3629*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
3630*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
3631*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3632*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m1, m7, m3
3633*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m6, m0, m1
3634*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*2], m0
3635*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r5*2-32]
3636*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r5*2-30]
3637*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
3638*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
3639*c0909341SAndroid Build Coastguard Worker    add                  r4, dyq
3640*c0909341SAndroid Build Coastguard Worker    psraw                m2, m3, 15
3641*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
3642*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3643*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m6, m0, m2
3644*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*3], m0
3645*c0909341SAndroid Build Coastguard Worker    dec                  wd
3646*c0909341SAndroid Build Coastguard Worker    jz .h64_transpose
3647*c0909341SAndroid Build Coastguard Worker    cmp                 r4d, maxbased
3648*c0909341SAndroid Build Coastguard Worker    jg .h64_loop
3649*c0909341SAndroid Build Coastguard Worker.h64_end_loop:
3650*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 128
3651*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*0], m6
3652*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*1], m6
3653*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*2], m6
3654*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*3], m6
3655*c0909341SAndroid Build Coastguard Worker    dec                  wd
3656*c0909341SAndroid Build Coastguard Worker    jg .h64_end_loop
3657*c0909341SAndroid Build Coastguard Worker.h64_transpose:
3658*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
3659*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*5]
3660*c0909341SAndroid Build Coastguard Worker    mov                  r5, dstq
3661*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq+r2*2]
3662*c0909341SAndroid Build Coastguard Worker.h64_transpose_loop0:
3663*c0909341SAndroid Build Coastguard Worker    lea                  r6, [rsp+112]
3664*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+org_wq*2-32]
3665*c0909341SAndroid Build Coastguard Worker.h64_transpose_loop:
3666*c0909341SAndroid Build Coastguard Worker    mova                xm0, [r6+128*15]
3667*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [r6+128* 7], 1
3668*c0909341SAndroid Build Coastguard Worker    mova                xm1, [r6+128*14]
3669*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [r6+128* 6], 1
3670*c0909341SAndroid Build Coastguard Worker    mova                xm2, [r6+128*13]
3671*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [r6+128* 5], 1
3672*c0909341SAndroid Build Coastguard Worker    mova                xm3, [r6+128*12]
3673*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [r6+128* 4], 1
3674*c0909341SAndroid Build Coastguard Worker    mova                xm4, [r6+128*11]
3675*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [r6+128* 3], 1
3676*c0909341SAndroid Build Coastguard Worker    mova                xm5, [r6+128*10]
3677*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [r6+128* 2], 1
3678*c0909341SAndroid Build Coastguard Worker    mova                xm6, [r6+128* 9]
3679*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [r6+128* 1], 1
3680*c0909341SAndroid Build Coastguard Worker    mova                xm7, [r6+128* 8]
3681*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [r6+128* 0], 1
3682*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m0, m1
3683*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
3684*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2, m3
3685*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
3686*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4, m5
3687*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5
3688*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6, m7
3689*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7
3690*c0909341SAndroid Build Coastguard Worker    sub                  r6, 16
3691*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m8, m1
3692*c0909341SAndroid Build Coastguard Worker    punpckldq            m8, m1
3693*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m3, m5
3694*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m5
3695*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m5, m7, m1
3696*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m7, m1
3697*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m8, m3
3698*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m8, m3
3699*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m0, m2
3700*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m5
3701*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
3702*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m7
3703*c0909341SAndroid Build Coastguard Worker    punpckhdq            m2, m4, m6
3704*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m1
3705*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m6
3706*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r2       ], m8
3707*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m6, m3, m2
3708*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*4], m6
3709*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m3, m2
3710*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r3       ], m3
3711*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m2, m0, m4
3712*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r2*2     ], m2
3713*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m4
3714*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r4       ], m0
3715*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*8]
3716*c0909341SAndroid Build Coastguard Worker    cmp                  r6, rsp
3717*c0909341SAndroid Build Coastguard Worker    jae .h64_transpose_loop
3718*c0909341SAndroid Build Coastguard Worker    add                 rsp, 128*16
3719*c0909341SAndroid Build Coastguard Worker    sub              org_wd, 16
3720*c0909341SAndroid Build Coastguard Worker    jg .h64_transpose_loop0
3721*c0909341SAndroid Build Coastguard Worker.h64_end:
3722*c0909341SAndroid Build Coastguard Worker    RET
3723*c0909341SAndroid Build Coastguard Worker
3724*c0909341SAndroid Build Coastguard Worker%macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax
3725*c0909341SAndroid Build Coastguard Worker%ifnum %4
3726*c0909341SAndroid Build Coastguard Worker    pshufb             xm%2, xm%4
3727*c0909341SAndroid Build Coastguard Worker%else
3728*c0909341SAndroid Build Coastguard Worker    pshufb             xm%2, %4
3729*c0909341SAndroid Build Coastguard Worker%endif
3730*c0909341SAndroid Build Coastguard Worker    vinserti128         m%2, xm%2, 1
3731*c0909341SAndroid Build Coastguard Worker    pshufd              m%1, m%2, q0000
3732*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m2
3733*c0909341SAndroid Build Coastguard Worker    pshufd              m%3, m%2, q1111
3734*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%3, m3
3735*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m1
3736*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%3
3737*c0909341SAndroid Build Coastguard Worker    pshufd              m%3, m%2, q2222
3738*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%3, m4
3739*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%3
3740*c0909341SAndroid Build Coastguard Worker    pshufd              m%3, m%2, q3333
3741*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%3, m5
3742*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%3
3743*c0909341SAndroid Build Coastguard Worker    psrad               m%1, 4
3744*c0909341SAndroid Build Coastguard Worker    packusdw            m%1, m%1
3745*c0909341SAndroid Build Coastguard Worker    pminsw              m%1, m%5
3746*c0909341SAndroid Build Coastguard Worker%endmacro
3747*c0909341SAndroid Build Coastguard Worker
3748*c0909341SAndroid Build Coastguard Worker%macro FILTER_2BLK 7 ; dst, src, tmp_dst, tmp_src, tmp, shuf, bdmax
3749*c0909341SAndroid Build Coastguard Worker    pshufb              m%2, m%6
3750*c0909341SAndroid Build Coastguard Worker    vpermq              m%4, m%2, q3232
3751*c0909341SAndroid Build Coastguard Worker    vinserti128         m%2, xm%2, 1
3752*c0909341SAndroid Build Coastguard Worker    pshufd              m%1, m%2, q0000
3753*c0909341SAndroid Build Coastguard Worker    pshufd              m%3, m%4, q0000
3754*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m2
3755*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%3, m2
3756*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m1
3757*c0909341SAndroid Build Coastguard Worker    paddd               m%3, m1
3758*c0909341SAndroid Build Coastguard Worker    pshufd              m%5, m%2, q1111
3759*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m3
3760*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%5
3761*c0909341SAndroid Build Coastguard Worker    pshufd              m%5, m%4, q1111
3762*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m3
3763*c0909341SAndroid Build Coastguard Worker    paddd               m%3, m%5
3764*c0909341SAndroid Build Coastguard Worker    pshufd              m%5, m%2, q2222
3765*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m4
3766*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%5
3767*c0909341SAndroid Build Coastguard Worker    pshufd              m%5, m%4, q2222
3768*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m4
3769*c0909341SAndroid Build Coastguard Worker    paddd               m%3, m%5
3770*c0909341SAndroid Build Coastguard Worker    pshufd              m%5, m%2, q3333
3771*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m5
3772*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%5
3773*c0909341SAndroid Build Coastguard Worker    pshufd              m%5, m%4, q3333
3774*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%5, m5
3775*c0909341SAndroid Build Coastguard Worker    paddd               m%3, m%5
3776*c0909341SAndroid Build Coastguard Worker    psrad               m%1, 4
3777*c0909341SAndroid Build Coastguard Worker    psrad               m%3, 4
3778*c0909341SAndroid Build Coastguard Worker    packusdw            m%1, m%3
3779*c0909341SAndroid Build Coastguard Worker    pminsw              m%1, m%7
3780*c0909341SAndroid Build Coastguard Worker%endmacro
3781*c0909341SAndroid Build Coastguard Worker
3782*c0909341SAndroid Build Coastguard Worker; The ipred_filter SIMD processes 4x2 blocks in the following order which
3783*c0909341SAndroid Build Coastguard Worker; increases parallelism compared to doing things row by row. One redundant
3784*c0909341SAndroid Build Coastguard Worker; block is calculated for w8 and w16, two for w32.
3785*c0909341SAndroid Build Coastguard Worker;     w4     w8       w16             w32
3786*c0909341SAndroid Build Coastguard Worker;     1     1 2     1 2 3 5     1 2 3 5 b c d f
3787*c0909341SAndroid Build Coastguard Worker;     2     2 3     2 4 5 7     2 4 5 7 c e f h
3788*c0909341SAndroid Build Coastguard Worker;     3     3 4     4 6 7 9     4 6 7 9 e g h j
3789*c0909341SAndroid Build Coastguard Worker; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___
3790*c0909341SAndroid Build Coastguard Worker;           5       8           8       i
3791*c0909341SAndroid Build Coastguard Worker
3792*c0909341SAndroid Build Coastguard Workercglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter
3793*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_filter_16bpc_avx2_table
3794*c0909341SAndroid Build Coastguard Worker    lea                  r6, [filter_intra_taps]
3795*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
3796*c0909341SAndroid Build Coastguard Worker%ifidn filterd, filterm
3797*c0909341SAndroid Build Coastguard Worker    movzx           filterd, filterb
3798*c0909341SAndroid Build Coastguard Worker%else
3799*c0909341SAndroid Build Coastguard Worker    movzx           filterd, byte filterm
3800*c0909341SAndroid Build Coastguard Worker%endif
3801*c0909341SAndroid Build Coastguard Worker    shl             filterd, 6
3802*c0909341SAndroid Build Coastguard Worker    add             filterq, r6
3803*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ipred_filter_16bpc_avx2_table]
3804*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [tlq-6]
3805*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
3806*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [base+pd_8]
3807*c0909341SAndroid Build Coastguard Worker    pmovsxbw             m2, [filterq+16*0]
3808*c0909341SAndroid Build Coastguard Worker    pmovsxbw             m3, [filterq+16*1]
3809*c0909341SAndroid Build Coastguard Worker    pmovsxbw             m4, [filterq+16*2]
3810*c0909341SAndroid Build Coastguard Worker    pmovsxbw             m5, [filterq+16*3]
3811*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
3812*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
3813*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3814*c0909341SAndroid Build Coastguard Worker.w4:
3815*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      10
3816*c0909341SAndroid Build Coastguard Worker    mova                xm8, [base+filter_shuf2]
3817*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, r8m ; bitdepth_max
3818*c0909341SAndroid Build Coastguard Worker    lea                  r7, [6+hq*2]
3819*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r7
3820*c0909341SAndroid Build Coastguard Worker    jmp .w4_loop_start
3821*c0909341SAndroid Build Coastguard Worker.w4_loop:
3822*c0909341SAndroid Build Coastguard Worker    pinsrq              xm0, [tlq+hq*2], 0
3823*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
3824*c0909341SAndroid Build Coastguard Worker.w4_loop_start:
3825*c0909341SAndroid Build Coastguard Worker    FILTER_1BLK           6, 0, 7, 8, 9
3826*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m6, 1
3827*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm6
3828*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm0
3829*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3830*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
3831*c0909341SAndroid Build Coastguard Worker    RET
3832*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3833*c0909341SAndroid Build Coastguard Worker.w8:
3834*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      16
3835*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m14, [base+filter_shuf3]
3836*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m15, r8m ; bitdepth_max
3837*c0909341SAndroid Build Coastguard Worker    FILTER_1BLK          10, 0, 7, [base+filter_shuf2], 15
3838*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m10, q1302         ; ____ ____ | ____ 4321
3839*c0909341SAndroid Build Coastguard Worker    pslldq               m8, m0, 4
3840*c0909341SAndroid Build Coastguard Worker    psrldq               m7, m6, 2
3841*c0909341SAndroid Build Coastguard Worker    psrldq               m0, m6, 10
3842*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m0
3843*c0909341SAndroid Build Coastguard Worker    vpblendd             m8, m6, 0x33           ; _0__ 4321 | ____ 4321
3844*c0909341SAndroid Build Coastguard Worker    vpblendd             m8, m7, 0x40           ; _056 4321 | ____ 4321
3845*c0909341SAndroid Build Coastguard Worker    vpblendd             m8, [tlq-6], 0x30      ; _056 4321 | ____ 4321
3846*c0909341SAndroid Build Coastguard Worker    lea                  r7, [16+hq*2]
3847*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r7
3848*c0909341SAndroid Build Coastguard Worker    jmp .w8_loop_start
3849*c0909341SAndroid Build Coastguard Worker.w8_loop:
3850*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m9, q1302          ; ____ 4321 | ____ 4321
3851*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m9, q2031
3852*c0909341SAndroid Build Coastguard Worker    psrldq               m0, m6, 2
3853*c0909341SAndroid Build Coastguard Worker    psrldq               m6, 10
3854*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m0
3855*c0909341SAndroid Build Coastguard Worker    vpblendd             m8, m7, 0x80           ; _0__ 4321 | ____ 4321
3856*c0909341SAndroid Build Coastguard Worker    vpblendd             m8, m6, 0x40           ; _056 4321 | ____ 4321
3857*c0909341SAndroid Build Coastguard Worker    mova                m10, m9
3858*c0909341SAndroid Build Coastguard Worker.w8_loop_start:
3859*c0909341SAndroid Build Coastguard Worker    vpblendd             m8, [tlq+hq*2], 0x0C   ; _056 4321 | _056 4321
3860*c0909341SAndroid Build Coastguard Worker    call .main
3861*c0909341SAndroid Build Coastguard Worker    vpblendd            m10, m9, 0xCC
3862*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm10
3863*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m10, 1
3864*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
3865*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3866*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
3867*c0909341SAndroid Build Coastguard Worker    RET
3868*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3869*c0909341SAndroid Build Coastguard Worker.w16:
3870*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK          32, 16
3871*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m15, r8m ; bitdepth_max
3872*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3873*c0909341SAndroid Build Coastguard Worker    TAIL_CALL .w16_main, 0
3874*c0909341SAndroid Build Coastguard Worker.w16_main:
3875*c0909341SAndroid Build Coastguard Worker    mova               xm10, [base+filter_shuf2]
3876*c0909341SAndroid Build Coastguard Worker    FILTER_1BLK          13, 0, 6, 10, 15
3877*c0909341SAndroid Build Coastguard Worker    vpermq              m12, m13, q3120
3878*c0909341SAndroid Build Coastguard Worker    mova               xm14, [base+filter_shuf3]
3879*c0909341SAndroid Build Coastguard Worker    vinserti128         m14, [base+filter_shuf1], 1
3880*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [tlq+10]
3881*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, [tlq-16], 0x4C     ; ___0 4321 | _056 ____
3882*c0909341SAndroid Build Coastguard Worker    psrldq               m6, m12, 8
3883*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m6, 0x03           ; ___0 4321 | _056 4321
3884*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m12
3885*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m6, 0x80           ; 56_0 4321 | _056 4321
3886*c0909341SAndroid Build Coastguard Worker    FILTER_2BLK          12, 0, 6, 7, 8, 14, 15
3887*c0909341SAndroid Build Coastguard Worker    vpblendd            m13, m12, 0xCC
3888*c0909341SAndroid Build Coastguard Worker    vpermq              m12, m12, q2031         ; 6___ 5___
3889*c0909341SAndroid Build Coastguard Worker    psrldq              xm6, xm12, 2
3890*c0909341SAndroid Build Coastguard Worker    psrldq              xm8, xm12, 12
3891*c0909341SAndroid Build Coastguard Worker    vpblendd            xm6, xm8, 0x01
3892*c0909341SAndroid Build Coastguard Worker    pblendw             xm6, [tlq+10], 0xF8     ; 4321 056_
3893*c0909341SAndroid Build Coastguard Worker    FILTER_1BLK          11, 6, 8, 10, 15
3894*c0909341SAndroid Build Coastguard Worker    vpermq              m11, m11, q3120
3895*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m11, q1032
3896*c0909341SAndroid Build Coastguard Worker    movu                 m8, [tlq+6]            ; __43 210_ | ____ ____
3897*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m8, q3021          ; __0_ 4321 | ____ ____
3898*c0909341SAndroid Build Coastguard Worker    pshufhw              m8, m8, q3201          ; ___0 4321 | ____ ____
3899*c0909341SAndroid Build Coastguard Worker    vpblendd             m9, m8, 0x70           ; ___0 4321 | ____ 4321
3900*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm13
3901*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m13, 1
3902*c0909341SAndroid Build Coastguard Worker    lea                  r7, [20+hq*2]
3903*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r7
3904*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m12, q0123         ; ____ 4321 | ____ 4321
3905*c0909341SAndroid Build Coastguard Worker    jmp .w16_loop_start
3906*c0909341SAndroid Build Coastguard Worker.w16_loop:
3907*c0909341SAndroid Build Coastguard Worker    vpermq              m13, m13, q3322
3908*c0909341SAndroid Build Coastguard Worker    vpermq              m11,  m9, q2020
3909*c0909341SAndroid Build Coastguard Worker    vpermq               m9,  m9, q1302
3910*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m12, q0123
3911*c0909341SAndroid Build Coastguard Worker    psrldq               m7, 4
3912*c0909341SAndroid Build Coastguard Worker    vpblendd            m13, m10, 0xCC
3913*c0909341SAndroid Build Coastguard Worker    vpblendd             m9, m7, 0x40
3914*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+8]
3915*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm13
3916*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m13, 1
3917*c0909341SAndroid Build Coastguard Worker.w16_loop_start:
3918*c0909341SAndroid Build Coastguard Worker    mova                m13, m12
3919*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, [tlq+hq*2], 0x0C
3920*c0909341SAndroid Build Coastguard Worker    psrldq               m7, m12, 8
3921*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m12
3922*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m6, 0x33           ; ___0 4321 | _056 4321
3923*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m7, 0x80           ; 56_0 4321 | _056 4321
3924*c0909341SAndroid Build Coastguard Worker    FILTER_2BLK          10, 0, 6, 7, 8, 14, 15
3925*c0909341SAndroid Build Coastguard Worker    vpermq              m12, m10, q2031
3926*c0909341SAndroid Build Coastguard Worker    mova            [rsp+8], m0
3927*c0909341SAndroid Build Coastguard Worker    psrldq               m8, m11, 8
3928*c0909341SAndroid Build Coastguard Worker    psrldq              xm6, xm12, 2
3929*c0909341SAndroid Build Coastguard Worker    psrldq              xm7, xm12, 10
3930*c0909341SAndroid Build Coastguard Worker    psrldq              xm0, xm13, 2
3931*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m11
3932*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm7, xm6
3933*c0909341SAndroid Build Coastguard Worker    vpblendd             m8, m9, 0x73           ; 56_0 4321 | ____ 4321
3934*c0909341SAndroid Build Coastguard Worker    vpblendd             m8, m7, 0x04           ; 56_0 4321 | __56 4321
3935*c0909341SAndroid Build Coastguard Worker    vpblendd             m8, m0, 0x08           ; 56_0 4321 | _056 4321
3936*c0909341SAndroid Build Coastguard Worker    call .main
3937*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m11, q3120
3938*c0909341SAndroid Build Coastguard Worker    vpblendd             m6, m8, m9, 0xCC
3939*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0+16], xm6
3940*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1+16], m6, 1
3941*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
3942*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3943*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
3944*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m9, q3120
3945*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m8, 1              ; 4321 ____
3946*c0909341SAndroid Build Coastguard Worker    pshufd             xm11, xm11, q1032
3947*c0909341SAndroid Build Coastguard Worker    vpblendd            xm0, xm11, 0x02         ; 4321 0___
3948*c0909341SAndroid Build Coastguard Worker    psrldq              xm6, xm8, 2
3949*c0909341SAndroid Build Coastguard Worker    psrldq              xm7, xm8, 12
3950*c0909341SAndroid Build Coastguard Worker    pblendw             xm0, xm6, 0x4           ; 4321 05__
3951*c0909341SAndroid Build Coastguard Worker    pblendw             xm0, xm7, 0x2           ; 4321 056_
3952*c0909341SAndroid Build Coastguard Worker    FILTER_1BLK           6, 0, 7, [base+filter_shuf2], 15
3953*c0909341SAndroid Build Coastguard Worker    vpermq              m12, m13, q1302
3954*c0909341SAndroid Build Coastguard Worker    vpblendd            m12, m10, 0xCC
3955*c0909341SAndroid Build Coastguard Worker    vpblendd             m9, m6, 0xCC
3956*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0+ 0], xm12
3957*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0+16], xm9
3958*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1+ 0], m12, 1
3959*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1+16], m9, 1
3960*c0909341SAndroid Build Coastguard Worker    ret
3961*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3962*c0909341SAndroid Build Coastguard Worker.w32:
3963*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK          64, 16
3964*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m15, r8m ; bitdepth_max
3965*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3966*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+32]
3967*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [hd*2+20]
3968*c0909341SAndroid Build Coastguard Worker    call .w16_main
3969*c0909341SAndroid Build Coastguard Worker    mov                dstq, r3
3970*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [tlq+r5+32]
3971*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 20
3972*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 1
3973*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
3974*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dstq+strideq*2-2]
3975*c0909341SAndroid Build Coastguard WorkerDEFINE_ARGS dst, stride, tl, stride3, left, h
3976*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
3977*c0909341SAndroid Build Coastguard Worker    movu                 m8, [tlq-6]                        ; 4321 0___
3978*c0909341SAndroid Build Coastguard Worker    mova               xm10, [base+filter_shuf2]
3979*c0909341SAndroid Build Coastguard Worker    pinsrw              xm0, xm8, [dstq+strideq*0-2], 2
3980*c0909341SAndroid Build Coastguard Worker    pinsrw              xm0, xm0, [dstq+strideq*1-2], 1     ; 4321 056_
3981*c0909341SAndroid Build Coastguard Worker    pinsrw              xm9, [leftq+strideq*0], 5
3982*c0909341SAndroid Build Coastguard Worker    pinsrw              xm9, [leftq+strideq*1], 4
3983*c0909341SAndroid Build Coastguard Worker    FILTER_1BLK          13, 0, 6, 10, 15
3984*c0909341SAndroid Build Coastguard Worker    vpermq              m12, m13, q3120
3985*c0909341SAndroid Build Coastguard Worker    mova               xm14, [base+filter_shuf3]
3986*c0909341SAndroid Build Coastguard Worker    vinserti128         m14, [base+filter_shuf1], 1
3987*c0909341SAndroid Build Coastguard Worker    psrldq               m6, m12, 8
3988*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m6, m12
3989*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m6, 0x03           ; ___0 ____ | _0__ 4321
3990*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m7, 0x80           ; 56_0 ____ | _0__ 4321
3991*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m8, 0x30           ; 56_0 4321 | _0__ 4321
3992*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m9, 0x04           ; 56_0 4321 | _056 4321
3993*c0909341SAndroid Build Coastguard Worker    FILTER_2BLK          12, 0, 6, 7, 8, 14, 15
3994*c0909341SAndroid Build Coastguard Worker    vpblendd            m13, m12, 0xCC
3995*c0909341SAndroid Build Coastguard Worker    pinsrw              xm9, [leftq+strideq*2], 3
3996*c0909341SAndroid Build Coastguard Worker    pinsrw              xm9, [leftq+stride3q ], 2
3997*c0909341SAndroid Build Coastguard Worker    lea               leftq, [leftq+strideq*4]
3998*c0909341SAndroid Build Coastguard Worker    pinsrw              xm9, [leftq+strideq*0], 1
3999*c0909341SAndroid Build Coastguard Worker    pinsrw              xm9, [leftq+strideq*1], 0
4000*c0909341SAndroid Build Coastguard Worker    movq           [rsp+32], xm9
4001*c0909341SAndroid Build Coastguard Worker    mov                 r7d, 1
4002*c0909341SAndroid Build Coastguard Worker    pslldq               m8, m9, 4
4003*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m8, 0x0C           ; ___0 ____ | _056 ____
4004*c0909341SAndroid Build Coastguard Worker    vpermq              m12, m12, q2031         ; 6___ 5___
4005*c0909341SAndroid Build Coastguard Worker    psrldq              xm6, xm12, 2
4006*c0909341SAndroid Build Coastguard Worker    psrldq              xm7, xm12, 12
4007*c0909341SAndroid Build Coastguard Worker    vpblendd            xm6, xm7, 0x01          ; ____ _56_
4008*c0909341SAndroid Build Coastguard Worker    pblendw             xm6, [tlq+10], 0xF8     ; 4321 056_
4009*c0909341SAndroid Build Coastguard Worker    FILTER_1BLK          11, 6, 7, 10, 15
4010*c0909341SAndroid Build Coastguard Worker    vpermq              m11, m11, q3120
4011*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m11, q1032
4012*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [tlq+22]           ; __43 210_ | ____ ____
4013*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m8, q3021          ; __0_ 4321 | ____ ____
4014*c0909341SAndroid Build Coastguard Worker    pshufhw              m8, m8, q3201          ; ___0 4321 | ____ ____
4015*c0909341SAndroid Build Coastguard Worker    vpblendd             m9, m8, 0x70           ; ___0 4321 | ____ 4321
4016*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm13
4017*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m13, 1
4018*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m12, q0123         ; ____ 4321 | ____ 4321
4019*c0909341SAndroid Build Coastguard Worker    jmp .w32_loop_start
4020*c0909341SAndroid Build Coastguard Worker.w32_loop_last:
4021*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+0]
4022*c0909341SAndroid Build Coastguard Worker    jmp .w32_loop
4023*c0909341SAndroid Build Coastguard Worker.w32_loop_left:
4024*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+0]
4025*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, [rsp+32+r7*4-12], 0x0C
4026*c0909341SAndroid Build Coastguard Worker    dec                 r7d
4027*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
4028*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 2
4029*c0909341SAndroid Build Coastguard Worker    je .w32_loop
4030*c0909341SAndroid Build Coastguard Worker    pinsrw              xm6, [rsp+32], 6
4031*c0909341SAndroid Build Coastguard Worker    pinsrw              xm6, [leftq+strideq*2], 5
4032*c0909341SAndroid Build Coastguard Worker    pinsrw              xm6, [leftq+stride3q ], 4
4033*c0909341SAndroid Build Coastguard Worker    lea               leftq, [leftq+strideq*4]
4034*c0909341SAndroid Build Coastguard Worker    pinsrw              xm6, [leftq+strideq*0], 3
4035*c0909341SAndroid Build Coastguard Worker    pinsrw              xm6, [leftq+strideq*1], 2
4036*c0909341SAndroid Build Coastguard Worker    pinsrw              xm6, [leftq+strideq*2], 1
4037*c0909341SAndroid Build Coastguard Worker    pinsrw              xm6, [leftq+stride3q ], 0
4038*c0909341SAndroid Build Coastguard Worker    lea               leftq, [leftq+strideq*4]
4039*c0909341SAndroid Build Coastguard Worker    movu           [rsp+36], xm6
4040*c0909341SAndroid Build Coastguard Worker    pinsrw              xm6, [leftq+strideq*0], 1
4041*c0909341SAndroid Build Coastguard Worker    pinsrw              xm6, [leftq+strideq*1], 0
4042*c0909341SAndroid Build Coastguard Worker    movd           [rsp+32], xm6
4043*c0909341SAndroid Build Coastguard Worker    mov                 r7d, 4
4044*c0909341SAndroid Build Coastguard Worker.w32_loop:
4045*c0909341SAndroid Build Coastguard Worker    vpermq              m13, m13, q3322
4046*c0909341SAndroid Build Coastguard Worker    vpermq              m11,  m9, q2020
4047*c0909341SAndroid Build Coastguard Worker    vpermq               m9,  m9, q1302
4048*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m12, q0123
4049*c0909341SAndroid Build Coastguard Worker    psrldq               m7, 4
4050*c0909341SAndroid Build Coastguard Worker    vpblendd            m13, m10, 0xCC
4051*c0909341SAndroid Build Coastguard Worker    vpblendd             m9, m7, 0x40           ; ___0 4321 | ____ 4321
4052*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm13
4053*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m13, 1
4054*c0909341SAndroid Build Coastguard Worker.w32_loop_start:
4055*c0909341SAndroid Build Coastguard Worker    mova                m13, m12
4056*c0909341SAndroid Build Coastguard Worker    psrldq               m7, m12, 8
4057*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m12
4058*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m6, 0x33           ; ___0 4321 | _056 4321
4059*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m7, 0x80           ; 56_0 4321 | _056 4321
4060*c0909341SAndroid Build Coastguard Worker    FILTER_2BLK          10, 0, 6, 7, 8, 14, 15
4061*c0909341SAndroid Build Coastguard Worker    vpermq              m12, m10, q2031
4062*c0909341SAndroid Build Coastguard Worker    mova            [rsp+0], m0
4063*c0909341SAndroid Build Coastguard Worker    psrldq               m8, m11, 8
4064*c0909341SAndroid Build Coastguard Worker    psrldq              xm6, xm12, 2
4065*c0909341SAndroid Build Coastguard Worker    psrldq              xm7, xm12, 10
4066*c0909341SAndroid Build Coastguard Worker    psrldq              xm0, xm13, 2
4067*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m11
4068*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm7, xm6
4069*c0909341SAndroid Build Coastguard Worker    vpblendd             m8, m9, 0x73           ; 56_0 4321 | ____ 4321
4070*c0909341SAndroid Build Coastguard Worker    vpblendd             m8, m7, 0x04           ; 56_0 4321 | __56 4321
4071*c0909341SAndroid Build Coastguard Worker    vpblendd             m8, m0, 0x08           ; 56_0 4321 | _056 4321
4072*c0909341SAndroid Build Coastguard Worker    call .main
4073*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m11, q3120
4074*c0909341SAndroid Build Coastguard Worker    vpblendd             m6, m8, m9, 0xCC
4075*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0+16], xm6
4076*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1+16], m6, 1
4077*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
4078*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4079*c0909341SAndroid Build Coastguard Worker    jg .w32_loop_left
4080*c0909341SAndroid Build Coastguard Worker    jz .w32_loop_last
4081*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m9, q3120
4082*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m8, 1              ; 4321 ____
4083*c0909341SAndroid Build Coastguard Worker    pshufd             xm11, xm11, q1032
4084*c0909341SAndroid Build Coastguard Worker    vpblendd            xm0, xm11, 0x02         ; 4321 0___
4085*c0909341SAndroid Build Coastguard Worker    psrldq              xm6, xm8, 2
4086*c0909341SAndroid Build Coastguard Worker    psrldq              xm7, xm8, 12
4087*c0909341SAndroid Build Coastguard Worker    pblendw             xm0, xm6, 0x4           ; 4321 05__
4088*c0909341SAndroid Build Coastguard Worker    pblendw             xm0, xm7, 0x2           ; 4321 056_
4089*c0909341SAndroid Build Coastguard Worker    FILTER_1BLK           6, 0, 7, [base+filter_shuf2], 15
4090*c0909341SAndroid Build Coastguard Worker    vpermq              m12, m13, q1302
4091*c0909341SAndroid Build Coastguard Worker    vpblendd            m12, m10, 0xCC
4092*c0909341SAndroid Build Coastguard Worker    vpblendd             m9, m6, 0xCC
4093*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0+ 0], xm12
4094*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0+16], xm9
4095*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1+ 0], m12, 1
4096*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1+16], m9, 1
4097*c0909341SAndroid Build Coastguard Worker    RET
4098*c0909341SAndroid Build Coastguard Worker.main:
4099*c0909341SAndroid Build Coastguard Worker    FILTER_2BLK           9, 8, 6, 7, 0, 14, 15
4100*c0909341SAndroid Build Coastguard Worker    ret
4101*c0909341SAndroid Build Coastguard Worker
4102*c0909341SAndroid Build Coastguard Worker%if WIN64
4103*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5
4104*c0909341SAndroid Build Coastguard Worker%else
4105*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7
4106*c0909341SAndroid Build Coastguard Worker%endif
4107*c0909341SAndroid Build Coastguard Worker
4108*c0909341SAndroid Build Coastguard Worker%macro IPRED_CFL 1 ; ac in, unpacked pixels out
4109*c0909341SAndroid Build Coastguard Worker    psignw               m3, m%1, m1
4110*c0909341SAndroid Build Coastguard Worker    pabsw               m%1, m%1
4111*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%1, m2
4112*c0909341SAndroid Build Coastguard Worker    psignw              m%1, m3
4113*c0909341SAndroid Build Coastguard Worker    paddw               m%1, m0
4114*c0909341SAndroid Build Coastguard Worker%endmacro
4115*c0909341SAndroid Build Coastguard Worker
4116*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_top_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
4117*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
4118*c0909341SAndroid Build Coastguard Worker    add                 tlq, 2
4119*c0909341SAndroid Build Coastguard Worker    movd                xm4, wd
4120*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
4121*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, r7m
4122*c0909341SAndroid Build Coastguard Worker    pavgw               xm4, xm6
4123*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
4124*c0909341SAndroid Build Coastguard Worker    movd                xm5, wd
4125*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq]
4126*c0909341SAndroid Build Coastguard Worker    lea                  t0, [ipred_cfl_left_16bpc_avx2_table]
4127*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [t0+wq*4]
4128*c0909341SAndroid Build Coastguard Worker    add                  r6, t0
4129*c0909341SAndroid Build Coastguard Worker    add                  t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table
4130*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4]
4131*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
4132*c0909341SAndroid Build Coastguard Worker    movifnidn           acq, acmp
4133*c0909341SAndroid Build Coastguard Worker    jmp                  r6
4134*c0909341SAndroid Build Coastguard Worker
4135*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
4136*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm ; zero upper half
4137*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
4138*c0909341SAndroid Build Coastguard Worker    movd                xm4, hd
4139*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
4140*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
4141*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, r7m
4142*c0909341SAndroid Build Coastguard Worker    pavgw               xm4, xm6
4143*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, hd
4144*c0909341SAndroid Build Coastguard Worker    movd                xm5, r6d
4145*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq]
4146*c0909341SAndroid Build Coastguard Worker    lea                  t0, [ipred_cfl_left_16bpc_avx2_table]
4147*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [t0+r6*4]
4148*c0909341SAndroid Build Coastguard Worker    add                  r6, t0
4149*c0909341SAndroid Build Coastguard Worker    add                  t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table
4150*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
4151*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4]
4152*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
4153*c0909341SAndroid Build Coastguard Worker    movifnidn           acq, acmp
4154*c0909341SAndroid Build Coastguard Worker    jmp                  r6
4155*c0909341SAndroid Build Coastguard Worker.h32:
4156*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+32]
4157*c0909341SAndroid Build Coastguard Worker.h16:
4158*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
4159*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4160*c0909341SAndroid Build Coastguard Worker.h8:
4161*c0909341SAndroid Build Coastguard Worker    psrldq              xm1, xm0, 8
4162*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4163*c0909341SAndroid Build Coastguard Worker.h4:
4164*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm0, xm6
4165*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
4166*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
4167*c0909341SAndroid Build Coastguard Worker    psrldq              xm1, xm0, 8
4168*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
4169*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm4
4170*c0909341SAndroid Build Coastguard Worker    psrld               xm0, xm5
4171*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
4172*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4173*c0909341SAndroid Build Coastguard Worker
4174*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
4175*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
4176*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
4177*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, hd
4178*c0909341SAndroid Build Coastguard Worker    lea                 t0d, [wq+hq]
4179*c0909341SAndroid Build Coastguard Worker    movd                xm4, t0d
4180*c0909341SAndroid Build Coastguard Worker    tzcnt               t0d, t0d
4181*c0909341SAndroid Build Coastguard Worker    movd                xm5, t0d
4182*c0909341SAndroid Build Coastguard Worker    lea                  t0, [ipred_cfl_16bpc_avx2_table]
4183*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
4184*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [t0+r6*4]
4185*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4+4*4]
4186*c0909341SAndroid Build Coastguard Worker    psrlw               xm4, 1
4187*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
4188*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, r7m
4189*c0909341SAndroid Build Coastguard Worker    add                  r6, t0
4190*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
4191*c0909341SAndroid Build Coastguard Worker    movifnidn           acq, acmp
4192*c0909341SAndroid Build Coastguard Worker    jmp                  r6
4193*c0909341SAndroid Build Coastguard Worker.h4:
4194*c0909341SAndroid Build Coastguard Worker    movq                xm0, [tlq-8]
4195*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4196*c0909341SAndroid Build Coastguard Worker.w4:
4197*c0909341SAndroid Build Coastguard Worker    movq                xm1, [tlq+2]
4198*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
4199*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
4200*c0909341SAndroid Build Coastguard Worker    psrlq                m1, m0, 32
4201*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
4202*c0909341SAndroid Build Coastguard Worker    psrld                m1, m0, 16
4203*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
4204*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
4205*c0909341SAndroid Build Coastguard Worker    jg .w4_mul
4206*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 3
4207*c0909341SAndroid Build Coastguard Worker    jmp .w4_end
4208*c0909341SAndroid Build Coastguard Worker.w4_mul:
4209*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
4210*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4211*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [hq*2]
4212*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0xAAAB6667
4213*c0909341SAndroid Build Coastguard Worker    shrx                r6d, r6d, r2d
4214*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm1, xm0, xm6
4215*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm0, xm6
4216*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
4217*c0909341SAndroid Build Coastguard Worker    movd                xm1, r6d
4218*c0909341SAndroid Build Coastguard Worker    psrld               xm0, 2
4219*c0909341SAndroid Build Coastguard Worker    pmulhuw             xm0, xm1
4220*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 1
4221*c0909341SAndroid Build Coastguard Worker.w4_end:
4222*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
4223*c0909341SAndroid Build Coastguard Worker.s4:
4224*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, alpham
4225*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
4226*c0909341SAndroid Build Coastguard Worker    pabsw                m2, m1
4227*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
4228*c0909341SAndroid Build Coastguard Worker.s4_loop:
4229*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq]
4230*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4
4231*c0909341SAndroid Build Coastguard Worker    pmaxsw               m4, m6
4232*c0909341SAndroid Build Coastguard Worker    pminsw               m4, m7
4233*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
4234*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm4
4235*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm5
4236*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm4
4237*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r6       ], xm5
4238*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4239*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4240*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4241*c0909341SAndroid Build Coastguard Worker    jg .s4_loop
4242*c0909341SAndroid Build Coastguard Worker    RET
4243*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4244*c0909341SAndroid Build Coastguard Worker.h8:
4245*c0909341SAndroid Build Coastguard Worker    mova                xm0, [tlq-16]
4246*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4247*c0909341SAndroid Build Coastguard Worker.w8:
4248*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
4249*c0909341SAndroid Build Coastguard Worker    paddw               xm0, [tlq+2]
4250*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm4
4251*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4252*c0909341SAndroid Build Coastguard Worker    psrld               xm1, xm0, 16
4253*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4254*c0909341SAndroid Build Coastguard Worker    pblendw             xm0, xm6, 0xAA
4255*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
4256*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
4257*c0909341SAndroid Build Coastguard Worker    psrldq              xm1, xm0, 8
4258*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
4259*c0909341SAndroid Build Coastguard Worker    psrld               xm0, xm5
4260*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
4261*c0909341SAndroid Build Coastguard Worker    je .w8_end
4262*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0xAAAB
4263*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x6667
4264*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
4265*c0909341SAndroid Build Coastguard Worker    cmovz               r6d, r2d
4266*c0909341SAndroid Build Coastguard Worker    movd                xm1, r6d
4267*c0909341SAndroid Build Coastguard Worker    pmulhuw             xm0, xm1
4268*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 1
4269*c0909341SAndroid Build Coastguard Worker.w8_end:
4270*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
4271*c0909341SAndroid Build Coastguard Worker.s8:
4272*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, alpham
4273*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
4274*c0909341SAndroid Build Coastguard Worker    pabsw                m2, m1
4275*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
4276*c0909341SAndroid Build Coastguard Worker.s8_loop:
4277*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq]
4278*c0909341SAndroid Build Coastguard Worker    mova                 m5, [acq+32]
4279*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4
4280*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             5
4281*c0909341SAndroid Build Coastguard Worker    pmaxsw               m4, m6
4282*c0909341SAndroid Build Coastguard Worker    pmaxsw               m5, m6
4283*c0909341SAndroid Build Coastguard Worker    pminsw               m4, m7
4284*c0909341SAndroid Build Coastguard Worker    pminsw               m5, m7
4285*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm4
4286*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*2], xm5
4287*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m4, 1
4288*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+r6       ], m5, 1
4289*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4290*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4291*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4292*c0909341SAndroid Build Coastguard Worker    jg .s8_loop
4293*c0909341SAndroid Build Coastguard Worker    RET
4294*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4295*c0909341SAndroid Build Coastguard Worker.h16:
4296*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-32]
4297*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4298*c0909341SAndroid Build Coastguard Worker.w16:
4299*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+2]
4300*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
4301*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm4
4302*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4303*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm1, xm0, xm6
4304*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm0, xm6
4305*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
4306*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
4307*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
4308*c0909341SAndroid Build Coastguard Worker    psrldq              xm1, xm0, 8
4309*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
4310*c0909341SAndroid Build Coastguard Worker    psrld               xm0, xm5
4311*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
4312*c0909341SAndroid Build Coastguard Worker    je .w16_end
4313*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0xAAAB
4314*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x6667
4315*c0909341SAndroid Build Coastguard Worker    test                 hb, 8|32
4316*c0909341SAndroid Build Coastguard Worker    cmovz               r6d, r2d
4317*c0909341SAndroid Build Coastguard Worker    movd                xm1, r6d
4318*c0909341SAndroid Build Coastguard Worker    pmulhuw             xm0, xm1
4319*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 1
4320*c0909341SAndroid Build Coastguard Worker.w16_end:
4321*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
4322*c0909341SAndroid Build Coastguard Worker.s16:
4323*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, alpham
4324*c0909341SAndroid Build Coastguard Worker    pabsw                m2, m1
4325*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
4326*c0909341SAndroid Build Coastguard Worker.s16_loop:
4327*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq]
4328*c0909341SAndroid Build Coastguard Worker    mova                 m5, [acq+32]
4329*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4
4330*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             5
4331*c0909341SAndroid Build Coastguard Worker    pmaxsw               m4, m6
4332*c0909341SAndroid Build Coastguard Worker    pmaxsw               m5, m6
4333*c0909341SAndroid Build Coastguard Worker    pminsw               m4, m7
4334*c0909341SAndroid Build Coastguard Worker    pminsw               m5, m7
4335*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m4
4336*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m5
4337*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
4338*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4339*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4340*c0909341SAndroid Build Coastguard Worker    jg .s16_loop
4341*c0909341SAndroid Build Coastguard Worker    RET
4342*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4343*c0909341SAndroid Build Coastguard Worker.h32:
4344*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-64]
4345*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-32]
4346*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4347*c0909341SAndroid Build Coastguard Worker.w32:
4348*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+ 2]
4349*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq+34]
4350*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
4351*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm4
4352*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4353*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm1, xm0, xm6
4354*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm0, xm6
4355*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
4356*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
4357*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
4358*c0909341SAndroid Build Coastguard Worker    psrldq              xm1, xm0, 8
4359*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
4360*c0909341SAndroid Build Coastguard Worker    psrld               xm0, xm5
4361*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
4362*c0909341SAndroid Build Coastguard Worker    je .w32_end
4363*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [hq*2]
4364*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x6667AAAB
4365*c0909341SAndroid Build Coastguard Worker    shrx                r6d, r6d, r2d
4366*c0909341SAndroid Build Coastguard Worker    movd                xm1, r6d
4367*c0909341SAndroid Build Coastguard Worker    pmulhuw             xm0, xm1
4368*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 1
4369*c0909341SAndroid Build Coastguard Worker.w32_end:
4370*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
4371*c0909341SAndroid Build Coastguard Worker.s32:
4372*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, alpham
4373*c0909341SAndroid Build Coastguard Worker    pabsw                m2, m1
4374*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
4375*c0909341SAndroid Build Coastguard Worker.s32_loop:
4376*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq]
4377*c0909341SAndroid Build Coastguard Worker    mova                 m5, [acq+32]
4378*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4
4379*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             5
4380*c0909341SAndroid Build Coastguard Worker    pmaxsw               m4, m6
4381*c0909341SAndroid Build Coastguard Worker    pmaxsw               m5, m6
4382*c0909341SAndroid Build Coastguard Worker    pminsw               m4, m7
4383*c0909341SAndroid Build Coastguard Worker    pminsw               m5, m7
4384*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m4
4385*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m5
4386*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
4387*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4388*c0909341SAndroid Build Coastguard Worker    dec                  hd
4389*c0909341SAndroid Build Coastguard Worker    jg .s32_loop
4390*c0909341SAndroid Build Coastguard Worker    RET
4391*c0909341SAndroid Build Coastguard Worker
4392*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
4393*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r7m
4394*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
4395*c0909341SAndroid Build Coastguard Worker    lea                  t0, [ipred_cfl_splat_16bpc_avx2_table]
4396*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
4397*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
4398*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4]
4399*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [t0-ipred_cfl_splat_16bpc_avx2_table+pw_512+r6*4]
4400*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
4401*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, r7m
4402*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
4403*c0909341SAndroid Build Coastguard Worker    movifnidn           acq, acmp
4404*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4405*c0909341SAndroid Build Coastguard Worker
4406*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_420_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
4407*c0909341SAndroid Build Coastguard Worker    movifnidn         hpadd, hpadm
4408*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pw_2]
4409*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
4410*c0909341SAndroid Build Coastguard Worker    shl               hpadd, 2
4411*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
4412*c0909341SAndroid Build Coastguard Worker    sub                  hd, hpadd
4413*c0909341SAndroid Build Coastguard Worker    cmp            dword wm, 8
4414*c0909341SAndroid Build Coastguard Worker    jg .w16
4415*c0909341SAndroid Build Coastguard Worker    je .w8
4416*c0909341SAndroid Build Coastguard Worker.w4:
4417*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
4418*c0909341SAndroid Build Coastguard Worker    mov                  r5, acq
4419*c0909341SAndroid Build Coastguard Worker.w4_loop:
4420*c0909341SAndroid Build Coastguard Worker    mova                xm0, [ypxq+strideq*2]
4421*c0909341SAndroid Build Coastguard Worker    mova                xm1, [ypxq+r3       ]
4422*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [ypxq+strideq*0], 1
4423*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [ypxq+strideq*1], 1
4424*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*4]
4425*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m5
4426*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5
4427*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
4428*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
4429*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
4430*c0909341SAndroid Build Coastguard Worker    packssdw            xm1, xm0
4431*c0909341SAndroid Build Coastguard Worker    mova              [acq], xm1
4432*c0909341SAndroid Build Coastguard Worker    add                 acq, 16
4433*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4434*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
4435*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4436*c0909341SAndroid Build Coastguard Worker    jz .dc
4437*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q1111
4438*c0909341SAndroid Build Coastguard Worker    pslld               xm0, 2
4439*c0909341SAndroid Build Coastguard Worker.w4_hpad_loop:
4440*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4441*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
4442*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4443*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 4
4444*c0909341SAndroid Build Coastguard Worker    jg .w4_hpad_loop
4445*c0909341SAndroid Build Coastguard Worker    jmp .dc
4446*c0909341SAndroid Build Coastguard Worker.w8:
4447*c0909341SAndroid Build Coastguard Worker    mov                  r5, acq
4448*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
4449*c0909341SAndroid Build Coastguard Worker    jnz .w8_wpad1
4450*c0909341SAndroid Build Coastguard Worker.w8_loop:
4451*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m5, [ypxq+strideq*0]
4452*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5, [ypxq+strideq*1]
4453*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*2]
4454*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
4455*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
4456*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
4457*c0909341SAndroid Build Coastguard Worker    packssdw            xm1, xm0, xm1
4458*c0909341SAndroid Build Coastguard Worker    mova              [acq], xm1
4459*c0909341SAndroid Build Coastguard Worker    add                 acq, 16
4460*c0909341SAndroid Build Coastguard Worker    dec                  hd
4461*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
4462*c0909341SAndroid Build Coastguard Worker.w8_hpad:
4463*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4464*c0909341SAndroid Build Coastguard Worker    jz .dc
4465*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, xm1, 1
4466*c0909341SAndroid Build Coastguard Worker    pslld                m0, 2
4467*c0909341SAndroid Build Coastguard Worker    jmp .hpad
4468*c0909341SAndroid Build Coastguard Worker.w8_wpad1:
4469*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm0, xm5, [ypxq+strideq*0]
4470*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm3, xm5, [ypxq+strideq*1]
4471*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*2]
4472*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm3
4473*c0909341SAndroid Build Coastguard Worker    pshufd              xm3, xm0, q3333
4474*c0909341SAndroid Build Coastguard Worker    packssdw            xm1, xm0, xm3
4475*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm3
4476*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm0
4477*c0909341SAndroid Build Coastguard Worker    mova              [acq], xm1
4478*c0909341SAndroid Build Coastguard Worker    add                 acq, 16
4479*c0909341SAndroid Build Coastguard Worker    dec                  hd
4480*c0909341SAndroid Build Coastguard Worker    jg .w8_wpad1
4481*c0909341SAndroid Build Coastguard Worker    jmp .w8_hpad
4482*c0909341SAndroid Build Coastguard Worker.w16_wpad:
4483*c0909341SAndroid Build Coastguard Worker    mova                 m0, [ypxq+strideq*0+ 0]
4484*c0909341SAndroid Build Coastguard Worker    mova                 m1, [ypxq+strideq*1+ 0]
4485*c0909341SAndroid Build Coastguard Worker    cmp               wpadd, 2
4486*c0909341SAndroid Build Coastguard Worker    jl .w16_wpad1
4487*c0909341SAndroid Build Coastguard Worker    je .w16_wpad2
4488*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [ypxq+strideq*0+12]
4489*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [ypxq+strideq*1+12]
4490*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m2, 0xf0
4491*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m3, 0xf0
4492*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_end
4493*c0909341SAndroid Build Coastguard Worker.w16_wpad2:
4494*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [ypxq+strideq*0+28]
4495*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [ypxq+strideq*1+28]
4496*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_end
4497*c0909341SAndroid Build Coastguard Worker.w16_wpad1:
4498*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [ypxq+strideq*0+44]
4499*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [ypxq+strideq*1+44]
4500*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [ypxq+strideq*0+32], 0
4501*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [ypxq+strideq*1+32], 0
4502*c0909341SAndroid Build Coastguard Worker.w16_wpad_end:
4503*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*2]
4504*c0909341SAndroid Build Coastguard Worker    REPX    {pmaddwd x, m5}, m0, m1, m2, m3
4505*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
4506*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3
4507*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m0, m2
4508*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
4509*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q3120
4510*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
4511*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4512*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4513*c0909341SAndroid Build Coastguard Worker    dec                  hd
4514*c0909341SAndroid Build Coastguard Worker    jg .w16_wpad
4515*c0909341SAndroid Build Coastguard Worker    jmp .w16_hpad
4516*c0909341SAndroid Build Coastguard Worker.w16:
4517*c0909341SAndroid Build Coastguard Worker    mov                  r5, acq
4518*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
4519*c0909341SAndroid Build Coastguard Worker    jnz .w16_wpad
4520*c0909341SAndroid Build Coastguard Worker.w16_loop:
4521*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m5, [ypxq+strideq*0+ 0]
4522*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5, [ypxq+strideq*0+32]
4523*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5, [ypxq+strideq*1+ 0]
4524*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5, [ypxq+strideq*1+32]
4525*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*2]
4526*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
4527*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3
4528*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m0, m2
4529*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
4530*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q3120
4531*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
4532*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4533*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4534*c0909341SAndroid Build Coastguard Worker    dec                  hd
4535*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
4536*c0909341SAndroid Build Coastguard Worker.w16_hpad:
4537*c0909341SAndroid Build Coastguard Worker    add               hpadd, hpadd
4538*c0909341SAndroid Build Coastguard Worker    jz .dc
4539*c0909341SAndroid Build Coastguard Worker    paddd                m0, m0
4540*c0909341SAndroid Build Coastguard Worker.hpad:
4541*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*0], m1
4542*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
4543*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*1], m1
4544*c0909341SAndroid Build Coastguard Worker    add                 acq, 32*2
4545*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 4
4546*c0909341SAndroid Build Coastguard Worker    jg .hpad
4547*c0909341SAndroid Build Coastguard Worker.dc:
4548*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m4, 1
4549*c0909341SAndroid Build Coastguard Worker    sub                  r5, acq ; -w*h*2
4550*c0909341SAndroid Build Coastguard Worker    tzcnt               r1d, r5d
4551*c0909341SAndroid Build Coastguard Worker    paddd               xm4, xm1
4552*c0909341SAndroid Build Coastguard Worker    sub                 r1d, 2
4553*c0909341SAndroid Build Coastguard Worker    punpckhqdq          xm1, xm4, xm4
4554*c0909341SAndroid Build Coastguard Worker    movd                xm0, r1d
4555*c0909341SAndroid Build Coastguard Worker    paddd               xm1, xm4
4556*c0909341SAndroid Build Coastguard Worker    pshuflw             xm4, xm1, q1032
4557*c0909341SAndroid Build Coastguard Worker    paddd               xm1, xm4
4558*c0909341SAndroid Build Coastguard Worker    psrld               xm1, xm0
4559*c0909341SAndroid Build Coastguard Worker    pxor                xm0, xm0
4560*c0909341SAndroid Build Coastguard Worker    pavgw               xm1, xm0
4561*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, xm1
4562*c0909341SAndroid Build Coastguard Worker.dc_loop:
4563*c0909341SAndroid Build Coastguard Worker    mova                 m0, [acq+r5]
4564*c0909341SAndroid Build Coastguard Worker    psubw                m0, m1
4565*c0909341SAndroid Build Coastguard Worker    mova           [acq+r5], m0
4566*c0909341SAndroid Build Coastguard Worker    add                  r5, 32
4567*c0909341SAndroid Build Coastguard Worker    jl .dc_loop
4568*c0909341SAndroid Build Coastguard Worker    RET
4569*c0909341SAndroid Build Coastguard Worker
4570*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_422_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
4571*c0909341SAndroid Build Coastguard Worker    movifnidn         hpadd, hpadm
4572*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pw_4]
4573*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
4574*c0909341SAndroid Build Coastguard Worker    shl               hpadd, 2
4575*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
4576*c0909341SAndroid Build Coastguard Worker    sub                  hd, hpadd
4577*c0909341SAndroid Build Coastguard Worker    cmp            dword wm, 8
4578*c0909341SAndroid Build Coastguard Worker    jg .w16
4579*c0909341SAndroid Build Coastguard Worker    je .w8
4580*c0909341SAndroid Build Coastguard Worker.w4:
4581*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
4582*c0909341SAndroid Build Coastguard Worker    mov                  r5, acq
4583*c0909341SAndroid Build Coastguard Worker.w4_loop:
4584*c0909341SAndroid Build Coastguard Worker    mova                xm0, [ypxq+strideq*0]
4585*c0909341SAndroid Build Coastguard Worker    mova                xm1, [ypxq+strideq*1]
4586*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [ypxq+strideq*2], 1
4587*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [ypxq+r3       ], 1
4588*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*4]
4589*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m5
4590*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5
4591*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
4592*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
4593*c0909341SAndroid Build Coastguard Worker    paddd                m4, m1
4594*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4595*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4596*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4597*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
4598*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4599*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4600*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m1, 1
4601*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3333
4602*c0909341SAndroid Build Coastguard Worker    pslld               xm1, 2
4603*c0909341SAndroid Build Coastguard Worker.w4_hpad_loop:
4604*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4605*c0909341SAndroid Build Coastguard Worker    paddd                m4, m1
4606*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4607*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 4
4608*c0909341SAndroid Build Coastguard Worker    jg .w4_hpad_loop
4609*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4610*c0909341SAndroid Build Coastguard Worker.w8:
4611*c0909341SAndroid Build Coastguard Worker    mov                  r5, acq
4612*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
4613*c0909341SAndroid Build Coastguard Worker    jnz .w8_wpad1
4614*c0909341SAndroid Build Coastguard Worker.w8_loop:
4615*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5, [ypxq+strideq*0]
4616*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m5, [ypxq+strideq*1]
4617*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*2]
4618*c0909341SAndroid Build Coastguard Worker    paddd                m4, m1
4619*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m0
4620*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
4621*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m1, q3120
4622*c0909341SAndroid Build Coastguard Worker    mova              [acq], m2
4623*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4624*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4625*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
4626*c0909341SAndroid Build Coastguard Worker.w8_hpad:
4627*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4628*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4629*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q3131
4630*c0909341SAndroid Build Coastguard Worker    pslld                m0, 2
4631*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
4632*c0909341SAndroid Build Coastguard Worker.w8_wpad1:
4633*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [ypxq+strideq*0+12]
4634*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [ypxq+strideq*1+12]
4635*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [ypxq+strideq*0+ 0], 0
4636*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [ypxq+strideq*1+ 0], 0
4637*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*2]
4638*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5
4639*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m5
4640*c0909341SAndroid Build Coastguard Worker    paddd                m4, m1
4641*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m0
4642*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
4643*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m1, q3120
4644*c0909341SAndroid Build Coastguard Worker    mova              [acq], m2
4645*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4646*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4647*c0909341SAndroid Build Coastguard Worker    jg .w8_wpad1
4648*c0909341SAndroid Build Coastguard Worker    jmp .w8_hpad
4649*c0909341SAndroid Build Coastguard Worker.w16:
4650*c0909341SAndroid Build Coastguard Worker    mov                  r5, acq
4651*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
4652*c0909341SAndroid Build Coastguard Worker    jnz .w16_wpad
4653*c0909341SAndroid Build Coastguard Worker.w16_loop:
4654*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5, [ypxq+strideq*0+ 0]
4655*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5, [ypxq+strideq*0+32]
4656*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m5, [ypxq+strideq*1+ 0]
4657*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5, [ypxq+strideq*1+32]
4658*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*2]
4659*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
4660*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m1
4661*c0909341SAndroid Build Coastguard Worker    paddd                m4, m1
4662*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m0, m3
4663*c0909341SAndroid Build Coastguard Worker    paddd                m0, m3
4664*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m2, q3120
4665*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
4666*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q3120
4667*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*0], m2
4668*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*1], m1
4669*c0909341SAndroid Build Coastguard Worker    add                 acq, 32*2
4670*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4671*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
4672*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad
4673*c0909341SAndroid Build Coastguard Worker.w16_wpad:
4674*c0909341SAndroid Build Coastguard Worker    mova                 m2, [ypxq+strideq*0+ 0]
4675*c0909341SAndroid Build Coastguard Worker    mova                 m0, [ypxq+strideq*1+ 0]
4676*c0909341SAndroid Build Coastguard Worker    cmp               wpadd, 2
4677*c0909341SAndroid Build Coastguard Worker    jl .w16_wpad1
4678*c0909341SAndroid Build Coastguard Worker    je .w16_wpad2
4679*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [ypxq+strideq*0+12]
4680*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [ypxq+strideq*1+12]
4681*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m1, 0xf0
4682*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m3, 0xf0
4683*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_end
4684*c0909341SAndroid Build Coastguard Worker.w16_wpad2:
4685*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [ypxq+strideq*0+28]
4686*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [ypxq+strideq*1+28]
4687*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_end
4688*c0909341SAndroid Build Coastguard Worker.w16_wpad1:
4689*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [ypxq+strideq*0+44]
4690*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [ypxq+strideq*1+44]
4691*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [ypxq+strideq*0+32], 0
4692*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [ypxq+strideq*1+32], 0
4693*c0909341SAndroid Build Coastguard Worker.w16_wpad_end:
4694*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*2]
4695*c0909341SAndroid Build Coastguard Worker    REPX    {pmaddwd x, m5}, m2, m0, m1, m3
4696*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
4697*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m1
4698*c0909341SAndroid Build Coastguard Worker    paddd                m4, m1
4699*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m0, m3
4700*c0909341SAndroid Build Coastguard Worker    paddd                m0, m3
4701*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m2, q3120
4702*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
4703*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q3120
4704*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*0], m2
4705*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*1], m1
4706*c0909341SAndroid Build Coastguard Worker    add                 acq, 32*2
4707*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4708*c0909341SAndroid Build Coastguard Worker    jg .w16_wpad
4709*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad
4710*c0909341SAndroid Build Coastguard Worker
4711*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
4712*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ipred_cfl_ac_444_16bpc_avx2_table]
4713*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
4714*c0909341SAndroid Build Coastguard Worker    movifnidn         hpadd, hpadm
4715*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pw_1]
4716*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
4717*c0909341SAndroid Build Coastguard Worker    shl               hpadd, 2
4718*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
4719*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
4720*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
4721*c0909341SAndroid Build Coastguard Worker    sub                  hd, hpadd
4722*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4723*c0909341SAndroid Build Coastguard Worker.w4:
4724*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
4725*c0909341SAndroid Build Coastguard Worker    mov                  r5, acq
4726*c0909341SAndroid Build Coastguard Worker.w4_loop:
4727*c0909341SAndroid Build Coastguard Worker    movq                xm0, [ypxq+strideq*0]
4728*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [ypxq+strideq*1]
4729*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [ypxq+strideq*2]
4730*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [ypxq+r3       ]
4731*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*4]
4732*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m1, 0x30
4733*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m2, 0xc0
4734*c0909341SAndroid Build Coastguard Worker    psllw                m0, 3
4735*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m0, m5
4736*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4737*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4738*c0909341SAndroid Build Coastguard Worker    paddd                m4, m1
4739*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4740*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
4741*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4742*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4743*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3333
4744*c0909341SAndroid Build Coastguard Worker    paddd                m1, m1
4745*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*0], m0
4746*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q3333
4747*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*1], m0
4748*c0909341SAndroid Build Coastguard Worker    add                 acq, 32*2
4749*c0909341SAndroid Build Coastguard Worker    paddd                m4, m1
4750*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4751*c0909341SAndroid Build Coastguard Worker.w8:
4752*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
4753*c0909341SAndroid Build Coastguard Worker    mov                  r5, acq
4754*c0909341SAndroid Build Coastguard Worker.w8_loop:
4755*c0909341SAndroid Build Coastguard Worker    mova                xm2, [ypxq+strideq*0]
4756*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [ypxq+strideq*1], 1
4757*c0909341SAndroid Build Coastguard Worker    mova                xm1, [ypxq+strideq*2]
4758*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [ypxq+r3       ], 1
4759*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*4]
4760*c0909341SAndroid Build Coastguard Worker    psllw                m2, 3
4761*c0909341SAndroid Build Coastguard Worker    psllw                m1, 3
4762*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*0], m2
4763*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5
4764*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*1], m1
4765*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m1, m5
4766*c0909341SAndroid Build Coastguard Worker    add                 acq, 32*2
4767*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
4768*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
4769*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4770*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
4771*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4772*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4773*c0909341SAndroid Build Coastguard Worker    vperm2i128           m1, m1, 0x11
4774*c0909341SAndroid Build Coastguard Worker    pslld                m0, 2
4775*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
4776*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m2, 0x0f
4777*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
4778*c0909341SAndroid Build Coastguard Worker.w16_wpad2:
4779*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, [ypxq+strideq*0+14]
4780*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, [ypxq+strideq*1+14]
4781*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m3, 0xf0
4782*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m0, 0xf0
4783*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_end
4784*c0909341SAndroid Build Coastguard Worker.w16:
4785*c0909341SAndroid Build Coastguard Worker    mov                  r5, acq
4786*c0909341SAndroid Build Coastguard Worker.w16_loop:
4787*c0909341SAndroid Build Coastguard Worker    mova                 m2, [ypxq+strideq*0]
4788*c0909341SAndroid Build Coastguard Worker    mova                 m1, [ypxq+strideq*1]
4789*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
4790*c0909341SAndroid Build Coastguard Worker    jnz .w16_wpad2
4791*c0909341SAndroid Build Coastguard Worker.w16_wpad_end:
4792*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*2]
4793*c0909341SAndroid Build Coastguard Worker    psllw                m2, 3
4794*c0909341SAndroid Build Coastguard Worker    psllw                m1, 3
4795*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*0], m2
4796*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5
4797*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*1], m1
4798*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m1, m5
4799*c0909341SAndroid Build Coastguard Worker    add                 acq, 32*2
4800*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
4801*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
4802*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4803*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
4804*c0909341SAndroid Build Coastguard Worker    add               hpadd, hpadd
4805*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4806*c0909341SAndroid Build Coastguard Worker    paddd                m0, m0
4807*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
4808*c0909341SAndroid Build Coastguard Worker.w32:
4809*c0909341SAndroid Build Coastguard Worker    mov                  r5, acq
4810*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
4811*c0909341SAndroid Build Coastguard Worker    jnz .w32_wpad
4812*c0909341SAndroid Build Coastguard Worker.w32_loop:
4813*c0909341SAndroid Build Coastguard Worker    mova                 m0, [ypxq+ 0]
4814*c0909341SAndroid Build Coastguard Worker    mova                 m1, [ypxq+32]
4815*c0909341SAndroid Build Coastguard Worker    add                ypxq, strideq
4816*c0909341SAndroid Build Coastguard Worker    psllw                m0, 3
4817*c0909341SAndroid Build Coastguard Worker    psllw                m1, 3
4818*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m0, m5
4819*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*0], m0
4820*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m1, m5
4821*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*1], m1
4822*c0909341SAndroid Build Coastguard Worker    add                 acq, 32*2
4823*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3
4824*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
4825*c0909341SAndroid Build Coastguard Worker    dec                  hd
4826*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
4827*c0909341SAndroid Build Coastguard Worker.w32_hpad:
4828*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4829*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4830*c0909341SAndroid Build Coastguard Worker    paddd                m2, m2
4831*c0909341SAndroid Build Coastguard Worker.w32_hpad_loop:
4832*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*0], m0
4833*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*1], m1
4834*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
4835*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*2], m0
4836*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*3], m1
4837*c0909341SAndroid Build Coastguard Worker    add                 acq, 32*4
4838*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 2
4839*c0909341SAndroid Build Coastguard Worker    jg .w32_hpad_loop
4840*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4841*c0909341SAndroid Build Coastguard Worker.w32_wpad:
4842*c0909341SAndroid Build Coastguard Worker    mova                 m0, [ypxq+ 0]
4843*c0909341SAndroid Build Coastguard Worker    cmp               wpadd, 4
4844*c0909341SAndroid Build Coastguard Worker    jl .w32_wpad2
4845*c0909341SAndroid Build Coastguard Worker    je .w32_wpad4
4846*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, [ypxq+14]
4847*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m1, 0xf0
4848*c0909341SAndroid Build Coastguard Worker    jmp .w32_wpad_end
4849*c0909341SAndroid Build Coastguard Worker.w32_wpad4:
4850*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, [ypxq+30]
4851*c0909341SAndroid Build Coastguard Worker    jmp .w32_wpad_end
4852*c0909341SAndroid Build Coastguard Worker.w32_wpad2:
4853*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, [ypxq+46]
4854*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [ypxq+32], 0
4855*c0909341SAndroid Build Coastguard Worker.w32_wpad_end:
4856*c0909341SAndroid Build Coastguard Worker    add                ypxq, strideq
4857*c0909341SAndroid Build Coastguard Worker    psllw                m0, 3
4858*c0909341SAndroid Build Coastguard Worker    psllw                m1, 3
4859*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m0, m5
4860*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*0], m0
4861*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m1, m5
4862*c0909341SAndroid Build Coastguard Worker    mova         [acq+32*1], m1
4863*c0909341SAndroid Build Coastguard Worker    add                 acq, 32*2
4864*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3
4865*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
4866*c0909341SAndroid Build Coastguard Worker    dec                  hd
4867*c0909341SAndroid Build Coastguard Worker    jg .w32_wpad
4868*c0909341SAndroid Build Coastguard Worker    jmp .w32_hpad
4869*c0909341SAndroid Build Coastguard Worker
4870*c0909341SAndroid Build Coastguard Workercglobal pal_pred_16bpc, 4, 6, 6, dst, stride, pal, idx, w, h
4871*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [palq]
4872*c0909341SAndroid Build Coastguard Worker    lea                  r2, [pal_pred_16bpc_avx2_table]
4873*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
4874*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [pal_pred_shuf]
4875*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
4876*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r2+wq*4]
4877*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m5
4878*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m5, m4, m4
4879*c0909341SAndroid Build Coastguard Worker    add                  wq, r2
4880*c0909341SAndroid Build Coastguard WorkerDEFINE_ARGS dst, stride, stride3, idx, w, h
4881*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
4882*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4883*c0909341SAndroid Build Coastguard Worker.w4:
4884*c0909341SAndroid Build Coastguard Worker    movq                xm0, [idxq]
4885*c0909341SAndroid Build Coastguard Worker    add                idxq, 8
4886*c0909341SAndroid Build Coastguard Worker    psrlw               xm1, xm0, 4
4887*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm0, xm1
4888*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm4, xm0
4889*c0909341SAndroid Build Coastguard Worker    pshufb              xm2, xm5, xm0
4890*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm0, xm1, xm2
4891*c0909341SAndroid Build Coastguard Worker    punpckhbw           xm1, xm2
4892*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
4893*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
4894*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
4895*c0909341SAndroid Build Coastguard Worker    movhps [dstq+stride3q ], xm1
4896*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4897*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4898*c0909341SAndroid Build Coastguard Worker    jg .w4
4899*c0909341SAndroid Build Coastguard Worker    RET
4900*c0909341SAndroid Build Coastguard Worker.w8:
4901*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m2, [idxq]
4902*c0909341SAndroid Build Coastguard Worker    add                idxq, 16
4903*c0909341SAndroid Build Coastguard Worker    psllw                m1, m2, 4
4904*c0909341SAndroid Build Coastguard Worker    por                  m2, m1
4905*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m2
4906*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m2
4907*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
4908*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
4909*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
4910*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*1], xm1
4911*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*2], m0, 1
4912*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+stride3q ], m1, 1
4913*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4914*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4915*c0909341SAndroid Build Coastguard Worker    jg .w8
4916*c0909341SAndroid Build Coastguard Worker    RET
4917*c0909341SAndroid Build Coastguard Worker.w16:
4918*c0909341SAndroid Build Coastguard Worker    pshufd               m3, [idxq], q3120
4919*c0909341SAndroid Build Coastguard Worker    add                idxq, 32
4920*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m3, q3120
4921*c0909341SAndroid Build Coastguard Worker    psrlw                m1, m3, 4
4922*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m1
4923*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m1
4924*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m2
4925*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m2
4926*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
4927*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
4928*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
4929*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
4930*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m3
4931*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m5, m3
4932*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m3
4933*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m3
4934*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m0
4935*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], m1
4936*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4937*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4938*c0909341SAndroid Build Coastguard Worker    jg .w16
4939*c0909341SAndroid Build Coastguard Worker    RET
4940*c0909341SAndroid Build Coastguard Worker.w32:
4941*c0909341SAndroid Build Coastguard Worker    pshufd               m3, [idxq], q3120
4942*c0909341SAndroid Build Coastguard Worker    add                idxq, 32
4943*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m3, q3120
4944*c0909341SAndroid Build Coastguard Worker    psrlw                m1, m3, 4
4945*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m1
4946*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m1
4947*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m2
4948*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m2
4949*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
4950*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
4951*c0909341SAndroid Build Coastguard Worker    mova          [dstq+ 0], m0
4952*c0909341SAndroid Build Coastguard Worker    mova          [dstq+32], m1
4953*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m3
4954*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m5, m3
4955*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m3
4956*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m3
4957*c0909341SAndroid Build Coastguard Worker    mova  [dstq+strideq+ 0], m0
4958*c0909341SAndroid Build Coastguard Worker    mova  [dstq+strideq+32], m1
4959*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
4960*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4961*c0909341SAndroid Build Coastguard Worker    jg .w32
4962*c0909341SAndroid Build Coastguard Worker    RET
4963*c0909341SAndroid Build Coastguard Worker.w64:
4964*c0909341SAndroid Build Coastguard Worker    pshufd               m3, [idxq], q3120
4965*c0909341SAndroid Build Coastguard Worker    add                idxq, 32
4966*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m3, q3120
4967*c0909341SAndroid Build Coastguard Worker    psrlw                m1, m3, 4
4968*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m1
4969*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m1
4970*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m2
4971*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m2
4972*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
4973*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
4974*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
4975*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m1
4976*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m3
4977*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m5, m3
4978*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m3
4979*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m3
4980*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*2], m0
4981*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*3], m1
4982*c0909341SAndroid Build Coastguard Worker    add                 dstq, strideq
4983*c0909341SAndroid Build Coastguard Worker    dec                   hd
4984*c0909341SAndroid Build Coastguard Worker    jg .w64
4985*c0909341SAndroid Build Coastguard Worker    RET
4986*c0909341SAndroid Build Coastguard Worker
4987*c0909341SAndroid Build Coastguard Worker%endif
4988