xref: /aosp_15_r20/external/libdav1d/src/x86/ipred16_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Workerfilter_shuf:   db  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  4,  5,  2,  3, -1, -1
32*c0909341SAndroid Build Coastguard Workerpal_pred_shuf: db  0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15
33*c0909341SAndroid Build Coastguard Workerz_base_inc:    dw   0*64,   1*64,   2*64,   3*64,   4*64,   5*64,   6*64,   7*64
34*c0909341SAndroid Build Coastguard Workerz_base_inc_z2: dw   7*64,   6*64,   5*64,   4*64,   3*64,   2*64,   1*64,   0*64
35*c0909341SAndroid Build Coastguard Workerz_upsample:    db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
36*c0909341SAndroid Build Coastguard Workerz2_upsample_l: db -1, -1, -2, -1, -3, -1, -4, -1,  8,  9,  8,  9, 10, 11, 12, 13
37*c0909341SAndroid Build Coastguard Worker               db  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
38*c0909341SAndroid Build Coastguard Workerz2_top_shufA:  db  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9
39*c0909341SAndroid Build Coastguard Workerz2_top_shufB:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
40*c0909341SAndroid Build Coastguard Workerz2_left_shufA: db 14, 15, 12, 13, 10, 11,  8,  9, 12, 13, 10, 11,  8,  9,  6,  7
41*c0909341SAndroid Build Coastguard Workerz2_left_shufB: db 14, 15, 10, 11,  6,  7,  2,  3, 12, 13,  8,  9,  4,  5,  0,  1
42*c0909341SAndroid Build Coastguard Workerz_filt_wh16:   db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1
43*c0909341SAndroid Build Coastguard Workerz_filt_t_w48:  db 55,127,  7,127, 15, 31, 39, 31,127, 39,127, 39,  7, 15, 31, 15
44*c0909341SAndroid Build Coastguard Worker               db 39, 63,  3, 63,  3,  3, 19,  3, 47, 19, 47, 19,  3,  3,  3,  3
45*c0909341SAndroid Build Coastguard Workerz_filt_t_w16:  db 15, 31,  7, 15, 31,  7,  3, 31,  3,  3,  3,  3,  3,  3,  0,  0
46*c0909341SAndroid Build Coastguard Workerz_filt_wh4:    db  7,  7, 19,  7,
47*c0909341SAndroid Build Coastguard Workerz_filt_wh8:    db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39
48*c0909341SAndroid Build Coastguard WorkerALIGN 8
49*c0909341SAndroid Build Coastguard Workerpb_2_3:   times 4 db 2, 3
50*c0909341SAndroid Build Coastguard Workerz2_dy_offset:     dw 96*64, 96*64, 95*64, 95*64
51*c0909341SAndroid Build Coastguard Workerz_filt_k: times 4 dw 8
52*c0909341SAndroid Build Coastguard Worker          times 4 dw 6
53*c0909341SAndroid Build Coastguard Worker          times 4 dw 4
54*c0909341SAndroid Build Coastguard Worker          times 4 dw 5
55*c0909341SAndroid Build Coastguard Workerpw_m3584: times 4 dw -3584
56*c0909341SAndroid Build Coastguard Workerpw_m3072: times 4 dw -3072
57*c0909341SAndroid Build Coastguard Workerpw_m2560: times 4 dw -2560
58*c0909341SAndroid Build Coastguard Workerpw_m2048: times 4 dw -2048
59*c0909341SAndroid Build Coastguard Workerpw_m1536: times 4 dw -1536
60*c0909341SAndroid Build Coastguard Workerpw_m1024: times 4 dw -1024
61*c0909341SAndroid Build Coastguard Workerpw_m512:  times 4 dw -512
62*c0909341SAndroid Build Coastguard Workerpw_1:     times 4 dw 1
63*c0909341SAndroid Build Coastguard Workerpw_2:     times 4 dw 2
64*c0909341SAndroid Build Coastguard Workerpw_3:     times 4 dw 3
65*c0909341SAndroid Build Coastguard Workerpw_62:    times 4 dw 62
66*c0909341SAndroid Build Coastguard Workerpw_256:   times 4 dw 256
67*c0909341SAndroid Build Coastguard Workerpw_512:   times 4 dw 512
68*c0909341SAndroid Build Coastguard Workerpw_2048:  times 4 dw 2048
69*c0909341SAndroid Build Coastguard Worker
70*c0909341SAndroid Build Coastguard Worker%define pw_4 (z_filt_k+8*2)
71*c0909341SAndroid Build Coastguard Worker%define pw_8 (z_filt_k+8*0)
72*c0909341SAndroid Build Coastguard Worker%define pw_m1to4 z2_upsample_l
73*c0909341SAndroid Build Coastguard Worker
74*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 3-*
75*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - 2*4)
76*c0909341SAndroid Build Coastguard Worker    %xdefine %%base mangle(private_prefix %+ _%1_%2)
77*c0909341SAndroid Build Coastguard Worker    %%table:
78*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
79*c0909341SAndroid Build Coastguard Worker        dd %%base %+ .%3 - (%%table - 2*4)
80*c0909341SAndroid Build Coastguard Worker        %rotate 1
81*c0909341SAndroid Build Coastguard Worker    %endrep
82*c0909341SAndroid Build Coastguard Worker%endmacro
83*c0909341SAndroid Build Coastguard Worker
84*c0909341SAndroid Build Coastguard Worker%define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4)
85*c0909341SAndroid Build Coastguard Worker%define ipred_dc_128_16bpc_ssse3_table   (ipred_dc_16bpc_ssse3_table + 15*4)
86*c0909341SAndroid Build Coastguard Worker%define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4)
87*c0909341SAndroid Build Coastguard Worker
88*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_dc_left_16bpc,    ssse3, h4, h8, h16, h32, h64
89*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_dc_16bpc,         ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
90*c0909341SAndroid Build Coastguard Worker                                         s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \
91*c0909341SAndroid Build Coastguard Worker                                         s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4
92*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_h_16bpc,          ssse3, w4, w8, w16, w32, w64
93*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z1_16bpc,         ssse3, w4, w8, w16, w32, w64
94*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z2_16bpc,         ssse3, w4, w8, w16, w32, w64
95*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z3_16bpc,         ssse3, h4, h8, h16, h32, h64
96*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl_16bpc,        ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
97*c0909341SAndroid Build Coastguard Worker                                         s4-8*4, s8-8*4, s16-8*4, s32-8*4
98*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl_left_16bpc,   ssse3, h4, h8, h16, h32
99*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32
100*c0909341SAndroid Build Coastguard WorkerJMP_TABLE pal_pred_16bpc,         ssse3, w4, w8, w16, w32, w64
101*c0909341SAndroid Build Coastguard Worker
102*c0909341SAndroid Build Coastguard Workercextern smooth_weights_1d_16bpc
103*c0909341SAndroid Build Coastguard Workercextern smooth_weights_2d_16bpc
104*c0909341SAndroid Build Coastguard Workercextern dr_intra_derivative
105*c0909341SAndroid Build Coastguard Workercextern filter_intra_taps
106*c0909341SAndroid Build Coastguard Worker
107*c0909341SAndroid Build Coastguard WorkerSECTION .text
108*c0909341SAndroid Build Coastguard Worker
109*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
110*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
111*c0909341SAndroid Build Coastguard Worker    LEA                  r5, ipred_dc_left_16bpc_ssse3_table
112*c0909341SAndroid Build Coastguard Worker    movd                 m4, wm
113*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
114*c0909341SAndroid Build Coastguard Worker    add                 tlq, 2
115*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
116*c0909341SAndroid Build Coastguard Worker    pxor                 m3, m3
117*c0909341SAndroid Build Coastguard Worker    pavgw                m4, m3
118*c0909341SAndroid Build Coastguard Worker    movd                 m5, wd
119*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq]
120*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [r5+wq*4]
121*c0909341SAndroid Build Coastguard Worker    add                  r6, r5
122*c0909341SAndroid Build Coastguard Worker    add                  r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table
123*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
124*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
125*c0909341SAndroid Build Coastguard Worker    jmp                  r6
126*c0909341SAndroid Build Coastguard Worker
127*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
128*c0909341SAndroid Build Coastguard Worker    LEA                  r5, ipred_dc_left_16bpc_ssse3_table
129*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
130*c0909341SAndroid Build Coastguard Worker    movd                 m4, hm
131*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, hd
132*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
133*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
134*c0909341SAndroid Build Coastguard Worker    pxor                 m3, m3
135*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
136*c0909341SAndroid Build Coastguard Worker    pavgw                m4, m3
137*c0909341SAndroid Build Coastguard Worker    movd                 m5, r6d
138*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq]
139*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [r5+r6*4]
140*c0909341SAndroid Build Coastguard Worker    add                  r6, r5
141*c0909341SAndroid Build Coastguard Worker    add                  r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table
142*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
143*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
144*c0909341SAndroid Build Coastguard Worker    jmp                  r6
145*c0909341SAndroid Build Coastguard Worker.h64:
146*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+112]
147*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+ 96]
148*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
149*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+ 80]
150*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
151*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+ 64]
152*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
153*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
154*c0909341SAndroid Build Coastguard Worker.h32:
155*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+ 48]
156*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+ 32]
157*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
158*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
159*c0909341SAndroid Build Coastguard Worker.h16:
160*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+ 16]
161*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
162*c0909341SAndroid Build Coastguard Worker.h8:
163*c0909341SAndroid Build Coastguard Worker    movhlps              m1, m0
164*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
165*c0909341SAndroid Build Coastguard Worker.h4:
166*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m3
167*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
168*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
169*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
170*c0909341SAndroid Build Coastguard Worker    pshuflw              m4, m0, q1032
171*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
172*c0909341SAndroid Build Coastguard Worker    psrld                m0, m5
173*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
174*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q0000
175*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
176*c0909341SAndroid Build Coastguard Worker    jmp                  wq
177*c0909341SAndroid Build Coastguard Worker
178*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3
179*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
180*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, hd
181*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [wq+hq]
182*c0909341SAndroid Build Coastguard Worker    movd                 m4, r5d
183*c0909341SAndroid Build Coastguard Worker    tzcnt               r5d, r5d
184*c0909341SAndroid Build Coastguard Worker    movd                 m5, r5d
185*c0909341SAndroid Build Coastguard Worker    LEA                  r5, ipred_dc_16bpc_ssse3_table
186*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
187*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [r5+r6*4]
188*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4+5*4]
189*c0909341SAndroid Build Coastguard Worker    pxor                 m3, m3
190*c0909341SAndroid Build Coastguard Worker    psrlw                m4, 1
191*c0909341SAndroid Build Coastguard Worker    add                  r6, r5
192*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
193*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
194*c0909341SAndroid Build Coastguard Worker    jmp                  r6
195*c0909341SAndroid Build Coastguard Worker.h4:
196*c0909341SAndroid Build Coastguard Worker    movq                 m0, [tlq-8]
197*c0909341SAndroid Build Coastguard Worker    jmp                  wq
198*c0909341SAndroid Build Coastguard Worker.w4:
199*c0909341SAndroid Build Coastguard Worker    movq                 m1, [tlq+2]
200*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
201*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m3
202*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3
203*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
204*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
205*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
206*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
207*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032
208*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
209*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
210*c0909341SAndroid Build Coastguard Worker    jg .w4_mul
211*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 3
212*c0909341SAndroid Build Coastguard Worker    jmp .w4_end
213*c0909341SAndroid Build Coastguard Worker.w4_mul:
214*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0xAAAB
215*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 0x6667
216*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
217*c0909341SAndroid Build Coastguard Worker    cmove               r2d, r3d
218*c0909341SAndroid Build Coastguard Worker    psrld                m0, 2
219*c0909341SAndroid Build Coastguard Worker    movd                 m1, r2d
220*c0909341SAndroid Build Coastguard Worker    pmulhuw              m0, m1
221*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 1
222*c0909341SAndroid Build Coastguard Worker.w4_end:
223*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q0000
224*c0909341SAndroid Build Coastguard Worker.s4:
225*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
226*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], m0
227*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], m0
228*c0909341SAndroid Build Coastguard Worker    movq   [dstq+stride3q ], m0
229*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
230*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
231*c0909341SAndroid Build Coastguard Worker    jg .s4
232*c0909341SAndroid Build Coastguard Worker    RET
233*c0909341SAndroid Build Coastguard Worker.h8:
234*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-16]
235*c0909341SAndroid Build Coastguard Worker    jmp                  wq
236*c0909341SAndroid Build Coastguard Worker.w8:
237*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+2]
238*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
239*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0, m3
240*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m3
241*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
242*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
243*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
244*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
245*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032
246*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
247*c0909341SAndroid Build Coastguard Worker    psrld                m0, m5
248*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
249*c0909341SAndroid Build Coastguard Worker    je .w8_end
250*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0xAAAB
251*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 0x6667
252*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
253*c0909341SAndroid Build Coastguard Worker    cmove               r2d, r3d
254*c0909341SAndroid Build Coastguard Worker    movd                 m1, r2d
255*c0909341SAndroid Build Coastguard Worker    pmulhuw              m0, m1
256*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 1
257*c0909341SAndroid Build Coastguard Worker.w8_end:
258*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q0000
259*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
260*c0909341SAndroid Build Coastguard Worker.s8:
261*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
262*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m0
263*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m0
264*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], m0
265*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
266*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
267*c0909341SAndroid Build Coastguard Worker    jg .s8
268*c0909341SAndroid Build Coastguard Worker    RET
269*c0909341SAndroid Build Coastguard Worker.h16:
270*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-32]
271*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-16]
272*c0909341SAndroid Build Coastguard Worker    jmp                  wq
273*c0909341SAndroid Build Coastguard Worker.w16:
274*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+ 2]
275*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+18]
276*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
277*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
278*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m3
279*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m3
280*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
281*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
282*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
283*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
284*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032
285*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
286*c0909341SAndroid Build Coastguard Worker    psrld                m0, m5
287*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
288*c0909341SAndroid Build Coastguard Worker    je .w16_end
289*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0xAAAB
290*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 0x6667
291*c0909341SAndroid Build Coastguard Worker    test                 hd, 8|32
292*c0909341SAndroid Build Coastguard Worker    cmovz               r2d, r3d
293*c0909341SAndroid Build Coastguard Worker    movd                 m1, r2d
294*c0909341SAndroid Build Coastguard Worker    pmulhuw              m0, m1
295*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 1
296*c0909341SAndroid Build Coastguard Worker.w16_end:
297*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q0000
298*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
299*c0909341SAndroid Build Coastguard Worker.s16c:
300*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
301*c0909341SAndroid Build Coastguard Worker.s16:
302*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*0], m0
303*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*1], m1
304*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*0], m0
305*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*1], m1
306*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*2+16*0], m0
307*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*2+16*1], m1
308*c0909341SAndroid Build Coastguard Worker    mova [dstq+stride3q +16*0], m0
309*c0909341SAndroid Build Coastguard Worker    mova [dstq+stride3q +16*1], m1
310*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
311*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
312*c0909341SAndroid Build Coastguard Worker    jg .s16
313*c0909341SAndroid Build Coastguard Worker    RET
314*c0909341SAndroid Build Coastguard Worker.h32:
315*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-64]
316*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-48]
317*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-32]
318*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-16]
319*c0909341SAndroid Build Coastguard Worker    jmp                  wq
320*c0909341SAndroid Build Coastguard Worker.w32:
321*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+ 2]
322*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+18]
323*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
324*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+34]
325*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
326*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+50]
327*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
328*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
329*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0, m3
330*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m3
331*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
332*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
333*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
334*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
335*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032
336*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
337*c0909341SAndroid Build Coastguard Worker    psrld                m0, m5
338*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
339*c0909341SAndroid Build Coastguard Worker    je .w32_end
340*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0xAAAB
341*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 0x6667
342*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
343*c0909341SAndroid Build Coastguard Worker    cmove               r2d, r3d
344*c0909341SAndroid Build Coastguard Worker    movd                 m1, r2d
345*c0909341SAndroid Build Coastguard Worker    pmulhuw              m0, m1
346*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 1
347*c0909341SAndroid Build Coastguard Worker.w32_end:
348*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q0000
349*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
350*c0909341SAndroid Build Coastguard Worker.s32c:
351*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
352*c0909341SAndroid Build Coastguard Worker    mova                 m2, m0
353*c0909341SAndroid Build Coastguard Worker    mova                 m3, m0
354*c0909341SAndroid Build Coastguard Worker.s32:
355*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*0], m0
356*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*1], m1
357*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*2], m2
358*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*3], m3
359*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*0], m0
360*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*1], m1
361*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*2], m2
362*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*3], m3
363*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
364*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
365*c0909341SAndroid Build Coastguard Worker    jg .s32
366*c0909341SAndroid Build Coastguard Worker    RET
367*c0909341SAndroid Build Coastguard Worker.h64:
368*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-128]
369*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tlq-112]
370*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq- 96]
371*c0909341SAndroid Build Coastguard Worker    paddw                m1, [tlq- 80]
372*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq- 64]
373*c0909341SAndroid Build Coastguard Worker    paddw                m1, [tlq- 48]
374*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq- 32]
375*c0909341SAndroid Build Coastguard Worker    paddw                m1, [tlq- 16]
376*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
377*c0909341SAndroid Build Coastguard Worker    jmp                  wq
378*c0909341SAndroid Build Coastguard Worker.w64:
379*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+  2]
380*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+ 18]
381*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
382*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+ 34]
383*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
384*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+ 50]
385*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
386*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+ 66]
387*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
388*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+ 82]
389*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
390*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+ 98]
391*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
392*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+114]
393*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
394*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
395*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0, m3
396*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m3
397*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
398*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
399*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
400*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
401*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032
402*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
403*c0909341SAndroid Build Coastguard Worker    psrld                m0, m5
404*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 64
405*c0909341SAndroid Build Coastguard Worker    je .w64_end
406*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0xAAAB
407*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 0x6667
408*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
409*c0909341SAndroid Build Coastguard Worker    cmove               r2d, r3d
410*c0909341SAndroid Build Coastguard Worker    movd                 m1, r2d
411*c0909341SAndroid Build Coastguard Worker    pmulhuw              m0, m1
412*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 1
413*c0909341SAndroid Build Coastguard Worker.w64_end:
414*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q0000
415*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
416*c0909341SAndroid Build Coastguard Worker.s64:
417*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
418*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m0
419*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
420*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m0
421*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*4], m0
422*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*5], m0
423*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*6], m0
424*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*7], m0
425*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
426*c0909341SAndroid Build Coastguard Worker    dec                  hd
427*c0909341SAndroid Build Coastguard Worker    jg .s64
428*c0909341SAndroid Build Coastguard Worker    RET
429*c0909341SAndroid Build Coastguard Worker
430*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
431*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r8m
432*c0909341SAndroid Build Coastguard Worker    LEA                  r5, ipred_dc_128_16bpc_ssse3_table
433*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
434*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
435*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
436*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
437*c0909341SAndroid Build Coastguard Worker    movddup              m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8]
438*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
439*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
440*c0909341SAndroid Build Coastguard Worker    jmp                  wq
441*c0909341SAndroid Build Coastguard Worker
442*c0909341SAndroid Build Coastguard Workercglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3
443*c0909341SAndroid Build Coastguard Worker    LEA                  r5, ipred_dc_splat_16bpc_ssse3_table
444*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
445*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+  2]
446*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+ 18]
447*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+ 34]
448*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+ 50]
449*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 64
450*c0909341SAndroid Build Coastguard Worker    je .w64
451*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
452*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
453*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
454*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
455*c0909341SAndroid Build Coastguard Worker    jmp                  wq
456*c0909341SAndroid Build Coastguard Worker.w64:
457*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM 8
458*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq+ 66]
459*c0909341SAndroid Build Coastguard Worker    movu                 m5, [tlq+ 82]
460*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq+ 98]
461*c0909341SAndroid Build Coastguard Worker    movu                 m7, [tlq+114]
462*c0909341SAndroid Build Coastguard Worker.w64_loop:
463*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
464*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
465*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m2
466*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m3
467*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*4], m4
468*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*5], m5
469*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*6], m6
470*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*7], m7
471*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
472*c0909341SAndroid Build Coastguard Worker    dec                  hd
473*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
474*c0909341SAndroid Build Coastguard Worker    RET
475*c0909341SAndroid Build Coastguard Worker
476*c0909341SAndroid Build Coastguard Workercglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
477*c0909341SAndroid Build Coastguard Worker%define base r5-ipred_h_16bpc_ssse3_table
478*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
479*c0909341SAndroid Build Coastguard Worker    LEA                  r5, ipred_h_16bpc_ssse3_table
480*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
481*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
482*c0909341SAndroid Build Coastguard Worker    movddup              m2, [base+pw_256]
483*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+pb_2_3]
484*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
485*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
486*c0909341SAndroid Build Coastguard Worker    jmp                  wq
487*c0909341SAndroid Build Coastguard Worker.w4:
488*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 8
489*c0909341SAndroid Build Coastguard Worker    movq                 m3, [tlq]
490*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m3, q3333
491*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m3, q2222
492*c0909341SAndroid Build Coastguard Worker    pshuflw              m2, m3, q1111
493*c0909341SAndroid Build Coastguard Worker    pshuflw              m3, m3, q0000
494*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
495*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], m1
496*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], m2
497*c0909341SAndroid Build Coastguard Worker    movq   [dstq+stride3q ], m3
498*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
499*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
500*c0909341SAndroid Build Coastguard Worker    jg .w4
501*c0909341SAndroid Build Coastguard Worker    RET
502*c0909341SAndroid Build Coastguard Worker.w8:
503*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 8
504*c0909341SAndroid Build Coastguard Worker    movq                 m3, [tlq]
505*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m3
506*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q3333
507*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m3, q2222
508*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q1111
509*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q0000
510*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
511*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
512*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m2
513*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], m3
514*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
515*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
516*c0909341SAndroid Build Coastguard Worker    jg .w8
517*c0909341SAndroid Build Coastguard Worker    RET
518*c0909341SAndroid Build Coastguard Worker.w16:
519*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
520*c0909341SAndroid Build Coastguard Worker    movd                 m1, [tlq]
521*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1, m3
522*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2
523*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*0], m0
524*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*1], m0
525*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*0], m1
526*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*1], m1
527*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
528*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
529*c0909341SAndroid Build Coastguard Worker    jg .w16
530*c0909341SAndroid Build Coastguard Worker    RET
531*c0909341SAndroid Build Coastguard Worker.w32:
532*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
533*c0909341SAndroid Build Coastguard Worker    movd                 m1, [tlq]
534*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1, m3
535*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2
536*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*0], m0
537*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*1], m0
538*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*2], m0
539*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*3], m0
540*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*0], m1
541*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*1], m1
542*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*2], m1
543*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*3], m1
544*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
545*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
546*c0909341SAndroid Build Coastguard Worker    jg .w32
547*c0909341SAndroid Build Coastguard Worker    RET
548*c0909341SAndroid Build Coastguard Worker.w64:
549*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 2
550*c0909341SAndroid Build Coastguard Worker    movd                 m0, [tlq]
551*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2
552*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
553*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m0
554*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
555*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m0
556*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*4], m0
557*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*5], m0
558*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*6], m0
559*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*7], m0
560*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
561*c0909341SAndroid Build Coastguard Worker    dec                  hd
562*c0909341SAndroid Build Coastguard Worker    jg .w64
563*c0909341SAndroid Build Coastguard Worker    RET
564*c0909341SAndroid Build Coastguard Worker
565*c0909341SAndroid Build Coastguard Workercglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left
566*c0909341SAndroid Build Coastguard Worker%define base r5-ipred_paeth_16bpc_ssse3_table
567*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
568*c0909341SAndroid Build Coastguard Worker    pshuflw              m4, [tlq], q0000
569*c0909341SAndroid Build Coastguard Worker    mov               leftq, tlq
570*c0909341SAndroid Build Coastguard Worker    add                  hd, hd
571*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m4      ; topleft
572*c0909341SAndroid Build Coastguard Worker    sub               leftq, hq
573*c0909341SAndroid Build Coastguard Worker    and                  wd, ~7
574*c0909341SAndroid Build Coastguard Worker    jnz .w8
575*c0909341SAndroid Build Coastguard Worker    movddup              m5, [tlq+2] ; top
576*c0909341SAndroid Build Coastguard Worker    psubw                m6, m5, m4
577*c0909341SAndroid Build Coastguard Worker    pabsw                m7, m6
578*c0909341SAndroid Build Coastguard Worker.w4_loop:
579*c0909341SAndroid Build Coastguard Worker    movd                 m1, [leftq+hq-4]
580*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m1
581*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m1      ; left
582*c0909341SAndroid Build Coastguard Worker%macro PAETH 0
583*c0909341SAndroid Build Coastguard Worker    paddw                m0, m6, m1
584*c0909341SAndroid Build Coastguard Worker    psubw                m2, m4, m0  ; tldiff
585*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5      ; tdiff
586*c0909341SAndroid Build Coastguard Worker    pabsw                m2, m2
587*c0909341SAndroid Build Coastguard Worker    pabsw                m0, m0
588*c0909341SAndroid Build Coastguard Worker    pminsw               m2, m0
589*c0909341SAndroid Build Coastguard Worker    pcmpeqw              m0, m2
590*c0909341SAndroid Build Coastguard Worker    pand                 m3, m5, m0
591*c0909341SAndroid Build Coastguard Worker    pandn                m0, m4
592*c0909341SAndroid Build Coastguard Worker    por                  m0, m3
593*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m3, m7, m2
594*c0909341SAndroid Build Coastguard Worker    pand                 m0, m3
595*c0909341SAndroid Build Coastguard Worker    pandn                m3, m1
596*c0909341SAndroid Build Coastguard Worker    por                  m0, m3
597*c0909341SAndroid Build Coastguard Worker%endmacro
598*c0909341SAndroid Build Coastguard Worker    PAETH
599*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*0], m0
600*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], m0
601*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
602*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2*2
603*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
604*c0909341SAndroid Build Coastguard Worker    RET
605*c0909341SAndroid Build Coastguard Worker.w8:
606*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
607*c0909341SAndroid Build Coastguard Worker    PUSH                 r6
608*c0909341SAndroid Build Coastguard Worker    %define             r7d  hm
609*c0909341SAndroid Build Coastguard Worker    %assign regs_used     7
610*c0909341SAndroid Build Coastguard Worker%elif WIN64
611*c0909341SAndroid Build Coastguard Worker    movaps              r4m, m8
612*c0909341SAndroid Build Coastguard Worker    PUSH                 r7
613*c0909341SAndroid Build Coastguard Worker    %assign regs_used     8
614*c0909341SAndroid Build Coastguard Worker%endif
615*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
616*c0909341SAndroid Build Coastguard Worker    movddup              m8, [pw_256]
617*c0909341SAndroid Build Coastguard Worker%endif
618*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [tlq+wq*2+2]
619*c0909341SAndroid Build Coastguard Worker    neg                  wq
620*c0909341SAndroid Build Coastguard Worker    mov                 r7d, hd
621*c0909341SAndroid Build Coastguard Worker.w8_loop0:
622*c0909341SAndroid Build Coastguard Worker    movu                 m5, [tlq+wq*2]
623*c0909341SAndroid Build Coastguard Worker    mov                  r6, dstq
624*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
625*c0909341SAndroid Build Coastguard Worker    psubw                m6, m5, m4
626*c0909341SAndroid Build Coastguard Worker    pabsw                m7, m6
627*c0909341SAndroid Build Coastguard Worker.w8_loop:
628*c0909341SAndroid Build Coastguard Worker    movd                 m1, [leftq+hq-2]
629*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
630*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m8
631*c0909341SAndroid Build Coastguard Worker%else
632*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m1, q0000
633*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m1
634*c0909341SAndroid Build Coastguard Worker%endif
635*c0909341SAndroid Build Coastguard Worker    PAETH
636*c0909341SAndroid Build Coastguard Worker    mova               [r6], m0
637*c0909341SAndroid Build Coastguard Worker    add                  r6, strideq
638*c0909341SAndroid Build Coastguard Worker    sub                  hd, 1*2
639*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
640*c0909341SAndroid Build Coastguard Worker    mov                  hd, r7d
641*c0909341SAndroid Build Coastguard Worker    add                  wq, 8
642*c0909341SAndroid Build Coastguard Worker    jl .w8_loop0
643*c0909341SAndroid Build Coastguard Worker%if WIN64
644*c0909341SAndroid Build Coastguard Worker    movaps               m8, r4m
645*c0909341SAndroid Build Coastguard Worker%endif
646*c0909341SAndroid Build Coastguard Worker    RET
647*c0909341SAndroid Build Coastguard Worker
648*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
649*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7
650*c0909341SAndroid Build Coastguard Worker%else
651*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4
652*c0909341SAndroid Build Coastguard Worker%endif
653*c0909341SAndroid Build Coastguard Worker
654*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights
655*c0909341SAndroid Build Coastguard Worker    LEA            weightsq, smooth_weights_1d_16bpc
656*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
657*c0909341SAndroid Build Coastguard Worker    lea            weightsq, [weightsq+hq*4]
658*c0909341SAndroid Build Coastguard Worker    neg                  hq
659*c0909341SAndroid Build Coastguard Worker    movd                 m5, [tlq+hq*2] ; bottom
660*c0909341SAndroid Build Coastguard Worker    pshuflw              m5, m5, q0000
661*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m5, m5
662*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
663*c0909341SAndroid Build Coastguard Worker    jne .w8
664*c0909341SAndroid Build Coastguard Worker    movddup              m4, [tlq+2]    ; top
665*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
666*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5         ; top - bottom
667*c0909341SAndroid Build Coastguard Worker.w4_loop:
668*c0909341SAndroid Build Coastguard Worker    movq                 m1, [weightsq+hq*2]
669*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m1
670*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m1, q1100
671*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m1
672*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
673*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4
674*c0909341SAndroid Build Coastguard Worker    paddw                m0, m5
675*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
676*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
677*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
678*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], m1
679*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r3       ], m1
680*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
681*c0909341SAndroid Build Coastguard Worker    add                  hq, 4
682*c0909341SAndroid Build Coastguard Worker    jl .w4_loop
683*c0909341SAndroid Build Coastguard Worker    RET
684*c0909341SAndroid Build Coastguard Worker.w8:
685*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
686*c0909341SAndroid Build Coastguard Worker    PUSH                 r6
687*c0909341SAndroid Build Coastguard Worker    %assign regs_used     7
688*c0909341SAndroid Build Coastguard Worker    mov                  hm, hq
689*c0909341SAndroid Build Coastguard Worker    %define              hq  hm
690*c0909341SAndroid Build Coastguard Worker%elif WIN64
691*c0909341SAndroid Build Coastguard Worker    PUSH                 r7
692*c0909341SAndroid Build Coastguard Worker    %assign regs_used     8
693*c0909341SAndroid Build Coastguard Worker%endif
694*c0909341SAndroid Build Coastguard Worker.w8_loop0:
695*c0909341SAndroid Build Coastguard Worker    mov                  t0, hq
696*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq+2]
697*c0909341SAndroid Build Coastguard Worker    add                 tlq, 16
698*c0909341SAndroid Build Coastguard Worker    mov                  r6, dstq
699*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
700*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5
701*c0909341SAndroid Build Coastguard Worker.w8_loop:
702*c0909341SAndroid Build Coastguard Worker    movq                 m3, [weightsq+t0*2]
703*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m3
704*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q0000
705*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m3, q1111
706*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q2222
707*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q3333
708*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
709*c0909341SAndroid Build Coastguard Worker    REPX   {paddw    x, m5}, m0, m1, m2, m3
710*c0909341SAndroid Build Coastguard Worker    mova     [r6+strideq*0], m0
711*c0909341SAndroid Build Coastguard Worker    mova     [r6+strideq*1], m1
712*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6+strideq*2]
713*c0909341SAndroid Build Coastguard Worker    mova     [r6+strideq*0], m2
714*c0909341SAndroid Build Coastguard Worker    mova     [r6+strideq*1], m3
715*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6+strideq*2]
716*c0909341SAndroid Build Coastguard Worker    add                  t0, 4
717*c0909341SAndroid Build Coastguard Worker    jl .w8_loop
718*c0909341SAndroid Build Coastguard Worker    sub                  wd, 8
719*c0909341SAndroid Build Coastguard Worker    jg .w8_loop0
720*c0909341SAndroid Build Coastguard Worker    RET
721*c0909341SAndroid Build Coastguard Worker
722*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights
723*c0909341SAndroid Build Coastguard Worker    LEA            weightsq, smooth_weights_1d_16bpc
724*c0909341SAndroid Build Coastguard Worker    mov                  wd, wm
725*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
726*c0909341SAndroid Build Coastguard Worker    movd                 m5, [tlq+wq*2] ; right
727*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 8
728*c0909341SAndroid Build Coastguard Worker    add                  hd, hd
729*c0909341SAndroid Build Coastguard Worker    pshuflw              m5, m5, q0000
730*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
731*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m5, m5
732*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
733*c0909341SAndroid Build Coastguard Worker    jne .w8
734*c0909341SAndroid Build Coastguard Worker    movddup              m4, [weightsq+4*2]
735*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
736*c0909341SAndroid Build Coastguard Worker.w4_loop:
737*c0909341SAndroid Build Coastguard Worker    movq                 m1, [tlq+hq]   ; left
738*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m1
739*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5         ; left - right
740*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m1, q3322
741*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m1
742*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
743*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4
744*c0909341SAndroid Build Coastguard Worker    paddw                m0, m5
745*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
746*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*0], m0
747*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], m0
748*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], m1
749*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r3       ], m1
750*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
751*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4*2
752*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
753*c0909341SAndroid Build Coastguard Worker    RET
754*c0909341SAndroid Build Coastguard Worker.w8:
755*c0909341SAndroid Build Coastguard Worker    lea            weightsq, [weightsq+wq*4]
756*c0909341SAndroid Build Coastguard Worker    neg                  wq
757*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
758*c0909341SAndroid Build Coastguard Worker    PUSH                 r6
759*c0909341SAndroid Build Coastguard Worker    %assign regs_used     7
760*c0909341SAndroid Build Coastguard Worker    %define              hd  hm
761*c0909341SAndroid Build Coastguard Worker%elif WIN64
762*c0909341SAndroid Build Coastguard Worker    PUSH                 r7
763*c0909341SAndroid Build Coastguard Worker    %assign regs_used     8
764*c0909341SAndroid Build Coastguard Worker%endif
765*c0909341SAndroid Build Coastguard Worker.w8_loop0:
766*c0909341SAndroid Build Coastguard Worker    mov                 t0d, hd
767*c0909341SAndroid Build Coastguard Worker    mova                 m4, [weightsq+wq*2]
768*c0909341SAndroid Build Coastguard Worker    mov                  r6, dstq
769*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
770*c0909341SAndroid Build Coastguard Worker.w8_loop:
771*c0909341SAndroid Build Coastguard Worker    movq                 m3, [tlq+t0*(1+ARCH_X86_32)]
772*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m3
773*c0909341SAndroid Build Coastguard Worker    psubw                m3, m5
774*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q3333
775*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m3, q2222
776*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q1111
777*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q0000
778*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
779*c0909341SAndroid Build Coastguard Worker    REPX   {paddw    x, m5}, m0, m1, m2, m3
780*c0909341SAndroid Build Coastguard Worker    mova     [r6+strideq*0], m0
781*c0909341SAndroid Build Coastguard Worker    mova     [r6+strideq*1], m1
782*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6+strideq*2]
783*c0909341SAndroid Build Coastguard Worker    mova     [r6+strideq*0], m2
784*c0909341SAndroid Build Coastguard Worker    mova     [r6+strideq*1], m3
785*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6+strideq*2]
786*c0909341SAndroid Build Coastguard Worker    sub                 t0d, 4*(1+ARCH_X86_64)
787*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
788*c0909341SAndroid Build Coastguard Worker    add                  wq, 8
789*c0909341SAndroid Build Coastguard Worker    jl .w8_loop0
790*c0909341SAndroid Build Coastguard Worker    RET
791*c0909341SAndroid Build Coastguard Worker
792*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
793*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 10
794*c0909341SAndroid Build Coastguard Worker%else
795*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 3
796*c0909341SAndroid Build Coastguard Worker%endif
797*c0909341SAndroid Build Coastguard Worker
798*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \
799*c0909341SAndroid Build Coastguard Worker                                     h_weights, v_weights, top
800*c0909341SAndroid Build Coastguard Worker    LEA          h_weightsq, smooth_weights_2d_16bpc
801*c0909341SAndroid Build Coastguard Worker    mov                  wd, wm
802*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
803*c0909341SAndroid Build Coastguard Worker    movd                 m7, [tlq+wq*2] ; right
804*c0909341SAndroid Build Coastguard Worker    lea          v_weightsq, [h_weightsq+hq*8]
805*c0909341SAndroid Build Coastguard Worker    neg                  hq
806*c0909341SAndroid Build Coastguard Worker    movd                 m6, [tlq+hq*2] ; bottom
807*c0909341SAndroid Build Coastguard Worker    pshuflw              m7, m7, q0000
808*c0909341SAndroid Build Coastguard Worker    pshuflw              m6, m6, q0000
809*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
810*c0909341SAndroid Build Coastguard Worker    jne .w8
811*c0909341SAndroid Build Coastguard Worker    movq                 m4, [tlq+2]    ; top
812*c0909341SAndroid Build Coastguard Worker    mova                 m5, [h_weightsq+4*4]
813*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m6         ; top, bottom
814*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
815*c0909341SAndroid Build Coastguard Worker.w4_loop:
816*c0909341SAndroid Build Coastguard Worker    movq                 m1, [v_weightsq+hq*4]
817*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
818*c0909341SAndroid Build Coastguard Worker    movd                 m3, [tlq]      ; left
819*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m1, q0000
820*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q1111
821*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m4
822*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m7         ; left, right
823*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m4
824*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q1111
825*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q0000
826*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5
827*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5
828*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
829*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
830*c0909341SAndroid Build Coastguard Worker    psrld                m0, 8
831*c0909341SAndroid Build Coastguard Worker    psrld                m1, 8
832*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
833*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m6
834*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
835*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
836*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
837*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
838*c0909341SAndroid Build Coastguard Worker    jl .w4_loop
839*c0909341SAndroid Build Coastguard Worker    RET
840*c0909341SAndroid Build Coastguard Worker.w8:
841*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
842*c0909341SAndroid Build Coastguard Worker    lea          h_weightsq, [h_weightsq+wq*4]
843*c0909341SAndroid Build Coastguard Worker    mov                  t0, tlq
844*c0909341SAndroid Build Coastguard Worker    mov                 r1m, tlq
845*c0909341SAndroid Build Coastguard Worker    mov                 r2m, hq
846*c0909341SAndroid Build Coastguard Worker    %define              m8  [h_weightsq+16*0]
847*c0909341SAndroid Build Coastguard Worker    %define              m9  [h_weightsq+16*1]
848*c0909341SAndroid Build Coastguard Worker%else
849*c0909341SAndroid Build Coastguard Worker%if WIN64
850*c0909341SAndroid Build Coastguard Worker    movaps              r4m, m8
851*c0909341SAndroid Build Coastguard Worker    movaps              r6m, m9
852*c0909341SAndroid Build Coastguard Worker    PUSH                 r7
853*c0909341SAndroid Build Coastguard Worker    PUSH                 r8
854*c0909341SAndroid Build Coastguard Worker%endif
855*c0909341SAndroid Build Coastguard Worker    PUSH                 r9
856*c0909341SAndroid Build Coastguard Worker    PUSH                r10
857*c0909341SAndroid Build Coastguard Worker    %assign       regs_used  11
858*c0909341SAndroid Build Coastguard Worker    lea          h_weightsq, [h_weightsq+wq*8]
859*c0909341SAndroid Build Coastguard Worker    lea                topq, [tlq+wq*2]
860*c0909341SAndroid Build Coastguard Worker    neg                  wq
861*c0909341SAndroid Build Coastguard Worker    mov                  r8, tlq
862*c0909341SAndroid Build Coastguard Worker    mov                  r9, hq
863*c0909341SAndroid Build Coastguard Worker%endif
864*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m6
865*c0909341SAndroid Build Coastguard Worker.w8_loop0:
866*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
867*c0909341SAndroid Build Coastguard Worker    movu                 m5, [t0+2]
868*c0909341SAndroid Build Coastguard Worker    add                  t0, 16
869*c0909341SAndroid Build Coastguard Worker    mov                 r0m, t0
870*c0909341SAndroid Build Coastguard Worker%else
871*c0909341SAndroid Build Coastguard Worker    movu                 m5, [topq+wq*2+2]
872*c0909341SAndroid Build Coastguard Worker    mova                 m8, [h_weightsq+wq*4+16*0]
873*c0909341SAndroid Build Coastguard Worker    mova                 m9, [h_weightsq+wq*4+16*1]
874*c0909341SAndroid Build Coastguard Worker%endif
875*c0909341SAndroid Build Coastguard Worker    mov                  t0, dstq
876*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
877*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m6
878*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6
879*c0909341SAndroid Build Coastguard Worker.w8_loop:
880*c0909341SAndroid Build Coastguard Worker    movd                 m1, [v_weightsq+hq*4]
881*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 2
882*c0909341SAndroid Build Coastguard Worker    movd                 m3, [tlq]      ; left
883*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q0000
884*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m4, m1
885*c0909341SAndroid Build Coastguard Worker    pshuflw              m3, m3, q0000
886*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5
887*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m7         ; left, right
888*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m8, m3
889*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9
890*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
891*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
892*c0909341SAndroid Build Coastguard Worker    psrld                m0, 8
893*c0909341SAndroid Build Coastguard Worker    psrld                m1, 8
894*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
895*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
896*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m1
897*c0909341SAndroid Build Coastguard Worker    mova               [t0], m0
898*c0909341SAndroid Build Coastguard Worker    add                  t0, strideq
899*c0909341SAndroid Build Coastguard Worker    inc                  hq
900*c0909341SAndroid Build Coastguard Worker    jl .w8_loop
901*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
902*c0909341SAndroid Build Coastguard Worker    mov                  t0, r0m
903*c0909341SAndroid Build Coastguard Worker    mov                 tlq, r1m
904*c0909341SAndroid Build Coastguard Worker    add          h_weightsq, 16*2
905*c0909341SAndroid Build Coastguard Worker    mov                  hq, r2m
906*c0909341SAndroid Build Coastguard Worker    sub            dword wm, 8
907*c0909341SAndroid Build Coastguard Worker    jg .w8_loop0
908*c0909341SAndroid Build Coastguard Worker%else
909*c0909341SAndroid Build Coastguard Worker    mov                 tlq, r8
910*c0909341SAndroid Build Coastguard Worker    mov                  hq, r9
911*c0909341SAndroid Build Coastguard Worker    add                  wq, 8
912*c0909341SAndroid Build Coastguard Worker    jl .w8_loop0
913*c0909341SAndroid Build Coastguard Worker%endif
914*c0909341SAndroid Build Coastguard Worker%if WIN64
915*c0909341SAndroid Build Coastguard Worker    movaps               m8, r4m
916*c0909341SAndroid Build Coastguard Worker    movaps               m9, r6m
917*c0909341SAndroid Build Coastguard Worker%endif
918*c0909341SAndroid Build Coastguard Worker    RET
919*c0909341SAndroid Build Coastguard Worker
920*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
921*c0909341SAndroid Build Coastguard Workercglobal ipred_z1_16bpc, 3, 8, 8, 16*18, dst, stride, tl, w, h, angle, dx
922*c0909341SAndroid Build Coastguard Worker    %define            base  r7-$$
923*c0909341SAndroid Build Coastguard Worker    %define          bdmaxm  r8m
924*c0909341SAndroid Build Coastguard Worker    lea                  r7, [$$]
925*c0909341SAndroid Build Coastguard Worker%else
926*c0909341SAndroid Build Coastguard Workercglobal ipred_z1_16bpc, 3, 7, 8, -16*18, dst, stride, tl, w, h, angle, dx
927*c0909341SAndroid Build Coastguard Worker    %define            base  r1-$$
928*c0909341SAndroid Build Coastguard Worker    %define        stridemp  [rsp+4*0]
929*c0909341SAndroid Build Coastguard Worker    %define          bdmaxm  [rsp+4*1]
930*c0909341SAndroid Build Coastguard Worker    mov                  r3, r8m
931*c0909341SAndroid Build Coastguard Worker    mov            stridemp, r1
932*c0909341SAndroid Build Coastguard Worker    mov              bdmaxm, r3
933*c0909341SAndroid Build Coastguard Worker    LEA                  r1, $$
934*c0909341SAndroid Build Coastguard Worker%endif
935*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
936*c0909341SAndroid Build Coastguard Worker    movifnidn        angled, anglem
937*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
938*c0909341SAndroid Build Coastguard Worker    add                 tlq, 2
939*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [base+ipred_z1_16bpc_ssse3_table+wq*4]
940*c0909341SAndroid Build Coastguard Worker    mov                 dxd, angled
941*c0909341SAndroid Build Coastguard Worker    movddup              m0, [base+pw_256]
942*c0909341SAndroid Build Coastguard Worker    and                 dxd, 0x7e
943*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+pw_62]
944*c0909341SAndroid Build Coastguard Worker    add              angled, 165 ; ~90
945*c0909341SAndroid Build Coastguard Worker    lea                  wq, [base+wq+ipred_z1_16bpc_ssse3_table]
946*c0909341SAndroid Build Coastguard Worker    movzx               dxd, word [base+dr_intra_derivative+dxq]
947*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x4ff ; d = 90 - angle
948*c0909341SAndroid Build Coastguard Worker    jmp                  wq
949*c0909341SAndroid Build Coastguard Worker.w4:
950*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq+88]
951*c0909341SAndroid Build Coastguard Worker    test                r3d, 0x480
952*c0909341SAndroid Build Coastguard Worker    jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40
953*c0909341SAndroid Build Coastguard Worker    sar                 r3d, 9
954*c0909341SAndroid Build Coastguard Worker    add                 r3d, hd
955*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
956*c0909341SAndroid Build Coastguard Worker    jg .w4_no_upsample ; h > 8 || (w == h && is_sm)
957*c0909341SAndroid Build Coastguard Worker    movd                 m3, [tlq+14]
958*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+ 0]  ; 1 2 3 4 5 6 7 8
959*c0909341SAndroid Build Coastguard Worker    movd                 m1, bdmaxm
960*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m0
961*c0909341SAndroid Build Coastguard Worker    palignr              m4, m3, m2, 4 ; 3 4 5 6 7 8 8 8
962*c0909341SAndroid Build Coastguard Worker    paddw                m4, [tlq- 2]  ; 0 1 2 3 4 5 6 7
963*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
964*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m3
965*c0909341SAndroid Build Coastguard Worker    palignr              m3, m2, 2     ; 2 3 4 5 6 7 8 8
966*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m0
967*c0909341SAndroid Build Coastguard Worker    paddw                m3, m2        ; -1 * a + 9 * b + 9 * c + -1 * d
968*c0909341SAndroid Build Coastguard Worker    psubw                m5, m3, m4    ; = (b + c - a - d + (b + c) << 3 + 8) >> 4
969*c0909341SAndroid Build Coastguard Worker    movd                 m4, dxd
970*c0909341SAndroid Build Coastguard Worker    psraw                m5, 3         ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1
971*c0909341SAndroid Build Coastguard Worker    paddw                m3, m5
972*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
973*c0909341SAndroid Build Coastguard Worker    pmaxsw               m3, m5
974*c0909341SAndroid Build Coastguard Worker    mov                 r3d, dxd
975*c0909341SAndroid Build Coastguard Worker    pavgw                m3, m5
976*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m0
977*c0909341SAndroid Build Coastguard Worker    pminsw               m3, m1
978*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m3
979*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m3
980*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+z_upsample]
981*c0909341SAndroid Build Coastguard Worker    movifnidn       strideq, stridemp
982*c0909341SAndroid Build Coastguard Worker    mova           [rsp+ 0], m1
983*c0909341SAndroid Build Coastguard Worker    paddw                m5, m4, m4
984*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], m2
985*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m5 ; xpos0 xpos1
986*c0909341SAndroid Build Coastguard Worker.w4_upsample_loop:
987*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+dxq]
988*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6 ; base0
989*c0909341SAndroid Build Coastguard Worker    movu                 m1, [rsp+r3*2]
990*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r2+dxq]
991*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6 ; base1
992*c0909341SAndroid Build Coastguard Worker    movu                 m2, [rsp+r2*2]
993*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
994*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3
995*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m1, m2
996*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m2
997*c0909341SAndroid Build Coastguard Worker    pand                 m2, m7, m4 ; frac
998*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9      ; (a * (64 - frac) + b * frac + 32) >> 6
999*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0     ; = a + (((b - a) * frac + 32) >> 6)
1000*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2     ; = a + (((b - a) * (frac << 9) + 16384) >> 15)
1001*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5     ; xpos += dx
1002*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1003*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
1004*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
1005*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1006*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1007*c0909341SAndroid Build Coastguard Worker    jg .w4_upsample_loop
1008*c0909341SAndroid Build Coastguard Worker    RET
1009*c0909341SAndroid Build Coastguard Worker.w4_no_upsample:
1010*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 7     ; max_base
1011*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
1012*c0909341SAndroid Build Coastguard Worker    jnz .w4_main
1013*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
1014*c0909341SAndroid Build Coastguard Worker    movd                 m1, r3d
1015*c0909341SAndroid Build Coastguard Worker    movd                 m3, angled
1016*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
1017*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
1018*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2
1019*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m2
1020*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m1, [base+z_filt_wh4]
1021*c0909341SAndroid Build Coastguard Worker    pand                 m1, m3
1022*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, [base+z_filt_t_w48+angleq*8]
1023*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m1
1024*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 7
1025*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1026*c0909341SAndroid Build Coastguard Worker    jz .w4_main ; filter_strength == 0
1027*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, [tlq-2], q0000
1028*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+16*0]
1029*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x55555555
1030*c0909341SAndroid Build Coastguard Worker    movd                 m3, [tlq+r3*2]
1031*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30 ; filter_strength
1032*c0909341SAndroid Build Coastguard Worker    movd           [rsp+12], m1
1033*c0909341SAndroid Build Coastguard Worker    pshuflw              m3, m3, q0000
1034*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m2
1035*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+2]
1036*c0909341SAndroid Build Coastguard Worker    movq      [rsp+r3*2+18], m3
1037*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
1038*c0909341SAndroid Build Coastguard Worker    cmovae              r3d, r2d
1039*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*1]
1040*c0909341SAndroid Build Coastguard Worker    call .filter_edge
1041*c0909341SAndroid Build Coastguard Worker.w4_main:
1042*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [tlq+r3*2]
1043*c0909341SAndroid Build Coastguard Worker    movd                 m4, dxd
1044*c0909341SAndroid Build Coastguard Worker    movddup              m1, [base+z_base_inc] ; base_inc << 6
1045*c0909341SAndroid Build Coastguard Worker    movd                 m6, [tlq] ; top[max_base_x]
1046*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1047*c0909341SAndroid Build Coastguard Worker    movd                 m3, r3d
1048*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m0
1049*c0909341SAndroid Build Coastguard Worker    mov                 r5d, dxd ; xpos
1050*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m0
1051*c0909341SAndroid Build Coastguard Worker    sub                  r5, r3
1052*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m0
1053*c0909341SAndroid Build Coastguard Worker    paddw                m5, m4, m4
1054*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1 ; max_base_x
1055*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m5 ; xpos0 xpos1
1056*c0909341SAndroid Build Coastguard Worker    movifnidn       strideq, stridemp
1057*c0909341SAndroid Build Coastguard Worker.w4_loop:
1058*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r5+dxq]
1059*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6      ; base0
1060*c0909341SAndroid Build Coastguard Worker    movq                 m0, [tlq+r5*2+0]
1061*c0909341SAndroid Build Coastguard Worker    movq                 m1, [tlq+r5*2+2]
1062*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r3+dxq]
1063*c0909341SAndroid Build Coastguard Worker    sar                  r3, 6      ; base1
1064*c0909341SAndroid Build Coastguard Worker    movhps               m0, [tlq+r3*2+0]
1065*c0909341SAndroid Build Coastguard Worker    movhps               m1, [tlq+r3*2+2]
1066*c0909341SAndroid Build Coastguard Worker    pand                 m2, m7, m4
1067*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
1068*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1069*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1070*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m3, m4 ; xpos < max_base_x
1071*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5     ; xpos += dx
1072*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1073*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
1074*c0909341SAndroid Build Coastguard Worker    pandn                m2, m6
1075*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
1076*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
1077*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
1078*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1079*c0909341SAndroid Build Coastguard Worker    jz .w4_end
1080*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1081*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1082*c0909341SAndroid Build Coastguard Worker    jl .w4_loop
1083*c0909341SAndroid Build Coastguard Worker.w4_end_loop:
1084*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m6
1085*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], m6
1086*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1087*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1088*c0909341SAndroid Build Coastguard Worker    jg .w4_end_loop
1089*c0909341SAndroid Build Coastguard Worker.w4_end:
1090*c0909341SAndroid Build Coastguard Worker    RET
1091*c0909341SAndroid Build Coastguard Worker.w8:
1092*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq+88]
1093*c0909341SAndroid Build Coastguard Worker    and                 r3d, ~0x7f
1094*c0909341SAndroid Build Coastguard Worker    or                  r3d, hd
1095*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
1096*c0909341SAndroid Build Coastguard Worker    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
1097*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+ 0]  ; 1 2 3 4 5 6 7 8
1098*c0909341SAndroid Build Coastguard Worker    movu                 m5, [tlq+ 2]  ; 2 3 4 5 6 7 8 9
1099*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+ 4]  ; 3 4 5 6 7 8 9 a
1100*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
1101*c0909341SAndroid Build Coastguard Worker    paddw                m3, [tlq- 2]  ; 0 1 2 3 4 5 6 7
1102*c0909341SAndroid Build Coastguard Worker    psubw                m2, m5, m3
1103*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq+18]  ; a b c d e f g _
1104*c0909341SAndroid Build Coastguard Worker    psraw                m2, 3
1105*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+20]  ; b c d e f g _ _
1106*c0909341SAndroid Build Coastguard Worker    paddw                m5, m2
1107*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+16]  ; 9 a b c d e f g
1108*c0909341SAndroid Build Coastguard Worker    paddw                m6, m2
1109*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
1110*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
1111*c0909341SAndroid Build Coastguard Worker    jne .w8_upsample_h8 ; awkward single-pixel edge case
1112*c0909341SAndroid Build Coastguard Worker    pshuflw              m3, m3, q1110 ; b c c _ _ _ _ _
1113*c0909341SAndroid Build Coastguard Worker.w8_upsample_h8:
1114*c0909341SAndroid Build Coastguard Worker    paddw                m3, [tlq+14]  ; 8 9 a b c d e f
1115*c0909341SAndroid Build Coastguard Worker    psubw                m4, m6, m3
1116*c0909341SAndroid Build Coastguard Worker    movd                 m3, bdmaxm
1117*c0909341SAndroid Build Coastguard Worker    psraw                m4, 3
1118*c0909341SAndroid Build Coastguard Worker    mov                 r3d, dxd
1119*c0909341SAndroid Build Coastguard Worker    paddw                m6, m4
1120*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
1121*c0909341SAndroid Build Coastguard Worker    pmaxsw               m5, m4
1122*c0909341SAndroid Build Coastguard Worker    pmaxsw               m6, m4
1123*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m0
1124*c0909341SAndroid Build Coastguard Worker    pavgw                m5, m4
1125*c0909341SAndroid Build Coastguard Worker    pavgw                m6, m4
1126*c0909341SAndroid Build Coastguard Worker    movd                 m4, dxd
1127*c0909341SAndroid Build Coastguard Worker    pminsw               m5, m3
1128*c0909341SAndroid Build Coastguard Worker    pminsw               m6, m3
1129*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+z_upsample]
1130*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m0
1131*c0909341SAndroid Build Coastguard Worker    movifnidn       strideq, stridemp
1132*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m5
1133*c0909341SAndroid Build Coastguard Worker    mova           [rsp+ 0], m0
1134*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m5
1135*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], m1
1136*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2, m6
1137*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m0
1138*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m6
1139*c0909341SAndroid Build Coastguard Worker    mova           [rsp+48], m2
1140*c0909341SAndroid Build Coastguard Worker    mova                 m5, m4
1141*c0909341SAndroid Build Coastguard Worker.w8_upsample_loop:
1142*c0909341SAndroid Build Coastguard Worker    mov                 r2d, r3d
1143*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6
1144*c0909341SAndroid Build Coastguard Worker    movu                 m1, [rsp+r2*2+ 0]
1145*c0909341SAndroid Build Coastguard Worker    movu                 m2, [rsp+r2*2+16]
1146*c0909341SAndroid Build Coastguard Worker    add                 r3d, dxd
1147*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
1148*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3
1149*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m1, m2
1150*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m2
1151*c0909341SAndroid Build Coastguard Worker    pand                 m2, m7, m4
1152*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
1153*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1154*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1155*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
1156*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1157*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
1158*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1159*c0909341SAndroid Build Coastguard Worker    dec                  hd
1160*c0909341SAndroid Build Coastguard Worker    jg .w8_upsample_loop
1161*c0909341SAndroid Build Coastguard Worker    RET
1162*c0909341SAndroid Build Coastguard Worker.w8_no_upsample:
1163*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
1164*c0909341SAndroid Build Coastguard Worker    movd                 m1, r3d
1165*c0909341SAndroid Build Coastguard Worker    and                 r3d, 7
1166*c0909341SAndroid Build Coastguard Worker    or                  r3d, 8 ; imin(h+7, 15)
1167*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1168*c0909341SAndroid Build Coastguard Worker    jnz .w8_main
1169*c0909341SAndroid Build Coastguard Worker    movd                 m3, angled
1170*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
1171*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
1172*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2
1173*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m2
1174*c0909341SAndroid Build Coastguard Worker    movu                 m2, [base+z_filt_wh8]
1175*c0909341SAndroid Build Coastguard Worker    psrldq               m4, [base+z_filt_t_w48+angleq*8], 4
1176*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m2, m1
1177*c0909341SAndroid Build Coastguard Worker    pand                 m2, m3
1178*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m2, m4
1179*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m2
1180*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1181*c0909341SAndroid Build Coastguard Worker    jz .w8_main ; filter_strength == 0
1182*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, [tlq-2], q0000
1183*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+16*0]
1184*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x55555555
1185*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+16*1]
1186*c0909341SAndroid Build Coastguard Worker    movd                 m4, [tlq+r3*2]
1187*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30 ; filter_strength
1188*c0909341SAndroid Build Coastguard Worker    movd           [rsp+12], m1
1189*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m2
1190*c0909341SAndroid Build Coastguard Worker    pshuflw              m4, m4, q0000
1191*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m3
1192*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+2]
1193*c0909341SAndroid Build Coastguard Worker    movq      [rsp+r3*2+18], m4
1194*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
1195*c0909341SAndroid Build Coastguard Worker    cmovae              r3d, r2d
1196*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*1]
1197*c0909341SAndroid Build Coastguard Worker    call .filter_edge
1198*c0909341SAndroid Build Coastguard Worker.w8_main:
1199*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [tlq+r3*2]
1200*c0909341SAndroid Build Coastguard Worker    movd                 m5, dxd
1201*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+z_base_inc]
1202*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1203*c0909341SAndroid Build Coastguard Worker    movd                 m6, [tlq] ; top[max_base_x]
1204*c0909341SAndroid Build Coastguard Worker    movd                 m1, r3d
1205*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m0
1206*c0909341SAndroid Build Coastguard Worker    mov                 r5d, dxd ; xpos
1207*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m0
1208*c0909341SAndroid Build Coastguard Worker    sub                  r5, r3
1209*c0909341SAndroid Build Coastguard Worker    psubw                m4, m1 ; max_base_x
1210*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m0
1211*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
1212*c0909341SAndroid Build Coastguard Worker    movifnidn       strideq, stridemp
1213*c0909341SAndroid Build Coastguard Worker.w8_loop:
1214*c0909341SAndroid Build Coastguard Worker    mov                  r3, r5
1215*c0909341SAndroid Build Coastguard Worker    sar                  r3, 6
1216*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3*2+0]
1217*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+2]
1218*c0909341SAndroid Build Coastguard Worker    pand                 m2, m7, m4
1219*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
1220*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1221*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1222*c0909341SAndroid Build Coastguard Worker    psraw                m2, m4, 15 ; xpos < max_base_x
1223*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5     ; xpos += dx
1224*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1225*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
1226*c0909341SAndroid Build Coastguard Worker    pandn                m2, m6
1227*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
1228*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
1229*c0909341SAndroid Build Coastguard Worker    dec                  hd
1230*c0909341SAndroid Build Coastguard Worker    jz .w8_end
1231*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1232*c0909341SAndroid Build Coastguard Worker    add                  r5, dxq
1233*c0909341SAndroid Build Coastguard Worker    jl .w8_loop
1234*c0909341SAndroid Build Coastguard Worker.w8_end_loop:
1235*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m6
1236*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1237*c0909341SAndroid Build Coastguard Worker    dec                  hd
1238*c0909341SAndroid Build Coastguard Worker    jg .w8_end_loop
1239*c0909341SAndroid Build Coastguard Worker.w8_end:
1240*c0909341SAndroid Build Coastguard Worker    RET
1241*c0909341SAndroid Build Coastguard Worker.w16:
1242*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1243*c0909341SAndroid Build Coastguard Worker    %define         strideq  r3
1244*c0909341SAndroid Build Coastguard Worker%endif
1245*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+15]
1246*c0909341SAndroid Build Coastguard Worker    movd                 m1, r3d
1247*c0909341SAndroid Build Coastguard Worker    and                 r3d, 15
1248*c0909341SAndroid Build Coastguard Worker    or                  r3d, 16 ; imin(h+15, 31)
1249*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1250*c0909341SAndroid Build Coastguard Worker    jnz .w16_main
1251*c0909341SAndroid Build Coastguard Worker    movd                 m3, angled
1252*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
1253*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
1254*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2
1255*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m2
1256*c0909341SAndroid Build Coastguard Worker    movq                 m4, [base+z_filt_t_w16+angleq*4]
1257*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m1, [base+z_filt_wh16]
1258*c0909341SAndroid Build Coastguard Worker    pand                 m1, m3
1259*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, m4
1260*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m1
1261*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1262*c0909341SAndroid Build Coastguard Worker    jz .w16_main ; filter_strength == 0
1263*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, [tlq-2], q0000
1264*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+16*0]
1265*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x24924924
1266*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+16*1]
1267*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq+16*2]
1268*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30
1269*c0909341SAndroid Build Coastguard Worker    movu                 m5, [tlq+16*3]
1270*c0909341SAndroid Build Coastguard Worker    movd                 m6, [tlq+r3*2]
1271*c0909341SAndroid Build Coastguard Worker    adc                 r5d, -1 ; filter_strength
1272*c0909341SAndroid Build Coastguard Worker    movd           [rsp+12], m1
1273*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m2
1274*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m3
1275*c0909341SAndroid Build Coastguard Worker    pshuflw              m6, m6, q0000
1276*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m4
1277*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*4], m5
1278*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+2]
1279*c0909341SAndroid Build Coastguard Worker    movq      [rsp+r3*2+18], m6
1280*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
1281*c0909341SAndroid Build Coastguard Worker    cmovae              r3d, r2d
1282*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*1]
1283*c0909341SAndroid Build Coastguard Worker    call .filter_edge
1284*c0909341SAndroid Build Coastguard Worker.w16_main:
1285*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [tlq+r3*2]
1286*c0909341SAndroid Build Coastguard Worker    movd                 m5, dxd
1287*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+z_base_inc]
1288*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1289*c0909341SAndroid Build Coastguard Worker    movd                 m6, [tlq] ; top[max_base_x]
1290*c0909341SAndroid Build Coastguard Worker    movd                 m1, r3d
1291*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m0
1292*c0909341SAndroid Build Coastguard Worker    mov                 r5d, dxd ; xpos
1293*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m0
1294*c0909341SAndroid Build Coastguard Worker    sub                  r5, r3
1295*c0909341SAndroid Build Coastguard Worker    psubw                m4, m1 ; max_base_x
1296*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m0
1297*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
1298*c0909341SAndroid Build Coastguard Worker.w16_loop:
1299*c0909341SAndroid Build Coastguard Worker    mov                  r3, r5
1300*c0909341SAndroid Build Coastguard Worker    sar                  r3, 6
1301*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3*2+ 0]
1302*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+ 2]
1303*c0909341SAndroid Build Coastguard Worker    pand                 m3, m7, m4
1304*c0909341SAndroid Build Coastguard Worker    psllw                m3, 9
1305*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
1306*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1307*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+16]
1308*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1309*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+18]
1310*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
1311*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1312*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+pw_m512]
1313*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1314*c0909341SAndroid Build Coastguard Worker    psraw                m2, m4, 15
1315*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m3, m4
1316*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
1317*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
1318*c0909341SAndroid Build Coastguard Worker    pandn                m2, m6
1319*c0909341SAndroid Build Coastguard Worker    pand                 m1, m3
1320*c0909341SAndroid Build Coastguard Worker    pandn                m3, m6
1321*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
1322*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
1323*c0909341SAndroid Build Coastguard Worker    por                  m1, m3
1324*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
1325*c0909341SAndroid Build Coastguard Worker    dec                  hd
1326*c0909341SAndroid Build Coastguard Worker    jz .w16_end
1327*c0909341SAndroid Build Coastguard Worker    movifnidn       strideq, stridemp
1328*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1329*c0909341SAndroid Build Coastguard Worker    add                  r5, dxq
1330*c0909341SAndroid Build Coastguard Worker    jl .w16_loop
1331*c0909341SAndroid Build Coastguard Worker.w16_end_loop:
1332*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m6
1333*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m6
1334*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1335*c0909341SAndroid Build Coastguard Worker    dec                  hd
1336*c0909341SAndroid Build Coastguard Worker    jg .w16_end_loop
1337*c0909341SAndroid Build Coastguard Worker.w16_end:
1338*c0909341SAndroid Build Coastguard Worker    RET
1339*c0909341SAndroid Build Coastguard Worker.w32:
1340*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+31]
1341*c0909341SAndroid Build Coastguard Worker    and                 r3d, 31
1342*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32    ; imin(h+31, 63)
1343*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
1344*c0909341SAndroid Build Coastguard Worker    jnz .w32_main
1345*c0909341SAndroid Build Coastguard Worker    call .filter_copy
1346*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r3+2]
1347*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 64
1348*c0909341SAndroid Build Coastguard Worker    cmove               r3d, r5d
1349*c0909341SAndroid Build Coastguard Worker    call .filter_edge_s3
1350*c0909341SAndroid Build Coastguard Worker.w32_main:
1351*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [tlq+r3*2]
1352*c0909341SAndroid Build Coastguard Worker    movd                 m5, dxd
1353*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+z_base_inc]
1354*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1355*c0909341SAndroid Build Coastguard Worker    movd                 m6, [tlq] ; top[max_base_x]
1356*c0909341SAndroid Build Coastguard Worker    movd                 m1, r3d
1357*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m0
1358*c0909341SAndroid Build Coastguard Worker    mov                 r5d, dxd ; xpos
1359*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m0
1360*c0909341SAndroid Build Coastguard Worker    sub                  r5, r3
1361*c0909341SAndroid Build Coastguard Worker    psubw                m4, m1 ; max_base_x
1362*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m0
1363*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
1364*c0909341SAndroid Build Coastguard Worker.w32_loop:
1365*c0909341SAndroid Build Coastguard Worker    mov                  r3, r5
1366*c0909341SAndroid Build Coastguard Worker    sar                  r3, 6
1367*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3*2+ 0]
1368*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+ 2]
1369*c0909341SAndroid Build Coastguard Worker    pand                 m3, m7, m4
1370*c0909341SAndroid Build Coastguard Worker    psllw                m3, 9
1371*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
1372*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1373*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+16]
1374*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1375*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+18]
1376*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
1377*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1378*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1379*c0909341SAndroid Build Coastguard Worker    psraw                m2, m4, 15
1380*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
1381*c0909341SAndroid Build Coastguard Worker    pandn                m2, m6
1382*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
1383*c0909341SAndroid Build Coastguard Worker    movddup              m2, [base+pw_m512]
1384*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m4
1385*c0909341SAndroid Build Coastguard Worker    pand                 m1, m2
1386*c0909341SAndroid Build Coastguard Worker    pandn                m2, m6
1387*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
1388*c0909341SAndroid Build Coastguard Worker    por                  m1, m2
1389*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
1390*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3*2+32]
1391*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+34]
1392*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
1393*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1394*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+48]
1395*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1396*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+50]
1397*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
1398*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1399*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1400*c0909341SAndroid Build Coastguard Worker    movddup              m2, [base+pw_m1024]
1401*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+pw_m1536]
1402*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m4
1403*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m3, m4
1404*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
1405*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
1406*c0909341SAndroid Build Coastguard Worker    pandn                m2, m6
1407*c0909341SAndroid Build Coastguard Worker    pand                 m1, m3
1408*c0909341SAndroid Build Coastguard Worker    pandn                m3, m6
1409*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
1410*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
1411*c0909341SAndroid Build Coastguard Worker    por                  m1, m3
1412*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m1
1413*c0909341SAndroid Build Coastguard Worker    dec                  hd
1414*c0909341SAndroid Build Coastguard Worker    jz .w32_end
1415*c0909341SAndroid Build Coastguard Worker    movifnidn       strideq, stridemp
1416*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1417*c0909341SAndroid Build Coastguard Worker    add                  r5, dxq
1418*c0909341SAndroid Build Coastguard Worker    jl .w32_loop
1419*c0909341SAndroid Build Coastguard Worker.w32_end_loop:
1420*c0909341SAndroid Build Coastguard Worker    REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3
1421*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1422*c0909341SAndroid Build Coastguard Worker    dec                  hd
1423*c0909341SAndroid Build Coastguard Worker    jg .w32_end_loop
1424*c0909341SAndroid Build Coastguard Worker.w32_end:
1425*c0909341SAndroid Build Coastguard Worker    RET
1426*c0909341SAndroid Build Coastguard Worker.w64:
1427*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+63]
1428*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
1429*c0909341SAndroid Build Coastguard Worker    jnz .w64_main
1430*c0909341SAndroid Build Coastguard Worker    call .filter_copy
1431*c0909341SAndroid Build Coastguard Worker    call .filter_edge_s3
1432*c0909341SAndroid Build Coastguard Worker.w64_main:
1433*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [tlq+r3*2]
1434*c0909341SAndroid Build Coastguard Worker    movd                 m5, dxd
1435*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+z_base_inc]
1436*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1437*c0909341SAndroid Build Coastguard Worker    movd                 m6, [tlq] ; top[max_base_x]
1438*c0909341SAndroid Build Coastguard Worker    movd                 m1, r3d
1439*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m0
1440*c0909341SAndroid Build Coastguard Worker    mov                 r5d, dxd ; xpos
1441*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m0
1442*c0909341SAndroid Build Coastguard Worker    sub                  r5, r3
1443*c0909341SAndroid Build Coastguard Worker    psubw                m4, m1 ; max_base_x
1444*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m0
1445*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
1446*c0909341SAndroid Build Coastguard Worker.w64_loop:
1447*c0909341SAndroid Build Coastguard Worker    mov                  r3, r5
1448*c0909341SAndroid Build Coastguard Worker    sar                  r3, 6
1449*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3*2+ 0]
1450*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+ 2]
1451*c0909341SAndroid Build Coastguard Worker    pand                 m3, m7, m4
1452*c0909341SAndroid Build Coastguard Worker    psllw                m3, 9
1453*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
1454*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1455*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+16]
1456*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1457*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+18]
1458*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
1459*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1460*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1461*c0909341SAndroid Build Coastguard Worker    psraw                m2, m4, 15
1462*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
1463*c0909341SAndroid Build Coastguard Worker    pandn                m2, m6
1464*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
1465*c0909341SAndroid Build Coastguard Worker    movddup              m2, [base+pw_m512]
1466*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m4
1467*c0909341SAndroid Build Coastguard Worker    pand                 m1, m2
1468*c0909341SAndroid Build Coastguard Worker    pandn                m2, m6
1469*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
1470*c0909341SAndroid Build Coastguard Worker    por                  m1, m2
1471*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
1472*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3*2+32]
1473*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+34]
1474*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
1475*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1476*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+48]
1477*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1478*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+50]
1479*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
1480*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1481*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1482*c0909341SAndroid Build Coastguard Worker    movddup              m2, [base+pw_m1024]
1483*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m4
1484*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
1485*c0909341SAndroid Build Coastguard Worker    pandn                m2, m6
1486*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
1487*c0909341SAndroid Build Coastguard Worker    movddup              m2, [base+pw_m1536]
1488*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m4
1489*c0909341SAndroid Build Coastguard Worker    pand                 m1, m2
1490*c0909341SAndroid Build Coastguard Worker    pandn                m2, m6
1491*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
1492*c0909341SAndroid Build Coastguard Worker    por                  m1, m2
1493*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m1
1494*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3*2+64]
1495*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+66]
1496*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
1497*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1498*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+80]
1499*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1500*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+82]
1501*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
1502*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1503*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1504*c0909341SAndroid Build Coastguard Worker    movddup              m2, [base+pw_m2048]
1505*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m4
1506*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
1507*c0909341SAndroid Build Coastguard Worker    pandn                m2, m6
1508*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
1509*c0909341SAndroid Build Coastguard Worker    movddup              m2, [base+pw_m2560]
1510*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m4
1511*c0909341SAndroid Build Coastguard Worker    pand                 m1, m2
1512*c0909341SAndroid Build Coastguard Worker    pandn                m2, m6
1513*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*4], m0
1514*c0909341SAndroid Build Coastguard Worker    por                  m1, m2
1515*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*5], m1
1516*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3*2+96]
1517*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+98]
1518*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
1519*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1520*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+112]
1521*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1522*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+114]
1523*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
1524*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1525*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1526*c0909341SAndroid Build Coastguard Worker    movddup              m2, [base+pw_m3072]
1527*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+pw_m3584]
1528*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m4
1529*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m3, m4
1530*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
1531*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
1532*c0909341SAndroid Build Coastguard Worker    pandn                m2, m6
1533*c0909341SAndroid Build Coastguard Worker    pand                 m1, m3
1534*c0909341SAndroid Build Coastguard Worker    pandn                m3, m6
1535*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
1536*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*6], m0
1537*c0909341SAndroid Build Coastguard Worker    por                  m1, m3
1538*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*7], m1
1539*c0909341SAndroid Build Coastguard Worker    dec                  hd
1540*c0909341SAndroid Build Coastguard Worker    jz .w64_end
1541*c0909341SAndroid Build Coastguard Worker    movifnidn       strideq, stridemp
1542*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1543*c0909341SAndroid Build Coastguard Worker    add                  r5, dxq
1544*c0909341SAndroid Build Coastguard Worker    jl .w64_loop
1545*c0909341SAndroid Build Coastguard Worker.w64_end_loop:
1546*c0909341SAndroid Build Coastguard Worker    REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
1547*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1548*c0909341SAndroid Build Coastguard Worker    dec                  hd
1549*c0909341SAndroid Build Coastguard Worker    jg .w64_end_loop
1550*c0909341SAndroid Build Coastguard Worker.w64_end:
1551*c0909341SAndroid Build Coastguard Worker    RET
1552*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1553*c0909341SAndroid Build Coastguard Worker.filter_copy:
1554*c0909341SAndroid Build Coastguard Worker    pshuflw              m2, [tlq-2], q0000
1555*c0909341SAndroid Build Coastguard Worker    pshuflw              m3, [tlq+r3*2], q0000
1556*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d
1557*c0909341SAndroid Build Coastguard Worker    movd   [rsp+gprsize+12], m2
1558*c0909341SAndroid Build Coastguard Worker.filter_copy_loop:
1559*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r5*2+16*0]
1560*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r5*2+16*1]
1561*c0909341SAndroid Build Coastguard Worker    add                 r5d, 16
1562*c0909341SAndroid Build Coastguard Worker    mova [rsp+r5*2+gprsize-16*1], m1
1563*c0909341SAndroid Build Coastguard Worker    mova [rsp+r5*2+gprsize-16*0], m2
1564*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, r3d
1565*c0909341SAndroid Build Coastguard Worker    jle .filter_copy_loop
1566*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+gprsize+16*1]
1567*c0909341SAndroid Build Coastguard Worker    movq       [tlq+r3*2+2], m3
1568*c0909341SAndroid Build Coastguard Worker    ret
1569*c0909341SAndroid Build Coastguard Worker.filter_edge:
1570*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, 3
1571*c0909341SAndroid Build Coastguard Worker    je .filter_edge_s3
1572*c0909341SAndroid Build Coastguard Worker    movddup              m4, [base+z_filt_k+r5*8-8]
1573*c0909341SAndroid Build Coastguard Worker    movddup              m5, [base+z_filt_k+r5*8+8]
1574*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d
1575*c0909341SAndroid Build Coastguard Worker    movddup              m6, [base+pw_8]
1576*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq-2]
1577*c0909341SAndroid Build Coastguard Worker    jmp .filter_edge_start
1578*c0909341SAndroid Build Coastguard Worker.filter_edge_loop:
1579*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r5*2-2]
1580*c0909341SAndroid Build Coastguard Worker    mova      [tlq+r5*2-16], m1
1581*c0909341SAndroid Build Coastguard Worker.filter_edge_start:
1582*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [tlq+r5*2]
1583*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r5*2+2]
1584*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
1585*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5
1586*c0909341SAndroid Build Coastguard Worker    add                 r5d, 8
1587*c0909341SAndroid Build Coastguard Worker    paddw                m1, m6
1588*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1589*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 4
1590*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, r3d
1591*c0909341SAndroid Build Coastguard Worker    jl .filter_edge_loop
1592*c0909341SAndroid Build Coastguard Worker    mova      [tlq+r5*2-16], m1
1593*c0909341SAndroid Build Coastguard Worker    ret
1594*c0909341SAndroid Build Coastguard Worker.filter_edge_s3:
1595*c0909341SAndroid Build Coastguard Worker    movddup              m5, [base+pw_3]
1596*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d
1597*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq-2]
1598*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq-4]
1599*c0909341SAndroid Build Coastguard Worker    jmp .filter_edge_s3_start
1600*c0909341SAndroid Build Coastguard Worker.filter_edge_s3_loop:
1601*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r5*2-2]
1602*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r5*2-4]
1603*c0909341SAndroid Build Coastguard Worker    mova      [tlq+r5*2-16], m1
1604*c0909341SAndroid Build Coastguard Worker.filter_edge_s3_start:
1605*c0909341SAndroid Build Coastguard Worker    paddw                m2, [tlq+r5*2+0]
1606*c0909341SAndroid Build Coastguard Worker    paddw                m3, m5
1607*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r5*2+2]
1608*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq+r5*2+4]
1609*c0909341SAndroid Build Coastguard Worker    add                 r5d, 8
1610*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1611*c0909341SAndroid Build Coastguard Worker    pavgw                m3, m4
1612*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
1613*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2
1614*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, r3d
1615*c0909341SAndroid Build Coastguard Worker    jl .filter_edge_s3_loop
1616*c0909341SAndroid Build Coastguard Worker    mova      [tlq+r5*2-16], m1
1617*c0909341SAndroid Build Coastguard Worker    ret
1618*c0909341SAndroid Build Coastguard Worker
1619*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1620*c0909341SAndroid Build Coastguard Workercglobal ipred_z2_16bpc, 4, 12, 11, 16*24, dst, stride, tl, w, h, angle, dx, _, dy
1621*c0909341SAndroid Build Coastguard Worker    %define            base  r7-$$
1622*c0909341SAndroid Build Coastguard Worker    %define           maxwm  r6m
1623*c0909341SAndroid Build Coastguard Worker    %define           maxhm  r7m
1624*c0909341SAndroid Build Coastguard Worker    %define          bdmaxm  r8m
1625*c0909341SAndroid Build Coastguard Worker    lea                  r7, [$$]
1626*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
1627*c0909341SAndroid Build Coastguard Worker    movddup              m8, [base+pw_62]
1628*c0909341SAndroid Build Coastguard Worker    lea                 r9d, [wq-4]
1629*c0909341SAndroid Build Coastguard Worker    shl                 r9d, 6
1630*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+z2_top_shufA]
1631*c0909341SAndroid Build Coastguard Worker    or                  r9d, hd
1632*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+z2_left_shufA]
1633*c0909341SAndroid Build Coastguard Worker%else
1634*c0909341SAndroid Build Coastguard Workercglobal ipred_z2_16bpc, 4, 7, 8, -16*27, dst, _, tl, w, h, angle, dx
1635*c0909341SAndroid Build Coastguard Worker    %define            base  r1-$$
1636*c0909341SAndroid Build Coastguard Worker    %define             r9b  byte  [rsp+16*26+4*0]
1637*c0909341SAndroid Build Coastguard Worker    %define             r9d  dword [rsp+16*26+4*0]
1638*c0909341SAndroid Build Coastguard Worker    %define            r10d  dword [rsp+16*26+4*1]
1639*c0909341SAndroid Build Coastguard Worker    %define            r11d  dword [rsp+16*26+4*2]
1640*c0909341SAndroid Build Coastguard Worker    %define           maxwm  [rsp+16*2+4*0]
1641*c0909341SAndroid Build Coastguard Worker    %define           maxhm  [rsp+16*2+4*1]
1642*c0909341SAndroid Build Coastguard Worker    %define          bdmaxm  [rsp+16*2+4*2]
1643*c0909341SAndroid Build Coastguard Worker    %define        stridemp  [rsp+16*26+4*3]
1644*c0909341SAndroid Build Coastguard Worker    %define         strideq  r3
1645*c0909341SAndroid Build Coastguard Worker    %define             dyd  r4
1646*c0909341SAndroid Build Coastguard Worker    %define             dyq  r4
1647*c0909341SAndroid Build Coastguard Worker    mov            stridemp, r1
1648*c0909341SAndroid Build Coastguard Worker    mov                 r1d, r6m
1649*c0909341SAndroid Build Coastguard Worker    mov                 r4d, r7m
1650*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r8m
1651*c0909341SAndroid Build Coastguard Worker    mov               maxwm, r1d
1652*c0909341SAndroid Build Coastguard Worker    mov               maxhm, r4d
1653*c0909341SAndroid Build Coastguard Worker    mov              bdmaxm, r5d
1654*c0909341SAndroid Build Coastguard Worker    LEA                  r1, $$
1655*c0909341SAndroid Build Coastguard Worker    lea                  hd, [wq-4]
1656*c0909341SAndroid Build Coastguard Worker    mova                 m0, [base+z2_top_shufA]
1657*c0909341SAndroid Build Coastguard Worker    shl                  hd, 6
1658*c0909341SAndroid Build Coastguard Worker    mova                 m1, [base+z2_left_shufA]
1659*c0909341SAndroid Build Coastguard Worker    or                   hd, hm
1660*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*24], m0
1661*c0909341SAndroid Build Coastguard Worker    mov                 r9d, hd
1662*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*25], m1
1663*c0909341SAndroid Build Coastguard Worker%endif
1664*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
1665*c0909341SAndroid Build Coastguard Worker    movifnidn        angled, anglem
1666*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-16*8]
1667*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tlq-16*7]
1668*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq-16*6]
1669*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tlq-16*5]
1670*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [base+ipred_z2_16bpc_ssse3_table+wq*4]
1671*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1672*c0909341SAndroid Build Coastguard Worker    movzx               dxd, angleb
1673*c0909341SAndroid Build Coastguard Worker%else
1674*c0909341SAndroid Build Coastguard Worker    movzx               dxd, byte anglem
1675*c0909341SAndroid Build Coastguard Worker%endif
1676*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tlq-16*4]
1677*c0909341SAndroid Build Coastguard Worker    mova                 m5, [tlq-16*3]
1678*c0909341SAndroid Build Coastguard Worker    mova                 m6, [tlq-16*2]
1679*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tlq-16*1]
1680*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 5], m0
1681*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x400
1682*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 6], m1
1683*c0909341SAndroid Build Coastguard Worker    mov                 dyd, dxd
1684*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 7], m2
1685*c0909341SAndroid Build Coastguard Worker    neg                 dxq
1686*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 8], m3
1687*c0909341SAndroid Build Coastguard Worker    and                 dyd, ~1
1688*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 9], m4
1689*c0909341SAndroid Build Coastguard Worker    and                 dxq, ~1
1690*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*10], m5
1691*c0909341SAndroid Build Coastguard Worker    lea                  wq, [base+ipred_z2_16bpc_ssse3_table+wq]
1692*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*11], m6
1693*c0909341SAndroid Build Coastguard Worker    pxor                 m3, m3
1694*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*12], m7
1695*c0909341SAndroid Build Coastguard Worker    movzx               dyd, word [base+dr_intra_derivative+dyq-90]  ; angle - 90
1696*c0909341SAndroid Build Coastguard Worker    movzx               dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle
1697*c0909341SAndroid Build Coastguard Worker    movddup              m0, [base+pw_256] ; 4<<6
1698*c0909341SAndroid Build Coastguard Worker    movd                 m4, [tlq]
1699*c0909341SAndroid Build Coastguard Worker    movu                 m5, [tlq+16*0+2]
1700*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq+16*1+2]
1701*c0909341SAndroid Build Coastguard Worker    movsldup             m1, [base+z2_dy_offset]
1702*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m0
1703*c0909341SAndroid Build Coastguard Worker    movq                 m7, [base+z_base_inc+2]
1704*c0909341SAndroid Build Coastguard Worker    mov                r11d, (112-4)<<6
1705*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*13], m4
1706*c0909341SAndroid Build Coastguard Worker    neg                 dxd
1707*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*14], m5
1708*c0909341SAndroid Build Coastguard Worker    or                  dyd, 4<<16
1709*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*15], m6
1710*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1711*c0909341SAndroid Build Coastguard Worker    lea                r10d, [dxq+(112<<6)] ; xpos
1712*c0909341SAndroid Build Coastguard Worker%else
1713*c0909341SAndroid Build Coastguard Worker    mov           [rsp+8*3], dyd
1714*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [dxq+(112<<6)]
1715*c0909341SAndroid Build Coastguard Worker    mov                r10d, r4d
1716*c0909341SAndroid Build Coastguard Worker    movzx                hd, r9b
1717*c0909341SAndroid Build Coastguard Worker%endif
1718*c0909341SAndroid Build Coastguard Worker    movq          [rsp+8*0], m1
1719*c0909341SAndroid Build Coastguard Worker    movq          [rsp+8*1], m0
1720*c0909341SAndroid Build Coastguard Worker    movq          [rsp+8*2], m7
1721*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1722*c0909341SAndroid Build Coastguard Worker.w4:
1723*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1724*c0909341SAndroid Build Coastguard Worker    jnz .w4_main
1725*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+2]
1726*c0909341SAndroid Build Coastguard Worker    add              angled, 1022
1727*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m5, q3333
1728*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1729*c0909341SAndroid Build Coastguard Worker    movq      [rsp+16*14+8], m1
1730*c0909341SAndroid Build Coastguard Worker    test                r3d, angled
1731*c0909341SAndroid Build Coastguard Worker    jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
1732*c0909341SAndroid Build Coastguard Worker    call .upsample_above
1733*c0909341SAndroid Build Coastguard Worker    sub              angled, 1075 ; angle - 53
1734*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
1735*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x7f ; 180 - angle
1736*c0909341SAndroid Build Coastguard Worker    movd                 m2, r3d
1737*c0909341SAndroid Build Coastguard Worker    movd                 m7, angled
1738*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
1739*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3
1740*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m3
1741*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m2, [base+z_filt_wh4]
1742*c0909341SAndroid Build Coastguard Worker    pand                 m7, m2
1743*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m7, [base+z_filt_t_w48+angleq*8]
1744*c0909341SAndroid Build Coastguard Worker    jmp .w8_filter_left
1745*c0909341SAndroid Build Coastguard Worker.upsample_above: ; w4/w8
1746*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5, [tlq]
1747*c0909341SAndroid Build Coastguard Worker    movu                 m1, [rsp+gprsize+16*14+2]
1748*c0909341SAndroid Build Coastguard Worker    movu                 m4, [rsp+gprsize+16*14-4]
1749*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1750*c0909341SAndroid Build Coastguard Worker    movd                 m6, r9m ; bdmax, offset due to call
1751*c0909341SAndroid Build Coastguard Worker%else
1752*c0909341SAndroid Build Coastguard Worker    movd                 m6, [rsp+gprsize+16*2+4*2]
1753*c0909341SAndroid Build Coastguard Worker%endif
1754*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
1755*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2, m4
1756*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m0
1757*c0909341SAndroid Build Coastguard Worker    psraw                m1, 3
1758*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1
1759*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
1760*c0909341SAndroid Build Coastguard Worker    pmaxsw               m2, m3
1761*c0909341SAndroid Build Coastguard Worker    paddw                m7, m7
1762*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m3
1763*c0909341SAndroid Build Coastguard Worker    pminsw               m2, m6
1764*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1765*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+z2_top_shufB]
1766*c0909341SAndroid Build Coastguard Worker    lea                r10d, [dxq+(113<<6)]
1767*c0909341SAndroid Build Coastguard Worker    mov                r11d, (112-7)<<6
1768*c0909341SAndroid Build Coastguard Worker%else
1769*c0909341SAndroid Build Coastguard Worker    mova                 m1, [base+z2_top_shufB]
1770*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [dxq+(113<<6)]
1771*c0909341SAndroid Build Coastguard Worker    mov dword [rsp+gprsize+16*26+4*2], (112-7)<<6
1772*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize+16*26+4*1], r3d
1773*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16*24], m1
1774*c0909341SAndroid Build Coastguard Worker%endif
1775*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m5
1776*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m5
1777*c0909341SAndroid Build Coastguard Worker    movq  [rsp+gprsize+8*2], m7
1778*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16*14], m1
1779*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16*15], m2
1780*c0909341SAndroid Build Coastguard Worker    ret
1781*c0909341SAndroid Build Coastguard Worker.w4_no_upsample_above:
1782*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
1783*c0909341SAndroid Build Coastguard Worker    mov          [rsp+16*4], angled
1784*c0909341SAndroid Build Coastguard Worker    sub              angled, 1112 ; angle - 90
1785*c0909341SAndroid Build Coastguard Worker    movd                 m2, r3d
1786*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 90
1787*c0909341SAndroid Build Coastguard Worker    movd                 m1, angled
1788*c0909341SAndroid Build Coastguard Worker    sub                 r3d, angled ; 180 - angle
1789*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
1790*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+z_filt_wh4]
1791*c0909341SAndroid Build Coastguard Worker    movd                 m7, r3d
1792*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+z_filt_t_w48+angleq*8]
1793*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 4
1794*c0909341SAndroid Build Coastguard Worker    call .w8_filter_top
1795*c0909341SAndroid Build Coastguard Worker    mov              angled, [rsp+16*4]
1796*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+2]
1797*c0909341SAndroid Build Coastguard Worker    sub              angled, 139
1798*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1799*c0909341SAndroid Build Coastguard Worker    test                r3d, angled
1800*c0909341SAndroid Build Coastguard Worker    jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
1801*c0909341SAndroid Build Coastguard Worker.upsample_left: ; w4/w8
1802*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq-16]
1803*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq-4]
1804*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq-14]
1805*c0909341SAndroid Build Coastguard Worker    movu                 m4, [rsp+16*12+4]
1806*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2, [base+z2_upsample_l+r3*4]
1807*c0909341SAndroid Build Coastguard Worker    movd                 m6, bdmaxm
1808*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
1809*c0909341SAndroid Build Coastguard Worker    paddw                m3, m2
1810*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
1811*c0909341SAndroid Build Coastguard Worker    psubw                m1, m3, m4
1812*c0909341SAndroid Build Coastguard Worker    movshdup             m4, [base+z2_dy_offset]
1813*c0909341SAndroid Build Coastguard Worker    psraw                m1, 3
1814*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m0
1815*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
1816*c0909341SAndroid Build Coastguard Worker    pmaxsw               m3, m5
1817*c0909341SAndroid Build Coastguard Worker    pavgw                m3, m5
1818*c0909341SAndroid Build Coastguard Worker    pminsw               m3, m6
1819*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1820*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+z2_left_shufB]
1821*c0909341SAndroid Build Coastguard Worker    add                 dyd, dyd
1822*c0909341SAndroid Build Coastguard Worker%else
1823*c0909341SAndroid Build Coastguard Worker    mova                 m1, [base+z2_left_shufB]
1824*c0909341SAndroid Build Coastguard Worker    shl     dword [rsp+8*3], 1
1825*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*25], m1
1826*c0909341SAndroid Build Coastguard Worker%endif
1827*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2, m3
1828*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
1829*c0909341SAndroid Build Coastguard Worker    movq          [rsp+8*0], m4
1830*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*12], m1
1831*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*11], m2
1832*c0909341SAndroid Build Coastguard Worker.w4_main:
1833*c0909341SAndroid Build Coastguard Worker    movd                 m6, dxd
1834*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1835*c0909341SAndroid Build Coastguard Worker    movd                 m3, dyd
1836*c0909341SAndroid Build Coastguard Worker%else
1837*c0909341SAndroid Build Coastguard Worker    movd                 m3, [rsp+8*3]
1838*c0909341SAndroid Build Coastguard Worker%endif
1839*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m0
1840*c0909341SAndroid Build Coastguard Worker    movddup              m0, [rsp+8*2]
1841*c0909341SAndroid Build Coastguard Worker    paddw                m7, m6, m6
1842*c0909341SAndroid Build Coastguard Worker    movq                 m5, [base+pw_m1to4]
1843*c0909341SAndroid Build Coastguard Worker    pshuflw              m4, m3, q0000
1844*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m7
1845*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m5
1846*c0909341SAndroid Build Coastguard Worker    pshuflw              m3, m3, q1111
1847*c0909341SAndroid Build Coastguard Worker    paddw                m6, m0
1848*c0909341SAndroid Build Coastguard Worker    mov                 r2d, r10d
1849*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m4, q3333
1850*c0909341SAndroid Build Coastguard Worker    psubw                m4, [rsp+8*0]
1851*c0909341SAndroid Build Coastguard Worker    movq          [rsp+8*3], m3
1852*c0909341SAndroid Build Coastguard Worker    movq          [rsp+8*5], m0 ; dy*4
1853*c0909341SAndroid Build Coastguard Worker    mov                  r5, dstq
1854*c0909341SAndroid Build Coastguard Worker.w4_loop0:
1855*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*4], m6
1856*c0909341SAndroid Build Coastguard Worker    movq          [rsp+8*4], m4
1857*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1858*c0909341SAndroid Build Coastguard Worker    pand                 m0, m8, m4
1859*c0909341SAndroid Build Coastguard Worker%else
1860*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base+pw_62]
1861*c0909341SAndroid Build Coastguard Worker    pand                 m0, m4
1862*c0909341SAndroid Build Coastguard Worker%endif
1863*c0909341SAndroid Build Coastguard Worker    psraw                m4, 6
1864*c0909341SAndroid Build Coastguard Worker    psllw                m0, 9 ; frac_y << 9
1865*c0909341SAndroid Build Coastguard Worker    movq          [rsp+8*7], m0
1866*c0909341SAndroid Build Coastguard Worker    pabsw                m4, m4
1867*c0909341SAndroid Build Coastguard Worker    movq          [rsp+8*6], m4
1868*c0909341SAndroid Build Coastguard Worker    movzx                hd, r9b
1869*c0909341SAndroid Build Coastguard Worker.w4_loop:
1870*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r2+dxq]
1871*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6        ; base_x0
1872*c0909341SAndroid Build Coastguard Worker    movu                 m2, [rsp+r2*2]
1873*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+dxq]
1874*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6        ; base_x1
1875*c0909341SAndroid Build Coastguard Worker    movu                 m1, [rsp+r3*2]
1876*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r2+dxq]
1877*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6        ; base_x2
1878*c0909341SAndroid Build Coastguard Worker    movu                 m3, [rsp+r2*2]
1879*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+dxq]
1880*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6        ; base_x3
1881*c0909341SAndroid Build Coastguard Worker    movu                 m4, [rsp+r3*2]
1882*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1883*c0909341SAndroid Build Coastguard Worker    REPX     {pshufb x, m9}, m2, m1, m3, m4
1884*c0909341SAndroid Build Coastguard Worker%else
1885*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+16*24]
1886*c0909341SAndroid Build Coastguard Worker    REPX     {pshufb x, m0}, m2, m1, m3, m4
1887*c0909341SAndroid Build Coastguard Worker%endif
1888*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m2, m1
1889*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m2, m1
1890*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m3, m4
1891*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m4
1892*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1893*c0909341SAndroid Build Coastguard Worker    pand                 m5, m8, m6
1894*c0909341SAndroid Build Coastguard Worker%else
1895*c0909341SAndroid Build Coastguard Worker    movddup              m5, [base+pw_62]
1896*c0909341SAndroid Build Coastguard Worker    pand                 m5, m6
1897*c0909341SAndroid Build Coastguard Worker%endif
1898*c0909341SAndroid Build Coastguard Worker    psllw                m5, 9
1899*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
1900*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5
1901*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6, m7
1902*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
1903*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1904*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1905*c0909341SAndroid Build Coastguard Worker    pand                 m2, m8, m5
1906*c0909341SAndroid Build Coastguard Worker%else
1907*c0909341SAndroid Build Coastguard Worker    movddup              m2, [base+pw_62]
1908*c0909341SAndroid Build Coastguard Worker    pand                 m2, m5
1909*c0909341SAndroid Build Coastguard Worker%endif
1910*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
1911*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m2
1912*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
1913*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 111 ; topleft
1914*c0909341SAndroid Build Coastguard Worker    jge .w4_toponly
1915*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*22], m0
1916*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*23], m1
1917*c0909341SAndroid Build Coastguard Worker    movzx               r3d, byte [rsp+8*6+0] ; base_y0
1918*c0909341SAndroid Build Coastguard Worker    movu                 m3, [rsp+r3*2]
1919*c0909341SAndroid Build Coastguard Worker    movzx               r3d, byte [rsp+8*6+2] ; base_y1
1920*c0909341SAndroid Build Coastguard Worker    movu                 m2, [rsp+r3*2]
1921*c0909341SAndroid Build Coastguard Worker    movzx               r3d, byte [rsp+8*6+4] ; base_y2
1922*c0909341SAndroid Build Coastguard Worker    movu                 m4, [rsp+r3*2]
1923*c0909341SAndroid Build Coastguard Worker    movzx               r3d, byte [rsp+8*6+6] ; base_y3
1924*c0909341SAndroid Build Coastguard Worker    movu                 m0, [rsp+r3*2]
1925*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1926*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m10}, m3, m2, m4, m0
1927*c0909341SAndroid Build Coastguard Worker%else
1928*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+16*25]
1929*c0909341SAndroid Build Coastguard Worker    REPX     {pshufb x, m1}, m3, m2, m4, m0
1930*c0909341SAndroid Build Coastguard Worker%endif
1931*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m2
1932*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m2     ; 01
1933*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m0
1934*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0     ; 23
1935*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1, m2 ; y0 d1
1936*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m2     ; y2 y3
1937*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m4
1938*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m4
1939*c0909341SAndroid Build Coastguard Worker    movddup              m4, [rsp+8*7]
1940*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
1941*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
1942*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
1943*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4
1944*c0909341SAndroid Build Coastguard Worker    psraw                m6, 15       ; base_x < topleft
1945*c0909341SAndroid Build Coastguard Worker    psraw                m4, m5, 15
1946*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1947*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
1948*c0909341SAndroid Build Coastguard Worker    pand                 m0, m6
1949*c0909341SAndroid Build Coastguard Worker    pandn                m6, [rsp+16*22]
1950*c0909341SAndroid Build Coastguard Worker    pand                 m1, m4
1951*c0909341SAndroid Build Coastguard Worker    pandn                m4, [rsp+16*23]
1952*c0909341SAndroid Build Coastguard Worker    por                  m0, m6
1953*c0909341SAndroid Build Coastguard Worker    por                  m1, m4
1954*c0909341SAndroid Build Coastguard Worker.w4_toponly:
1955*c0909341SAndroid Build Coastguard Worker    movifnidn       strideq, stridemp
1956*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
1957*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
1958*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1959*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m1
1960*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m1
1961*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1962*c0909341SAndroid Build Coastguard Worker    jz .w4_end
1963*c0909341SAndroid Build Coastguard Worker    movq                 m4, [rsp+8*6]
1964*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m5, m7   ; xpos += dx
1965*c0909341SAndroid Build Coastguard Worker    movq                 m5, [rsp+8*3]
1966*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5
1967*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1968*c0909341SAndroid Build Coastguard Worker    movq          [rsp+8*6], m4
1969*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, r11d
1970*c0909341SAndroid Build Coastguard Worker    jge .w4_loop
1971*c0909341SAndroid Build Coastguard Worker.w4_leftonly_loop:
1972*c0909341SAndroid Build Coastguard Worker    movzx               r2d, byte [rsp+8*6+0] ; base_y0
1973*c0909341SAndroid Build Coastguard Worker    movu                 m3, [rsp+r2*2]
1974*c0909341SAndroid Build Coastguard Worker    movzx               r2d, byte [rsp+8*6+2] ; base_y1
1975*c0909341SAndroid Build Coastguard Worker    movu                 m2, [rsp+r2*2]
1976*c0909341SAndroid Build Coastguard Worker    movzx               r2d, byte [rsp+8*6+4] ; base_y2
1977*c0909341SAndroid Build Coastguard Worker    movu                 m6, [rsp+r2*2]
1978*c0909341SAndroid Build Coastguard Worker    movzx               r2d, byte [rsp+8*6+6] ; base_y3
1979*c0909341SAndroid Build Coastguard Worker    movu                 m0, [rsp+r2*2]
1980*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5
1981*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1982*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m10}, m3, m2, m6, m0
1983*c0909341SAndroid Build Coastguard Worker%else
1984*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+16*25]
1985*c0909341SAndroid Build Coastguard Worker    REPX     {pshufb x, m1}, m3, m2, m6, m0
1986*c0909341SAndroid Build Coastguard Worker%endif
1987*c0909341SAndroid Build Coastguard Worker    movq          [rsp+8*6], m4
1988*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m2
1989*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m2
1990*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m6, m0
1991*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m0
1992*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1, m2
1993*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m2
1994*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m6
1995*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m6
1996*c0909341SAndroid Build Coastguard Worker    movddup              m6, [rsp+8*7]
1997*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
1998*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
1999*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
2000*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6
2001*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
2002*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
2003*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
2004*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
2005*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2006*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m1
2007*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m1
2008*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2009*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2010*c0909341SAndroid Build Coastguard Worker    jg .w4_leftonly_loop
2011*c0909341SAndroid Build Coastguard Worker.w4_end:
2012*c0909341SAndroid Build Coastguard Worker    sub                 r9d, 1<<8
2013*c0909341SAndroid Build Coastguard Worker    jl .w4_ret
2014*c0909341SAndroid Build Coastguard Worker    movq                 m4, [rsp+8*5]
2015*c0909341SAndroid Build Coastguard Worker    add                  r5, 8
2016*c0909341SAndroid Build Coastguard Worker    mov                dstq, r5
2017*c0909341SAndroid Build Coastguard Worker    paddw                m4, [rsp+8*4] ; base_y += 4*dy
2018*c0909341SAndroid Build Coastguard Worker    movzx               r2d, word [rsp+8*1]
2019*c0909341SAndroid Build Coastguard Worker    movddup              m6, [rsp+8*1]
2020*c0909341SAndroid Build Coastguard Worker    paddw                m6, [rsp+16*4] ; base_x += (4 << upsample_above)
2021*c0909341SAndroid Build Coastguard Worker    add                 r2d, r10d
2022*c0909341SAndroid Build Coastguard Worker    mov                r10d, r2d
2023*c0909341SAndroid Build Coastguard Worker    jmp .w4_loop0
2024*c0909341SAndroid Build Coastguard Worker.w4_ret:
2025*c0909341SAndroid Build Coastguard Worker    RET
2026*c0909341SAndroid Build Coastguard Worker.w8:
2027*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2028*c0909341SAndroid Build Coastguard Worker    jnz .w4_main
2029*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq+126]
2030*c0909341SAndroid Build Coastguard Worker    pshufhw              m1, m5, q3333
2031*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2032*c0909341SAndroid Build Coastguard Worker    mov                 r3b, hb
2033*c0909341SAndroid Build Coastguard Worker%else
2034*c0909341SAndroid Build Coastguard Worker    xor                 r3b, r3b
2035*c0909341SAndroid Build Coastguard Worker    or                  r3d, hd
2036*c0909341SAndroid Build Coastguard Worker%endif
2037*c0909341SAndroid Build Coastguard Worker    movhps      [rsp+16*15], m1
2038*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
2039*c0909341SAndroid Build Coastguard Worker    ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
2040*c0909341SAndroid Build Coastguard Worker    call .upsample_above
2041*c0909341SAndroid Build Coastguard Worker    sub              angled, 53
2042*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
2043*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x7f ; 180 - angle
2044*c0909341SAndroid Build Coastguard Worker    movu                 m1, [base+z_filt_wh8]
2045*c0909341SAndroid Build Coastguard Worker    movd                 m2, r3d
2046*c0909341SAndroid Build Coastguard Worker    movd                 m7, angled
2047*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
2048*c0909341SAndroid Build Coastguard Worker    psrldq               m4, [base+z_filt_t_w48+angleq*8], 4
2049*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3
2050*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m3
2051*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m2, m1
2052*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base+pw_512]
2053*c0909341SAndroid Build Coastguard Worker    pand                 m7, m2
2054*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m7, m4
2055*c0909341SAndroid Build Coastguard Worker    movq          [rsp+8*1], m1 ; 8<<6
2056*c0909341SAndroid Build Coastguard Worker    jmp .w8_filter_left
2057*c0909341SAndroid Build Coastguard Worker.w8_no_upsample_above:
2058*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
2059*c0909341SAndroid Build Coastguard Worker    mov          [rsp+16*4], angled
2060*c0909341SAndroid Build Coastguard Worker    sub              angled, 90
2061*c0909341SAndroid Build Coastguard Worker    movd                 m2, r3d
2062*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 90
2063*c0909341SAndroid Build Coastguard Worker    movd                 m1, angled
2064*c0909341SAndroid Build Coastguard Worker    sub                 r3d, angled ; 180 - angle
2065*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
2066*c0909341SAndroid Build Coastguard Worker    movu                 m4, [base+z_filt_wh8]
2067*c0909341SAndroid Build Coastguard Worker    movd                 m7, r3d
2068*c0909341SAndroid Build Coastguard Worker    psrldq               m5, [base+z_filt_t_w48+angleq*8], 4
2069*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 8
2070*c0909341SAndroid Build Coastguard Worker    call .w8_filter_top
2071*c0909341SAndroid Build Coastguard Worker    mov                 r3d, [rsp+16*4]
2072*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 141
2073*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2074*c0909341SAndroid Build Coastguard Worker    mov                 r3b, hb
2075*c0909341SAndroid Build Coastguard Worker%else
2076*c0909341SAndroid Build Coastguard Worker    xor                 r3b, r3b
2077*c0909341SAndroid Build Coastguard Worker    or                  r3d, hd
2078*c0909341SAndroid Build Coastguard Worker%endif
2079*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
2080*c0909341SAndroid Build Coastguard Worker    jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm
2081*c0909341SAndroid Build Coastguard Worker.w8_filter_left:
2082*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m7
2083*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2084*c0909341SAndroid Build Coastguard Worker    jz .w4_main
2085*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x55555555
2086*c0909341SAndroid Build Coastguard Worker    neg                  hq
2087*c0909341SAndroid Build Coastguard Worker    mov                  r3, tlq
2088*c0909341SAndroid Build Coastguard Worker    movd                 m1, [tlq+hq*2]
2089*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30 ; filter_strength
2090*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*13-2]
2091*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m1, q0000
2092*c0909341SAndroid Build Coastguard Worker    movq       [tlq+hq*2-6], m1
2093*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge
2094*c0909341SAndroid Build Coastguard Worker    jmp .filter_left_end
2095*c0909341SAndroid Build Coastguard Worker.w8_filter_top:
2096*c0909341SAndroid Build Coastguard Worker    REPX     {pshufb x, m3}, m2, m1, m7
2097*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m2, m4
2098*c0909341SAndroid Build Coastguard Worker    pand                 m1, m2
2099*c0909341SAndroid Build Coastguard Worker    pand                 m7, m2
2100*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, m5
2101*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m7, m5
2102*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m1
2103*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2104*c0909341SAndroid Build Coastguard Worker    jz .w8_filter_top_end ; filter_strength == 0
2105*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x55555555
2106*c0909341SAndroid Build Coastguard Worker    mov              [dstq], tlq
2107*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*14+gprsize]
2108*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30 ; filter_strength
2109*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge
2110*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2111*c0909341SAndroid Build Coastguard Worker    mov                 r3d, r7m ; maxw, offset due to call
2112*c0909341SAndroid Build Coastguard Worker%else
2113*c0909341SAndroid Build Coastguard Worker    mov                 r3d, [rsp+16*2+4*1]
2114*c0909341SAndroid Build Coastguard Worker%endif
2115*c0909341SAndroid Build Coastguard Worker    mov                 tlq, [dstq]
2116*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
2117*c0909341SAndroid Build Coastguard Worker    jge .w8_filter_top_end
2118*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+16*0+2]
2119*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+16*1+2]
2120*c0909341SAndroid Build Coastguard Worker    movu [rsp+r3*2+16*14+gprsize], m1
2121*c0909341SAndroid Build Coastguard Worker    movu [rsp+r3*2+16*15+gprsize], m2
2122*c0909341SAndroid Build Coastguard Worker.w8_filter_top_end:
2123*c0909341SAndroid Build Coastguard Worker    ret
2124*c0909341SAndroid Build Coastguard Worker.w16:
2125*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2126*c0909341SAndroid Build Coastguard Worker    jnz .w4_main
2127*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+15]
2128*c0909341SAndroid Build Coastguard Worker    sub              angled, 90
2129*c0909341SAndroid Build Coastguard Worker    movd                 m2, r3d
2130*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 90
2131*c0909341SAndroid Build Coastguard Worker    movd                 m1, angled
2132*c0909341SAndroid Build Coastguard Worker    sub                 r3d, angled ; 180 - angle
2133*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
2134*c0909341SAndroid Build Coastguard Worker    movd                 m7, r3d
2135*c0909341SAndroid Build Coastguard Worker    REPX     {pshufb x, m3}, m2, m1, m7
2136*c0909341SAndroid Build Coastguard Worker    movq                 m4, [base+z_filt_t_w16+angleq*4]
2137*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m2, [base+z_filt_wh16]
2138*c0909341SAndroid Build Coastguard Worker    pand                 m1, m2
2139*c0909341SAndroid Build Coastguard Worker    pand                 m7, m2
2140*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, m4
2141*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m7, m4
2142*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m1
2143*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2144*c0909341SAndroid Build Coastguard Worker    jz .w16_filter_left ; filter_strength == 0
2145*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x24924924
2146*c0909341SAndroid Build Coastguard Worker    pshufhw              m6, m6, q3333
2147*c0909341SAndroid Build Coastguard Worker    mov              [dstq], tlq
2148*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*14]
2149*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30
2150*c0909341SAndroid Build Coastguard Worker    movhps       [tlq+16*2], m6
2151*c0909341SAndroid Build Coastguard Worker    adc                 r5d, -1 ; filter_strength
2152*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 16
2153*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge
2154*c0909341SAndroid Build Coastguard Worker    mov                 r3d, maxwm
2155*c0909341SAndroid Build Coastguard Worker    mov                 tlq, [dstq]
2156*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 16
2157*c0909341SAndroid Build Coastguard Worker    jge .w16_filter_left
2158*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+16*0+2]
2159*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+16*1+2]
2160*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r3*2+16*14], m1
2161*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r3*2+16*15], m2
2162*c0909341SAndroid Build Coastguard Worker.w16_filter_left:
2163*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m7
2164*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2165*c0909341SAndroid Build Coastguard Worker    jz .w4_main
2166*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x24924924
2167*c0909341SAndroid Build Coastguard Worker    neg                  hq
2168*c0909341SAndroid Build Coastguard Worker    mov                  r3, tlq
2169*c0909341SAndroid Build Coastguard Worker    movd                 m1, [tlq+hq*2]
2170*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30
2171*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*13-2]
2172*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m1, q0000
2173*c0909341SAndroid Build Coastguard Worker    adc                 r5d, -1 ; filter_strength
2174*c0909341SAndroid Build Coastguard Worker    movq       [tlq+hq*2-6], m1
2175*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge
2176*c0909341SAndroid Build Coastguard Worker    jmp .filter_left_end
2177*c0909341SAndroid Build Coastguard Worker.w32:
2178*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+16*2+2]
2179*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+16*3+2]
2180*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*16], m1
2181*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*17], m2
2182*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2183*c0909341SAndroid Build Coastguard Worker    jnz .w4_main
2184*c0909341SAndroid Build Coastguard Worker    mov              [dstq], tlq
2185*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*14]
2186*c0909341SAndroid Build Coastguard Worker    pshufhw              m2, m2, q3333
2187*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 32
2188*c0909341SAndroid Build Coastguard Worker    movhps       [tlq+16*4], m2
2189*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3
2190*c0909341SAndroid Build Coastguard Worker    mov                 r3d, maxwm
2191*c0909341SAndroid Build Coastguard Worker    mov                 tlq, [dstq]
2192*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 32
2193*c0909341SAndroid Build Coastguard Worker    jge .filter_left
2194*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+16*0+2]
2195*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+16*1+2]
2196*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r3*2+16*14], m1
2197*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r3*2+16*15], m2
2198*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 16
2199*c0909341SAndroid Build Coastguard Worker    jge .filter_left
2200*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+16*2+2]
2201*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+16*3+2]
2202*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r3*2+16*16], m1
2203*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r3*2+16*17], m2
2204*c0909341SAndroid Build Coastguard Worker.filter_left:
2205*c0909341SAndroid Build Coastguard Worker    neg                  hq
2206*c0909341SAndroid Build Coastguard Worker    mov                  r3, tlq
2207*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, [tlq+hq*2], q0000
2208*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*13-2]
2209*c0909341SAndroid Build Coastguard Worker    movq       [tlq+hq*2-6], m1
2210*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge_s3
2211*c0909341SAndroid Build Coastguard Worker.filter_left_end:
2212*c0909341SAndroid Build Coastguard Worker    mov                 r2d, maxhm
2213*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, hd
2214*c0909341SAndroid Build Coastguard Worker    jge .w4_main
2215*c0909341SAndroid Build Coastguard Worker    neg                  r2
2216*c0909341SAndroid Build Coastguard Worker    movu                 m1, [r3+r2*2-16*1]
2217*c0909341SAndroid Build Coastguard Worker    movu                 m2, [r3+r2*2-16*2]
2218*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r2*2+16*12], m1
2219*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r2*2+16*11], m2
2220*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, -48
2221*c0909341SAndroid Build Coastguard Worker    jle .w4_main
2222*c0909341SAndroid Build Coastguard Worker    movu                 m1, [r3+r2*2-16*3]
2223*c0909341SAndroid Build Coastguard Worker    movu                 m2, [r3+r2*2-16*4]
2224*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r2*2+16*10], m1
2225*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r2*2+16* 9], m2
2226*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, -32
2227*c0909341SAndroid Build Coastguard Worker    jle .w4_main
2228*c0909341SAndroid Build Coastguard Worker    movu                 m1, [r3+r2*2-16*5]
2229*c0909341SAndroid Build Coastguard Worker    movu                 m2, [r3+r2*2-16*6]
2230*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r2*2+16* 8], m1
2231*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r2*2+16* 7], m2
2232*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, -16
2233*c0909341SAndroid Build Coastguard Worker    jle .w4_main
2234*c0909341SAndroid Build Coastguard Worker    movu                 m1, [r3+r2*2-16*7]
2235*c0909341SAndroid Build Coastguard Worker    movu                 m2, [r3+r2*2-16*8]
2236*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r2*2+16* 6], m1
2237*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r2*2+16* 5], m2
2238*c0909341SAndroid Build Coastguard Worker    jmp .w4_main
2239*c0909341SAndroid Build Coastguard Worker.w64:
2240*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+16*2+2]
2241*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+16*3+2]
2242*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+16*4+2]
2243*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq+16*5+2]
2244*c0909341SAndroid Build Coastguard Worker    movu                 m5, [tlq+16*6+2]
2245*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq+16*7+2]
2246*c0909341SAndroid Build Coastguard Worker    mov              [dstq], tlq
2247*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*14]
2248*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*2], m1
2249*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*3], m2
2250*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*4], m3
2251*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*5], m4
2252*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*6], m5
2253*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*7], m6
2254*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2255*c0909341SAndroid Build Coastguard Worker    jnz .w4_main
2256*c0909341SAndroid Build Coastguard Worker    pshufhw              m6, m6, q3333
2257*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 64
2258*c0909341SAndroid Build Coastguard Worker    movhps       [tlq+16*8], m6
2259*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3
2260*c0909341SAndroid Build Coastguard Worker    mov                 r3d, maxwm
2261*c0909341SAndroid Build Coastguard Worker    mov                 tlq, [dstq]
2262*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 64
2263*c0909341SAndroid Build Coastguard Worker    jge .filter_left
2264*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+16*0+2]
2265*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+16*1+2]
2266*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r3*2+16*14], m1
2267*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r3*2+16*15], m2
2268*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 48
2269*c0909341SAndroid Build Coastguard Worker    jge .filter_left
2270*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+16*2+2]
2271*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+16*3+2]
2272*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r3*2+16*16], m1
2273*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r3*2+16*17], m2
2274*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 32
2275*c0909341SAndroid Build Coastguard Worker    jge .filter_left
2276*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+16*4+2]
2277*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+16*5+2]
2278*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r3*2+16*18], m1
2279*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r3*2+16*19], m2
2280*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 16
2281*c0909341SAndroid Build Coastguard Worker    jge .filter_left
2282*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3*2+16*6+2]
2283*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3*2+16*7+2]
2284*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r3*2+16*20], m1
2285*c0909341SAndroid Build Coastguard Worker    movu   [rsp+r3*2+16*21], m2
2286*c0909341SAndroid Build Coastguard Worker    jmp .filter_left
2287*c0909341SAndroid Build Coastguard Worker
2288*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2289*c0909341SAndroid Build Coastguard Workercglobal ipred_z3_16bpc, 4, 9, 8, 16*18, dst, stride, tl, w, h, angle, dy, _, org_w
2290*c0909341SAndroid Build Coastguard Worker    %define            base  r7-$$
2291*c0909341SAndroid Build Coastguard Worker    lea                  r7, [$$]
2292*c0909341SAndroid Build Coastguard Worker    mov              org_wd, wd
2293*c0909341SAndroid Build Coastguard Worker%else
2294*c0909341SAndroid Build Coastguard Workercglobal ipred_z3_16bpc, 4, 7, 8, -16*18, dst, stride, tl, w, h, angle, dy
2295*c0909341SAndroid Build Coastguard Worker    %define            base  r1-$$
2296*c0909341SAndroid Build Coastguard Worker    %define          org_wd  r5
2297*c0909341SAndroid Build Coastguard Worker    %define          org_wq  r5
2298*c0909341SAndroid Build Coastguard Worker    movd                 m6, r8m ; pixel_max
2299*c0909341SAndroid Build Coastguard Worker    mov          [dstq+4*0], strideq
2300*c0909341SAndroid Build Coastguard Worker    LEA                  r1, $$
2301*c0909341SAndroid Build Coastguard Worker    mov          [dstq+4*1], wd
2302*c0909341SAndroid Build Coastguard Worker%endif
2303*c0909341SAndroid Build Coastguard Worker    tzcnt                hd, hm
2304*c0909341SAndroid Build Coastguard Worker    movifnidn        angled, anglem
2305*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 2
2306*c0909341SAndroid Build Coastguard Worker    movsxd               hq, [base+ipred_z3_16bpc_ssse3_table+hq*4]
2307*c0909341SAndroid Build Coastguard Worker    sub              angled, 180
2308*c0909341SAndroid Build Coastguard Worker    movddup              m0, [base+pw_256]
2309*c0909341SAndroid Build Coastguard Worker    mov                 dyd, angled
2310*c0909341SAndroid Build Coastguard Worker    neg                 dyd
2311*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x400
2312*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+pw_62]
2313*c0909341SAndroid Build Coastguard Worker    or                  dyq, ~0x7e
2314*c0909341SAndroid Build Coastguard Worker    lea                  hq, [base+ipred_z3_16bpc_ssse3_table+hq]
2315*c0909341SAndroid Build Coastguard Worker    movzx               dyd, word [base+dr_intra_derivative+45*2-1+dyq]
2316*c0909341SAndroid Build Coastguard Worker    jmp                  hq
2317*c0909341SAndroid Build Coastguard Worker.h4:
2318*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [angleq+88]
2319*c0909341SAndroid Build Coastguard Worker    test                r4d, 0x480
2320*c0909341SAndroid Build Coastguard Worker    jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40
2321*c0909341SAndroid Build Coastguard Worker    sar                 r4d, 9
2322*c0909341SAndroid Build Coastguard Worker    add                 r4d, wd
2323*c0909341SAndroid Build Coastguard Worker    cmp                 r4d, 8
2324*c0909341SAndroid Build Coastguard Worker    jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm)
2325*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq-14]   ; 7 6 5 4 3 2 1 0
2326*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq-12]   ; 8 7 6 5 4 3 2 1
2327*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2328*c0909341SAndroid Build Coastguard Worker    movd                 m6, r8m
2329*c0909341SAndroid Build Coastguard Worker%endif
2330*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m2, m0
2331*c0909341SAndroid Build Coastguard Worker    mov                 tlq, rsp
2332*c0909341SAndroid Build Coastguard Worker    palignr              m1, m2, m4, 14 ; 8 8 7 6 5 4 3 2
2333*c0909341SAndroid Build Coastguard Worker    add                 dyd, dyd
2334*c0909341SAndroid Build Coastguard Worker    palignr              m5, m2, m4, 12 ; 8 8 8 7 6 5 4 3
2335*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2336*c0909341SAndroid Build Coastguard Worker    paddw                m3, m5
2337*c0909341SAndroid Build Coastguard Worker    psubw                m5, m1, m3
2338*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+z_upsample]
2339*c0909341SAndroid Build Coastguard Worker    mova           [tlq+ 0], m4
2340*c0909341SAndroid Build Coastguard Worker    movd                 m4, dyd
2341*c0909341SAndroid Build Coastguard Worker    psraw                m5, 3
2342*c0909341SAndroid Build Coastguard Worker    neg                 dyd
2343*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
2344*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
2345*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [dyq+(16<<6)+63] ; ypos
2346*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m5
2347*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m0
2348*c0909341SAndroid Build Coastguard Worker    shl                  wd, 3
2349*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m5
2350*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m0
2351*c0909341SAndroid Build Coastguard Worker    pminsw               m1, m6
2352*c0909341SAndroid Build Coastguard Worker    sub                 rsp, wq
2353*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m1, m2
2354*c0909341SAndroid Build Coastguard Worker    paddw                m5, m4, m4
2355*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2
2356*c0909341SAndroid Build Coastguard Worker    mova           [tlq+32], m0
2357*c0909341SAndroid Build Coastguard Worker    movsd                m4, m5
2358*c0909341SAndroid Build Coastguard Worker    mova           [tlq+16], m1
2359*c0909341SAndroid Build Coastguard Worker.h4_upsample_loop:
2360*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [r5+dyq]
2361*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 6
2362*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r5*2]
2363*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r4+dyq]
2364*c0909341SAndroid Build Coastguard Worker    sar                 r4d, 6
2365*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r4*2]
2366*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3
2367*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
2368*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m1, m2
2369*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m2
2370*c0909341SAndroid Build Coastguard Worker    pand                 m2, m7, m4
2371*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
2372*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
2373*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
2374*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
2375*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2376*c0909341SAndroid Build Coastguard Worker    mova        [rsp+wq-16], m0
2377*c0909341SAndroid Build Coastguard Worker    sub                  wd, 16
2378*c0909341SAndroid Build Coastguard Worker    jg .h4_upsample_loop
2379*c0909341SAndroid Build Coastguard Worker    or                  r3d, 4*2
2380*c0909341SAndroid Build Coastguard Worker    jmp .end_transpose
2381*c0909341SAndroid Build Coastguard Worker.h4_no_upsample:
2382*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 7
2383*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
2384*c0909341SAndroid Build Coastguard Worker    jnz .h4_main
2385*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [wq+3]
2386*c0909341SAndroid Build Coastguard Worker    movd                 m1, r4d
2387*c0909341SAndroid Build Coastguard Worker    movd                 m3, angled
2388*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
2389*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
2390*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2
2391*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m2
2392*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m1, [base+z_filt_wh4]
2393*c0909341SAndroid Build Coastguard Worker    pand                 m1, m3
2394*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, [base+z_filt_t_w48+angleq*8]
2395*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m1
2396*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 7
2397*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2398*c0909341SAndroid Build Coastguard Worker    jz .h4_main ; filter_strength == 0
2399*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, [tlq+2], q0000
2400*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x55555555
2401*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq-14]
2402*c0909341SAndroid Build Coastguard Worker    neg                  r4
2403*c0909341SAndroid Build Coastguard Worker    movd                 m3, [tlq+r4*2]
2404*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30
2405*c0909341SAndroid Build Coastguard Worker    movd        [rsp+16*17], m1
2406*c0909341SAndroid Build Coastguard Worker    pshuflw              m3, m3, q0000
2407*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*16], m2
2408*c0909341SAndroid Build Coastguard Worker    lea                  r2, [r4-2]
2409*c0909341SAndroid Build Coastguard Worker    movq [rsp+16*17+r4*2-10], m3
2410*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
2411*c0909341SAndroid Build Coastguard Worker    cmovae               r4, r2
2412*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*17-2]
2413*c0909341SAndroid Build Coastguard Worker    call .filter_edge
2414*c0909341SAndroid Build Coastguard Worker.h4_main:
2415*c0909341SAndroid Build Coastguard Worker    movd                 m4, dyd
2416*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r4
2417*c0909341SAndroid Build Coastguard Worker    movddup              m1, [base+z_base_inc_z2+8] ; base_inc << 6
2418*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r4
2419*c0909341SAndroid Build Coastguard Worker    shl                 r4d, 6
2420*c0909341SAndroid Build Coastguard Worker    movd                 m6, [tlq]
2421*c0909341SAndroid Build Coastguard Worker    movd                 m3, r4d
2422*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m0
2423*c0909341SAndroid Build Coastguard Worker    neg                 dyq
2424*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m0
2425*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dyq+r4+63] ; ypos
2426*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m0
2427*c0909341SAndroid Build Coastguard Worker    shl                  wd, 3
2428*c0909341SAndroid Build Coastguard Worker    paddw                m5, m4, m4
2429*c0909341SAndroid Build Coastguard Worker    sub                 rsp, wq
2430*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1 ; max_base_y
2431*c0909341SAndroid Build Coastguard Worker    movsd                m4, m5 ; ypos1 ypos0
2432*c0909341SAndroid Build Coastguard Worker.h4_loop:
2433*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r5+dyq]
2434*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6
2435*c0909341SAndroid Build Coastguard Worker    movddup              m0, [tlq+r5*2-6]
2436*c0909341SAndroid Build Coastguard Worker    movddup              m1, [tlq+r5*2-8]
2437*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r4+dyq]
2438*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6
2439*c0909341SAndroid Build Coastguard Worker    movlps               m0, [tlq+r4*2-6]
2440*c0909341SAndroid Build Coastguard Worker    movlps               m1, [tlq+r4*2-8]
2441*c0909341SAndroid Build Coastguard Worker    pand                 m2, m7, m4
2442*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
2443*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
2444*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
2445*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m3, m4
2446*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
2447*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2448*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
2449*c0909341SAndroid Build Coastguard Worker    pandn                m2, m6
2450*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
2451*c0909341SAndroid Build Coastguard Worker    mova        [rsp+wq-16], m0
2452*c0909341SAndroid Build Coastguard Worker    sub                  wd, 16
2453*c0909341SAndroid Build Coastguard Worker    jz .h4_transpose
2454*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2455*c0909341SAndroid Build Coastguard Worker    jg .h4_loop
2456*c0909341SAndroid Build Coastguard Worker.h4_end_loop:
2457*c0909341SAndroid Build Coastguard Worker    mova        [rsp+wq-16], m6
2458*c0909341SAndroid Build Coastguard Worker    sub                  wd, 16
2459*c0909341SAndroid Build Coastguard Worker    jg .h4_end_loop
2460*c0909341SAndroid Build Coastguard Worker.h4_transpose:
2461*c0909341SAndroid Build Coastguard Worker    or                  r3d, 4*2
2462*c0909341SAndroid Build Coastguard Worker    jmp .end_transpose
2463*c0909341SAndroid Build Coastguard Worker.h8:
2464*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [angleq+88]
2465*c0909341SAndroid Build Coastguard Worker    and                 r4d, ~0x7f
2466*c0909341SAndroid Build Coastguard Worker    or                  r4d, wd
2467*c0909341SAndroid Build Coastguard Worker    cmp                 r4d, 8
2468*c0909341SAndroid Build Coastguard Worker    ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
2469*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq-30]  ; g f e d c b a 9
2470*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq-32]  ; _ g f e d c b a
2471*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq-16]  ; 9 8 7 6 5 4 3 2
2472*c0909341SAndroid Build Coastguard Worker    paddw                m3, [tlq-14]  ; 8 7 6 5 4 3 2 1
2473*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m2, q2100 ; _ _ g f e d c b
2474*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2475*c0909341SAndroid Build Coastguard Worker    movu                 m5, [tlq-28]  ; f e d c b a 9 8
2476*c0909341SAndroid Build Coastguard Worker    add                 dyd, dyd
2477*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
2478*c0909341SAndroid Build Coastguard Worker    je .h8_upsample_w8
2479*c0909341SAndroid Build Coastguard Worker    pshufhw              m4, m2, q1000 ; _ _ _ _ c c c b
2480*c0909341SAndroid Build Coastguard Worker.h8_upsample_w8:
2481*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
2482*c0909341SAndroid Build Coastguard Worker    psubw                m5, m1, m4
2483*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq-18]  ; a 9 8 7 6 5 4 3
2484*c0909341SAndroid Build Coastguard Worker    psraw                m5, 3
2485*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
2486*c0909341SAndroid Build Coastguard Worker    movu                 m5, [tlq-12]  ; 7 6 5 4 3 2 1 0
2487*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2488*c0909341SAndroid Build Coastguard Worker    movd                 m6, r8m ; pixel_max
2489*c0909341SAndroid Build Coastguard Worker%endif
2490*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
2491*c0909341SAndroid Build Coastguard Worker    shl                  wd, 4
2492*c0909341SAndroid Build Coastguard Worker    psubw                m5, m3, m4
2493*c0909341SAndroid Build Coastguard Worker    movd                 m4, dyd
2494*c0909341SAndroid Build Coastguard Worker    psraw                m5, 3
2495*c0909341SAndroid Build Coastguard Worker    neg                 dyd
2496*c0909341SAndroid Build Coastguard Worker    paddw                m3, m5
2497*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m0
2498*c0909341SAndroid Build Coastguard Worker    mova                 m5, [tlq-14]
2499*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m0
2500*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
2501*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m0
2502*c0909341SAndroid Build Coastguard Worker    pmaxsw               m3, m0
2503*c0909341SAndroid Build Coastguard Worker    mov                 tlq, rsp
2504*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m0
2505*c0909341SAndroid Build Coastguard Worker    pavgw                m3, m0
2506*c0909341SAndroid Build Coastguard Worker    sub                 rsp, wq
2507*c0909341SAndroid Build Coastguard Worker    pminsw               m1, m6
2508*c0909341SAndroid Build Coastguard Worker    pminsw               m6, m3
2509*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+z_upsample]
2510*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [dyq+(16<<6)+63] ; ypos
2511*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2
2512*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*0], m0
2513*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2
2514*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*1], m1
2515*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m6, m5
2516*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*2], m0
2517*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m5
2518*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*3], m6
2519*c0909341SAndroid Build Coastguard Worker    mova                 m5, m4
2520*c0909341SAndroid Build Coastguard Worker.h8_upsample_loop:
2521*c0909341SAndroid Build Coastguard Worker    mov                 r4d, r5d
2522*c0909341SAndroid Build Coastguard Worker    sar                 r4d, 6
2523*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r4*2+16*0]
2524*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r4*2+16*1]
2525*c0909341SAndroid Build Coastguard Worker    add                 r5d, dyd
2526*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3
2527*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
2528*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m1, m2
2529*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m2
2530*c0909341SAndroid Build Coastguard Worker    pand                 m2, m7, m4
2531*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
2532*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
2533*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
2534*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
2535*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2536*c0909341SAndroid Build Coastguard Worker    mova        [rsp+wq-16], m0
2537*c0909341SAndroid Build Coastguard Worker    sub                  wd, 16
2538*c0909341SAndroid Build Coastguard Worker    jg .h8_upsample_loop
2539*c0909341SAndroid Build Coastguard Worker    or                  r3d, 8*2
2540*c0909341SAndroid Build Coastguard Worker    jmp .end_transpose
2541*c0909341SAndroid Build Coastguard Worker.h8_no_upsample:
2542*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [wq+7]
2543*c0909341SAndroid Build Coastguard Worker    movd                 m1, r4d
2544*c0909341SAndroid Build Coastguard Worker    and                 r4d, 7
2545*c0909341SAndroid Build Coastguard Worker    or                  r4d, 8 ; imin(w+7, 15)
2546*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2547*c0909341SAndroid Build Coastguard Worker    jnz .h8_main
2548*c0909341SAndroid Build Coastguard Worker    movd                 m3, angled
2549*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
2550*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
2551*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2
2552*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m2
2553*c0909341SAndroid Build Coastguard Worker    movu                 m2, [base+z_filt_wh8]
2554*c0909341SAndroid Build Coastguard Worker    psrldq               m4, [base+z_filt_t_w48+angleq*8], 4
2555*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m2, m1
2556*c0909341SAndroid Build Coastguard Worker    pand                 m2, m3
2557*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m2, m4
2558*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m2
2559*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2560*c0909341SAndroid Build Coastguard Worker    jz .h8_main ; filter_strength == 0
2561*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, [tlq+2], q0000
2562*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x55555555
2563*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq-16*1+2]
2564*c0909341SAndroid Build Coastguard Worker    neg                  r4
2565*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tlq-16*2+2]
2566*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30
2567*c0909341SAndroid Build Coastguard Worker    movd                 m4, [tlq+r4*2]
2568*c0909341SAndroid Build Coastguard Worker    movd        [rsp+16*17], m1
2569*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*16], m2
2570*c0909341SAndroid Build Coastguard Worker    pshuflw              m4, m4, q0000
2571*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*15], m3
2572*c0909341SAndroid Build Coastguard Worker    lea                  r2, [r4-2]
2573*c0909341SAndroid Build Coastguard Worker    movq [rsp+16*17+r4*2-10], m4
2574*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 16
2575*c0909341SAndroid Build Coastguard Worker    cmovae               r4, r2
2576*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*17-2]
2577*c0909341SAndroid Build Coastguard Worker    call .filter_edge
2578*c0909341SAndroid Build Coastguard Worker.h8_main:
2579*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r4
2580*c0909341SAndroid Build Coastguard Worker    movd                 m4, dyd
2581*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r4
2582*c0909341SAndroid Build Coastguard Worker    shl                 r4d, 6
2583*c0909341SAndroid Build Coastguard Worker    movd                 m6, [tlq]
2584*c0909341SAndroid Build Coastguard Worker    movd                 m3, r4d
2585*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m0
2586*c0909341SAndroid Build Coastguard Worker    neg                 dyq
2587*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m0
2588*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dyq+r4+63]
2589*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m0
2590*c0909341SAndroid Build Coastguard Worker    shl                  wd, 4
2591*c0909341SAndroid Build Coastguard Worker    mova                 m5, m4
2592*c0909341SAndroid Build Coastguard Worker    sub                 rsp, wq
2593*c0909341SAndroid Build Coastguard Worker    psubw                m3, [base+z_base_inc_z2]
2594*c0909341SAndroid Build Coastguard Worker.h8_loop:
2595*c0909341SAndroid Build Coastguard Worker    mov                  r4, r5
2596*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6
2597*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r4*2-14]
2598*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r4*2-16]
2599*c0909341SAndroid Build Coastguard Worker    pand                 m2, m7, m4
2600*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
2601*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
2602*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
2603*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m3, m4
2604*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
2605*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2606*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
2607*c0909341SAndroid Build Coastguard Worker    pandn                m2, m6
2608*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
2609*c0909341SAndroid Build Coastguard Worker    mova        [rsp+wq-16], m0
2610*c0909341SAndroid Build Coastguard Worker    sub                  wd, 8*2
2611*c0909341SAndroid Build Coastguard Worker    jz .h8_transpose
2612*c0909341SAndroid Build Coastguard Worker    add                  r5, dyq
2613*c0909341SAndroid Build Coastguard Worker    jg .h8_loop
2614*c0909341SAndroid Build Coastguard Worker.h8_end_loop:
2615*c0909341SAndroid Build Coastguard Worker    mova        [rsp+wq-16], m6
2616*c0909341SAndroid Build Coastguard Worker    sub                  wd, 8*2
2617*c0909341SAndroid Build Coastguard Worker    jg .h8_end_loop
2618*c0909341SAndroid Build Coastguard Worker.h8_transpose:
2619*c0909341SAndroid Build Coastguard Worker    or                  r3d, 8*2
2620*c0909341SAndroid Build Coastguard Worker    jmp .end_transpose
2621*c0909341SAndroid Build Coastguard Worker.h16:
2622*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [wq+15]
2623*c0909341SAndroid Build Coastguard Worker    movd                 m1, r4d
2624*c0909341SAndroid Build Coastguard Worker    and                 r4d, 15
2625*c0909341SAndroid Build Coastguard Worker    or                  r4d, 16 ; imin(w+15, 31)
2626*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2627*c0909341SAndroid Build Coastguard Worker    jnz .h16_main
2628*c0909341SAndroid Build Coastguard Worker    movd                 m3, angled
2629*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
2630*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
2631*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2
2632*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m2
2633*c0909341SAndroid Build Coastguard Worker    movq                 m4, [base+z_filt_t_w16+angleq*4]
2634*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m1, [base+z_filt_wh16]
2635*c0909341SAndroid Build Coastguard Worker    pand                 m1, m3
2636*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, m4
2637*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m1
2638*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2639*c0909341SAndroid Build Coastguard Worker    jz .h16_main ; filter_strength == 0
2640*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, [tlq+2], q0000
2641*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq-16*1+2]
2642*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x24924924
2643*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tlq-16*2+2]
2644*c0909341SAndroid Build Coastguard Worker    neg                  r4
2645*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tlq-16*3+2]
2646*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30
2647*c0909341SAndroid Build Coastguard Worker    mova                 m5, [tlq-16*4+2]
2648*c0909341SAndroid Build Coastguard Worker    movd                 m6, [tlq+r4*2]
2649*c0909341SAndroid Build Coastguard Worker    adc                 r5d, -1 ; filter_strength
2650*c0909341SAndroid Build Coastguard Worker    movd        [rsp+16*17], m1
2651*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*16], m2
2652*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*15], m3
2653*c0909341SAndroid Build Coastguard Worker    pshuflw              m6, m6, q0000
2654*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*14], m4
2655*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*13], m5
2656*c0909341SAndroid Build Coastguard Worker    lea                  r2, [r4-2]
2657*c0909341SAndroid Build Coastguard Worker    movq [rsp+16*17+r4*2-10], m6
2658*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 32
2659*c0909341SAndroid Build Coastguard Worker    cmovae               r4, r2
2660*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*17-2]
2661*c0909341SAndroid Build Coastguard Worker    call .filter_edge
2662*c0909341SAndroid Build Coastguard Worker.h16_main:
2663*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r4
2664*c0909341SAndroid Build Coastguard Worker    movd                 m5, dyd
2665*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r4
2666*c0909341SAndroid Build Coastguard Worker    shl                 r4d, 6
2667*c0909341SAndroid Build Coastguard Worker    movd                 m6, [tlq]
2668*c0909341SAndroid Build Coastguard Worker    movd                 m3, r4d
2669*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m0
2670*c0909341SAndroid Build Coastguard Worker    neg                 dyq
2671*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m0
2672*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dyq+r4+63]
2673*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m0
2674*c0909341SAndroid Build Coastguard Worker    shl                  wd, 5
2675*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5, [base+z_base_inc_z2]
2676*c0909341SAndroid Build Coastguard Worker    sub                 rsp, wq
2677*c0909341SAndroid Build Coastguard Worker    psubw                m4, m3
2678*c0909341SAndroid Build Coastguard Worker.h16_loop:
2679*c0909341SAndroid Build Coastguard Worker    mov                  r4, r5
2680*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6
2681*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r4*2-14]
2682*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r4*2-16]
2683*c0909341SAndroid Build Coastguard Worker    pand                 m3, m7, m4
2684*c0909341SAndroid Build Coastguard Worker    psllw                m3, 9
2685*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
2686*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
2687*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r4*2-30]
2688*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
2689*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r4*2-32]
2690*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
2691*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
2692*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+pw_m512]
2693*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2694*c0909341SAndroid Build Coastguard Worker    psraw                m2, m4, 15
2695*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m3, m4
2696*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
2697*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
2698*c0909341SAndroid Build Coastguard Worker    pandn                m2, m6
2699*c0909341SAndroid Build Coastguard Worker    pand                 m1, m3
2700*c0909341SAndroid Build Coastguard Worker    pandn                m3, m6
2701*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
2702*c0909341SAndroid Build Coastguard Worker    mova      [rsp+wq-16*1], m0
2703*c0909341SAndroid Build Coastguard Worker    por                  m1, m3
2704*c0909341SAndroid Build Coastguard Worker    mova      [rsp+wq-16*2], m1
2705*c0909341SAndroid Build Coastguard Worker    sub                  wd, 16*2
2706*c0909341SAndroid Build Coastguard Worker    jz .h16_transpose
2707*c0909341SAndroid Build Coastguard Worker    add                  r5, dyq
2708*c0909341SAndroid Build Coastguard Worker    jg .h16_loop
2709*c0909341SAndroid Build Coastguard Worker.h16_end_loop:
2710*c0909341SAndroid Build Coastguard Worker    mova      [rsp+wq-16*1], m6
2711*c0909341SAndroid Build Coastguard Worker    mova      [rsp+wq-16*2], m6
2712*c0909341SAndroid Build Coastguard Worker    sub                  wd, 16*2
2713*c0909341SAndroid Build Coastguard Worker    jg .h16_end_loop
2714*c0909341SAndroid Build Coastguard Worker.h16_transpose:
2715*c0909341SAndroid Build Coastguard Worker    or                  r3d, 16*2
2716*c0909341SAndroid Build Coastguard Worker    jmp .end_transpose
2717*c0909341SAndroid Build Coastguard Worker.h32:
2718*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [wq+31]
2719*c0909341SAndroid Build Coastguard Worker    and                 r4d, 31
2720*c0909341SAndroid Build Coastguard Worker    or                  r4d, 32 ; imin(w+31, 63)
2721*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
2722*c0909341SAndroid Build Coastguard Worker    jnz .h32_main
2723*c0909341SAndroid Build Coastguard Worker    call .filter_copy
2724*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r4-2]
2725*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 64
2726*c0909341SAndroid Build Coastguard Worker    cmove                r4, r5
2727*c0909341SAndroid Build Coastguard Worker    call .filter_edge_s3
2728*c0909341SAndroid Build Coastguard Worker.h32_main:
2729*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r4
2730*c0909341SAndroid Build Coastguard Worker    movd                 m5, dyd
2731*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r4
2732*c0909341SAndroid Build Coastguard Worker    shl                 r4d, 6
2733*c0909341SAndroid Build Coastguard Worker    movd                 m6, [tlq]
2734*c0909341SAndroid Build Coastguard Worker    movd                 m3, r4d
2735*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m0
2736*c0909341SAndroid Build Coastguard Worker    neg                 dyq
2737*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m0
2738*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dyq+r4+63]
2739*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m0
2740*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5, [base+z_base_inc_z2]
2741*c0909341SAndroid Build Coastguard Worker    psubw                m4, m3
2742*c0909341SAndroid Build Coastguard Worker.h32_loop:
2743*c0909341SAndroid Build Coastguard Worker    mov                  r4, r5
2744*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6
2745*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r4*2-14]
2746*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r4*2-16]
2747*c0909341SAndroid Build Coastguard Worker    pand                 m2, m7, m4
2748*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
2749*c0909341SAndroid Build Coastguard Worker    psubw                m3, m0
2750*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m2
2751*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r4*2-30]
2752*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
2753*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r4*2-32]
2754*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
2755*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m2
2756*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 16*4
2757*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
2758*c0909341SAndroid Build Coastguard Worker    psraw                m3, m4, 15
2759*c0909341SAndroid Build Coastguard Worker    pand                 m0, m3
2760*c0909341SAndroid Build Coastguard Worker    pandn                m3, m6
2761*c0909341SAndroid Build Coastguard Worker    por                  m0, m3
2762*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+pw_m512]
2763*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m3, m4
2764*c0909341SAndroid Build Coastguard Worker    pand                 m1, m3
2765*c0909341SAndroid Build Coastguard Worker    pandn                m3, m6
2766*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m0
2767*c0909341SAndroid Build Coastguard Worker    por                  m1, m3
2768*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m1
2769*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r4*2-46]
2770*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r4*2-48]
2771*c0909341SAndroid Build Coastguard Worker    psubw                m3, m0
2772*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m2
2773*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r4*2-62]
2774*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
2775*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r4*2-64]
2776*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
2777*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m2
2778*c0909341SAndroid Build Coastguard Worker    movddup              m2, [base+pw_m1024]
2779*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
2780*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+pw_m1536]
2781*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m4
2782*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m3, m4
2783*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
2784*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
2785*c0909341SAndroid Build Coastguard Worker    pandn                m2, m6
2786*c0909341SAndroid Build Coastguard Worker    pand                 m1, m3
2787*c0909341SAndroid Build Coastguard Worker    pandn                m3, m6
2788*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
2789*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m0
2790*c0909341SAndroid Build Coastguard Worker    por                  m1, m3
2791*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m1
2792*c0909341SAndroid Build Coastguard Worker    dec                  wd
2793*c0909341SAndroid Build Coastguard Worker    jz .h32_transpose
2794*c0909341SAndroid Build Coastguard Worker    add                  r5, dyq
2795*c0909341SAndroid Build Coastguard Worker    jg .h32_loop
2796*c0909341SAndroid Build Coastguard Worker.h32_end_loop:
2797*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 16*4
2798*c0909341SAndroid Build Coastguard Worker    REPX {mova [rsp+16*x], m6}, 3, 2, 1, 0
2799*c0909341SAndroid Build Coastguard Worker    dec                  wd
2800*c0909341SAndroid Build Coastguard Worker    jg .h32_end_loop
2801*c0909341SAndroid Build Coastguard Worker.h32_transpose:
2802*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32*2
2803*c0909341SAndroid Build Coastguard Worker    jmp .end_transpose
2804*c0909341SAndroid Build Coastguard Worker.h64:
2805*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [wq+63]
2806*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
2807*c0909341SAndroid Build Coastguard Worker    jnz .h64_main
2808*c0909341SAndroid Build Coastguard Worker    call .filter_copy
2809*c0909341SAndroid Build Coastguard Worker    call .filter_edge_s3
2810*c0909341SAndroid Build Coastguard Worker.h64_main:
2811*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r4
2812*c0909341SAndroid Build Coastguard Worker    movd                 m5, dyd
2813*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r4
2814*c0909341SAndroid Build Coastguard Worker    shl                 r4d, 6
2815*c0909341SAndroid Build Coastguard Worker    movd                 m6, [tlq]
2816*c0909341SAndroid Build Coastguard Worker    movd                 m3, r4d
2817*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m0
2818*c0909341SAndroid Build Coastguard Worker    neg                 dyq
2819*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m0
2820*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dyq+r4+63]
2821*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m0
2822*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5, [base+z_base_inc_z2]
2823*c0909341SAndroid Build Coastguard Worker    psubw                m4, m3
2824*c0909341SAndroid Build Coastguard Worker.h64_loop:
2825*c0909341SAndroid Build Coastguard Worker    mov                  r4, r5
2826*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6
2827*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r4*2- 14]
2828*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r4*2- 16]
2829*c0909341SAndroid Build Coastguard Worker    pand                 m2, m7, m4
2830*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
2831*c0909341SAndroid Build Coastguard Worker    psubw                m3, m0
2832*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m2
2833*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r4*2- 30]
2834*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
2835*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r4*2- 32]
2836*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
2837*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m2
2838*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 16*8
2839*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
2840*c0909341SAndroid Build Coastguard Worker    psraw                m3, m4, 15
2841*c0909341SAndroid Build Coastguard Worker    pand                 m0, m3
2842*c0909341SAndroid Build Coastguard Worker    pandn                m3, m6
2843*c0909341SAndroid Build Coastguard Worker    por                  m0, m3
2844*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+pw_m512]
2845*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m3, m4
2846*c0909341SAndroid Build Coastguard Worker    pand                 m1, m3
2847*c0909341SAndroid Build Coastguard Worker    pandn                m3, m6
2848*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*7], m0
2849*c0909341SAndroid Build Coastguard Worker    por                  m1, m3
2850*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*6], m1
2851*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r4*2- 46]
2852*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r4*2- 48]
2853*c0909341SAndroid Build Coastguard Worker    psubw                m3, m0
2854*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m2
2855*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r4*2- 62]
2856*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
2857*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r4*2- 64]
2858*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
2859*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m2
2860*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
2861*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+pw_m1024]
2862*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m3, m4
2863*c0909341SAndroid Build Coastguard Worker    pand                 m0, m3
2864*c0909341SAndroid Build Coastguard Worker    pandn                m3, m6
2865*c0909341SAndroid Build Coastguard Worker    por                  m0, m3
2866*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+pw_m1536]
2867*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m3, m4
2868*c0909341SAndroid Build Coastguard Worker    pand                 m1, m3
2869*c0909341SAndroid Build Coastguard Worker    pandn                m3, m6
2870*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*5], m0
2871*c0909341SAndroid Build Coastguard Worker    por                  m1, m3
2872*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*4], m1
2873*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r4*2- 78]
2874*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r4*2- 80]
2875*c0909341SAndroid Build Coastguard Worker    psubw                m3, m0
2876*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m2
2877*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r4*2- 94]
2878*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
2879*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r4*2- 96]
2880*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
2881*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m2
2882*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
2883*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+pw_m2048]
2884*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m3, m4
2885*c0909341SAndroid Build Coastguard Worker    pand                 m0, m3
2886*c0909341SAndroid Build Coastguard Worker    pandn                m3, m6
2887*c0909341SAndroid Build Coastguard Worker    por                  m0, m3
2888*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+pw_m2560]
2889*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m3, m4
2890*c0909341SAndroid Build Coastguard Worker    pand                 m1, m3
2891*c0909341SAndroid Build Coastguard Worker    pandn                m3, m6
2892*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m0
2893*c0909341SAndroid Build Coastguard Worker    por                  m1, m3
2894*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m1
2895*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r4*2-110]
2896*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r4*2-112]
2897*c0909341SAndroid Build Coastguard Worker    psubw                m3, m0
2898*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m2
2899*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r4*2-126]
2900*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
2901*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r4*2-128]
2902*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
2903*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m2
2904*c0909341SAndroid Build Coastguard Worker    movddup              m2, [base+pw_m3072]
2905*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
2906*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+pw_m3584]
2907*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m4
2908*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m3, m4
2909*c0909341SAndroid Build Coastguard Worker    paddw                m4, m5
2910*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
2911*c0909341SAndroid Build Coastguard Worker    pandn                m2, m6
2912*c0909341SAndroid Build Coastguard Worker    pand                 m1, m3
2913*c0909341SAndroid Build Coastguard Worker    pandn                m3, m6
2914*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
2915*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m0
2916*c0909341SAndroid Build Coastguard Worker    por                  m1, m3
2917*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m1
2918*c0909341SAndroid Build Coastguard Worker    dec                  wd
2919*c0909341SAndroid Build Coastguard Worker    jz .h64_transpose
2920*c0909341SAndroid Build Coastguard Worker    add                  r5, dyq
2921*c0909341SAndroid Build Coastguard Worker    jg .h64_loop
2922*c0909341SAndroid Build Coastguard Worker.h64_end_loop:
2923*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 16*8
2924*c0909341SAndroid Build Coastguard Worker    REPX {mova [rsp+16*x], m6}, 7, 6, 5, 4, 3, 2, 1, 0
2925*c0909341SAndroid Build Coastguard Worker    dec                  wd
2926*c0909341SAndroid Build Coastguard Worker    jg .h64_end_loop
2927*c0909341SAndroid Build Coastguard Worker.h64_transpose:
2928*c0909341SAndroid Build Coastguard Worker    add                 r3d, 64*2
2929*c0909341SAndroid Build Coastguard Worker.end_transpose:
2930*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2931*c0909341SAndroid Build Coastguard Worker    lea                  r7, [strideq*3]
2932*c0909341SAndroid Build Coastguard Worker%else
2933*c0909341SAndroid Build Coastguard Worker    mov             strideq, [dstq+4*0]
2934*c0909341SAndroid Build Coastguard Worker    mov              org_wd, [dstq+4*1]
2935*c0909341SAndroid Build Coastguard Worker%endif
2936*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [r3*3]
2937*c0909341SAndroid Build Coastguard Worker.end_transpose_loop:
2938*c0909341SAndroid Build Coastguard Worker    lea                  r2, [rsp+r3-8]
2939*c0909341SAndroid Build Coastguard Worker    lea                  r6, [dstq+org_wq*2-8]
2940*c0909341SAndroid Build Coastguard Worker.end_transpose_loop_y:
2941*c0909341SAndroid Build Coastguard Worker    movq                 m0, [r2+r4  ]
2942*c0909341SAndroid Build Coastguard Worker    movq                 m1, [r2+r3*2]
2943*c0909341SAndroid Build Coastguard Worker    movq                 m2, [r2+r3*1]
2944*c0909341SAndroid Build Coastguard Worker    movq                 m3, [r2+r3*0]
2945*c0909341SAndroid Build Coastguard Worker    sub                  r2, 8
2946*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
2947*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
2948*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m0, m2
2949*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
2950*c0909341SAndroid Build Coastguard Worker    movhps   [r6+strideq*0], m1
2951*c0909341SAndroid Build Coastguard Worker    movq     [r6+strideq*1], m1
2952*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2953*c0909341SAndroid Build Coastguard Worker    movhps   [r6+strideq*2], m0
2954*c0909341SAndroid Build Coastguard Worker    movq     [r6+r7       ], m0
2955*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6+strideq*4]
2956*c0909341SAndroid Build Coastguard Worker%else
2957*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6+strideq*2]
2958*c0909341SAndroid Build Coastguard Worker    movhps   [r6+strideq*0], m0
2959*c0909341SAndroid Build Coastguard Worker    movq     [r6+strideq*1], m0
2960*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6+strideq*2]
2961*c0909341SAndroid Build Coastguard Worker%endif
2962*c0909341SAndroid Build Coastguard Worker    cmp                  r2, rsp
2963*c0909341SAndroid Build Coastguard Worker    jae .end_transpose_loop_y
2964*c0909341SAndroid Build Coastguard Worker    lea                 rsp, [rsp+r3*4]
2965*c0909341SAndroid Build Coastguard Worker    sub              org_wd, 4
2966*c0909341SAndroid Build Coastguard Worker    jg .end_transpose_loop
2967*c0909341SAndroid Build Coastguard Worker    RET
2968*c0909341SAndroid Build Coastguard Worker.filter_copy:
2969*c0909341SAndroid Build Coastguard Worker    neg                  r4
2970*c0909341SAndroid Build Coastguard Worker    pshuflw              m2, [tlq+2], q0000
2971*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d
2972*c0909341SAndroid Build Coastguard Worker    pshuflw              m3, [tlq+r4*2], q0000
2973*c0909341SAndroid Build Coastguard Worker    movq [rsp+gprsize+16*17], m2
2974*c0909341SAndroid Build Coastguard Worker.filter_copy_loop:
2975*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tlq+r5*2-16*1+2]
2976*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq+r5*2-16*2+2]
2977*c0909341SAndroid Build Coastguard Worker    sub                  r5, 16
2978*c0909341SAndroid Build Coastguard Worker    mova [rsp+r5*2+gprsize+16*18], m1
2979*c0909341SAndroid Build Coastguard Worker    mova [rsp+r5*2+gprsize+16*17], m2
2980*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, r4d
2981*c0909341SAndroid Build Coastguard Worker    jg .filter_copy_loop
2982*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+gprsize+16*17-2]
2983*c0909341SAndroid Build Coastguard Worker    movq       [tlq+r4*2-8], m3
2984*c0909341SAndroid Build Coastguard Worker    ret
2985*c0909341SAndroid Build Coastguard Worker.filter_edge:
2986*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, 3
2987*c0909341SAndroid Build Coastguard Worker    je .filter_edge_s3
2988*c0909341SAndroid Build Coastguard Worker    movddup              m4, [base+z_filt_k+r5*8-8]
2989*c0909341SAndroid Build Coastguard Worker    movddup              m5, [base+z_filt_k+r5*8+8]
2990*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d
2991*c0909341SAndroid Build Coastguard Worker    movddup              m6, [base+pw_8]
2992*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq-12]
2993*c0909341SAndroid Build Coastguard Worker    jmp .filter_edge_start
2994*c0909341SAndroid Build Coastguard Worker.filter_edge_loop:
2995*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r5*2-12]
2996*c0909341SAndroid Build Coastguard Worker    mova       [tlq+r5*2+2], m1
2997*c0909341SAndroid Build Coastguard Worker.filter_edge_start:
2998*c0909341SAndroid Build Coastguard Worker    pmullw               m1, m4, [tlq+r5*2-14]
2999*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r5*2-16]
3000*c0909341SAndroid Build Coastguard Worker    sub                  r5, 8
3001*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
3002*c0909341SAndroid Build Coastguard Worker    pmullw               m2, m5
3003*c0909341SAndroid Build Coastguard Worker    paddw                m1, m6
3004*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
3005*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 4
3006*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, r4d
3007*c0909341SAndroid Build Coastguard Worker    jg .filter_edge_loop
3008*c0909341SAndroid Build Coastguard Worker    mova       [tlq+r5*2+2], m1
3009*c0909341SAndroid Build Coastguard Worker    neg                 r4d
3010*c0909341SAndroid Build Coastguard Worker    ret
3011*c0909341SAndroid Build Coastguard Worker.filter_edge_s3:
3012*c0909341SAndroid Build Coastguard Worker    movddup              m5, [base+pw_3]
3013*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d
3014*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq-12]
3015*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq-10]
3016*c0909341SAndroid Build Coastguard Worker    jmp .filter_edge_s3_start
3017*c0909341SAndroid Build Coastguard Worker.filter_edge_s3_loop:
3018*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r5*2-12]
3019*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+r5*2-10]
3020*c0909341SAndroid Build Coastguard Worker    mova       [tlq+r5*2+2], m1
3021*c0909341SAndroid Build Coastguard Worker.filter_edge_s3_start:
3022*c0909341SAndroid Build Coastguard Worker    paddw                m2, [tlq+r5*2-14]
3023*c0909341SAndroid Build Coastguard Worker    paddw                m3, m5
3024*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r5*2-16]
3025*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq+r5*2-18]
3026*c0909341SAndroid Build Coastguard Worker    sub                  r5, 8
3027*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
3028*c0909341SAndroid Build Coastguard Worker    pavgw                m3, m4
3029*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
3030*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2
3031*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, r4d
3032*c0909341SAndroid Build Coastguard Worker    jg .filter_edge_s3_loop
3033*c0909341SAndroid Build Coastguard Worker    mova       [tlq+r5*2+2], m1
3034*c0909341SAndroid Build Coastguard Worker    neg                 r4d
3035*c0909341SAndroid Build Coastguard Worker    ret
3036*c0909341SAndroid Build Coastguard Worker
3037*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3038*c0909341SAndroid Build Coastguard Workercglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter
3039*c0909341SAndroid Build Coastguard Worker%else
3040*c0909341SAndroid Build Coastguard Workercglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter
3041*c0909341SAndroid Build Coastguard Worker%define  m8 [esp+16*0]
3042*c0909341SAndroid Build Coastguard Worker%define  m9 [esp+16*1]
3043*c0909341SAndroid Build Coastguard Worker%define m10 [esp+16*2]
3044*c0909341SAndroid Build Coastguard Worker%define m11 [esp+16*3]
3045*c0909341SAndroid Build Coastguard Worker%define m12 [esp+16*4]
3046*c0909341SAndroid Build Coastguard Worker%define m13 [esp+16*5]
3047*c0909341SAndroid Build Coastguard Worker%define m14 [esp+16*6]
3048*c0909341SAndroid Build Coastguard Worker%define m15 [esp+16*7]
3049*c0909341SAndroid Build Coastguard Worker%endif
3050*c0909341SAndroid Build Coastguard Worker%define base r6-$$
3051*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
3052*c0909341SAndroid Build Coastguard Worker    movd                 m6, r8m     ; bitdepth_max
3053*c0909341SAndroid Build Coastguard Worker%ifidn filterd, filterm
3054*c0909341SAndroid Build Coastguard Worker    movzx           filterd, filterb
3055*c0909341SAndroid Build Coastguard Worker%else
3056*c0909341SAndroid Build Coastguard Worker    movzx           filterd, byte filterm
3057*c0909341SAndroid Build Coastguard Worker%endif
3058*c0909341SAndroid Build Coastguard Worker    LEA                  r6, $$
3059*c0909341SAndroid Build Coastguard Worker    shl             filterd, 6
3060*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3
3061*c0909341SAndroid Build Coastguard Worker    mova                 m1, [base+filter_intra_taps+filterq+16*0]
3062*c0909341SAndroid Build Coastguard Worker    mova                 m2, [base+filter_intra_taps+filterq+16*1]
3063*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+filter_intra_taps+filterq+16*2]
3064*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+filter_intra_taps+filterq+16*3]
3065*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
3066*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3067*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m5, m1  ; place 8-bit coefficients in the upper
3068*c0909341SAndroid Build Coastguard Worker    punpckhbw            m9, m5, m1  ; half of each 16-bit word to avoid
3069*c0909341SAndroid Build Coastguard Worker    punpcklbw           m10, m5, m2  ; having to perform sign-extension.
3070*c0909341SAndroid Build Coastguard Worker    punpckhbw           m11, m5, m2
3071*c0909341SAndroid Build Coastguard Worker    punpcklbw           m12, m5, m3
3072*c0909341SAndroid Build Coastguard Worker    punpckhbw           m13, m5, m3
3073*c0909341SAndroid Build Coastguard Worker    punpcklbw           m14, m5, m4
3074*c0909341SAndroid Build Coastguard Worker    punpckhbw           m15, m5, m4
3075*c0909341SAndroid Build Coastguard Worker%else
3076*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m5, m1
3077*c0909341SAndroid Build Coastguard Worker    mova                 m8, m7
3078*c0909341SAndroid Build Coastguard Worker    punpckhbw            m7, m5, m1
3079*c0909341SAndroid Build Coastguard Worker    mova                 m9, m7
3080*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m5, m2
3081*c0909341SAndroid Build Coastguard Worker    mova                m10, m7
3082*c0909341SAndroid Build Coastguard Worker    punpckhbw            m7, m5, m2
3083*c0909341SAndroid Build Coastguard Worker    mova                m11, m7
3084*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m5, m3
3085*c0909341SAndroid Build Coastguard Worker    mova                m12, m7
3086*c0909341SAndroid Build Coastguard Worker    punpckhbw            m7, m5, m3
3087*c0909341SAndroid Build Coastguard Worker    mova                m13, m7
3088*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m5, m4
3089*c0909341SAndroid Build Coastguard Worker    mova                m14, m7
3090*c0909341SAndroid Build Coastguard Worker    punpckhbw            m7, m5, m4
3091*c0909341SAndroid Build Coastguard Worker    mova                m15, m7
3092*c0909341SAndroid Build Coastguard Worker%endif
3093*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+filter_shuf]
3094*c0909341SAndroid Build Coastguard Worker    add                  hd, hd
3095*c0909341SAndroid Build Coastguard Worker    mov                  r5, dstq
3096*c0909341SAndroid Build Coastguard Worker    pshuflw              m6, m6, q0000
3097*c0909341SAndroid Build Coastguard Worker    mov                  r6, tlq
3098*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m6
3099*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
3100*c0909341SAndroid Build Coastguard Worker.left_loop:
3101*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m7      ; tl t0 t1 t2 t3 l0 l1 __
3102*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m0, q0000
3103*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m8, m1
3104*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m9
3105*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m0, q1111
3106*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m10, m4
3107*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m11
3108*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3
3109*c0909341SAndroid Build Coastguard Worker    paddd                m1, m4
3110*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m0, q2222
3111*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m12, m4
3112*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m13
3113*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3
3114*c0909341SAndroid Build Coastguard Worker    paddd                m1, m4
3115*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m0, q3333
3116*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m14, m3
3117*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m15
3118*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
3119*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
3120*c0909341SAndroid Build Coastguard Worker    psrad                m0, 11     ; x >> 3
3121*c0909341SAndroid Build Coastguard Worker    psrad                m1, 11
3122*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
3123*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m5
3124*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m5     ; (x + 8) >> 4
3125*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m6
3126*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
3127*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
3128*c0909341SAndroid Build Coastguard Worker    movlps               m0, [tlq+hq-10]
3129*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
3130*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2*2
3131*c0909341SAndroid Build Coastguard Worker    jg .left_loop
3132*c0909341SAndroid Build Coastguard Worker    sub                  wd, 4
3133*c0909341SAndroid Build Coastguard Worker    jz .end
3134*c0909341SAndroid Build Coastguard Worker    sub                 tld, r6d     ; -h*2
3135*c0909341SAndroid Build Coastguard Worker    sub                  r6, r5      ; tl-dst
3136*c0909341SAndroid Build Coastguard Worker.right_loop0:
3137*c0909341SAndroid Build Coastguard Worker    add                  r5, 8
3138*c0909341SAndroid Build Coastguard Worker    mov                  hd, tld
3139*c0909341SAndroid Build Coastguard Worker    movu                 m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __
3140*c0909341SAndroid Build Coastguard Worker    mov                dstq, r5
3141*c0909341SAndroid Build Coastguard Worker.right_loop:
3142*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q0000
3143*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m8, m2
3144*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9
3145*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m0, q1111
3146*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m10, m4
3147*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m11
3148*c0909341SAndroid Build Coastguard Worker    pinsrw               m0, [dstq+strideq*0-2], 5
3149*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
3150*c0909341SAndroid Build Coastguard Worker    paddd                m2, m4
3151*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m0, q2222
3152*c0909341SAndroid Build Coastguard Worker    movddup              m4, [dstq+strideq*1-8]
3153*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m12, m0
3154*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m13
3155*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
3156*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
3157*c0909341SAndroid Build Coastguard Worker    pshuflw              m2, m4, q3333
3158*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5
3159*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m14, m2
3160*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m15
3161*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
3162*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
3163*c0909341SAndroid Build Coastguard Worker    psrad                m1, 11
3164*c0909341SAndroid Build Coastguard Worker    psrad                m0, 11
3165*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
3166*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m5
3167*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m5
3168*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m6
3169*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*0], m0
3170*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], m0
3171*c0909341SAndroid Build Coastguard Worker    palignr              m0, m4, 14
3172*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
3173*c0909341SAndroid Build Coastguard Worker    add                  hd, 2*2
3174*c0909341SAndroid Build Coastguard Worker    jl .right_loop
3175*c0909341SAndroid Build Coastguard Worker    sub                  wd, 4
3176*c0909341SAndroid Build Coastguard Worker    jg .right_loop0
3177*c0909341SAndroid Build Coastguard Worker.end:
3178*c0909341SAndroid Build Coastguard Worker    RET
3179*c0909341SAndroid Build Coastguard Worker
3180*c0909341SAndroid Build Coastguard Worker%if UNIX64
3181*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7
3182*c0909341SAndroid Build Coastguard Worker%else
3183*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5
3184*c0909341SAndroid Build Coastguard Worker%endif
3185*c0909341SAndroid Build Coastguard Worker
3186*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac
3187*c0909341SAndroid Build Coastguard Worker    LEA                  t0, ipred_cfl_left_16bpc_ssse3_table
3188*c0909341SAndroid Build Coastguard Worker    movd                 m4, wd
3189*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
3190*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
3191*c0909341SAndroid Build Coastguard Worker    add                 tlq, 2
3192*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [t0+wq*4]
3193*c0909341SAndroid Build Coastguard Worker    movd                 m5, wd
3194*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start)
3195*c0909341SAndroid Build Coastguard Worker
3196*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
3197*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
3198*c0909341SAndroid Build Coastguard Worker    LEA                  t0, ipred_cfl_left_16bpc_ssse3_table
3199*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
3200*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq*2]
3201*c0909341SAndroid Build Coastguard Worker    movd                 m4, hd
3202*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r6
3203*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, hd
3204*c0909341SAndroid Build Coastguard Worker    movd                 m5, r6d
3205*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [t0+r6*4]
3206*c0909341SAndroid Build Coastguard Worker.start:
3207*c0909341SAndroid Build Coastguard Worker    movd                 m7, r7m
3208*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq]
3209*c0909341SAndroid Build Coastguard Worker    add                  r6, t0
3210*c0909341SAndroid Build Coastguard Worker    add                  t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table
3211*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4]
3212*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
3213*c0909341SAndroid Build Coastguard Worker    pshuflw              m7, m7, q0000
3214*c0909341SAndroid Build Coastguard Worker    pcmpeqw              m3, m3
3215*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
3216*c0909341SAndroid Build Coastguard Worker    movifnidn           acq, acmp
3217*c0909341SAndroid Build Coastguard Worker    pavgw                m4, m6
3218*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m7, m7
3219*c0909341SAndroid Build Coastguard Worker    jmp                  r6
3220*c0909341SAndroid Build Coastguard Worker.h32:
3221*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+48]
3222*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+32]
3223*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3224*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
3225*c0909341SAndroid Build Coastguard Worker.h16:
3226*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+16]
3227*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3228*c0909341SAndroid Build Coastguard Worker.h8:
3229*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m0, q1032
3230*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3231*c0909341SAndroid Build Coastguard Worker.h4:
3232*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m3
3233*c0909341SAndroid Build Coastguard Worker    psubd                m4, m0
3234*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m4, q1032
3235*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
3236*c0909341SAndroid Build Coastguard Worker    psrld                m0, m5
3237*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q0000
3238*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
3239*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3240*c0909341SAndroid Build Coastguard Worker
3241*c0909341SAndroid Build Coastguard Worker%macro IPRED_CFL 2 ; dst, src
3242*c0909341SAndroid Build Coastguard Worker    pabsw               m%1, m%2
3243*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%1, m2
3244*c0909341SAndroid Build Coastguard Worker    psignw              m%2, m1
3245*c0909341SAndroid Build Coastguard Worker    psignw              m%1, m%2
3246*c0909341SAndroid Build Coastguard Worker    paddw               m%1, m0
3247*c0909341SAndroid Build Coastguard Worker    pmaxsw              m%1, m6
3248*c0909341SAndroid Build Coastguard Worker    pminsw              m%1, m7
3249*c0909341SAndroid Build Coastguard Worker%endmacro
3250*c0909341SAndroid Build Coastguard Worker
3251*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha
3252*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
3253*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, hd
3254*c0909341SAndroid Build Coastguard Worker    lea                 t0d, [wq+hq]
3255*c0909341SAndroid Build Coastguard Worker    movd                 m4, t0d
3256*c0909341SAndroid Build Coastguard Worker    tzcnt               t0d, t0d
3257*c0909341SAndroid Build Coastguard Worker    movd                 m5, t0d
3258*c0909341SAndroid Build Coastguard Worker    LEA                  t0, ipred_cfl_16bpc_ssse3_table
3259*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
3260*c0909341SAndroid Build Coastguard Worker    movd                 m7, r7m
3261*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [t0+r6*4]
3262*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4+4*4]
3263*c0909341SAndroid Build Coastguard Worker    psrlw                m4, 1
3264*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
3265*c0909341SAndroid Build Coastguard Worker    pshuflw              m7, m7, q0000
3266*c0909341SAndroid Build Coastguard Worker    add                  r6, t0
3267*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
3268*c0909341SAndroid Build Coastguard Worker    movifnidn           acq, acmp
3269*c0909341SAndroid Build Coastguard Worker    pcmpeqw              m3, m3
3270*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m7, m7
3271*c0909341SAndroid Build Coastguard Worker    jmp                  r6
3272*c0909341SAndroid Build Coastguard Worker.h4:
3273*c0909341SAndroid Build Coastguard Worker    movq                 m0, [tlq-8]
3274*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3275*c0909341SAndroid Build Coastguard Worker.w4:
3276*c0909341SAndroid Build Coastguard Worker    movq                 m1, [tlq+2]
3277*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3278*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m3
3279*c0909341SAndroid Build Coastguard Worker    psubd                m4, m0
3280*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m4, q1032
3281*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
3282*c0909341SAndroid Build Coastguard Worker    pshuflw              m4, m0, q1032
3283*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
3284*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
3285*c0909341SAndroid Build Coastguard Worker    jg .w4_mul
3286*c0909341SAndroid Build Coastguard Worker    psrld                m0, 3
3287*c0909341SAndroid Build Coastguard Worker    jmp .w4_end
3288*c0909341SAndroid Build Coastguard Worker.w4_mul:
3289*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0xAAAB
3290*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x6667
3291*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
3292*c0909341SAndroid Build Coastguard Worker    cmove               r6d, r2d
3293*c0909341SAndroid Build Coastguard Worker    movd                 m1, r6d
3294*c0909341SAndroid Build Coastguard Worker    psrld                m0, 2
3295*c0909341SAndroid Build Coastguard Worker    pmulhuw              m0, m1
3296*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 1
3297*c0909341SAndroid Build Coastguard Worker.w4_end:
3298*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q0000
3299*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
3300*c0909341SAndroid Build Coastguard Worker.s4:
3301*c0909341SAndroid Build Coastguard Worker    movd                 m1, alpham
3302*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
3303*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m1, q0000
3304*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m1
3305*c0909341SAndroid Build Coastguard Worker    pabsw                m2, m1
3306*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
3307*c0909341SAndroid Build Coastguard Worker.s4_loop:
3308*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq+16*0]
3309*c0909341SAndroid Build Coastguard Worker    mova                 m5, [acq+16*1]
3310*c0909341SAndroid Build Coastguard Worker    add                 acq, 16*2
3311*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             3, 4
3312*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4, 5
3313*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m3
3314*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m3
3315*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], m4
3316*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r6       ], m4
3317*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
3318*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3319*c0909341SAndroid Build Coastguard Worker    jg .s4_loop
3320*c0909341SAndroid Build Coastguard Worker    RET
3321*c0909341SAndroid Build Coastguard Worker.h8:
3322*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-16]
3323*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3324*c0909341SAndroid Build Coastguard Worker.w8:
3325*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+2]
3326*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3327*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m3
3328*c0909341SAndroid Build Coastguard Worker    psubd                m4, m0
3329*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m4, q1032
3330*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
3331*c0909341SAndroid Build Coastguard Worker    pshuflw              m4, m0, q1032
3332*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
3333*c0909341SAndroid Build Coastguard Worker    psrld                m0, m5
3334*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
3335*c0909341SAndroid Build Coastguard Worker    je .w8_end
3336*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0xAAAB
3337*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x6667
3338*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
3339*c0909341SAndroid Build Coastguard Worker    cmove               r6d, r2d
3340*c0909341SAndroid Build Coastguard Worker    movd                 m1, r6d
3341*c0909341SAndroid Build Coastguard Worker    pmulhuw              m0, m1
3342*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 1
3343*c0909341SAndroid Build Coastguard Worker.w8_end:
3344*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q0000
3345*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
3346*c0909341SAndroid Build Coastguard Worker.s8:
3347*c0909341SAndroid Build Coastguard Worker    movd                 m1, alpham
3348*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m1, q0000
3349*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m1
3350*c0909341SAndroid Build Coastguard Worker    pabsw                m2, m1
3351*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
3352*c0909341SAndroid Build Coastguard Worker.s8_loop:
3353*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq+16*0]
3354*c0909341SAndroid Build Coastguard Worker    mova                 m5, [acq+16*1]
3355*c0909341SAndroid Build Coastguard Worker    add                 acq, 16*2
3356*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             3, 4
3357*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4, 5
3358*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m3
3359*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m4
3360*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
3361*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3362*c0909341SAndroid Build Coastguard Worker    jg .s8_loop
3363*c0909341SAndroid Build Coastguard Worker    RET
3364*c0909341SAndroid Build Coastguard Worker.h16:
3365*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-32]
3366*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-16]
3367*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3368*c0909341SAndroid Build Coastguard Worker.w16:
3369*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+ 2]
3370*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+18]
3371*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
3372*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3373*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m3
3374*c0909341SAndroid Build Coastguard Worker    psubd                m4, m0
3375*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m4, q1032
3376*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
3377*c0909341SAndroid Build Coastguard Worker    pshuflw              m4, m0, q1032
3378*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
3379*c0909341SAndroid Build Coastguard Worker    psrld                m0, m5
3380*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
3381*c0909341SAndroid Build Coastguard Worker    je .w16_end
3382*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0xAAAB
3383*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x6667
3384*c0909341SAndroid Build Coastguard Worker    test                 hd, 8|32
3385*c0909341SAndroid Build Coastguard Worker    cmovz               r6d, r2d
3386*c0909341SAndroid Build Coastguard Worker    movd                 m1, r6d
3387*c0909341SAndroid Build Coastguard Worker    pmulhuw              m0, m1
3388*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 1
3389*c0909341SAndroid Build Coastguard Worker.w16_end:
3390*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q0000
3391*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
3392*c0909341SAndroid Build Coastguard Worker.s16:
3393*c0909341SAndroid Build Coastguard Worker    movd                 m1, alpham
3394*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m1, q0000
3395*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m1
3396*c0909341SAndroid Build Coastguard Worker    pabsw                m2, m1
3397*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
3398*c0909341SAndroid Build Coastguard Worker.s16_loop:
3399*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq+16*0]
3400*c0909341SAndroid Build Coastguard Worker    mova                 m5, [acq+16*1]
3401*c0909341SAndroid Build Coastguard Worker    add                 acq, 16*2
3402*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             3, 4
3403*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4, 5
3404*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m3
3405*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m4
3406*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
3407*c0909341SAndroid Build Coastguard Worker    dec                  hd
3408*c0909341SAndroid Build Coastguard Worker    jg .s16_loop
3409*c0909341SAndroid Build Coastguard Worker    RET
3410*c0909341SAndroid Build Coastguard Worker.h32:
3411*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-64]
3412*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-48]
3413*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-32]
3414*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tlq-16]
3415*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3416*c0909341SAndroid Build Coastguard Worker.w32:
3417*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+ 2]
3418*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+18]
3419*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
3420*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+34]
3421*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
3422*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+50]
3423*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
3424*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3425*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m3
3426*c0909341SAndroid Build Coastguard Worker    psubd                m4, m0
3427*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m4, q1032
3428*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
3429*c0909341SAndroid Build Coastguard Worker    pshuflw              m4, m0, q1032
3430*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
3431*c0909341SAndroid Build Coastguard Worker    psrld                m0, m5
3432*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
3433*c0909341SAndroid Build Coastguard Worker    je .w32_end
3434*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0xAAAB
3435*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x6667
3436*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
3437*c0909341SAndroid Build Coastguard Worker    cmove               r6d, r2d
3438*c0909341SAndroid Build Coastguard Worker    movd                 m1, r6d
3439*c0909341SAndroid Build Coastguard Worker    pmulhuw              m0, m1
3440*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 1
3441*c0909341SAndroid Build Coastguard Worker.w32_end:
3442*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q0000
3443*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
3444*c0909341SAndroid Build Coastguard Worker.s32:
3445*c0909341SAndroid Build Coastguard Worker    movd                 m1, alpham
3446*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m1, q0000
3447*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m1
3448*c0909341SAndroid Build Coastguard Worker    pabsw                m2, m1
3449*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
3450*c0909341SAndroid Build Coastguard Worker.s32_loop:
3451*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq+16*0]
3452*c0909341SAndroid Build Coastguard Worker    mova                 m5, [acq+16*1]
3453*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             3, 4
3454*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4, 5
3455*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m3
3456*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m4
3457*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq+16*2]
3458*c0909341SAndroid Build Coastguard Worker    mova                 m5, [acq+16*3]
3459*c0909341SAndroid Build Coastguard Worker    add                 acq, 16*4
3460*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             3, 4
3461*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4, 5
3462*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m3
3463*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m4
3464*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
3465*c0909341SAndroid Build Coastguard Worker    dec                  hd
3466*c0909341SAndroid Build Coastguard Worker    jg .s32_loop
3467*c0909341SAndroid Build Coastguard Worker    RET
3468*c0909341SAndroid Build Coastguard Worker
3469*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac
3470*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
3471*c0909341SAndroid Build Coastguard Worker    LEA                  t0, ipred_cfl_splat_16bpc_ssse3_table
3472*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r7m
3473*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
3474*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 11
3475*c0909341SAndroid Build Coastguard Worker    movd                 m7, r7m
3476*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4]
3477*c0909341SAndroid Build Coastguard Worker    movddup              m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8]
3478*c0909341SAndroid Build Coastguard Worker    pshuflw              m7, m7, q0000
3479*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
3480*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
3481*c0909341SAndroid Build Coastguard Worker    movifnidn           acq, acmp
3482*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m7, m7
3483*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3484*c0909341SAndroid Build Coastguard Worker
3485*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
3486*c0909341SAndroid Build Coastguard Worker    movifnidn         hpadd, hpadm
3487*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 && PIC
3488*c0909341SAndroid Build Coastguard Worker    pcmpeqw              m5, m5
3489*c0909341SAndroid Build Coastguard Worker    pabsw                m5, m5
3490*c0909341SAndroid Build Coastguard Worker    paddw                m5, m5
3491*c0909341SAndroid Build Coastguard Worker%else
3492*c0909341SAndroid Build Coastguard Worker    movddup              m5, [pw_2]
3493*c0909341SAndroid Build Coastguard Worker%endif
3494*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
3495*c0909341SAndroid Build Coastguard Worker    shl               hpadd, 2
3496*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
3497*c0909341SAndroid Build Coastguard Worker    sub                  hd, hpadd
3498*c0909341SAndroid Build Coastguard Worker    cmp            dword wm, 8
3499*c0909341SAndroid Build Coastguard Worker    mov                  r5, acq
3500*c0909341SAndroid Build Coastguard Worker    jg .w16
3501*c0909341SAndroid Build Coastguard Worker    je .w8
3502*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
3503*c0909341SAndroid Build Coastguard Worker.w4_loop:
3504*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m5, [ypxq+strideq*0]
3505*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5, [ypxq+strideq*1]
3506*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5, [ypxq+strideq*2]
3507*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5, [ypxq+r3       ]
3508*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*4]
3509*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
3510*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3
3511*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
3512*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2
3513*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3514*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
3515*c0909341SAndroid Build Coastguard Worker    add                 acq, 16
3516*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3517*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
3518*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
3519*c0909341SAndroid Build Coastguard Worker    jz .dc
3520*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
3521*c0909341SAndroid Build Coastguard Worker    pslld                m2, 2
3522*c0909341SAndroid Build Coastguard Worker.w4_hpad:
3523*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*0], m0
3524*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3525*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*1], m0
3526*c0909341SAndroid Build Coastguard Worker    add                 acq, 16*2
3527*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 4
3528*c0909341SAndroid Build Coastguard Worker    jg .w4_hpad
3529*c0909341SAndroid Build Coastguard Worker    jmp .dc
3530*c0909341SAndroid Build Coastguard Worker.w8:
3531*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3532*c0909341SAndroid Build Coastguard Worker    cmp         dword wpadm, 0
3533*c0909341SAndroid Build Coastguard Worker%else
3534*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
3535*c0909341SAndroid Build Coastguard Worker%endif
3536*c0909341SAndroid Build Coastguard Worker    jnz .w8_wpad1
3537*c0909341SAndroid Build Coastguard Worker.w8_loop:
3538*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m5, [ypxq+strideq*0+16*0]
3539*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5, [ypxq+strideq*1+16*0]
3540*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5, [ypxq+strideq*0+16*1]
3541*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5, [ypxq+strideq*1+16*1]
3542*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*2]
3543*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
3544*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
3545*c0909341SAndroid Build Coastguard Worker    paddd                m2, m0, m1
3546*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
3547*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3548*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
3549*c0909341SAndroid Build Coastguard Worker    add                 acq, 16
3550*c0909341SAndroid Build Coastguard Worker    dec                  hd
3551*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
3552*c0909341SAndroid Build Coastguard Worker.w8_hpad:
3553*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
3554*c0909341SAndroid Build Coastguard Worker    jz .dc
3555*c0909341SAndroid Build Coastguard Worker    pslld                m2, 2
3556*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
3557*c0909341SAndroid Build Coastguard Worker    jmp .hpad
3558*c0909341SAndroid Build Coastguard Worker.w8_wpad1:
3559*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m5, [ypxq+strideq*0]
3560*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5, [ypxq+strideq*1]
3561*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*2]
3562*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
3563*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m0, q3333
3564*c0909341SAndroid Build Coastguard Worker    paddd                m2, m0, m1
3565*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
3566*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3567*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
3568*c0909341SAndroid Build Coastguard Worker    add                 acq, 16
3569*c0909341SAndroid Build Coastguard Worker    dec                  hd
3570*c0909341SAndroid Build Coastguard Worker    jg .w8_wpad1
3571*c0909341SAndroid Build Coastguard Worker    jmp .w8_hpad
3572*c0909341SAndroid Build Coastguard Worker.w16_wpad3:
3573*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m0, q3333
3574*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3575*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
3576*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_end
3577*c0909341SAndroid Build Coastguard Worker.w16_wpad2:
3578*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m3, q3333
3579*c0909341SAndroid Build Coastguard Worker    mova                 m2, m1
3580*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_end
3581*c0909341SAndroid Build Coastguard Worker.w16_wpad1:
3582*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m1, q3333
3583*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_end
3584*c0909341SAndroid Build Coastguard Worker.w16:
3585*c0909341SAndroid Build Coastguard Worker    movifnidn         wpadd, wpadm
3586*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       7
3587*c0909341SAndroid Build Coastguard Worker.w16_loop:
3588*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m5, [ypxq+strideq*0+16*0]
3589*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m5, [ypxq+strideq*1+16*0]
3590*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6
3591*c0909341SAndroid Build Coastguard Worker    cmp               wpadd, 2
3592*c0909341SAndroid Build Coastguard Worker    jg .w16_wpad3
3593*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5, [ypxq+strideq*0+16*1]
3594*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m5, [ypxq+strideq*1+16*1]
3595*c0909341SAndroid Build Coastguard Worker    paddd                m3, m6
3596*c0909341SAndroid Build Coastguard Worker    je .w16_wpad2
3597*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5, [ypxq+strideq*0+16*2]
3598*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m5, [ypxq+strideq*1+16*2]
3599*c0909341SAndroid Build Coastguard Worker    paddd                m1, m6
3600*c0909341SAndroid Build Coastguard Worker    jp .w16_wpad1
3601*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5, [ypxq+strideq*0+16*3]
3602*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m5, [ypxq+strideq*1+16*3]
3603*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6
3604*c0909341SAndroid Build Coastguard Worker.w16_wpad_end:
3605*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*2]
3606*c0909341SAndroid Build Coastguard Worker    paddd                m6, m0, m3
3607*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m3
3608*c0909341SAndroid Build Coastguard Worker    paddd                m6, m1
3609*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*0], m0
3610*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m2
3611*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6
3612*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*1], m1
3613*c0909341SAndroid Build Coastguard Worker    add                 acq, 16*2
3614*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3615*c0909341SAndroid Build Coastguard Worker    dec                  hd
3616*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
3617*c0909341SAndroid Build Coastguard Worker    WIN64_RESTORE_XMM
3618*c0909341SAndroid Build Coastguard Worker    add               hpadd, hpadd
3619*c0909341SAndroid Build Coastguard Worker    jz .dc
3620*c0909341SAndroid Build Coastguard Worker    paddd                m2, m2
3621*c0909341SAndroid Build Coastguard Worker.hpad:
3622*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*0], m0
3623*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*1], m1
3624*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3625*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*2], m0
3626*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*3], m1
3627*c0909341SAndroid Build Coastguard Worker    add                 acq, 16*4
3628*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 4
3629*c0909341SAndroid Build Coastguard Worker    jg .hpad
3630*c0909341SAndroid Build Coastguard Worker.dc:
3631*c0909341SAndroid Build Coastguard Worker    sub                  r5, acq ; -w*h*2
3632*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m4, q1032
3633*c0909341SAndroid Build Coastguard Worker    tzcnt               r1d, r5d
3634*c0909341SAndroid Build Coastguard Worker    paddd                m2, m4
3635*c0909341SAndroid Build Coastguard Worker    sub                 r1d, 2
3636*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m2, q2301
3637*c0909341SAndroid Build Coastguard Worker    movd                 m0, r1d
3638*c0909341SAndroid Build Coastguard Worker    paddd                m2, m4
3639*c0909341SAndroid Build Coastguard Worker    psrld                m2, m0
3640*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
3641*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m0
3642*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m2
3643*c0909341SAndroid Build Coastguard Worker.dc_loop:
3644*c0909341SAndroid Build Coastguard Worker    mova                 m0, [acq+r5+16*0]
3645*c0909341SAndroid Build Coastguard Worker    mova                 m1, [acq+r5+16*1]
3646*c0909341SAndroid Build Coastguard Worker    psubw                m0, m2
3647*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
3648*c0909341SAndroid Build Coastguard Worker    mova      [acq+r5+16*0], m0
3649*c0909341SAndroid Build Coastguard Worker    mova      [acq+r5+16*1], m1
3650*c0909341SAndroid Build Coastguard Worker    add                  r5, 16*2
3651*c0909341SAndroid Build Coastguard Worker    jl .dc_loop
3652*c0909341SAndroid Build Coastguard Worker    RET
3653*c0909341SAndroid Build Coastguard Worker
3654*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
3655*c0909341SAndroid Build Coastguard Worker    movifnidn         hpadd, hpadm
3656*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 && PIC
3657*c0909341SAndroid Build Coastguard Worker    pcmpeqw              m5, m5
3658*c0909341SAndroid Build Coastguard Worker    pabsw                m5, m5
3659*c0909341SAndroid Build Coastguard Worker    psllw                m5, 2
3660*c0909341SAndroid Build Coastguard Worker%else
3661*c0909341SAndroid Build Coastguard Worker    movddup              m5, [pw_4]
3662*c0909341SAndroid Build Coastguard Worker%endif
3663*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
3664*c0909341SAndroid Build Coastguard Worker    shl               hpadd, 2
3665*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
3666*c0909341SAndroid Build Coastguard Worker    sub                  hd, hpadd
3667*c0909341SAndroid Build Coastguard Worker    cmp            dword wm, 8
3668*c0909341SAndroid Build Coastguard Worker    mov                  r5, acq
3669*c0909341SAndroid Build Coastguard Worker    jg .w16
3670*c0909341SAndroid Build Coastguard Worker    je .w8
3671*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
3672*c0909341SAndroid Build Coastguard Worker.w4_loop:
3673*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m5, [ypxq+strideq*0]
3674*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5, [ypxq+strideq*1]
3675*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5, [ypxq+strideq*2]
3676*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5, [ypxq+r3       ]
3677*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*4]
3678*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
3679*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m3
3680*c0909341SAndroid Build Coastguard Worker    paddd                m3, m1
3681*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m2
3682*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3683*c0909341SAndroid Build Coastguard Worker    paddd                m4, m3
3684*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*0], m0
3685*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*1], m1
3686*c0909341SAndroid Build Coastguard Worker    add                 acq, 16*2
3687*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3688*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
3689*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
3690*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3691*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m1
3692*c0909341SAndroid Build Coastguard Worker    pslld                m2, 3
3693*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*0], m1
3694*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*1], m1
3695*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3696*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*2], m1
3697*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*3], m1
3698*c0909341SAndroid Build Coastguard Worker    add                 acq, 16*4
3699*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3700*c0909341SAndroid Build Coastguard Worker.w8:
3701*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3702*c0909341SAndroid Build Coastguard Worker    cmp         dword wpadm, 0
3703*c0909341SAndroid Build Coastguard Worker%else
3704*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
3705*c0909341SAndroid Build Coastguard Worker%endif
3706*c0909341SAndroid Build Coastguard Worker    jnz .w8_wpad1
3707*c0909341SAndroid Build Coastguard Worker.w8_loop:
3708*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m5, [ypxq+strideq*0+16*0]
3709*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5, [ypxq+strideq*0+16*1]
3710*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5, [ypxq+strideq*1+16*0]
3711*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5, [ypxq+strideq*1+16*1]
3712*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*2]
3713*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
3714*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2
3715*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3716*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*0], m0
3717*c0909341SAndroid Build Coastguard Worker    paddd                m2, m1, m3
3718*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3
3719*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3720*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*1], m1
3721*c0909341SAndroid Build Coastguard Worker    add                 acq, 16*2
3722*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3723*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
3724*c0909341SAndroid Build Coastguard Worker.w8_hpad:
3725*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
3726*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3727*c0909341SAndroid Build Coastguard Worker    pslld                m2, 2
3728*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
3729*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
3730*c0909341SAndroid Build Coastguard Worker.w8_wpad1:
3731*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m5, [ypxq+strideq*0]
3732*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5, [ypxq+strideq*1]
3733*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*2]
3734*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q3333
3735*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m1, q3333
3736*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
3737*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2
3738*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3739*c0909341SAndroid Build Coastguard Worker    paddd                m2, m1, m3
3740*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3
3741*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3742*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*0], m0
3743*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*1], m1
3744*c0909341SAndroid Build Coastguard Worker    add                 acq, 16*2
3745*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3746*c0909341SAndroid Build Coastguard Worker    jg .w8_wpad1
3747*c0909341SAndroid Build Coastguard Worker    jmp .w8_hpad
3748*c0909341SAndroid Build Coastguard Worker.w16_wpad3:
3749*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m0, q3333
3750*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3751*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
3752*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_end
3753*c0909341SAndroid Build Coastguard Worker.w16_wpad2:
3754*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m3, q3333
3755*c0909341SAndroid Build Coastguard Worker    mova                 m2, m1
3756*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_end
3757*c0909341SAndroid Build Coastguard Worker.w16_wpad1:
3758*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m1, q3333
3759*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_end
3760*c0909341SAndroid Build Coastguard Worker.w16:
3761*c0909341SAndroid Build Coastguard Worker    movifnidn         wpadd, wpadm
3762*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       7
3763*c0909341SAndroid Build Coastguard Worker.w16_loop:
3764*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m5, [ypxq+16*0]
3765*c0909341SAndroid Build Coastguard Worker    cmp               wpadd, 2
3766*c0909341SAndroid Build Coastguard Worker    jg .w16_wpad3
3767*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5, [ypxq+16*1]
3768*c0909341SAndroid Build Coastguard Worker    je .w16_wpad2
3769*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5, [ypxq+16*2]
3770*c0909341SAndroid Build Coastguard Worker    jp .w16_wpad1
3771*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5, [ypxq+16*3]
3772*c0909341SAndroid Build Coastguard Worker.w16_wpad_end:
3773*c0909341SAndroid Build Coastguard Worker    add                ypxq, strideq
3774*c0909341SAndroid Build Coastguard Worker    paddd                m6, m0, m3
3775*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m3
3776*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*0], m0
3777*c0909341SAndroid Build Coastguard Worker    paddd                m6, m1
3778*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m2
3779*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6
3780*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*1], m1
3781*c0909341SAndroid Build Coastguard Worker    add                 acq, 16*2
3782*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3783*c0909341SAndroid Build Coastguard Worker    dec                  hd
3784*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
3785*c0909341SAndroid Build Coastguard Worker    WIN64_RESTORE_XMM
3786*c0909341SAndroid Build Coastguard Worker    add               hpadd, hpadd
3787*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3788*c0909341SAndroid Build Coastguard Worker    paddd                m2, m2
3789*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
3790*c0909341SAndroid Build Coastguard Worker
3791*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
3792*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_cfl_ac_444_16bpc_ssse3_table
3793*c0909341SAndroid Build Coastguard Worker    LEA                  r6, ipred_cfl_ac_444_16bpc_ssse3_table
3794*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
3795*c0909341SAndroid Build Coastguard Worker    movifnidn         hpadd, hpadm
3796*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
3797*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
3798*c0909341SAndroid Build Coastguard Worker    movddup              m5, [base+pw_1]
3799*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
3800*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
3801*c0909341SAndroid Build Coastguard Worker    shl               hpadd, 2
3802*c0909341SAndroid Build Coastguard Worker    sub                  hd, hpadd
3803*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3804*c0909341SAndroid Build Coastguard Worker.w4:
3805*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
3806*c0909341SAndroid Build Coastguard Worker    mov                  r5, acq
3807*c0909341SAndroid Build Coastguard Worker.w4_loop:
3808*c0909341SAndroid Build Coastguard Worker    movq                 m0, [ypxq+strideq*0]
3809*c0909341SAndroid Build Coastguard Worker    movhps               m0, [ypxq+strideq*1]
3810*c0909341SAndroid Build Coastguard Worker    movq                 m1, [ypxq+strideq*2]
3811*c0909341SAndroid Build Coastguard Worker    movhps               m1, [ypxq+r3       ]
3812*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*4]
3813*c0909341SAndroid Build Coastguard Worker    psllw                m0, 3
3814*c0909341SAndroid Build Coastguard Worker    psllw                m1, 3
3815*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*0], m0
3816*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m5
3817*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*1], m1
3818*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5, m1
3819*c0909341SAndroid Build Coastguard Worker    add                 acq, 16*2
3820*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
3821*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3822*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3823*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
3824*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
3825*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3826*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m1
3827*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*0], m1
3828*c0909341SAndroid Build Coastguard Worker    pslld                m2, 2
3829*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*1], m1
3830*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m2, m2
3831*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*2], m1
3832*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3833*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*3], m1
3834*c0909341SAndroid Build Coastguard Worker    add                 acq, 16*4
3835*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3836*c0909341SAndroid Build Coastguard Worker.w8:
3837*c0909341SAndroid Build Coastguard Worker    mov                  r5, acq
3838*c0909341SAndroid Build Coastguard Worker.w8_loop:
3839*c0909341SAndroid Build Coastguard Worker    mova                 m0, [ypxq+strideq*0]
3840*c0909341SAndroid Build Coastguard Worker    mova                 m1, [ypxq+strideq*1]
3841*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*2]
3842*c0909341SAndroid Build Coastguard Worker    psllw                m0, 3
3843*c0909341SAndroid Build Coastguard Worker    psllw                m1, 3
3844*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*0], m0
3845*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m5
3846*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*1], m1
3847*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5, m1
3848*c0909341SAndroid Build Coastguard Worker    add                 acq, 16*2
3849*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
3850*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3851*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3852*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
3853*c0909341SAndroid Build Coastguard Worker.w8_hpad:
3854*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
3855*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3856*c0909341SAndroid Build Coastguard Worker    pslld                m2, 2
3857*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
3858*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
3859*c0909341SAndroid Build Coastguard Worker.w16_wpad2:
3860*c0909341SAndroid Build Coastguard Worker    pshufhw              m3, m2, q3333
3861*c0909341SAndroid Build Coastguard Worker    pshufhw              m1, m0, q3333
3862*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m3
3863*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m1
3864*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_end
3865*c0909341SAndroid Build Coastguard Worker.w16:
3866*c0909341SAndroid Build Coastguard Worker    movifnidn         wpadd, wpadm
3867*c0909341SAndroid Build Coastguard Worker    mov                  r5, acq
3868*c0909341SAndroid Build Coastguard Worker.w16_loop:
3869*c0909341SAndroid Build Coastguard Worker    mova                 m2, [ypxq+strideq*0+16*0]
3870*c0909341SAndroid Build Coastguard Worker    mova                 m0, [ypxq+strideq*1+16*0]
3871*c0909341SAndroid Build Coastguard Worker    psllw                m2, 3
3872*c0909341SAndroid Build Coastguard Worker    psllw                m0, 3
3873*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
3874*c0909341SAndroid Build Coastguard Worker    jnz .w16_wpad2
3875*c0909341SAndroid Build Coastguard Worker    mova                 m3, [ypxq+strideq*0+16*1]
3876*c0909341SAndroid Build Coastguard Worker    mova                 m1, [ypxq+strideq*1+16*1]
3877*c0909341SAndroid Build Coastguard Worker    psllw                m3, 3
3878*c0909341SAndroid Build Coastguard Worker    psllw                m1, 3
3879*c0909341SAndroid Build Coastguard Worker.w16_wpad_end:
3880*c0909341SAndroid Build Coastguard Worker    lea                ypxq, [ypxq+strideq*2]
3881*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*0], m2
3882*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5
3883*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*1], m3
3884*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5
3885*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3886*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5, m0
3887*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*2], m0
3888*c0909341SAndroid Build Coastguard Worker    paddd                m4, m3
3889*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5, m1
3890*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*3], m1
3891*c0909341SAndroid Build Coastguard Worker    add                 acq, 16*4
3892*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3
3893*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
3894*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3895*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
3896*c0909341SAndroid Build Coastguard Worker    add               hpadd, hpadd
3897*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3898*c0909341SAndroid Build Coastguard Worker    paddd                m2, m2
3899*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
3900*c0909341SAndroid Build Coastguard Worker.w32_wpad6:
3901*c0909341SAndroid Build Coastguard Worker    pshufhw              m1, m0, q3333
3902*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m1
3903*c0909341SAndroid Build Coastguard Worker    mova                 m2, m1
3904*c0909341SAndroid Build Coastguard Worker    mova                 m3, m1
3905*c0909341SAndroid Build Coastguard Worker    jmp .w32_wpad_end
3906*c0909341SAndroid Build Coastguard Worker.w32_wpad4:
3907*c0909341SAndroid Build Coastguard Worker    pshufhw              m2, m1, q3333
3908*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m2, m2
3909*c0909341SAndroid Build Coastguard Worker    mova                 m3, m2
3910*c0909341SAndroid Build Coastguard Worker    jmp .w32_wpad_end
3911*c0909341SAndroid Build Coastguard Worker.w32_wpad2:
3912*c0909341SAndroid Build Coastguard Worker    pshufhw              m3, m2, q3333
3913*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m3
3914*c0909341SAndroid Build Coastguard Worker    jmp .w32_wpad_end
3915*c0909341SAndroid Build Coastguard Worker.w32:
3916*c0909341SAndroid Build Coastguard Worker    movifnidn         wpadd, wpadm
3917*c0909341SAndroid Build Coastguard Worker    mov                  r5, acq
3918*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       8
3919*c0909341SAndroid Build Coastguard Worker.w32_loop:
3920*c0909341SAndroid Build Coastguard Worker    mova                 m0, [ypxq+16*0]
3921*c0909341SAndroid Build Coastguard Worker    psllw                m0, 3
3922*c0909341SAndroid Build Coastguard Worker    cmp               wpadd, 4
3923*c0909341SAndroid Build Coastguard Worker    jg .w32_wpad6
3924*c0909341SAndroid Build Coastguard Worker    mova                 m1, [ypxq+16*1]
3925*c0909341SAndroid Build Coastguard Worker    psllw                m1, 3
3926*c0909341SAndroid Build Coastguard Worker    je .w32_wpad4
3927*c0909341SAndroid Build Coastguard Worker    mova                 m2, [ypxq+16*2]
3928*c0909341SAndroid Build Coastguard Worker    psllw                m2, 3
3929*c0909341SAndroid Build Coastguard Worker    jnp .w32_wpad2
3930*c0909341SAndroid Build Coastguard Worker    mova                 m3, [ypxq+16*3]
3931*c0909341SAndroid Build Coastguard Worker    psllw                m3, 3
3932*c0909341SAndroid Build Coastguard Worker.w32_wpad_end:
3933*c0909341SAndroid Build Coastguard Worker    add                ypxq, strideq
3934*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m5, m0
3935*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*0], m0
3936*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m5, m1
3937*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*1], m1
3938*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
3939*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m5, m2
3940*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*2], m2
3941*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
3942*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m5, m3
3943*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*3], m3
3944*c0909341SAndroid Build Coastguard Worker    add                 acq, 16*4
3945*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
3946*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
3947*c0909341SAndroid Build Coastguard Worker    dec                  hd
3948*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
3949*c0909341SAndroid Build Coastguard Worker%if WIN64
3950*c0909341SAndroid Build Coastguard Worker    mova                 m5, m6
3951*c0909341SAndroid Build Coastguard Worker    WIN64_RESTORE_XMM
3952*c0909341SAndroid Build Coastguard Worker    SWAP                  5, 6
3953*c0909341SAndroid Build Coastguard Worker%endif
3954*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
3955*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3956*c0909341SAndroid Build Coastguard Worker.w32_hpad_loop:
3957*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*0], m0
3958*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*1], m1
3959*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
3960*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*2], m2
3961*c0909341SAndroid Build Coastguard Worker    mova         [acq+16*3], m3
3962*c0909341SAndroid Build Coastguard Worker    add                 acq, 16*4
3963*c0909341SAndroid Build Coastguard Worker    dec               hpadd
3964*c0909341SAndroid Build Coastguard Worker    jg .w32_hpad_loop
3965*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
3966*c0909341SAndroid Build Coastguard Worker
3967*c0909341SAndroid Build Coastguard Workercglobal pal_pred_16bpc, 4, 5, 6, dst, stride, pal, idx, w, h
3968*c0909341SAndroid Build Coastguard Worker%define base r2-pal_pred_16bpc_ssse3_table
3969*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3970*c0909341SAndroid Build Coastguard Worker    %define              hd  r2d
3971*c0909341SAndroid Build Coastguard Worker%endif
3972*c0909341SAndroid Build Coastguard Worker    mova                 m4, [palq]
3973*c0909341SAndroid Build Coastguard Worker    LEA                  r2, pal_pred_16bpc_ssse3_table
3974*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
3975*c0909341SAndroid Build Coastguard Worker    pshufb               m4, [base+pal_pred_shuf]
3976*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r2+wq*4]
3977*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m4, q1032
3978*c0909341SAndroid Build Coastguard Worker    add                  wq, r2
3979*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
3980*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3981*c0909341SAndroid Build Coastguard Worker.w4:
3982*c0909341SAndroid Build Coastguard Worker    movq                 m0, [idxq]
3983*c0909341SAndroid Build Coastguard Worker    add                idxq, 8
3984*c0909341SAndroid Build Coastguard Worker    psrlw                m1, m0, 4
3985*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1
3986*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m0
3987*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m0
3988*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
3989*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
3990*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
3991*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
3992*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
3993*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m1
3994*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m1
3995*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
3996*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3997*c0909341SAndroid Build Coastguard Worker    jg .w4
3998*c0909341SAndroid Build Coastguard Worker    RET
3999*c0909341SAndroid Build Coastguard Worker.w8:
4000*c0909341SAndroid Build Coastguard Worker    movu                 m3, [idxq]
4001*c0909341SAndroid Build Coastguard Worker    add                idxq, 16
4002*c0909341SAndroid Build Coastguard Worker    psrlw                m1, m3, 4
4003*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m3, m1
4004*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m1
4005*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m0
4006*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m0
4007*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
4008*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
4009*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
4010*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
4011*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
4012*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m3
4013*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m3
4014*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
4015*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
4016*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
4017*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
4018*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
4019*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4020*c0909341SAndroid Build Coastguard Worker    jg .w8
4021*c0909341SAndroid Build Coastguard Worker    RET
4022*c0909341SAndroid Build Coastguard Worker.w16:
4023*c0909341SAndroid Build Coastguard Worker    movu                 m3, [idxq]
4024*c0909341SAndroid Build Coastguard Worker    add                idxq, 16
4025*c0909341SAndroid Build Coastguard Worker    psrlw                m1, m3, 4
4026*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m3, m1
4027*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m1
4028*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m0
4029*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m0
4030*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
4031*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
4032*c0909341SAndroid Build Coastguard Worker    mova          [dstq+ 0], m0
4033*c0909341SAndroid Build Coastguard Worker    mova          [dstq+16], m1
4034*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m3
4035*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m3
4036*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
4037*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
4038*c0909341SAndroid Build Coastguard Worker    mova  [dstq+strideq+ 0], m0
4039*c0909341SAndroid Build Coastguard Worker    mova  [dstq+strideq+16], m1
4040*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
4041*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4042*c0909341SAndroid Build Coastguard Worker    jg .w16
4043*c0909341SAndroid Build Coastguard Worker    RET
4044*c0909341SAndroid Build Coastguard Worker.w32:
4045*c0909341SAndroid Build Coastguard Worker    movu                 m3, [idxq]
4046*c0909341SAndroid Build Coastguard Worker    add                idxq, 16
4047*c0909341SAndroid Build Coastguard Worker    psrlw                m1, m3, 4
4048*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m3, m1
4049*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m1
4050*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m0
4051*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m0
4052*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
4053*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
4054*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
4055*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
4056*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m3
4057*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m3
4058*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
4059*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
4060*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
4061*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m1
4062*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
4063*c0909341SAndroid Build Coastguard Worker    dec                  hd
4064*c0909341SAndroid Build Coastguard Worker    jg .w32
4065*c0909341SAndroid Build Coastguard Worker    RET
4066*c0909341SAndroid Build Coastguard Worker.w64:
4067*c0909341SAndroid Build Coastguard Worker    movu                 m3, [idxq+16*0]
4068*c0909341SAndroid Build Coastguard Worker    psrlw                m1, m3, 4
4069*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m3, m1
4070*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m1
4071*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m0
4072*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m0
4073*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
4074*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
4075*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
4076*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
4077*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m3
4078*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m3
4079*c0909341SAndroid Build Coastguard Worker    movu                 m3, [idxq+16*1]
4080*c0909341SAndroid Build Coastguard Worker    add                idxq, 32
4081*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
4082*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
4083*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
4084*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m1
4085*c0909341SAndroid Build Coastguard Worker    psrlw                m1, m3, 4
4086*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m3, m1
4087*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m1
4088*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m0
4089*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m0
4090*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
4091*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
4092*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*4], m0
4093*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*5], m1
4094*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m3
4095*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m3
4096*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
4097*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
4098*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*6], m0
4099*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*7], m1
4100*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
4101*c0909341SAndroid Build Coastguard Worker    dec                  hd
4102*c0909341SAndroid Build Coastguard Worker    jg .w64
4103*c0909341SAndroid Build Coastguard Worker    RET
4104