xref: /aosp_15_r20/external/libdav1d/src/x86/ipred_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2018-2021, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64
32*c0909341SAndroid Build Coastguard Worker
33*c0909341SAndroid Build Coastguard Worker%macro SMOOTH_WEIGHT_TABLE 1-*
34*c0909341SAndroid Build Coastguard Worker    %rep %0
35*c0909341SAndroid Build Coastguard Worker        db %1-128, 127-%1
36*c0909341SAndroid Build Coastguard Worker        %rotate 1
37*c0909341SAndroid Build Coastguard Worker    %endrep
38*c0909341SAndroid Build Coastguard Worker%endmacro
39*c0909341SAndroid Build Coastguard Worker
40*c0909341SAndroid Build Coastguard Worker; sm_weights[], but modified to precalculate x and 256-x with offsets to
41*c0909341SAndroid Build Coastguard Worker; enable efficient use of pmaddubsw (which requires signed values)
42*c0909341SAndroid Build Coastguard Workersmooth_weights: SMOOTH_WEIGHT_TABLE         \
43*c0909341SAndroid Build Coastguard Worker      0,   0, 255, 128, 255, 149,  85,  64, \
44*c0909341SAndroid Build Coastguard Worker    255, 197, 146, 105,  73,  50,  37,  32, \
45*c0909341SAndroid Build Coastguard Worker    255, 225, 196, 170, 145, 123, 102,  84, \
46*c0909341SAndroid Build Coastguard Worker     68,  54,  43,  33,  26,  20,  17,  16, \
47*c0909341SAndroid Build Coastguard Worker    255, 240, 225, 210, 196, 182, 169, 157, \
48*c0909341SAndroid Build Coastguard Worker    145, 133, 122, 111, 101,  92,  83,  74, \
49*c0909341SAndroid Build Coastguard Worker     66,  59,  52,  45,  39,  34,  29,  25, \
50*c0909341SAndroid Build Coastguard Worker     21,  17,  14,  12,  10,   9,   8,   8, \
51*c0909341SAndroid Build Coastguard Worker    255, 248, 240, 233, 225, 218, 210, 203, \
52*c0909341SAndroid Build Coastguard Worker    196, 189, 182, 176, 169, 163, 156, 150, \
53*c0909341SAndroid Build Coastguard Worker    144, 138, 133, 127, 121, 116, 111, 106, \
54*c0909341SAndroid Build Coastguard Worker    101,  96,  91,  86,  82,  77,  73,  69, \
55*c0909341SAndroid Build Coastguard Worker     65,  61,  57,  54,  50,  47,  44,  41, \
56*c0909341SAndroid Build Coastguard Worker     38,  35,  32,  29,  27,  25,  22,  20, \
57*c0909341SAndroid Build Coastguard Worker     18,  16,  15,  13,  12,  10,   9,   8, \
58*c0909341SAndroid Build Coastguard Worker      7,   6,   6,   5,   5,   4,   4,   4
59*c0909341SAndroid Build Coastguard Worker
60*c0909341SAndroid Build Coastguard Workerpb_1to32:     db  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
61*c0909341SAndroid Build Coastguard Worker              db 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
62*c0909341SAndroid Build Coastguard Workerpb_32to1:     db 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17
63*c0909341SAndroid Build Coastguard Workerpb_16to1:     db 16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1
64*c0909341SAndroid Build Coastguard Workerz_filter_wh:  db  7,  7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
65*c0909341SAndroid Build Coastguard Worker              db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
66*c0909341SAndroid Build Coastguard Workerz_filter_k:   db  0, 16,  0, 16,  0, 20,  0, 20,  8, 16,  8, 16
67*c0909341SAndroid Build Coastguard Worker              db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16
68*c0909341SAndroid Build Coastguard Worker              db  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  8,  0
69*c0909341SAndroid Build Coastguard Workerconst \
70*c0909341SAndroid Build Coastguard Workerz_filter_s,   db  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7
71*c0909341SAndroid Build Coastguard Worker              db  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15
72*c0909341SAndroid Build Coastguard Worker              db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line
73*c0909341SAndroid Build Coastguard Workerpb_128:       times 4 db 128 ; those are just placed here for alignment.
74*c0909341SAndroid Build Coastguard Workerpb_36_m4:     times 2 db 36, -4
75*c0909341SAndroid Build Coastguard Workerz3_shuf:      db  8,  7,  7,  6,  6,  5,  5,  4,  4,  3,  3,  2,  2,  1,  1,  0
76*c0909341SAndroid Build Coastguard Workerz_filter_t0:  db 55,127, 39,127, 39,127,  7, 15, 31,  7, 15, 31,  0,  3, 31,  0
77*c0909341SAndroid Build Coastguard Workerz_filter_t1:  db 39, 63, 19, 47, 19, 47,  3,  3,  3,  3,  3,  3,  0,  0,  0,  0
78*c0909341SAndroid Build Coastguard Workerz_upsample1:  db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
79*c0909341SAndroid Build Coastguard Workerz_upsample2:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  8,  8,  8
80*c0909341SAndroid Build Coastguard Workerz2_upsample:  db  7,  6, 15, 14,  5,  4, 13, 12,  3,  2, 11, 10,  1,  0,  9,  8
81*c0909341SAndroid Build Coastguard Workerz1_shuf_w4:   db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
82*c0909341SAndroid Build Coastguard Workerz2_shuf_h2:   db  3,  2,  7,  6, 11, 10, 15, 14,  2,  1,  6,  5, 10,  9, 14, 13
83*c0909341SAndroid Build Coastguard Workerz2_shuf_h4:   db  7,  6, 15, 14,  6,  5, 14, 13,  5,  4, 13, 12,  4,  3, 12, 11
84*c0909341SAndroid Build Coastguard Workerz3_shuf_w4:   db  4,  3,  3,  2,  2,  1,  1,  0, 12, 11, 11, 10, 10,  9,  9,  8
85*c0909341SAndroid Build Coastguard Workerz_transpose4: db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
86*c0909341SAndroid Build Coastguard Workerz_base_inc:   dw   0*64,   1*64,   2*64,   3*64,   4*64,   5*64,   6*64,   7*64
87*c0909341SAndroid Build Coastguard Worker              dw  16*64,  17*64,  18*64,  19*64,  20*64,  21*64,  22*64,  23*64
88*c0909341SAndroid Build Coastguard Workerz2_base_inc:  dw   1*64,   2*64,   3*64,   4*64,   5*64,   6*64,   7*64,   8*64
89*c0909341SAndroid Build Coastguard Worker              dw   9*64,  10*64,  11*64,  12*64,  13*64,  14*64,  15*64,  16*64
90*c0909341SAndroid Build Coastguard Workerz2_ymul:      dw  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
91*c0909341SAndroid Build Coastguard Workerz2_y_shuf_h4: db 90, 90, 90, 90, 14, 14, 14, 14, 27, 27, 27, 27, 31, 31, 31, 31 ; 2, 6, 3, 7
92*c0909341SAndroid Build Coastguard Worker              db 32, 32, 32, 32, 12, 12, 12, 12,  1,  0,  1,  0,  5, -1, -1, -1 ; 0, 4, 1, 5
93*c0909341SAndroid Build Coastguard Worker; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5
94*c0909341SAndroid Build Coastguard Workerfilter_shuf1: db 10,  4, 10,  4, 37,  6,  5,  6,103,  9,  7,  9, 72, -1,  8, -1
95*c0909341SAndroid Build Coastguard Worker              db 16,  4,  0,  4, 53,  6,  5,  6,119, 11,  7, 11, 95, -1, 15, -1
96*c0909341SAndroid Build Coastguard Workerfilter_shuf2: db  3,  4,  3,  4,  5,  6,  5,  6,  7,  2,  7,  2,  1, -1,  1, -1
97*c0909341SAndroid Build Coastguard Workerfilter_shuf3: db  3,  4,  3,  4,  5,  6,  5,  6,  7, 11,  7, 11; 15, -1, 15, -1
98*c0909341SAndroid Build Coastguard Workerpb_127_m127:  times 2 db 127, -127
99*c0909341SAndroid Build Coastguard Workeripred_v_shuf: db  0,  1,  0,  1,  4,  5,  4,  5,  8,  9,  8,  9, 12, 13, 12, 13
100*c0909341SAndroid Build Coastguard Worker              db  2,  3,  2,  3,  6,  7,  6,  7, 10, 11, 10, 11, 14, 15, 14, 15
101*c0909341SAndroid Build Coastguard Workeripred_h_shuf: db  7,  7,  7,  7,  3,  3,  3,  3,  5,  5,  5,  5,  1,  1,  1,  1
102*c0909341SAndroid Build Coastguard Worker              db  6,  6,  6,  6,  2,  2,  2,  2,  4,  4,  4,  4;  0,  0,  0,  0
103*c0909341SAndroid Build Coastguard Workerpw_64:        times 2 dw 64
104*c0909341SAndroid Build Coastguard Worker
105*c0909341SAndroid Build Coastguard Workercfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1
106*c0909341SAndroid Build Coastguard Worker                             times 9 db 7, -1
107*c0909341SAndroid Build Coastguard Workercfl_ac_w16_pad_shuffle: ; w=16, w_pad=1
108*c0909341SAndroid Build Coastguard Worker                        db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
109*c0909341SAndroid Build Coastguard Worker                        ; w=8, w_pad=1 as well as second half of previous one
110*c0909341SAndroid Build Coastguard Workercfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5
111*c0909341SAndroid Build Coastguard Worker                        times 5 db 6, 7
112*c0909341SAndroid Build Coastguard Worker                        ; w=16,w_pad=2
113*c0909341SAndroid Build Coastguard Worker                        db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
114*c0909341SAndroid Build Coastguard Worker                        times 8 db 14, 15
115*c0909341SAndroid Build Coastguard Worker                        ; w=16,w_pad=3
116*c0909341SAndroid Build Coastguard Worker                        db 0, 1, 2, 3, 4, 5
117*c0909341SAndroid Build Coastguard Worker                        times 13 db 6, 7
118*c0909341SAndroid Build Coastguard Workerpb_15to0:               db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
119*c0909341SAndroid Build Coastguard Worker
120*c0909341SAndroid Build Coastguard Worker%define pb_0to15 cfl_ac_w16_pad_shuffle
121*c0909341SAndroid Build Coastguard Worker%define pb_1  (ipred_h_shuf+12)
122*c0909341SAndroid Build Coastguard Worker%define pb_2  (ipred_h_shuf+20)
123*c0909341SAndroid Build Coastguard Worker%define pb_3  (ipred_h_shuf+ 4)
124*c0909341SAndroid Build Coastguard Worker%define pb_4  (ipred_h_shuf+24)
125*c0909341SAndroid Build Coastguard Worker%define pb_5  (ipred_h_shuf+ 8)
126*c0909341SAndroid Build Coastguard Worker%define pb_7  (ipred_h_shuf+ 0)
127*c0909341SAndroid Build Coastguard Worker%define pb_8  (z_upsample2 +12)
128*c0909341SAndroid Build Coastguard Worker%define pb_12 (z2_y_shuf_h4+20)
129*c0909341SAndroid Build Coastguard Worker%define pb_14 (z2_y_shuf_h4+ 4)
130*c0909341SAndroid Build Coastguard Worker%define pb_15 (z_filter_s  +32)
131*c0909341SAndroid Build Coastguard Worker%define pb_27 (z2_y_shuf_h4+ 8)
132*c0909341SAndroid Build Coastguard Worker%define pb_31 (z2_y_shuf_h4+12)
133*c0909341SAndroid Build Coastguard Worker%define pb_32 (z2_y_shuf_h4+16)
134*c0909341SAndroid Build Coastguard Worker%define pb_90 (z2_y_shuf_h4+ 0)
135*c0909341SAndroid Build Coastguard Worker%define pw_1  (z2_y_shuf_h4+24)
136*c0909341SAndroid Build Coastguard Worker%define pw_8  (z_filter_k  +32)
137*c0909341SAndroid Build Coastguard Worker
138*c0909341SAndroid Build Coastguard Workerpw_62:    times 2 dw 62
139*c0909341SAndroid Build Coastguard Workerpw_128:   times 2 dw 128
140*c0909341SAndroid Build Coastguard Workerpw_255:   times 2 dw 255
141*c0909341SAndroid Build Coastguard Workerpw_512:   times 2 dw 512
142*c0909341SAndroid Build Coastguard Worker
143*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 3-*
144*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - 2*4)
145*c0909341SAndroid Build Coastguard Worker    %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
146*c0909341SAndroid Build Coastguard Worker    %%table:
147*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
148*c0909341SAndroid Build Coastguard Worker        dd %%base %+ .%3 - (%%table - 2*4)
149*c0909341SAndroid Build Coastguard Worker        %rotate 1
150*c0909341SAndroid Build Coastguard Worker    %endrep
151*c0909341SAndroid Build Coastguard Worker%endmacro
152*c0909341SAndroid Build Coastguard Worker
153*c0909341SAndroid Build Coastguard Worker%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4)
154*c0909341SAndroid Build Coastguard Worker%define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4)
155*c0909341SAndroid Build Coastguard Worker
156*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth,     avx2, w4, w8, w16, w32, w64
157*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_v,   avx2, w4, w8, w16, w32, w64
158*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_h,   avx2, w4, w8, w16, w32, w64
159*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_paeth,      avx2, w4, w8, w16, w32, w64
160*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_filter,     avx2, w4, w8, w16, w32
161*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_dc,         avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
162*c0909341SAndroid Build Coastguard Worker                                  s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
163*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_dc_left,    avx2, h4, h8, h16, h32, h64
164*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_h,          avx2, w4, w8, w16, w32, w64
165*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z1,         avx2, w4, w8, w16, w32, w64
166*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z2,         avx2, w4, w8, w16, w32, w64
167*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z3,         avx2, h4, h8, h16, h32, h64
168*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl,        avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
169*c0909341SAndroid Build Coastguard Worker                                  s4-8*4, s8-8*4, s16-8*4, s32-8*4
170*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl_left,   avx2, h4, h8, h16, h32
171*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3
172*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3
173*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32
174*c0909341SAndroid Build Coastguard WorkerJMP_TABLE pal_pred,         avx2, w4, w8, w16, w32, w64
175*c0909341SAndroid Build Coastguard Worker
176*c0909341SAndroid Build Coastguard Workercextern dr_intra_derivative
177*c0909341SAndroid Build Coastguard Workercextern filter_intra_taps
178*c0909341SAndroid Build Coastguard Worker
179*c0909341SAndroid Build Coastguard WorkerSECTION .text
180*c0909341SAndroid Build Coastguard Worker
181*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
182*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
183*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_dc_left_avx2_table]
184*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
185*c0909341SAndroid Build Coastguard Worker    inc                 tlq
186*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq]
187*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
188*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x8000
189*c0909341SAndroid Build Coastguard Worker    shrx                r6d, r6d, wd
190*c0909341SAndroid Build Coastguard Worker    movd                xm3, r6d
191*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [r5+wq*4]
192*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m2, m2
193*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
194*c0909341SAndroid Build Coastguard Worker    add                  r6, r5
195*c0909341SAndroid Build Coastguard Worker    add                  r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
196*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
197*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
198*c0909341SAndroid Build Coastguard Worker    jmp                  r6
199*c0909341SAndroid Build Coastguard Worker
200*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
201*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm ; zero upper half
202*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, hd
203*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
204*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
205*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq]
206*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 0x8000
207*c0909341SAndroid Build Coastguard Worker    shrx                r5d, r5d, r6d
208*c0909341SAndroid Build Coastguard Worker    movd                xm3, r5d
209*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_dc_left_avx2_table]
210*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [r5+r6*4]
211*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m2, m2
212*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
213*c0909341SAndroid Build Coastguard Worker    add                  r6, r5
214*c0909341SAndroid Build Coastguard Worker    add                  r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
215*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
216*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
217*c0909341SAndroid Build Coastguard Worker    jmp                  r6
218*c0909341SAndroid Build Coastguard Worker.h64:
219*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+32] ; unaligned when jumping here from dc_top
220*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
221*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
222*c0909341SAndroid Build Coastguard Worker.h32:
223*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
224*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
225*c0909341SAndroid Build Coastguard Worker.h16:
226*c0909341SAndroid Build Coastguard Worker    punpckhqdq          xm1, xm0, xm0
227*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
228*c0909341SAndroid Build Coastguard Worker.h8:
229*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
230*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
231*c0909341SAndroid Build Coastguard Worker.h4:
232*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm0, xm2
233*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm3
234*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
235*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, xm0
236*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
237*c0909341SAndroid Build Coastguard Worker    jmp                  wq
238*c0909341SAndroid Build Coastguard Worker
239*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
240*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
241*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
242*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, hd
243*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [wq+hq]
244*c0909341SAndroid Build Coastguard Worker    movd                xm4, r5d
245*c0909341SAndroid Build Coastguard Worker    tzcnt               r5d, r5d
246*c0909341SAndroid Build Coastguard Worker    movd                xm5, r5d
247*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_dc_avx2_table]
248*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
249*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [r5+r6*4]
250*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4+5*4]
251*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m3, m3
252*c0909341SAndroid Build Coastguard Worker    psrlw               xm4, 1
253*c0909341SAndroid Build Coastguard Worker    add                  r6, r5
254*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
255*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
256*c0909341SAndroid Build Coastguard Worker    jmp                  r6
257*c0909341SAndroid Build Coastguard Worker.h4:
258*c0909341SAndroid Build Coastguard Worker    movd                xm0, [tlq-4]
259*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm3
260*c0909341SAndroid Build Coastguard Worker    jmp                  wq
261*c0909341SAndroid Build Coastguard Worker.w4:
262*c0909341SAndroid Build Coastguard Worker    movd                xm1, [tlq+1]
263*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm3
264*c0909341SAndroid Build Coastguard Worker    psubw               xm0, xm4
265*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
266*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm0, xm3
267*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
268*c0909341SAndroid Build Coastguard Worker    jg .w4_mul
269*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 3
270*c0909341SAndroid Build Coastguard Worker    jmp .w4_end
271*c0909341SAndroid Build Coastguard Worker.w4_mul:
272*c0909341SAndroid Build Coastguard Worker    punpckhqdq          xm1, xm0, xm0
273*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [hq*2]
274*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x55563334
275*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
276*c0909341SAndroid Build Coastguard Worker    shrx                r6d, r6d, r2d
277*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
278*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
279*c0909341SAndroid Build Coastguard Worker    movd                xm1, r6d
280*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 2
281*c0909341SAndroid Build Coastguard Worker    pmulhuw             xm0, xm1
282*c0909341SAndroid Build Coastguard Worker.w4_end:
283*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm0, xm0
284*c0909341SAndroid Build Coastguard Worker.s4:
285*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
286*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], xm0
287*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm0
288*c0909341SAndroid Build Coastguard Worker    movd   [dstq+stride3q ], xm0
289*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
290*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
291*c0909341SAndroid Build Coastguard Worker    jg .s4
292*c0909341SAndroid Build Coastguard Worker    RET
293*c0909341SAndroid Build Coastguard WorkerALIGN function_align
294*c0909341SAndroid Build Coastguard Worker.h8:
295*c0909341SAndroid Build Coastguard Worker    movq                xm0, [tlq-8]
296*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm3
297*c0909341SAndroid Build Coastguard Worker    jmp                  wq
298*c0909341SAndroid Build Coastguard Worker.w8:
299*c0909341SAndroid Build Coastguard Worker    movq                xm1, [tlq+1]
300*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m0, 1
301*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm3
302*c0909341SAndroid Build Coastguard Worker    psubw               xm0, xm4
303*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm2
304*c0909341SAndroid Build Coastguard Worker    punpckhqdq          xm2, xm0, xm0
305*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm2
306*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
307*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
308*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
309*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm0, xm3
310*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, xm5
311*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
312*c0909341SAndroid Build Coastguard Worker    je .w8_end
313*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x5556
314*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x3334
315*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
316*c0909341SAndroid Build Coastguard Worker    cmove               r6d, r2d
317*c0909341SAndroid Build Coastguard Worker    movd                xm1, r6d
318*c0909341SAndroid Build Coastguard Worker    pmulhuw             xm0, xm1
319*c0909341SAndroid Build Coastguard Worker.w8_end:
320*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm0, xm0
321*c0909341SAndroid Build Coastguard Worker.s8:
322*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
323*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm0
324*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm0
325*c0909341SAndroid Build Coastguard Worker    movq   [dstq+stride3q ], xm0
326*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
327*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
328*c0909341SAndroid Build Coastguard Worker    jg .s8
329*c0909341SAndroid Build Coastguard Worker    RET
330*c0909341SAndroid Build Coastguard WorkerALIGN function_align
331*c0909341SAndroid Build Coastguard Worker.h16:
332*c0909341SAndroid Build Coastguard Worker    mova                xm0, [tlq-16]
333*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm3
334*c0909341SAndroid Build Coastguard Worker    jmp                  wq
335*c0909341SAndroid Build Coastguard Worker.w16:
336*c0909341SAndroid Build Coastguard Worker    movu                xm1, [tlq+1]
337*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m0, 1
338*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm3
339*c0909341SAndroid Build Coastguard Worker    psubw               xm0, xm4
340*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm2
341*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
342*c0909341SAndroid Build Coastguard Worker    punpckhqdq          xm1, xm0, xm0
343*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
344*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
345*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
346*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm0, xm3
347*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, xm5
348*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
349*c0909341SAndroid Build Coastguard Worker    je .w16_end
350*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x5556
351*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x3334
352*c0909341SAndroid Build Coastguard Worker    test                 hb, 8|32
353*c0909341SAndroid Build Coastguard Worker    cmovz               r6d, r2d
354*c0909341SAndroid Build Coastguard Worker    movd                xm1, r6d
355*c0909341SAndroid Build Coastguard Worker    pmulhuw             xm0, xm1
356*c0909341SAndroid Build Coastguard Worker.w16_end:
357*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm0, xm0
358*c0909341SAndroid Build Coastguard Worker.s16:
359*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], xm0
360*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], xm0
361*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], xm0
362*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], xm0
363*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
364*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
365*c0909341SAndroid Build Coastguard Worker    jg .s16
366*c0909341SAndroid Build Coastguard Worker    RET
367*c0909341SAndroid Build Coastguard WorkerALIGN function_align
368*c0909341SAndroid Build Coastguard Worker.h32:
369*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-32]
370*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
371*c0909341SAndroid Build Coastguard Worker    jmp                  wq
372*c0909341SAndroid Build Coastguard Worker.w32:
373*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+1]
374*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
375*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
376*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
377*c0909341SAndroid Build Coastguard Worker    psubw               xm0, xm4
378*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
379*c0909341SAndroid Build Coastguard Worker    punpckhqdq          xm1, xm0, xm0
380*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
381*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
382*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
383*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm0, xm3
384*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, xm5
385*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
386*c0909341SAndroid Build Coastguard Worker    je .w32_end
387*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [hq*2]
388*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x33345556
389*c0909341SAndroid Build Coastguard Worker    shrx                r6d, r6d, r2d
390*c0909341SAndroid Build Coastguard Worker    movd                xm1, r6d
391*c0909341SAndroid Build Coastguard Worker    pmulhuw             xm0, xm1
392*c0909341SAndroid Build Coastguard Worker.w32_end:
393*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, xm0
394*c0909341SAndroid Build Coastguard Worker.s32:
395*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
396*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m0
397*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m0
398*c0909341SAndroid Build Coastguard Worker    mova   [dstq+stride3q ], m0
399*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
400*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
401*c0909341SAndroid Build Coastguard Worker    jg .s32
402*c0909341SAndroid Build Coastguard Worker    RET
403*c0909341SAndroid Build Coastguard WorkerALIGN function_align
404*c0909341SAndroid Build Coastguard Worker.h64:
405*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-64]
406*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tlq-32]
407*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
408*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
409*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
410*c0909341SAndroid Build Coastguard Worker    jmp                  wq
411*c0909341SAndroid Build Coastguard Worker.w64:
412*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+ 1]
413*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+33]
414*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
415*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m3
416*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
417*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
418*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
419*c0909341SAndroid Build Coastguard Worker    psubw               xm0, xm4
420*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
421*c0909341SAndroid Build Coastguard Worker    punpckhqdq          xm1, xm0, xm0
422*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
423*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
424*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
425*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm0, xm3
426*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, xm5
427*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 64
428*c0909341SAndroid Build Coastguard Worker    je .w64_end
429*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x33345556
430*c0909341SAndroid Build Coastguard Worker    shrx                r6d, r6d, hd
431*c0909341SAndroid Build Coastguard Worker    movd                xm1, r6d
432*c0909341SAndroid Build Coastguard Worker    pmulhuw             xm0, xm1
433*c0909341SAndroid Build Coastguard Worker.w64_end:
434*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, xm0
435*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
436*c0909341SAndroid Build Coastguard Worker.s64:
437*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*0], m0
438*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*1], m1
439*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*0], m0
440*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*1], m1
441*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*2+32*0], m0
442*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*2+32*1], m1
443*c0909341SAndroid Build Coastguard Worker    mova [dstq+stride3q +32*0], m0
444*c0909341SAndroid Build Coastguard Worker    mova [dstq+stride3q +32*1], m1
445*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
446*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
447*c0909341SAndroid Build Coastguard Worker    jg .s64
448*c0909341SAndroid Build Coastguard Worker    RET
449*c0909341SAndroid Build Coastguard Worker
450*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
451*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_dc_splat_avx2_table]
452*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
453*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
454*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
455*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [r5-ipred_dc_splat_avx2_table+pb_128]
456*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
457*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
458*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
459*c0909341SAndroid Build Coastguard Worker    jmp                  wq
460*c0909341SAndroid Build Coastguard Worker
461*c0909341SAndroid Build Coastguard Workercglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
462*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_dc_splat_avx2_table]
463*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
464*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+ 1]
465*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+33]
466*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
467*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
468*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
469*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
470*c0909341SAndroid Build Coastguard Worker    jmp                  wq
471*c0909341SAndroid Build Coastguard Worker
472*c0909341SAndroid Build Coastguard Worker%macro IPRED_H 2 ; w, store_type
473*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, [tlq-1]
474*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m1, [tlq-2]
475*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m2, [tlq-3]
476*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
477*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m3, [tlq+0]
478*c0909341SAndroid Build Coastguard Worker    mov%2  [dstq+strideq*0], m0
479*c0909341SAndroid Build Coastguard Worker    mov%2  [dstq+strideq*1], m1
480*c0909341SAndroid Build Coastguard Worker    mov%2  [dstq+strideq*2], m2
481*c0909341SAndroid Build Coastguard Worker    mov%2  [dstq+stride3q ], m3
482*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
483*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
484*c0909341SAndroid Build Coastguard Worker    jg .w%1
485*c0909341SAndroid Build Coastguard Worker    RET
486*c0909341SAndroid Build Coastguard WorkerALIGN function_align
487*c0909341SAndroid Build Coastguard Worker%endmacro
488*c0909341SAndroid Build Coastguard Worker
489*c0909341SAndroid Build Coastguard WorkerINIT_XMM avx2
490*c0909341SAndroid Build Coastguard Workercglobal ipred_h_8bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
491*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_h_avx2_table]
492*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
493*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
494*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
495*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
496*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
497*c0909341SAndroid Build Coastguard Worker    jmp                  wq
498*c0909341SAndroid Build Coastguard Worker.w4:
499*c0909341SAndroid Build Coastguard Worker    IPRED_H               4, d
500*c0909341SAndroid Build Coastguard Worker.w8:
501*c0909341SAndroid Build Coastguard Worker    IPRED_H               8, q
502*c0909341SAndroid Build Coastguard Worker.w16:
503*c0909341SAndroid Build Coastguard Worker    IPRED_H              16, a
504*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
505*c0909341SAndroid Build Coastguard Worker.w32:
506*c0909341SAndroid Build Coastguard Worker    IPRED_H              32, a
507*c0909341SAndroid Build Coastguard Worker.w64:
508*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, [tlq-1]
509*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m1, [tlq-2]
510*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m2, [tlq-3]
511*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
512*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m3, [tlq+0]
513*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*0], m0
514*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+32*1], m0
515*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*0], m1
516*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+32*1], m1
517*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*2+32*0], m2
518*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*2+32*1], m2
519*c0909341SAndroid Build Coastguard Worker    mova [dstq+stride3q +32*0], m3
520*c0909341SAndroid Build Coastguard Worker    mova [dstq+stride3q +32*1], m3
521*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
522*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
523*c0909341SAndroid Build Coastguard Worker    jg .w64
524*c0909341SAndroid Build Coastguard Worker    RET
525*c0909341SAndroid Build Coastguard Worker
526*c0909341SAndroid Build Coastguard Worker%macro PAETH 2 ; top, ldiff
527*c0909341SAndroid Build Coastguard Worker    pavgb                m1, m%1, m3 ; Calculating tldiff normally requires
528*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m%1, m3 ; 10-bit intermediates, but we can do it
529*c0909341SAndroid Build Coastguard Worker    pand                 m0, m4      ; in 8-bit with some tricks which avoids
530*c0909341SAndroid Build Coastguard Worker    psubusb              m2, m5, m1  ; having to unpack everything to 16-bit.
531*c0909341SAndroid Build Coastguard Worker    psubb                m1, m0
532*c0909341SAndroid Build Coastguard Worker    psubusb              m1, m5
533*c0909341SAndroid Build Coastguard Worker    por                  m1, m2
534*c0909341SAndroid Build Coastguard Worker    paddusb              m1, m1
535*c0909341SAndroid Build Coastguard Worker    por                  m1, m0      ; min(tldiff, 255)
536*c0909341SAndroid Build Coastguard Worker    psubusb              m2, m5, m3
537*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m3, m5
538*c0909341SAndroid Build Coastguard Worker    por                  m2, m0      ; tdiff
539*c0909341SAndroid Build Coastguard Worker    pminub               m2, m%2
540*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m0, m%2, m2 ; ldiff <= tdiff
541*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m%1, m3, m0
542*c0909341SAndroid Build Coastguard Worker    pminub               m1, m2
543*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m1, m2      ; ldiff <= tldiff || tdiff <= tldiff
544*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m5, m0, m1
545*c0909341SAndroid Build Coastguard Worker%endmacro
546*c0909341SAndroid Build Coastguard Worker
547*c0909341SAndroid Build Coastguard Workercglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w, h
548*c0909341SAndroid Build Coastguard Worker%define base r5-ipred_paeth_avx2_table
549*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_paeth_avx2_table]
550*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
551*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m5, [tlq]   ; topleft
552*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
553*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
554*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+pb_1]
555*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
556*c0909341SAndroid Build Coastguard Worker    jmp                  wq
557*c0909341SAndroid Build Coastguard Worker.w4:
558*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [tlq+1] ; top
559*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+ipred_h_shuf]
560*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
561*c0909341SAndroid Build Coastguard Worker    psubusb              m7, m5, m6
562*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
563*c0909341SAndroid Build Coastguard Worker    por                  m7, m0      ; ldiff
564*c0909341SAndroid Build Coastguard Worker.w4_loop:
565*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 8
566*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m3, [tlq]
567*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m8      ; left
568*c0909341SAndroid Build Coastguard Worker    PAETH                 6, 7
569*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
570*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
571*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], xm1
572*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 2
573*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r3       ], xm1, 2
574*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
575*c0909341SAndroid Build Coastguard Worker    je .ret
576*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
577*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*0], xm0, 1
578*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm1, 1
579*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 3
580*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r3       ], xm1, 3
581*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
582*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
583*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
584*c0909341SAndroid Build Coastguard Worker.ret:
585*c0909341SAndroid Build Coastguard Worker    RET
586*c0909341SAndroid Build Coastguard WorkerALIGN function_align
587*c0909341SAndroid Build Coastguard Worker.w8:
588*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m6, [tlq+1]
589*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+ipred_h_shuf]
590*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
591*c0909341SAndroid Build Coastguard Worker    psubusb              m7, m5, m6
592*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
593*c0909341SAndroid Build Coastguard Worker    por                  m7, m0
594*c0909341SAndroid Build Coastguard Worker.w8_loop:
595*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
596*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [tlq]
597*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m8
598*c0909341SAndroid Build Coastguard Worker    PAETH                 6, 7
599*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
600*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
601*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
602*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
603*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r3       ], xm1
604*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
605*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
606*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
607*c0909341SAndroid Build Coastguard Worker    RET
608*c0909341SAndroid Build Coastguard WorkerALIGN function_align
609*c0909341SAndroid Build Coastguard Worker.w16:
610*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [tlq+1]
611*c0909341SAndroid Build Coastguard Worker    mova                xm8, xm4 ; lower half = 1, upper half = 0
612*c0909341SAndroid Build Coastguard Worker    psubusb              m7, m5, m6
613*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
614*c0909341SAndroid Build Coastguard Worker    por                  m7, m0
615*c0909341SAndroid Build Coastguard Worker.w16_loop:
616*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 2
617*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [tlq]
618*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m8
619*c0909341SAndroid Build Coastguard Worker    PAETH                 6, 7
620*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
621*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
622*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
623*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
624*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
625*c0909341SAndroid Build Coastguard Worker    RET
626*c0909341SAndroid Build Coastguard WorkerALIGN function_align
627*c0909341SAndroid Build Coastguard Worker.w32:
628*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq+1]
629*c0909341SAndroid Build Coastguard Worker    psubusb              m7, m5, m6
630*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
631*c0909341SAndroid Build Coastguard Worker    por                  m7, m0
632*c0909341SAndroid Build Coastguard Worker.w32_loop:
633*c0909341SAndroid Build Coastguard Worker    dec                 tlq
634*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m3, [tlq]
635*c0909341SAndroid Build Coastguard Worker    PAETH                 6, 7
636*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
637*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
638*c0909341SAndroid Build Coastguard Worker    dec                  hd
639*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
640*c0909341SAndroid Build Coastguard Worker    RET
641*c0909341SAndroid Build Coastguard WorkerALIGN function_align
642*c0909341SAndroid Build Coastguard Worker.w64:
643*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq+ 1]
644*c0909341SAndroid Build Coastguard Worker    movu                 m7, [tlq+33]
645*c0909341SAndroid Build Coastguard Worker%if WIN64
646*c0909341SAndroid Build Coastguard Worker    movaps              r4m, xmm9
647*c0909341SAndroid Build Coastguard Worker%endif
648*c0909341SAndroid Build Coastguard Worker    psubusb              m8, m5, m6
649*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
650*c0909341SAndroid Build Coastguard Worker    psubusb              m9, m5, m7
651*c0909341SAndroid Build Coastguard Worker    psubusb              m1, m7, m5
652*c0909341SAndroid Build Coastguard Worker    por                  m8, m0
653*c0909341SAndroid Build Coastguard Worker    por                  m9, m1
654*c0909341SAndroid Build Coastguard Worker.w64_loop:
655*c0909341SAndroid Build Coastguard Worker    dec                 tlq
656*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m3, [tlq]
657*c0909341SAndroid Build Coastguard Worker    PAETH                 6, 8
658*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
659*c0909341SAndroid Build Coastguard Worker    PAETH                 7, 9
660*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m0
661*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
662*c0909341SAndroid Build Coastguard Worker    dec                  hd
663*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
664*c0909341SAndroid Build Coastguard Worker%if WIN64
665*c0909341SAndroid Build Coastguard Worker    movaps             xmm9, r4m
666*c0909341SAndroid Build Coastguard Worker%endif
667*c0909341SAndroid Build Coastguard Worker    RET
668*c0909341SAndroid Build Coastguard Worker
669*c0909341SAndroid Build Coastguard Worker%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
670*c0909341SAndroid Build Coastguard Worker    ; w * a         = (w - 128) * a + 128 * a
671*c0909341SAndroid Build Coastguard Worker    ; (256 - w) * b = (127 - w) * b + 129 * b
672*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m%3, m%1
673*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m%4, m%2
674*c0909341SAndroid Build Coastguard Worker    paddw                m0, m%5
675*c0909341SAndroid Build Coastguard Worker    paddw                m1, m%6
676*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 8
677*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 8
678*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
679*c0909341SAndroid Build Coastguard Worker%endmacro
680*c0909341SAndroid Build Coastguard Worker
681*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl, w, h, weights
682*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_smooth_v_avx2_table
683*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ipred_smooth_v_avx2_table]
684*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
685*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
686*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
687*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [base+pb_127_m127]
688*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [base+pw_128]
689*c0909341SAndroid Build Coastguard Worker    lea            weightsq, [base+smooth_weights+hq*4]
690*c0909341SAndroid Build Coastguard Worker    neg                  hq
691*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m5, [tlq+hq] ; bottom
692*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
693*c0909341SAndroid Build Coastguard Worker    jmp                  wq
694*c0909341SAndroid Build Coastguard Worker.w4:
695*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [tlq+1]
696*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m5 ; top, bottom
697*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+ipred_v_shuf]
698*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
699*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m5, m5
700*c0909341SAndroid Build Coastguard Worker    punpckhdq            m5, m5
701*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m2, m0
702*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2 ;   1 * top + 256 * bottom + 128, overflow is ok
703*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1 ; 128 * top + 129 * bottom + 128
704*c0909341SAndroid Build Coastguard Worker.w4_loop:
705*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [weightsq+hq*2]
706*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1, m4
707*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m5
708*c0909341SAndroid Build Coastguard Worker    SMOOTH                0, 1, 2, 2, 3, 3
709*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
710*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
711*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], xm1
712*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 1
713*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r3       ], xm1, 1
714*c0909341SAndroid Build Coastguard Worker    cmp                  hd, -4
715*c0909341SAndroid Build Coastguard Worker    je .ret
716*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
717*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*0], xm0, 2
718*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm1, 2
719*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 3
720*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r3       ], xm1, 3
721*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
722*c0909341SAndroid Build Coastguard Worker    add                  hq, 8
723*c0909341SAndroid Build Coastguard Worker    jl .w4_loop
724*c0909341SAndroid Build Coastguard Worker.ret:
725*c0909341SAndroid Build Coastguard Worker    RET
726*c0909341SAndroid Build Coastguard WorkerALIGN function_align
727*c0909341SAndroid Build Coastguard Worker.w8:
728*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [tlq+1]
729*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m5
730*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+ipred_v_shuf]
731*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
732*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m5, q0000
733*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q1111
734*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m2, m0
735*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
736*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
737*c0909341SAndroid Build Coastguard Worker.w8_loop:
738*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [weightsq+hq*2]
739*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1, m4
740*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m5
741*c0909341SAndroid Build Coastguard Worker    SMOOTH                0, 1, 2, 2, 3, 3
742*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
743*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
744*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
745*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
746*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r3       ], xm1
747*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
748*c0909341SAndroid Build Coastguard Worker    add                  hq, 4
749*c0909341SAndroid Build Coastguard Worker    jl .w8_loop
750*c0909341SAndroid Build Coastguard Worker    RET
751*c0909341SAndroid Build Coastguard WorkerALIGN function_align
752*c0909341SAndroid Build Coastguard Worker.w16:
753*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       7
754*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [tlq+1]
755*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+ipred_v_shuf]
756*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m5
757*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m5
758*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m2, m0
759*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m3, m0
760*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1, m2
761*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
762*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
763*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
764*c0909341SAndroid Build Coastguard Worker.w16_loop:
765*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [weightsq+hq*2]
766*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m6
767*c0909341SAndroid Build Coastguard Worker    SMOOTH                1, 1, 2, 3, 4, 5
768*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
769*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
770*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
771*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
772*c0909341SAndroid Build Coastguard Worker    jl .w16_loop
773*c0909341SAndroid Build Coastguard Worker    RET
774*c0909341SAndroid Build Coastguard WorkerALIGN function_align
775*c0909341SAndroid Build Coastguard Worker.w32:
776*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       6
777*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+1]
778*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m5
779*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m5
780*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m2, m0
781*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m3, m0
782*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1, m2
783*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
784*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
785*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
786*c0909341SAndroid Build Coastguard Worker.w32_loop:
787*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, [weightsq+hq*2]
788*c0909341SAndroid Build Coastguard Worker    SMOOTH                1, 1, 2, 3, 4, 5
789*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
790*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
791*c0909341SAndroid Build Coastguard Worker    inc                  hq
792*c0909341SAndroid Build Coastguard Worker    jl .w32_loop
793*c0909341SAndroid Build Coastguard Worker    RET
794*c0909341SAndroid Build Coastguard WorkerALIGN function_align
795*c0909341SAndroid Build Coastguard Worker.w64:
796*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      11
797*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq+ 1]
798*c0909341SAndroid Build Coastguard Worker    movu                 m8, [tlq+33]
799*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4, m5
800*c0909341SAndroid Build Coastguard Worker    punpckhbw            m4, m5
801*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m8, m5
802*c0909341SAndroid Build Coastguard Worker    punpckhbw            m8, m5
803*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m3, m0
804*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m4, m0
805*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m9, m7, m0
806*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m8, m0
807*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1, m3
808*c0909341SAndroid Build Coastguard Worker    paddw                m5, m2
809*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1, m4
810*c0909341SAndroid Build Coastguard Worker    paddw                m6, m2
811*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1, m7
812*c0909341SAndroid Build Coastguard Worker    paddw                m9, m0
813*c0909341SAndroid Build Coastguard Worker    paddw                m1, m8
814*c0909341SAndroid Build Coastguard Worker    paddw               m10, m1
815*c0909341SAndroid Build Coastguard Worker.w64_loop:
816*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, [weightsq+hq*2]
817*c0909341SAndroid Build Coastguard Worker    SMOOTH                2, 2, 3, 4, 5, 6
818*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
819*c0909341SAndroid Build Coastguard Worker    SMOOTH                2, 2, 7, 8, 9, 10
820*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m0
821*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
822*c0909341SAndroid Build Coastguard Worker    inc                  hq
823*c0909341SAndroid Build Coastguard Worker    jl .w64_loop
824*c0909341SAndroid Build Coastguard Worker    RET
825*c0909341SAndroid Build Coastguard Worker
826*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h
827*c0909341SAndroid Build Coastguard Worker%define base r5-ipred_smooth_h_avx2_table
828*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_smooth_h_avx2_table]
829*c0909341SAndroid Build Coastguard Worker    mov                  wd, wm
830*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m3, [tlq+wq] ; right
831*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
832*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
833*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
834*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+pb_127_m127]
835*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pw_128]
836*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
837*c0909341SAndroid Build Coastguard Worker    jmp                  wq
838*c0909341SAndroid Build Coastguard Worker.w4:
839*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       8
840*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m6, [base+smooth_weights+4*2]
841*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+ipred_h_shuf]
842*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 8
843*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
844*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
845*c0909341SAndroid Build Coastguard Worker.w4_loop:
846*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [tlq+hq]
847*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
848*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2, m3 ; left, right
849*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m3
850*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1, m4 ; 127 * left - 127 * right
851*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1     ; 128 * left + 129 * right
852*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6
853*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
854*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
855*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2, m4
856*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
857*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6
858*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
859*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
860*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 8
861*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 8
862*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
863*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
864*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
865*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], xm1
866*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 2
867*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r3       ], xm1, 2
868*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
869*c0909341SAndroid Build Coastguard Worker    je .ret
870*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
871*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*0], xm0, 1
872*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm1, 1
873*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 3
874*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r3       ], xm1, 3
875*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
876*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
877*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
878*c0909341SAndroid Build Coastguard Worker.ret:
879*c0909341SAndroid Build Coastguard Worker    RET
880*c0909341SAndroid Build Coastguard WorkerALIGN function_align
881*c0909341SAndroid Build Coastguard Worker.w8:
882*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       8
883*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [base+smooth_weights+8*2]
884*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+ipred_h_shuf]
885*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
886*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
887*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
888*c0909341SAndroid Build Coastguard Worker.w8_loop:
889*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [tlq+hq]
890*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
891*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2, m3
892*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m3
893*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1, m4
894*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
895*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6
896*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
897*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
898*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2, m4
899*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
900*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6
901*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
902*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
903*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 8
904*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 8
905*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
906*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
907*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
908*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
909*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
910*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r3       ], xm1
911*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
912*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
913*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
914*c0909341SAndroid Build Coastguard Worker    RET
915*c0909341SAndroid Build Coastguard WorkerALIGN function_align
916*c0909341SAndroid Build Coastguard Worker.w16:
917*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK        32*4, 8
918*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+64*2-4]
919*c0909341SAndroid Build Coastguard Worker    call .prep ; only worthwhile for for w16 and above
920*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 2
921*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm6, [base+pb_1]
922*c0909341SAndroid Build Coastguard Worker    mova                xm7, [base+ipred_v_shuf+16]
923*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [base+ipred_v_shuf+ 0], 1
924*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [base+smooth_weights+16*2]
925*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [base+smooth_weights+16*3]
926*c0909341SAndroid Build Coastguard Worker.w16_loop:
927*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [tlq+hq]
928*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [r3+hq*2]
929*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m6
930*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m3
931*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
932*c0909341SAndroid Build Coastguard Worker    SMOOTH                4, 5, 1, 1, 2, 2
933*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
934*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
935*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
936*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
937*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
938*c0909341SAndroid Build Coastguard Worker    RET
939*c0909341SAndroid Build Coastguard WorkerALIGN function_align
940*c0909341SAndroid Build Coastguard Worker.w32:
941*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK        32*4
942*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+64*2-2]
943*c0909341SAndroid Build Coastguard Worker    call .prep
944*c0909341SAndroid Build Coastguard Worker    dec                 tlq
945*c0909341SAndroid Build Coastguard Worker    mova                xm4, [base+smooth_weights+16*4]
946*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [base+smooth_weights+16*6], 1
947*c0909341SAndroid Build Coastguard Worker    mova                xm5, [base+smooth_weights+16*5]
948*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [base+smooth_weights+16*7], 1
949*c0909341SAndroid Build Coastguard Worker.w32_loop:
950*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m1, [tlq+hq]
951*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m3
952*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, [r3+hq*2]
953*c0909341SAndroid Build Coastguard Worker    SMOOTH                4, 5, 1, 1, 2, 2
954*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
955*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
956*c0909341SAndroid Build Coastguard Worker    dec                  hd
957*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
958*c0909341SAndroid Build Coastguard Worker    RET
959*c0909341SAndroid Build Coastguard WorkerALIGN function_align
960*c0909341SAndroid Build Coastguard Worker.w64:
961*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK        32*4, 9
962*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+64*2-2]
963*c0909341SAndroid Build Coastguard Worker    call .prep
964*c0909341SAndroid Build Coastguard Worker    add                  r5, smooth_weights+16*15-ipred_smooth_h_avx2_table
965*c0909341SAndroid Build Coastguard Worker    dec                 tlq
966*c0909341SAndroid Build Coastguard Worker    mova                xm5, [r5-16*7]
967*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [r5-16*5], 1
968*c0909341SAndroid Build Coastguard Worker    mova                xm6, [r5-16*6]
969*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [r5-16*4], 1
970*c0909341SAndroid Build Coastguard Worker    mova                xm7, [r5-16*3]
971*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [r5-16*1], 1
972*c0909341SAndroid Build Coastguard Worker    mova                xm8, [r5-16*2]
973*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, [r5-16*0], 1
974*c0909341SAndroid Build Coastguard Worker.w64_loop:
975*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m2, [tlq+hq]
976*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3
977*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, [r3+hq*2]
978*c0909341SAndroid Build Coastguard Worker    SMOOTH                5, 6, 2, 2, 4, 4
979*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
980*c0909341SAndroid Build Coastguard Worker    SMOOTH                7, 8, 2, 2, 4, 4
981*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m0
982*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
983*c0909341SAndroid Build Coastguard Worker    dec                  hd
984*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
985*c0909341SAndroid Build Coastguard Worker    RET
986*c0909341SAndroid Build Coastguard WorkerALIGN function_align
987*c0909341SAndroid Build Coastguard Worker.prep:
988*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [tlq-32*1], q3120
989*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2, m3
990*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3
991*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1, m4 ; 127 * left - 127 * right
992*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5     ;   1 * left + 256 * right + 128
993*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1     ; 128 * left + 129 * right + 128
994*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2, m4
995*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
996*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
997*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [tlq-32*2], q3120
998*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*3], m0
999*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*2], m1
1000*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2, m3
1001*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3
1002*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1, m4
1003*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
1004*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1005*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2, m4
1006*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
1007*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1008*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*1], m0
1009*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*0], m1
1010*c0909341SAndroid Build Coastguard Worker    sub                  r3, hq
1011*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
1012*c0909341SAndroid Build Coastguard Worker    sub                  r3, hq
1013*c0909341SAndroid Build Coastguard Worker    ret
1014*c0909341SAndroid Build Coastguard Worker
1015*c0909341SAndroid Build Coastguard Worker%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2]
1016*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m%3, m%1
1017*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m%4, m%2
1018*c0909341SAndroid Build Coastguard Worker%ifnum %5
1019*c0909341SAndroid Build Coastguard Worker    paddw                m0, m%5
1020*c0909341SAndroid Build Coastguard Worker%else
1021*c0909341SAndroid Build Coastguard Worker    paddw                m0, %5
1022*c0909341SAndroid Build Coastguard Worker%endif
1023*c0909341SAndroid Build Coastguard Worker%ifnum %6
1024*c0909341SAndroid Build Coastguard Worker    paddw                m1, m%6
1025*c0909341SAndroid Build Coastguard Worker%else
1026*c0909341SAndroid Build Coastguard Worker    paddw                m1, %6
1027*c0909341SAndroid Build Coastguard Worker%endif
1028*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m2
1029*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m3
1030*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 8
1031*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 8
1032*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1033*c0909341SAndroid Build Coastguard Worker%endmacro
1034*c0909341SAndroid Build Coastguard Worker
1035*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights
1036*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_smooth_avx2_table
1037*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ipred_smooth_avx2_table]
1038*c0909341SAndroid Build Coastguard Worker    mov                  wd, wm
1039*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m4, [tlq+wq] ; right
1040*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
1041*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
1042*c0909341SAndroid Build Coastguard Worker    mov                  r5, tlq
1043*c0909341SAndroid Build Coastguard Worker    sub                  r5, hq
1044*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
1045*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+pb_127_m127]
1046*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, [r5] ; bottom
1047*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [base+pw_255]
1048*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
1049*c0909341SAndroid Build Coastguard Worker    lea          v_weightsq, [base+smooth_weights+hq*2]
1050*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1051*c0909341SAndroid Build Coastguard Worker.w4:
1052*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      12
1053*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+ipred_h_shuf]
1054*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m11, [base+smooth_weights+4*2]
1055*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+ipred_v_shuf]
1056*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [tlq+1]
1057*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 8
1058*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
1059*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
1060*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m0 ; top, bottom
1061*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m7, q2200
1062*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q3311
1063*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m9, m8, m5
1064*c0909341SAndroid Build Coastguard Worker    paddw                m3, m8 ;   1 * top + 255 * bottom + 255
1065*c0909341SAndroid Build Coastguard Worker    paddw                m9, m3 ; 128 * top + 129 * bottom + 255
1066*c0909341SAndroid Build Coastguard Worker.w4_loop:
1067*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [tlq+hq]
1068*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m10
1069*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m4 ; left, right
1070*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m4
1071*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m0, m5 ; 127 * left - 127 * right
1072*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m1, m5
1073*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0     ; 128 * left + 129 * right
1074*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
1075*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m11
1076*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m11
1077*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
1078*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
1079*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [v_weightsq]
1080*c0909341SAndroid Build Coastguard Worker    add          v_weightsq, 16
1081*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1, m6
1082*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m7
1083*c0909341SAndroid Build Coastguard Worker    SMOOTH_2D_END         0, 1, 8, 8, 9, 9
1084*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
1085*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
1086*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], xm1
1087*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 2
1088*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r3       ], xm1, 2
1089*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
1090*c0909341SAndroid Build Coastguard Worker    je .ret
1091*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1092*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*0], xm0, 1
1093*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm1, 1
1094*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 3
1095*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r3       ], xm1, 3
1096*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1097*c0909341SAndroid Build Coastguard Worker    sub                  hd, 8
1098*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
1099*c0909341SAndroid Build Coastguard Worker.ret:
1100*c0909341SAndroid Build Coastguard Worker    RET
1101*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1102*c0909341SAndroid Build Coastguard Worker.w8:
1103*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      12
1104*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+ipred_h_shuf]
1105*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m11, [base+smooth_weights+8*2]
1106*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+ipred_v_shuf]
1107*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m8, [tlq+1]
1108*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
1109*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
1110*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
1111*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m0
1112*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m7, q0000
1113*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q1111
1114*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m9, m8, m5
1115*c0909341SAndroid Build Coastguard Worker    paddw                m3, m8
1116*c0909341SAndroid Build Coastguard Worker    paddw                m9, m3
1117*c0909341SAndroid Build Coastguard Worker.w8_loop:
1118*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [tlq+hq]
1119*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m10
1120*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m4
1121*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m4
1122*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m0, m5
1123*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m1, m5
1124*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
1125*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
1126*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m11
1127*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m11
1128*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
1129*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
1130*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [v_weightsq]
1131*c0909341SAndroid Build Coastguard Worker    add          v_weightsq, 8
1132*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1, m6
1133*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m7
1134*c0909341SAndroid Build Coastguard Worker    SMOOTH_2D_END         0, 1, 8, 8, 9, 9
1135*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
1136*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
1137*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
1138*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
1139*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r3       ], xm1
1140*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1141*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1142*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
1143*c0909341SAndroid Build Coastguard Worker    RET
1144*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1145*c0909341SAndroid Build Coastguard Worker.w16:
1146*c0909341SAndroid Build Coastguard Worker    %assign regs_used 4
1147*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -32*4, 14
1148*c0909341SAndroid Build Coastguard Worker    %assign regs_used 7
1149*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m11, [tlq+1]
1150*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+64*2-4]
1151*c0909341SAndroid Build Coastguard Worker    punpcklbw           m10, m11, m0 ; top, bottom
1152*c0909341SAndroid Build Coastguard Worker    punpckhbw           m11, m0
1153*c0909341SAndroid Build Coastguard Worker    call .prep_v
1154*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 2
1155*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, m10, m5
1156*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m11, m5
1157*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm5, [base+pb_1]
1158*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+ipred_v_shuf]
1159*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [base+smooth_weights+16*2]
1160*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [base+smooth_weights+16*3]
1161*c0909341SAndroid Build Coastguard Worker    vperm2i128           m8, m9, m9, 0x01
1162*c0909341SAndroid Build Coastguard Worker    paddw                m0, m10, m3
1163*c0909341SAndroid Build Coastguard Worker    paddw                m3, m11
1164*c0909341SAndroid Build Coastguard Worker    paddw               m12, m0
1165*c0909341SAndroid Build Coastguard Worker    paddw               m13, m3
1166*c0909341SAndroid Build Coastguard Worker.w16_loop:
1167*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [tlq+hq]
1168*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [r3+hq*2]
1169*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [v_weightsq]
1170*c0909341SAndroid Build Coastguard Worker    add          v_weightsq, 4
1171*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m5
1172*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4 ; left, right
1173*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m3, m6
1174*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m7
1175*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m8
1176*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m9
1177*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
1178*c0909341SAndroid Build Coastguard Worker    paddw                m3, m0
1179*c0909341SAndroid Build Coastguard Worker    SMOOTH_2D_END         1, 1, 10, 11, 12, 13
1180*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
1181*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
1182*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1183*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1184*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
1185*c0909341SAndroid Build Coastguard Worker    RET
1186*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1187*c0909341SAndroid Build Coastguard Worker.w32:
1188*c0909341SAndroid Build Coastguard Worker    %assign regs_used 4
1189*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -32*4, 11
1190*c0909341SAndroid Build Coastguard Worker    %assign regs_used 7
1191*c0909341SAndroid Build Coastguard Worker    movu                 m8, [tlq+1]
1192*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+64*2-2]
1193*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m8, m0
1194*c0909341SAndroid Build Coastguard Worker    punpckhbw            m8, m0
1195*c0909341SAndroid Build Coastguard Worker    call .prep_v
1196*c0909341SAndroid Build Coastguard Worker    dec                 tlq
1197*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m9, m7, m5
1198*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m8, m5
1199*c0909341SAndroid Build Coastguard Worker    mova                xm5, [base+smooth_weights+16*4]
1200*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [base+smooth_weights+16*6], 1
1201*c0909341SAndroid Build Coastguard Worker    mova                xm6, [base+smooth_weights+16*5]
1202*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [base+smooth_weights+16*7], 1
1203*c0909341SAndroid Build Coastguard Worker    paddw                m0, m7, m3
1204*c0909341SAndroid Build Coastguard Worker    paddw                m3, m8
1205*c0909341SAndroid Build Coastguard Worker    paddw                m9, m0
1206*c0909341SAndroid Build Coastguard Worker    paddw               m10, m3
1207*c0909341SAndroid Build Coastguard Worker.w32_loop:
1208*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m3, [tlq+hq]
1209*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4
1210*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, [r3+hq*2]
1211*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, [v_weightsq]
1212*c0909341SAndroid Build Coastguard Worker    add          v_weightsq, 2
1213*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m3, m5
1214*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m6
1215*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
1216*c0909341SAndroid Build Coastguard Worker    paddw                m3, m0
1217*c0909341SAndroid Build Coastguard Worker    SMOOTH_2D_END         1, 1, 7, 8, 9, 10
1218*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
1219*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1220*c0909341SAndroid Build Coastguard Worker    dec                  hd
1221*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
1222*c0909341SAndroid Build Coastguard Worker    RET
1223*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1224*c0909341SAndroid Build Coastguard Worker.w64:
1225*c0909341SAndroid Build Coastguard Worker    %assign regs_used 4
1226*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -32*8, 16
1227*c0909341SAndroid Build Coastguard Worker    %assign regs_used 7
1228*c0909341SAndroid Build Coastguard Worker    movu                m13, [tlq+1 ]
1229*c0909341SAndroid Build Coastguard Worker    movu                m15, [tlq+33]
1230*c0909341SAndroid Build Coastguard Worker    add                  r6, smooth_weights+16*15-ipred_smooth_avx2_table
1231*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+64*2-2]
1232*c0909341SAndroid Build Coastguard Worker    punpcklbw           m12, m13, m0
1233*c0909341SAndroid Build Coastguard Worker    punpckhbw           m13, m0
1234*c0909341SAndroid Build Coastguard Worker    punpcklbw           m14, m15, m0
1235*c0909341SAndroid Build Coastguard Worker    punpckhbw           m15, m0
1236*c0909341SAndroid Build Coastguard Worker    call .prep_v
1237*c0909341SAndroid Build Coastguard Worker    dec                 tlq
1238*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m12, m5
1239*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m13, m5
1240*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m14, m5
1241*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m15, m5
1242*c0909341SAndroid Build Coastguard Worker    mova                xm8, [r6-16*7]
1243*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, [r6-16*5], 1
1244*c0909341SAndroid Build Coastguard Worker    mova                xm9, [r6-16*6]
1245*c0909341SAndroid Build Coastguard Worker    vinserti128          m9, [r6-16*4], 1
1246*c0909341SAndroid Build Coastguard Worker    mova               xm10, [r6-16*3]
1247*c0909341SAndroid Build Coastguard Worker    vinserti128         m10, [r6-16*1], 1
1248*c0909341SAndroid Build Coastguard Worker    mova               xm11, [r6-16*2]
1249*c0909341SAndroid Build Coastguard Worker    vinserti128         m11, [r6-16*0], 1
1250*c0909341SAndroid Build Coastguard Worker    lea                  r6, [rsp+32*4]
1251*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
1252*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
1253*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
1254*c0909341SAndroid Build Coastguard Worker    paddw                m3, m5
1255*c0909341SAndroid Build Coastguard Worker    paddw                m0, m12
1256*c0909341SAndroid Build Coastguard Worker    paddw                m1, m13
1257*c0909341SAndroid Build Coastguard Worker    paddw                m2, m14
1258*c0909341SAndroid Build Coastguard Worker    paddw                m3, m15
1259*c0909341SAndroid Build Coastguard Worker    mova          [r6+32*0], m0
1260*c0909341SAndroid Build Coastguard Worker    mova          [r6+32*1], m1
1261*c0909341SAndroid Build Coastguard Worker    mova          [r6+32*2], m2
1262*c0909341SAndroid Build Coastguard Worker    mova          [r6+32*3], m3
1263*c0909341SAndroid Build Coastguard Worker.w64_loop:
1264*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m5, [tlq+hq]
1265*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m4
1266*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, [r3+hq*2]
1267*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, [v_weightsq]
1268*c0909341SAndroid Build Coastguard Worker    add          v_weightsq, 2
1269*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5, m8
1270*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5, m9
1271*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6
1272*c0909341SAndroid Build Coastguard Worker    paddw                m3, m6
1273*c0909341SAndroid Build Coastguard Worker    SMOOTH_2D_END         7, 7, 12, 13, [r6+32*0], [r6+32*1]
1274*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
1275*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5, m10
1276*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5, m11
1277*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6
1278*c0909341SAndroid Build Coastguard Worker    paddw                m3, m6
1279*c0909341SAndroid Build Coastguard Worker    SMOOTH_2D_END         7, 7, 14, 15, [r6+32*2], [r6+32*3]
1280*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m0
1281*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1282*c0909341SAndroid Build Coastguard Worker    dec                  hd
1283*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
1284*c0909341SAndroid Build Coastguard Worker    RET
1285*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1286*c0909341SAndroid Build Coastguard Worker.prep_v:
1287*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [tlq-32*1], q3120
1288*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2, m4
1289*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4
1290*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1, m5 ; 127 * left - 127 * right
1291*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1     ; 128 * left + 129 * right
1292*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2, m5
1293*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1294*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [tlq-32*2], q3120
1295*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*3], m0
1296*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*2], m1
1297*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2, m4
1298*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4
1299*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1, m5
1300*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1301*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2, m5
1302*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1303*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*1], m0
1304*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*0], m1
1305*c0909341SAndroid Build Coastguard Worker    sub                  r3, hq
1306*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
1307*c0909341SAndroid Build Coastguard Worker    sub                  r3, hq
1308*c0909341SAndroid Build Coastguard Worker    ret
1309*c0909341SAndroid Build Coastguard Worker
1310*c0909341SAndroid Build Coastguard Workercglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
1311*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ipred_z1_avx2_table]
1312*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
1313*c0909341SAndroid Build Coastguard Worker    movifnidn        angled, anglem
1314*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
1315*c0909341SAndroid Build Coastguard Worker    lea                  r7, [dr_intra_derivative]
1316*c0909341SAndroid Build Coastguard Worker    inc                 tlq
1317*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
1318*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
1319*c0909341SAndroid Build Coastguard Worker    mov                 dxd, angled
1320*c0909341SAndroid Build Coastguard Worker    and                 dxd, 0x7e
1321*c0909341SAndroid Build Coastguard Worker    add              angled, 165 ; ~90
1322*c0909341SAndroid Build Coastguard Worker    movzx               dxd, word [r7+dxq]
1323*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x4ff ; d = 90 - angle
1324*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [pw_512]
1325*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [pw_62]
1326*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pw_64]
1327*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1328*c0909341SAndroid Build Coastguard Worker.w4:
1329*c0909341SAndroid Build Coastguard Worker    cmp              angleb, 40
1330*c0909341SAndroid Build Coastguard Worker    jae .w4_no_upsample
1331*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq-1024]
1332*c0909341SAndroid Build Coastguard Worker    sar                 r3d, 7
1333*c0909341SAndroid Build Coastguard Worker    add                 r3d, hd
1334*c0909341SAndroid Build Coastguard Worker    jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
1335*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK         -32, 8
1336*c0909341SAndroid Build Coastguard Worker    mova                xm1, [tlq-1]
1337*c0909341SAndroid Build Coastguard Worker    pshufb              xm0, xm1, [z_upsample1]
1338*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, [z_upsample2]
1339*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse
1340*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd        ; pw_512 (which is already in m3)
1341*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm2        ; for rounding instead of pw_2048
1342*c0909341SAndroid Build Coastguard Worker    pextrd         [rsp+16], xm1, 3 ; top[max_base_x]
1343*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm2
1344*c0909341SAndroid Build Coastguard Worker    movd                xm7, dxd
1345*c0909341SAndroid Build Coastguard Worker    mov                 r3d, dxd ; xpos
1346*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, xm7
1347*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm0
1348*c0909341SAndroid Build Coastguard Worker    movq                xm0, [tlq]
1349*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm1, xm3
1350*c0909341SAndroid Build Coastguard Worker    pslldq               m6, m7, 8
1351*c0909341SAndroid Build Coastguard Worker    paddw               xm2, xm7, xm7
1352*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
1353*c0909341SAndroid Build Coastguard Worker    paddw                m6, m7
1354*c0909341SAndroid Build Coastguard Worker    packuswb            xm1, xm1
1355*c0909341SAndroid Build Coastguard Worker    paddw                m6, m2 ; xpos2 xpos3 xpos0 xpos1
1356*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm0, xm1
1357*c0909341SAndroid Build Coastguard Worker    psllw                m7, 2
1358*c0909341SAndroid Build Coastguard Worker    mova              [rsp], xm0
1359*c0909341SAndroid Build Coastguard Worker.w4_upsample_loop:
1360*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r3+dxq]
1361*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6 ; base0
1362*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [rsp+r3]
1363*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r5+dxq]
1364*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 6 ; base1
1365*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [rsp+r5]
1366*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r3+dxq]
1367*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6 ; base2
1368*c0909341SAndroid Build Coastguard Worker    movq                xm0, [rsp+r3]
1369*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r5+dxq]
1370*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 6 ; base3
1371*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [rsp+r5]
1372*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m2, 0xc0
1373*c0909341SAndroid Build Coastguard Worker    pand                 m2, m4, m6 ; frac
1374*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m1, 0xf0
1375*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5, m2 ; 64-frac
1376*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
1377*c0909341SAndroid Build Coastguard Worker    por                  m1, m2     ; 64-frac, frac
1378*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
1379*c0909341SAndroid Build Coastguard Worker    paddw                m6, m7     ; xpos += dx
1380*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
1381*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
1382*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
1383*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm0
1384*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r2       ], xm0, 1
1385*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm1
1386*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm1, 1
1387*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1388*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1389*c0909341SAndroid Build Coastguard Worker    jg .w4_upsample_loop
1390*c0909341SAndroid Build Coastguard Worker    RET
1391*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1392*c0909341SAndroid Build Coastguard Worker.filter_strength: ; w4/w8/w16
1393*c0909341SAndroid Build Coastguard Worker    ; The C version uses a lot of branches, but we can do all the comparisons
1394*c0909341SAndroid Build Coastguard Worker    ; in parallel and use popcnt to get the final filter strength value.
1395*c0909341SAndroid Build Coastguard Worker%define base r3-z_filter_t0
1396*c0909341SAndroid Build Coastguard Worker    lea                  r3, [z_filter_t0]
1397*c0909341SAndroid Build Coastguard Worker    movd                xm0, maxbased
1398*c0909341SAndroid Build Coastguard Worker    movd                xm2, angled
1399*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
1400*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, xm0
1401*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m2, xm2
1402*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m1, m0, [base+z_filter_wh]
1403*c0909341SAndroid Build Coastguard Worker    pand                 m1, m2
1404*c0909341SAndroid Build Coastguard Worker    mova                xm2, [r3+angleq*8] ; upper ymm half zero in both cases
1405*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, m2
1406*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m1
1407*c0909341SAndroid Build Coastguard Worker    ret
1408*c0909341SAndroid Build Coastguard Worker.w4_no_upsample:
1409*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK         -16, 11
1410*c0909341SAndroid Build Coastguard Worker    mov            maxbased, 7
1411*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
1412*c0909341SAndroid Build Coastguard Worker    jnz .w4_main
1413*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [hq+3]
1414*c0909341SAndroid Build Coastguard Worker    call .filter_strength
1415*c0909341SAndroid Build Coastguard Worker    mov            maxbased, 7
1416*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1417*c0909341SAndroid Build Coastguard Worker    jz .w4_main ; filter_strength == 0
1418*c0909341SAndroid Build Coastguard Worker    popcnt              r5d, r5d
1419*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+pb_8]
1420*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [tlq-1]
1421*c0909341SAndroid Build Coastguard Worker    pminub               m1, m7, [base+z_filter_s]
1422*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+z_filter_k-4+r5*4+12*0]
1423*c0909341SAndroid Build Coastguard Worker    pminub               m7, [base+z_filter_s+8]
1424*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
1425*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+z_filter_k-4+r5*4+12*2]
1426*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2, m1
1427*c0909341SAndroid Build Coastguard Worker    shufps               m1, m7, q2121
1428*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m8
1429*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2, m1
1430*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m9
1431*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
1432*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m10
1433*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1434*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1435*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
1436*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 9
1437*c0909341SAndroid Build Coastguard Worker    mov                 tlq, rsp
1438*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
1439*c0909341SAndroid Build Coastguard Worker    cmovne         maxbased, r3d
1440*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
1441*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm1
1442*c0909341SAndroid Build Coastguard Worker    mova              [tlq], xm0
1443*c0909341SAndroid Build Coastguard Worker.w4_main:
1444*c0909341SAndroid Build Coastguard Worker    movd                xm6, dxd
1445*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [z_base_inc] ; base_inc << 6
1446*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m7, [tlq+maxbaseq]
1447*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
1448*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, xm6
1449*c0909341SAndroid Build Coastguard Worker    mov                 r3d, dxd ; xpos
1450*c0909341SAndroid Build Coastguard Worker    movd                xm9, maxbased
1451*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, xm9
1452*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [z1_shuf_w4]
1453*c0909341SAndroid Build Coastguard Worker    psrlw                m7, 8  ; top[max_base_x]
1454*c0909341SAndroid Build Coastguard Worker    paddw               m10, m6, m6
1455*c0909341SAndroid Build Coastguard Worker    psubw                m9, m0 ; max_base_x
1456*c0909341SAndroid Build Coastguard Worker    vpblendd             m6, m10, 0xcc
1457*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm10
1458*c0909341SAndroid Build Coastguard Worker    paddw                m6, m0 ; xpos2 xpos3 xpos0 xpos1
1459*c0909341SAndroid Build Coastguard Worker    paddw               m10, m10
1460*c0909341SAndroid Build Coastguard Worker.w4_loop:
1461*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r3+dxq]
1462*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6 ; base0
1463*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [tlq+r3]
1464*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r5+dxq]
1465*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 6 ; base1
1466*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [tlq+r5]
1467*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r3+dxq]
1468*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6 ; base2
1469*c0909341SAndroid Build Coastguard Worker    movq                xm0, [tlq+r3]
1470*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r5+dxq]
1471*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 6 ; base3
1472*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [tlq+r5]
1473*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m2, 0xc0
1474*c0909341SAndroid Build Coastguard Worker    pand                 m2, m4, m6 ; frac
1475*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m1, 0xf0
1476*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5, m2 ; 64-frac
1477*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
1478*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m8
1479*c0909341SAndroid Build Coastguard Worker    por                  m1, m2     ; 64-frac, frac
1480*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
1481*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m1, m9, m6 ; base < max_base_x
1482*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
1483*c0909341SAndroid Build Coastguard Worker    paddw                m6, m10    ; xpos += dx
1484*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dstq+strideq*2]
1485*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m7, m0, m1
1486*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
1487*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
1488*c0909341SAndroid Build Coastguard Worker    movd   [r5  +strideq*0], xm0
1489*c0909341SAndroid Build Coastguard Worker    pextrd [r5  +strideq*1], xm0, 1
1490*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm1
1491*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm1, 1
1492*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1493*c0909341SAndroid Build Coastguard Worker    jz .w4_end
1494*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1495*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, maxbased
1496*c0909341SAndroid Build Coastguard Worker    jb .w4_loop
1497*c0909341SAndroid Build Coastguard Worker    packuswb            xm7, xm7
1498*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
1499*c0909341SAndroid Build Coastguard Worker.w4_end_loop:
1500*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm7
1501*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], xm7
1502*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm7
1503*c0909341SAndroid Build Coastguard Worker    movd   [dstq+r6       ], xm7
1504*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1505*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1506*c0909341SAndroid Build Coastguard Worker    jg .w4_end_loop
1507*c0909341SAndroid Build Coastguard Worker.w4_end:
1508*c0909341SAndroid Build Coastguard Worker    RET
1509*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1510*c0909341SAndroid Build Coastguard Worker.w8:
1511*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq+216]
1512*c0909341SAndroid Build Coastguard Worker    mov                 r3b, hb
1513*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
1514*c0909341SAndroid Build Coastguard Worker    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
1515*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK         -32, 8
1516*c0909341SAndroid Build Coastguard Worker    movu                xm2, [z_filter_s+6]
1517*c0909341SAndroid Build Coastguard Worker    mova                xm0, [tlq-1]
1518*c0909341SAndroid Build Coastguard Worker    movd                xm6, hd
1519*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [tlq+7], 1
1520*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm6, xm6
1521*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [z_upsample1]
1522*c0909341SAndroid Build Coastguard Worker    pminub              xm6, xm2
1523*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pb_36_m4]
1524*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, xm6, 1
1525*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
1526*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m0, m1
1527*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m0, m2
1528*c0909341SAndroid Build Coastguard Worker    movd                xm6, dxd
1529*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m7
1530*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
1531*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, xm6
1532*c0909341SAndroid Build Coastguard Worker    mov                 r3d, dxd
1533*c0909341SAndroid Build Coastguard Worker    psrldq               m0, 1
1534*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
1535*c0909341SAndroid Build Coastguard Worker    paddw                m7, m6, m6
1536*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1537*c0909341SAndroid Build Coastguard Worker    vpblendd             m6, m7, 0xf0
1538*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
1539*c0909341SAndroid Build Coastguard Worker    pslldq               m2, m7, 8
1540*c0909341SAndroid Build Coastguard Worker    paddw                m7, m7
1541*c0909341SAndroid Build Coastguard Worker    paddw                m6, m2
1542*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m1
1543*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1
1544*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m0
1545*c0909341SAndroid Build Coastguard Worker.w8_upsample_loop:
1546*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r3+dxq]
1547*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6 ; base0
1548*c0909341SAndroid Build Coastguard Worker    movu                xm0, [rsp+r3]
1549*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r5+dxq]
1550*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 6 ; base1
1551*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [rsp+r5], 1
1552*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r3+dxq]
1553*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6 ; base2
1554*c0909341SAndroid Build Coastguard Worker    pand                 m1, m4, m6
1555*c0909341SAndroid Build Coastguard Worker    psubw                m2, m5, m1
1556*c0909341SAndroid Build Coastguard Worker    psllw                m1, 8
1557*c0909341SAndroid Build Coastguard Worker    por                  m2, m1
1558*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m2, m2 ; frac0 frac1
1559*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
1560*c0909341SAndroid Build Coastguard Worker    movu                xm1, [rsp+r3]
1561*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r5+dxq]
1562*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 6 ; base3
1563*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [rsp+r5], 1
1564*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m2, m2 ; frac2 frac3
1565*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
1566*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
1567*c0909341SAndroid Build Coastguard Worker    paddw                m6, m7
1568*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
1569*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1570*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
1571*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
1572*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
1573*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
1574*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r2       ], xm1
1575*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1576*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1577*c0909341SAndroid Build Coastguard Worker    jg .w8_upsample_loop
1578*c0909341SAndroid Build Coastguard Worker    RET
1579*c0909341SAndroid Build Coastguard Worker.w8_no_intra_edge_filter:
1580*c0909341SAndroid Build Coastguard Worker    and            maxbased, 7
1581*c0909341SAndroid Build Coastguard Worker    or             maxbased, 8 ; imin(h+7, 15)
1582*c0909341SAndroid Build Coastguard Worker    jmp .w8_main
1583*c0909341SAndroid Build Coastguard Worker.w8_no_upsample:
1584*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK         -32, 10
1585*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [hq+7]
1586*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1587*c0909341SAndroid Build Coastguard Worker    jnz .w8_no_intra_edge_filter
1588*c0909341SAndroid Build Coastguard Worker    call .filter_strength
1589*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1590*c0909341SAndroid Build Coastguard Worker    jz .w8_main ; filter_strength == 0
1591*c0909341SAndroid Build Coastguard Worker    popcnt              r5d, r5d
1592*c0909341SAndroid Build Coastguard Worker    movu                xm2, [tlq]
1593*c0909341SAndroid Build Coastguard Worker    pminub              xm1, xm0, [base+z_filter_s+14]
1594*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [tlq-1], 1
1595*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [base+z_filter_s+ 0], 1
1596*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*0]
1597*c0909341SAndroid Build Coastguard Worker    pminub              xm0, [base+z_filter_s+22]
1598*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [base+z_filter_s+ 8], 1
1599*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m2, m1
1600*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m7
1601*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*1]
1602*c0909341SAndroid Build Coastguard Worker    movzx               r3d, byte [tlq+15]
1603*c0909341SAndroid Build Coastguard Worker    shufps               m1, m0, q2121
1604*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2, m1
1605*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m7
1606*c0909341SAndroid Build Coastguard Worker    paddw                m1, m6
1607*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 3
1608*c0909341SAndroid Build Coastguard Worker    jnz .w8_3tap
1609*c0909341SAndroid Build Coastguard Worker    ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one,
1610*c0909341SAndroid Build Coastguard Worker    ; which also results in an awkward edge case where out[w*2] is
1611*c0909341SAndroid Build Coastguard Worker    ; slightly different from out[max_base_x] when h > w.
1612*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [z_filter_k+4*8]
1613*c0909341SAndroid Build Coastguard Worker    movzx               r2d, byte [tlq+14]
1614*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m0
1615*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
1616*c0909341SAndroid Build Coastguard Worker    sub                 r2d, r3d
1617*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r2+r3*8+4]
1618*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3
1619*c0909341SAndroid Build Coastguard Worker    mov            [rsp+16], r2b
1620*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1621*c0909341SAndroid Build Coastguard Worker.w8_3tap:
1622*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
1623*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 1
1624*c0909341SAndroid Build Coastguard Worker    mov                 tlq, rsp
1625*c0909341SAndroid Build Coastguard Worker    add                 r5d, 17 ; w*2 + (filter_strength == 3)
1626*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
1627*c0909341SAndroid Build Coastguard Worker    cmovns         maxbased, r5d
1628*c0909341SAndroid Build Coastguard Worker    mov            [tlq+r5], r3b
1629*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m1, 1
1630*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm1
1631*c0909341SAndroid Build Coastguard Worker    mova              [tlq], xm0
1632*c0909341SAndroid Build Coastguard Worker.w8_main:
1633*c0909341SAndroid Build Coastguard Worker    movd                xm2, dxd
1634*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [z_base_inc]
1635*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, xm2
1636*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m7, [tlq+maxbaseq]
1637*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
1638*c0909341SAndroid Build Coastguard Worker    movd                xm9, maxbased
1639*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [z_filter_s+2]
1640*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, xm9
1641*c0909341SAndroid Build Coastguard Worker    psrlw                m7, 8
1642*c0909341SAndroid Build Coastguard Worker    psubw                m9, m0
1643*c0909341SAndroid Build Coastguard Worker    mov                 r3d, dxd
1644*c0909341SAndroid Build Coastguard Worker    paddw                m6, m2, m2
1645*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m6, 0xf0
1646*c0909341SAndroid Build Coastguard Worker.w8_loop:
1647*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r3+dxq]
1648*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6
1649*c0909341SAndroid Build Coastguard Worker    pand                 m0, m4, m2
1650*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5, m0
1651*c0909341SAndroid Build Coastguard Worker    psllw                m0, 8
1652*c0909341SAndroid Build Coastguard Worker    por                  m1, m0
1653*c0909341SAndroid Build Coastguard Worker    movu                xm0, [tlq+r3]
1654*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r5+dxq]
1655*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 6 ; base1
1656*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [tlq+r5], 1
1657*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m8
1658*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
1659*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m1, m9, m2
1660*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6
1661*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
1662*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m7, m0, m1
1663*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
1664*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm1
1665*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
1666*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
1667*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1668*c0909341SAndroid Build Coastguard Worker    jz .w8_end
1669*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1670*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, maxbased
1671*c0909341SAndroid Build Coastguard Worker    jb .w8_loop
1672*c0909341SAndroid Build Coastguard Worker    packuswb            xm7, xm7
1673*c0909341SAndroid Build Coastguard Worker.w8_end_loop:
1674*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm7
1675*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm7
1676*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1677*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1678*c0909341SAndroid Build Coastguard Worker    jg .w8_end_loop
1679*c0909341SAndroid Build Coastguard Worker.w8_end:
1680*c0909341SAndroid Build Coastguard Worker    RET
1681*c0909341SAndroid Build Coastguard Worker.w16_no_intra_edge_filter:
1682*c0909341SAndroid Build Coastguard Worker    and            maxbased, 15
1683*c0909341SAndroid Build Coastguard Worker    or             maxbased, 16 ; imin(h+15, 31)
1684*c0909341SAndroid Build Coastguard Worker    jmp .w16_main
1685*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1686*c0909341SAndroid Build Coastguard Worker.w16:
1687*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK         -64, 12
1688*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [hq+15]
1689*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1690*c0909341SAndroid Build Coastguard Worker    jnz .w16_no_intra_edge_filter
1691*c0909341SAndroid Build Coastguard Worker    call .filter_strength
1692*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1693*c0909341SAndroid Build Coastguard Worker    jz .w16_main ; filter_strength == 0
1694*c0909341SAndroid Build Coastguard Worker    popcnt              r5d, r5d
1695*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [base+pb_12]
1696*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [base+z_filter_s+8]
1697*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, m6, [base+z_filter_s], 0
1698*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [base+z_filter_s+16], 1
1699*c0909341SAndroid Build Coastguard Worker    mova               xm10, [tlq-1]
1700*c0909341SAndroid Build Coastguard Worker    vinserti128         m10, [tlq+3], 1
1701*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*0]
1702*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [base+z_filter_s+14]
1703*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, m7, [base+z_filter_s+6], 0
1704*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [base+z_filter_s+22], 1
1705*c0909341SAndroid Build Coastguard Worker    psubw                m0, m1
1706*c0909341SAndroid Build Coastguard Worker    movu               xm11, [tlq+12]
1707*c0909341SAndroid Build Coastguard Worker    vinserti128         m11, [tlq+16], 1
1708*c0909341SAndroid Build Coastguard Worker    pminub               m8, m0
1709*c0909341SAndroid Build Coastguard Worker    pminub               m7, m0
1710*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m10, m2
1711*c0909341SAndroid Build Coastguard Worker    shufps               m2, m6, q2121
1712*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m9
1713*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m11, m8
1714*c0909341SAndroid Build Coastguard Worker    shufps               m8, m7, q2121
1715*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m9
1716*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
1717*c0909341SAndroid Build Coastguard Worker    movzx               r3d, byte [tlq+31]
1718*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m10, m2
1719*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9
1720*c0909341SAndroid Build Coastguard Worker    pshufb               m8, m11, m8
1721*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m8, m9
1722*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1723*c0909341SAndroid Build Coastguard Worker    paddw                m1, m8
1724*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 3
1725*c0909341SAndroid Build Coastguard Worker    jnz .w16_3tap
1726*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [z_filter_k+4*8]
1727*c0909341SAndroid Build Coastguard Worker    movzx               r2d, byte [tlq+30]
1728*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m6
1729*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
1730*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m7
1731*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m11, m9
1732*c0909341SAndroid Build Coastguard Worker    sub                 r2d, r3d
1733*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r2+r3*8+4]
1734*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 3
1735*c0909341SAndroid Build Coastguard Worker    mov            [rsp+32], r2b
1736*c0909341SAndroid Build Coastguard Worker    paddw                m0, m10
1737*c0909341SAndroid Build Coastguard Worker    paddw                m1, m11
1738*c0909341SAndroid Build Coastguard Worker.w16_3tap:
1739*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
1740*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
1741*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 1
1742*c0909341SAndroid Build Coastguard Worker    mov                 tlq, rsp
1743*c0909341SAndroid Build Coastguard Worker    add                 r5d, 33
1744*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
1745*c0909341SAndroid Build Coastguard Worker    cmovns         maxbased, r5d
1746*c0909341SAndroid Build Coastguard Worker    mov            [tlq+r5], r3b
1747*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1748*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
1749*c0909341SAndroid Build Coastguard Worker    mova              [tlq], m0
1750*c0909341SAndroid Build Coastguard Worker.w16_main:
1751*c0909341SAndroid Build Coastguard Worker    movd                xm6, dxd
1752*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [z_base_inc]
1753*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m7, [tlq+maxbaseq]
1754*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
1755*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, xm6
1756*c0909341SAndroid Build Coastguard Worker    movd                xm9, maxbased
1757*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [z_filter_s+2]
1758*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, xm9
1759*c0909341SAndroid Build Coastguard Worker    mov                 r3d, dxd
1760*c0909341SAndroid Build Coastguard Worker    psubw                m9, m0
1761*c0909341SAndroid Build Coastguard Worker    paddw               m11, m6, m6
1762*c0909341SAndroid Build Coastguard Worker    psubw               m10, m9, m3 ; 64*8
1763*c0909341SAndroid Build Coastguard Worker    vpblendd             m6, m11, 0xf0
1764*c0909341SAndroid Build Coastguard Worker.w16_loop:
1765*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r3+dxq]
1766*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6 ; base0
1767*c0909341SAndroid Build Coastguard Worker    pand                 m1, m4, m6
1768*c0909341SAndroid Build Coastguard Worker    psubw                m2, m5, m1
1769*c0909341SAndroid Build Coastguard Worker    psllw                m1, 8
1770*c0909341SAndroid Build Coastguard Worker    por                  m2, m1
1771*c0909341SAndroid Build Coastguard Worker    movu                xm0, [tlq+r3+0]
1772*c0909341SAndroid Build Coastguard Worker    movu                xm1, [tlq+r3+8]
1773*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r5+dxq]
1774*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 6 ; base1
1775*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [tlq+r5+0], 1
1776*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [tlq+r5+8], 1
1777*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m8
1778*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m8
1779*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
1780*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
1781*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
1782*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
1783*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1784*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m1, m9, m6
1785*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m10, m6
1786*c0909341SAndroid Build Coastguard Worker    packsswb             m1, m2
1787*c0909341SAndroid Build Coastguard Worker    paddw                m6, m11
1788*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m7, m0, m1
1789*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
1790*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
1791*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1792*c0909341SAndroid Build Coastguard Worker    jz .w16_end
1793*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1794*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, maxbased
1795*c0909341SAndroid Build Coastguard Worker    jb .w16_loop
1796*c0909341SAndroid Build Coastguard Worker.w16_end_loop:
1797*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], xm7
1798*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], xm7
1799*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1800*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1801*c0909341SAndroid Build Coastguard Worker    jg .w16_end_loop
1802*c0909341SAndroid Build Coastguard Worker.w16_end:
1803*c0909341SAndroid Build Coastguard Worker    RET
1804*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1805*c0909341SAndroid Build Coastguard Worker.w32:
1806*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK         -96, 15
1807*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+31]
1808*c0909341SAndroid Build Coastguard Worker    mov            maxbased, 63
1809*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
1810*c0909341SAndroid Build Coastguard Worker    cmovs          maxbased, r3d
1811*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
1812*c0909341SAndroid Build Coastguard Worker    jnz .w32_main
1813*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [pb_0to15]
1814*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 29 ; h+2
1815*c0909341SAndroid Build Coastguard Worker    movu               xm13, [tlq+29]    ; 32-39
1816*c0909341SAndroid Build Coastguard Worker    movd                xm1, r3d
1817*c0909341SAndroid Build Coastguard Worker    movu               xm14, [tlq+37]    ; 40-47
1818*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 8 ; h-6
1819*c0909341SAndroid Build Coastguard Worker    vinserti128         m14, [tlq+51], 1 ; 56-63
1820*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm1, xm1
1821*c0909341SAndroid Build Coastguard Worker    mova               xm11, [tlq- 1]    ;  0- 7
1822*c0909341SAndroid Build Coastguard Worker    vinserti128         m11, [tlq+13], 1 ; 16-23
1823*c0909341SAndroid Build Coastguard Worker    movd                xm2, r3d
1824*c0909341SAndroid Build Coastguard Worker    movu               xm12, [tlq+ 5]    ;  8-15
1825*c0909341SAndroid Build Coastguard Worker    vinserti128         m12, [tlq+19], 1 ; 24-31
1826*c0909341SAndroid Build Coastguard Worker    pminub              xm1, xm0 ; clip 32x8
1827*c0909341SAndroid Build Coastguard Worker    mova                 m7, [z_filter_s+0]
1828*c0909341SAndroid Build Coastguard Worker    pshufb             xm13, xm1
1829*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [pb_12]
1830*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm2, xm2
1831*c0909341SAndroid Build Coastguard Worker    vinserti128         m13, [tlq+43], 1 ; 48-55
1832*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, m7, [z_filter_s+4], 1
1833*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m1, 0xf0
1834*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [z_filter_s+12], 0
1835*c0909341SAndroid Build Coastguard Worker    pminub               m2, m0 ; clip 32x16 and 32x(32|64)
1836*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
1837*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m2
1838*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m11, m8
1839*c0909341SAndroid Build Coastguard Worker    shufps               m8, m7, q1021
1840*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m9
1841*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m12, m8
1842*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9
1843*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m13, m8
1844*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m9
1845*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m14, m8
1846*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m9
1847*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
1848*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m11, m8
1849*c0909341SAndroid Build Coastguard Worker    shufps               m8, m7, q2121
1850*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
1851*c0909341SAndroid Build Coastguard Worker    paddw                m0, m10
1852*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m12, m8
1853*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
1854*c0909341SAndroid Build Coastguard Worker    paddw                m2, m10
1855*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m13, m8
1856*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
1857*c0909341SAndroid Build Coastguard Worker    paddw                m1, m10
1858*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m14, m8
1859*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
1860*c0909341SAndroid Build Coastguard Worker    paddw                m6, m10
1861*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
1862*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m8
1863*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m11, m9
1864*c0909341SAndroid Build Coastguard Worker    pshufb              m12, m7
1865*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, m9
1866*c0909341SAndroid Build Coastguard Worker    movzx               r3d, byte [tlq+63]
1867*c0909341SAndroid Build Coastguard Worker    movzx               r2d, byte [tlq+62]
1868*c0909341SAndroid Build Coastguard Worker    paddw                m0, m11
1869*c0909341SAndroid Build Coastguard Worker    paddw                m2, m12
1870*c0909341SAndroid Build Coastguard Worker    pshufb              m13, m7
1871*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m9
1872*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m7
1873*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m9
1874*c0909341SAndroid Build Coastguard Worker    paddw                m1, m13
1875*c0909341SAndroid Build Coastguard Worker    paddw                m6, m14
1876*c0909341SAndroid Build Coastguard Worker    sub                 r2d, r3d
1877*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r2+r3*8+4] ; edge case for 32x64
1878*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
1879*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1880*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
1881*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m3
1882*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 3
1883*c0909341SAndroid Build Coastguard Worker    mov            [rsp+64], r2b
1884*c0909341SAndroid Build Coastguard Worker    mov                 tlq, rsp
1885*c0909341SAndroid Build Coastguard Worker    mov            [tlq+65], r3b
1886*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 65
1887*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 64
1888*c0909341SAndroid Build Coastguard Worker    cmove          maxbased, r3d
1889*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m2
1890*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m6
1891*c0909341SAndroid Build Coastguard Worker    mova           [tlq+ 0], m0
1892*c0909341SAndroid Build Coastguard Worker    mova           [tlq+32], m1
1893*c0909341SAndroid Build Coastguard Worker.w32_main:
1894*c0909341SAndroid Build Coastguard Worker    movd                xm6, dxd
1895*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m7, [tlq+maxbaseq]
1896*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
1897*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, xm6
1898*c0909341SAndroid Build Coastguard Worker    movd                xm9, maxbased
1899*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [z_filter_s+2]
1900*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, xm9
1901*c0909341SAndroid Build Coastguard Worker    mov                 r5d, dxd
1902*c0909341SAndroid Build Coastguard Worker    psubw                m9, [z_base_inc]
1903*c0909341SAndroid Build Coastguard Worker    mova                m11, m6
1904*c0909341SAndroid Build Coastguard Worker    psubw               m10, m9, m3 ; 64*8
1905*c0909341SAndroid Build Coastguard Worker.w32_loop:
1906*c0909341SAndroid Build Coastguard Worker    mov                 r3d, r5d
1907*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6
1908*c0909341SAndroid Build Coastguard Worker    pand                 m1, m4, m6
1909*c0909341SAndroid Build Coastguard Worker    psubw                m2, m5, m1
1910*c0909341SAndroid Build Coastguard Worker    psllw                m1, 8
1911*c0909341SAndroid Build Coastguard Worker    por                  m2, m1
1912*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3+0]
1913*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3+8]
1914*c0909341SAndroid Build Coastguard Worker    add                 r5d, dxd
1915*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m8
1916*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m8
1917*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
1918*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
1919*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
1920*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
1921*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1922*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m1, m9, m6
1923*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m10, m6
1924*c0909341SAndroid Build Coastguard Worker    packsswb             m1, m2
1925*c0909341SAndroid Build Coastguard Worker    paddw                m6, m11
1926*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m7, m0, m1
1927*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
1928*c0909341SAndroid Build Coastguard Worker    dec                  hd
1929*c0909341SAndroid Build Coastguard Worker    jz .w32_end
1930*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1931*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, maxbased
1932*c0909341SAndroid Build Coastguard Worker    jb .w32_loop
1933*c0909341SAndroid Build Coastguard Worker    test                 hb, 1
1934*c0909341SAndroid Build Coastguard Worker    jz .w32_end_loop
1935*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m7
1936*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1937*c0909341SAndroid Build Coastguard Worker    dec                  hd
1938*c0909341SAndroid Build Coastguard Worker    jz .w32_end
1939*c0909341SAndroid Build Coastguard Worker.w32_end_loop:
1940*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m7
1941*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m7
1942*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1943*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1944*c0909341SAndroid Build Coastguard Worker    jg .w32_end_loop
1945*c0909341SAndroid Build Coastguard Worker.w32_end:
1946*c0909341SAndroid Build Coastguard Worker    RET
1947*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1948*c0909341SAndroid Build Coastguard Worker.w64:
1949*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK        -128, 16
1950*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [hq+63]
1951*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
1952*c0909341SAndroid Build Coastguard Worker    jnz .w64_main
1953*c0909341SAndroid Build Coastguard Worker    mova               xm11, [tlq- 1]    ;  0- 7
1954*c0909341SAndroid Build Coastguard Worker    vinserti128         m11, [tlq+13], 1 ; 16-23
1955*c0909341SAndroid Build Coastguard Worker    movu               xm12, [tlq+ 5]    ;  8-15
1956*c0909341SAndroid Build Coastguard Worker    vinserti128         m12, [tlq+19], 1 ; 24-31
1957*c0909341SAndroid Build Coastguard Worker    mova                 m7, [z_filter_s+0]
1958*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, m7, [z_filter_s+4], 1
1959*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [z_filter_s+12], 0
1960*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
1961*c0909341SAndroid Build Coastguard Worker    movu               xm13, [tlq+29]    ; 32-39
1962*c0909341SAndroid Build Coastguard Worker    vinserti128         m13, [tlq+43], 1 ; 48-55
1963*c0909341SAndroid Build Coastguard Worker    movu               xm14, [tlq+37]    ; 40-47
1964*c0909341SAndroid Build Coastguard Worker    vinserti128         m14, [tlq+51], 1 ; 56-63
1965*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m11, m8
1966*c0909341SAndroid Build Coastguard Worker    shufps               m8, m7, q1021
1967*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m9
1968*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m12, m8
1969*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9
1970*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m13, m8
1971*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m9
1972*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m14, m8
1973*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m9
1974*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
1975*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m11, m8
1976*c0909341SAndroid Build Coastguard Worker    shufps              m15, m8, m7, q2121
1977*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
1978*c0909341SAndroid Build Coastguard Worker    paddw                m0, m10
1979*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m12, m15
1980*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
1981*c0909341SAndroid Build Coastguard Worker    paddw                m2, m10
1982*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m13, m15
1983*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
1984*c0909341SAndroid Build Coastguard Worker    paddw                m1, m10
1985*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m14, m15
1986*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
1987*c0909341SAndroid Build Coastguard Worker    paddw                m6, m10
1988*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [z_filter_k+4*2+12*2]
1989*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m15
1990*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m11, m10
1991*c0909341SAndroid Build Coastguard Worker    pshufb              m12, m7
1992*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, m10
1993*c0909341SAndroid Build Coastguard Worker    pshufb              m13, m7
1994*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m10
1995*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m7
1996*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m10
1997*c0909341SAndroid Build Coastguard Worker    paddw                m0, m11
1998*c0909341SAndroid Build Coastguard Worker    paddw                m2, m12
1999*c0909341SAndroid Build Coastguard Worker    paddw                m1, m13
2000*c0909341SAndroid Build Coastguard Worker    paddw                m6, m14
2001*c0909341SAndroid Build Coastguard Worker    movu               xm11, [tlq+ 61]    ;  64- 71
2002*c0909341SAndroid Build Coastguard Worker    vinserti128         m11, [tlq+ 75], 1 ;  80- 87
2003*c0909341SAndroid Build Coastguard Worker    movu               xm12, [tlq+ 69]    ;  72- 79
2004*c0909341SAndroid Build Coastguard Worker    vinserti128         m12, [tlq+ 83], 1 ;  88- 95
2005*c0909341SAndroid Build Coastguard Worker    movu               xm13, [tlq+ 93]    ;  96-103
2006*c0909341SAndroid Build Coastguard Worker    vinserti128         m13, [tlq+107], 1 ; 112-119
2007*c0909341SAndroid Build Coastguard Worker    movu               xm14, [tlq+101]    ; 104-111
2008*c0909341SAndroid Build Coastguard Worker    vinserti128         m14, [tlq+115], 1 ; 120-127
2009*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
2010*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
2011*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
2012*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m3
2013*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq-20]
2014*c0909341SAndroid Build Coastguard Worker    mov                 tlq, rsp
2015*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m2
2016*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m6
2017*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm2, [pb_14]
2018*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [pb_0to15]
2019*c0909341SAndroid Build Coastguard Worker    mova         [tlq+32*0], m0
2020*c0909341SAndroid Build Coastguard Worker    mova         [tlq+32*1], m1
2021*c0909341SAndroid Build Coastguard Worker    movd                xm0, r3d
2022*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [pb_12]
2023*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, xm0
2024*c0909341SAndroid Build Coastguard Worker    paddb                m0, m2
2025*c0909341SAndroid Build Coastguard Worker    pminub               m0, m6 ; clip 64x16 and 64x32
2026*c0909341SAndroid Build Coastguard Worker    pshufb              m12, m0
2027*c0909341SAndroid Build Coastguard Worker    pminub               m1, m6 ; clip 64x64
2028*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m1
2029*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m11, m7
2030*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m10
2031*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m12, m7
2032*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m10
2033*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m13, m7
2034*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m10
2035*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m14, m7
2036*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m10
2037*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m11, m15
2038*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m9
2039*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m12, m15
2040*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
2041*c0909341SAndroid Build Coastguard Worker    paddw                m0, m7
2042*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m13, m15
2043*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m9
2044*c0909341SAndroid Build Coastguard Worker    paddw                m2, m10
2045*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m14, m15
2046*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
2047*c0909341SAndroid Build Coastguard Worker    paddw                m1, m7
2048*c0909341SAndroid Build Coastguard Worker    paddw                m6, m10
2049*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
2050*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m8
2051*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m11, m9
2052*c0909341SAndroid Build Coastguard Worker    pshufb              m12, m8
2053*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, m9
2054*c0909341SAndroid Build Coastguard Worker    pshufb              m13, m8
2055*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m9
2056*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m8
2057*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m9
2058*c0909341SAndroid Build Coastguard Worker    paddw                m0, m11
2059*c0909341SAndroid Build Coastguard Worker    paddw                m2, m12
2060*c0909341SAndroid Build Coastguard Worker    paddw                m1, m13
2061*c0909341SAndroid Build Coastguard Worker    paddw                m6, m14
2062*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
2063*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
2064*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
2065*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m3
2066*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m2
2067*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m6
2068*c0909341SAndroid Build Coastguard Worker    mova         [tlq+32*2], m0
2069*c0909341SAndroid Build Coastguard Worker    mova         [tlq+32*3], m1
2070*c0909341SAndroid Build Coastguard Worker.w64_main:
2071*c0909341SAndroid Build Coastguard Worker    movd               xm12, dxd
2072*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m7, [tlq+maxbaseq]
2073*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [dxq-64]
2074*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
2075*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m12, xm12
2076*c0909341SAndroid Build Coastguard Worker    sub                 r3d, maxbased
2077*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [z_filter_s+2]
2078*c0909341SAndroid Build Coastguard Worker    movd                xm6, r3d
2079*c0909341SAndroid Build Coastguard Worker    mov                 r5d, dxd
2080*c0909341SAndroid Build Coastguard Worker    mova                m10, [pb_1to32]
2081*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pb_32]
2082*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, xm6
2083*c0909341SAndroid Build Coastguard Worker.w64_loop:
2084*c0909341SAndroid Build Coastguard Worker    mov                 r3d, r5d
2085*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6
2086*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3+ 0]
2087*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3+ 8]
2088*c0909341SAndroid Build Coastguard Worker    pand                 m2, m4, m6
2089*c0909341SAndroid Build Coastguard Worker    psubw                m9, m5, m2
2090*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
2091*c0909341SAndroid Build Coastguard Worker    por                  m9, m2
2092*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m8
2093*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m8
2094*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m9
2095*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m9
2096*c0909341SAndroid Build Coastguard Worker    psraw                m2, m6, 6
2097*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
2098*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
2099*c0909341SAndroid Build Coastguard Worker    packsswb             m2, m2
2100*c0909341SAndroid Build Coastguard Worker    paddb                m2, m10
2101*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2102*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m7, m0, m2
2103*c0909341SAndroid Build Coastguard Worker    mova          [dstq+ 0], m0
2104*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3+32]
2105*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3+40]
2106*c0909341SAndroid Build Coastguard Worker    add                 r5d, dxd
2107*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m8
2108*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m8
2109*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m9
2110*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m9
2111*c0909341SAndroid Build Coastguard Worker    paddb                m2, m11
2112*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
2113*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
2114*c0909341SAndroid Build Coastguard Worker    paddw                m6, m12
2115*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2116*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m7, m0, m2
2117*c0909341SAndroid Build Coastguard Worker    mova          [dstq+32], m0
2118*c0909341SAndroid Build Coastguard Worker    dec                  hd
2119*c0909341SAndroid Build Coastguard Worker    jz .w64_end
2120*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
2121*c0909341SAndroid Build Coastguard Worker    cmp                 r5d, maxbased
2122*c0909341SAndroid Build Coastguard Worker    jb .w64_loop
2123*c0909341SAndroid Build Coastguard Worker.w64_end_loop:
2124*c0909341SAndroid Build Coastguard Worker    mova          [dstq+ 0], m7
2125*c0909341SAndroid Build Coastguard Worker    mova          [dstq+32], m7
2126*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
2127*c0909341SAndroid Build Coastguard Worker    dec                  hd
2128*c0909341SAndroid Build Coastguard Worker    jg .w64_end_loop
2129*c0909341SAndroid Build Coastguard Worker.w64_end:
2130*c0909341SAndroid Build Coastguard Worker    RET
2131*c0909341SAndroid Build Coastguard Worker
2132*c0909341SAndroid Build Coastguard Workercglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy
2133*c0909341SAndroid Build Coastguard Worker%define base r9-z_filter_t0
2134*c0909341SAndroid Build Coastguard Worker    lea                  r9, [ipred_z2_avx2_table]
2135*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
2136*c0909341SAndroid Build Coastguard Worker    movifnidn        angled, anglem
2137*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
2138*c0909341SAndroid Build Coastguard Worker    lea                 dxq, [dr_intra_derivative-90]
2139*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r9+wq*4]
2140*c0909341SAndroid Build Coastguard Worker    movzx               dyd, angleb
2141*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x400
2142*c0909341SAndroid Build Coastguard Worker    mov                  r8, dxq
2143*c0909341SAndroid Build Coastguard Worker    sub                 dxq, dyq
2144*c0909341SAndroid Build Coastguard Worker    add                  wq, r9
2145*c0909341SAndroid Build Coastguard Worker    add                  r9, z_filter_t0-ipred_z2_avx2_table
2146*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq-64]
2147*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-32]
2148*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tlq]
2149*c0909341SAndroid Build Coastguard Worker    and                 dyd, ~1
2150*c0909341SAndroid Build Coastguard Worker    and                 dxq, ~1
2151*c0909341SAndroid Build Coastguard Worker    movzx               dyd, word [r8+dyq]  ; angle - 90
2152*c0909341SAndroid Build Coastguard Worker    movzx               dxd, word [dxq+270] ; 180 - angle
2153*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [base+pw_512]
2154*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [base+pw_62]
2155*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [base+pw_64]
2156*c0909341SAndroid Build Coastguard Worker    mova           [rsp+ 0], m2
2157*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m0
2158*c0909341SAndroid Build Coastguard Worker    mova           [rsp+64], m1
2159*c0909341SAndroid Build Coastguard Worker    neg                 dxd
2160*c0909341SAndroid Build Coastguard Worker    neg                 dyd
2161*c0909341SAndroid Build Coastguard Worker    jmp                  wq
2162*c0909341SAndroid Build Coastguard Worker.w4:
2163*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m6, [base+z2_base_inc] ; base_inc << 6
2164*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m10, [base+z1_shuf_w4]
2165*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m11, [base+z2_shuf_h4]
2166*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [dxq+(65<<6)] ; xpos
2167*c0909341SAndroid Build Coastguard Worker    movd                xm5, dyd
2168*c0909341SAndroid Build Coastguard Worker    mov                 r8d, (63-4)<<6
2169*c0909341SAndroid Build Coastguard Worker    mov                 dyq, -4
2170*c0909341SAndroid Build Coastguard Worker    pshuflw             xm5, xm5, q0000
2171*c0909341SAndroid Build Coastguard Worker    pmullw              xm5, [base+z2_ymul]
2172*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2173*c0909341SAndroid Build Coastguard Worker    jnz .w4_main ; !enable_intra_edge_filter
2174*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+2]
2175*c0909341SAndroid Build Coastguard Worker    add              angled, 1022
2176*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
2177*c0909341SAndroid Build Coastguard Worker    test                r3d, angled
2178*c0909341SAndroid Build Coastguard Worker    jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
2179*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm3, [base+pb_4]
2180*c0909341SAndroid Build Coastguard Worker    call .upsample_above
2181*c0909341SAndroid Build Coastguard Worker    sub              angled, 1075 ; angle - 53
2182*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
2183*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x7f ; 180 - angle
2184*c0909341SAndroid Build Coastguard Worker    call .filter_strength
2185*c0909341SAndroid Build Coastguard Worker    jmp .w4_filter_left
2186*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2187*c0909341SAndroid Build Coastguard Worker.filter_strength:
2188*c0909341SAndroid Build Coastguard Worker    movd                xm8, r3d
2189*c0909341SAndroid Build Coastguard Worker    mov                 r3d, angled
2190*c0909341SAndroid Build Coastguard Worker    movd                xm7, angled
2191*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m8, xm8
2192*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 8 ; is_sm << 1
2193*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m7, xm7
2194*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m8, [base+z_filter_wh]
2195*c0909341SAndroid Build Coastguard Worker    mova                xm9, [r9+r3*8]
2196*c0909341SAndroid Build Coastguard Worker    pand                 m0, m8, m7
2197*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m0, m9
2198*c0909341SAndroid Build Coastguard Worker    pmovmskb            r3d, m0
2199*c0909341SAndroid Build Coastguard Worker    ret
2200*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2201*c0909341SAndroid Build Coastguard Worker.upsample_above: ; w4/w8
2202*c0909341SAndroid Build Coastguard Worker    pshufb              xm2, xm1, [base+z_upsample1-2]
2203*c0909341SAndroid Build Coastguard Worker    pminub              xm3, [base+z_filter_s+4]
2204*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [base+pb_36_m4]
2205*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m10, [base+pb_0to15]
2206*c0909341SAndroid Build Coastguard Worker    pshufb              xm3, xm1, xm3
2207*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm2, xm4
2208*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm3, xm4
2209*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r2+dxq+(1<<6)]
2210*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
2211*c0909341SAndroid Build Coastguard Worker    paddw               xm2, xm3
2212*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm2, xm13
2213*c0909341SAndroid Build Coastguard Worker    sub                 r8d, 3<<6
2214*c0909341SAndroid Build Coastguard Worker    paddw                m6, m6
2215*c0909341SAndroid Build Coastguard Worker    packuswb            xm2, xm2
2216*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm1, xm2
2217*c0909341SAndroid Build Coastguard Worker    mova   [rsp+gprsize+64], xm1
2218*c0909341SAndroid Build Coastguard Worker    ret
2219*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2220*c0909341SAndroid Build Coastguard Worker.upsample_left: ; h4/h8
2221*c0909341SAndroid Build Coastguard Worker    mov                 r3d, hd
2222*c0909341SAndroid Build Coastguard Worker    and                 r3d, 4
2223*c0909341SAndroid Build Coastguard Worker    movd                xm2, [rsp+gprsize+64]
2224*c0909341SAndroid Build Coastguard Worker    movddup             xm0, [rsp+gprsize+56]
2225*c0909341SAndroid Build Coastguard Worker    movd                xm1, r3d
2226*c0909341SAndroid Build Coastguard Worker    palignr             xm2, xm0, 1
2227*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm1, xm1
2228*c0909341SAndroid Build Coastguard Worker    pshufb              xm2, [base+z_filter_s+18]
2229*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm3, [base+pb_36_m4]
2230*c0909341SAndroid Build Coastguard Worker    pmaxub              xm1, [base+z_upsample1-2]
2231*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm0, xm1
2232*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm2, xm3
2233*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm3
2234*c0909341SAndroid Build Coastguard Worker    paddw               xm5, xm5
2235*c0909341SAndroid Build Coastguard Worker    add                 dyq, dyq
2236*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm2
2237*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm1, xm13
2238*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m11, [base+z2_upsample]
2239*c0909341SAndroid Build Coastguard Worker    paddw               xm5, xm15
2240*c0909341SAndroid Build Coastguard Worker    packuswb            xm1, xm1
2241*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm0, xm1
2242*c0909341SAndroid Build Coastguard Worker    mova   [rsp+gprsize+48], xm0
2243*c0909341SAndroid Build Coastguard Worker    ret
2244*c0909341SAndroid Build Coastguard Worker.w4_no_upsample_above:
2245*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
2246*c0909341SAndroid Build Coastguard Worker    sub              angled, 1112 ; angle - 90
2247*c0909341SAndroid Build Coastguard Worker    call .filter_strength
2248*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
2249*c0909341SAndroid Build Coastguard Worker    jz .w4_no_filter_above
2250*c0909341SAndroid Build Coastguard Worker    popcnt              r3d, r3d
2251*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm2, [base+pb_4]
2252*c0909341SAndroid Build Coastguard Worker    pminub              xm2, [base+z_filter_s]
2253*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [base+z_filter_k-4+r3*4+12*0]
2254*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*1]
2255*c0909341SAndroid Build Coastguard Worker    pshufb              xm3, xm1, xm2 ; 00 01 12 23
2256*c0909341SAndroid Build Coastguard Worker    pshufd              xm2, xm2, q0321
2257*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm3, xm0
2258*c0909341SAndroid Build Coastguard Worker    pshufb              xm2, xm1, xm2 ; 12 23 34 44
2259*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm2, xm4
2260*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*2]
2261*c0909341SAndroid Build Coastguard Worker    punpckhqdq          xm3, xm3      ; 34 44 44 44
2262*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm3, xm4
2263*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, r6m      ; max_width
2264*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm4
2265*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm2
2266*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm3
2267*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm13
2268*c0909341SAndroid Build Coastguard Worker    packsswb            xm4, xm4
2269*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, 8
2270*c0909341SAndroid Build Coastguard Worker    psubb               xm4, [base+pb_1to32]
2271*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm0
2272*c0909341SAndroid Build Coastguard Worker    vpblendvb           xm0, xm1, xm4
2273*c0909341SAndroid Build Coastguard Worker    movd           [rsp+65], xm0
2274*c0909341SAndroid Build Coastguard Worker.w4_no_filter_above:
2275*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+2]
2276*c0909341SAndroid Build Coastguard Worker    add              angled, 973 ; angle + 883
2277*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
2278*c0909341SAndroid Build Coastguard Worker    test                r3d, angled
2279*c0909341SAndroid Build Coastguard Worker    jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
2280*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [base+pb_90]
2281*c0909341SAndroid Build Coastguard Worker    psubb               xm0, xm7 ; 180 - angle
2282*c0909341SAndroid Build Coastguard Worker    pand                xm0, xm8 ; reuse from previous filter_strength call
2283*c0909341SAndroid Build Coastguard Worker    pcmpgtb             xm0, xm9
2284*c0909341SAndroid Build Coastguard Worker    pmovmskb            r3d, xm0
2285*c0909341SAndroid Build Coastguard Worker.w4_filter_left:
2286*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
2287*c0909341SAndroid Build Coastguard Worker    jz .w4_main
2288*c0909341SAndroid Build Coastguard Worker    popcnt              r3d, r3d
2289*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 10
2290*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
2291*c0909341SAndroid Build Coastguard Worker    movu                xm2, [rsp+49]
2292*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [rsp+43], 1
2293*c0909341SAndroid Build Coastguard Worker    cmovs               r5d, hd
2294*c0909341SAndroid Build Coastguard Worker    xor                 r5d, 15 ; h == 16 ? 5 : 15 - h
2295*c0909341SAndroid Build Coastguard Worker    movd                xm0, r5d
2296*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [base+z_filter_s+12]
2297*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [base+z_filter_s+16]
2298*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, m1, [z_filter_s+8], 1   ; 56 67 78 89 9a ab bc cd   55 55 56 67 78 89 9a ab
2299*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m4, 0x0f                ; 78 89 9a ab bc cd de ef   56 67 78 89 9a ab bc cd
2300*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff   78 89 9a ab bc cd de ef
2301*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, xm0
2302*c0909341SAndroid Build Coastguard Worker    pmaxub               m0, m3
2303*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [base+z_filter_k-4+r3*4+12*0]
2304*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2, m0
2305*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
2306*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [base+z_filter_k-4+r3*4+12*1]
2307*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2, m1
2308*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
2309*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [base+z_filter_k-4+r3*4+12*2]
2310*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
2311*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m3
2312*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, r7m ; max_height
2313*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm4
2314*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
2315*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2316*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m13
2317*c0909341SAndroid Build Coastguard Worker    packsswb            xm4, xm4
2318*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m1, 1
2319*c0909341SAndroid Build Coastguard Worker    psubb               xm4, [base+pb_16to1]
2320*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm1
2321*c0909341SAndroid Build Coastguard Worker    vpblendvb           xm0, [rsp+48], xm4
2322*c0909341SAndroid Build Coastguard Worker    mova           [rsp+48], xm0
2323*c0909341SAndroid Build Coastguard Worker    jmp .w4_main
2324*c0909341SAndroid Build Coastguard Worker.w4_upsample_left:
2325*c0909341SAndroid Build Coastguard Worker    call .upsample_left
2326*c0909341SAndroid Build Coastguard Worker.w4_main:
2327*c0909341SAndroid Build Coastguard Worker    movd                xm0, dxd
2328*c0909341SAndroid Build Coastguard Worker    mova                m12, [base+z2_y_shuf_h4]
2329*c0909341SAndroid Build Coastguard Worker    lea                  r5, [rsp+56]  ; left-7
2330*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
2331*c0909341SAndroid Build Coastguard Worker    lea                  r9, [strideq*3]
2332*c0909341SAndroid Build Coastguard Worker    psraw               xm1, xm5, 6
2333*c0909341SAndroid Build Coastguard Worker    pand                xm5, xm14      ; frac_y
2334*c0909341SAndroid Build Coastguard Worker    pxor                xm2, xm2
2335*c0909341SAndroid Build Coastguard Worker    paddw                m7, m0, m0
2336*c0909341SAndroid Build Coastguard Worker    psubw               xm4, xm2, xm1  ; base_y
2337*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m7, 0xcc
2338*c0909341SAndroid Build Coastguard Worker    mova                xm1, xm7
2339*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm4, xm2
2340*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1        ; xpos2 xpos3 xpos0 xpos1
2341*c0909341SAndroid Build Coastguard Worker    psubw               xm1, xm15, xm5 ; 64-frac_y
2342*c0909341SAndroid Build Coastguard Worker    psllw               xm5, 8
2343*c0909341SAndroid Build Coastguard Worker    paddw                m7, m7
2344*c0909341SAndroid Build Coastguard Worker    paddw                m6, m0
2345*c0909341SAndroid Build Coastguard Worker    por                 xm5, xm1       ; 64-frac_y, frac_y
2346*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m5, xm5
2347*c0909341SAndroid Build Coastguard Worker.w4_loop:
2348*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r2+dxq]
2349*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6         ; base_x0
2350*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [rsp+r2]
2351*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+dxq]
2352*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6         ; base_x1
2353*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [rsp+r3]
2354*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r2+dxq]
2355*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6         ; base_x2
2356*c0909341SAndroid Build Coastguard Worker    movq                xm0, [rsp+r2]
2357*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+dxq]
2358*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6         ; base_x3
2359*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [rsp+r3]
2360*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m2, 0xc0
2361*c0909341SAndroid Build Coastguard Worker    pand                 m2, m14, m6   ; frac_x
2362*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m1, 0xf0
2363*c0909341SAndroid Build Coastguard Worker    psubw                m1, m15, m2   ; 64-frac_x
2364*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
2365*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m10
2366*c0909341SAndroid Build Coastguard Worker    por                  m1, m2        ; 64-frac_x, frac_x
2367*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
2368*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 64
2369*c0909341SAndroid Build Coastguard Worker    jge .w4_toponly
2370*c0909341SAndroid Build Coastguard Worker    mova                 m1, m7        ; arbitrary negative value
2371*c0909341SAndroid Build Coastguard Worker    vpgatherdq           m3, [r5+xm4], m1
2372*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3, m11
2373*c0909341SAndroid Build Coastguard Worker    vpermd               m1, m12, m1
2374*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
2375*c0909341SAndroid Build Coastguard Worker    psraw                m2, m6, 15    ; base_x < topleft
2376*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m1, m2
2377*c0909341SAndroid Build Coastguard Worker.w4_toponly:
2378*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m13
2379*c0909341SAndroid Build Coastguard Worker    paddw                m6, m7        ; xpos += dx
2380*c0909341SAndroid Build Coastguard Worker    add                  r5, dyq
2381*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
2382*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
2383*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm0
2384*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r9       ], xm0, 1
2385*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm1
2386*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm1, 1
2387*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2388*c0909341SAndroid Build Coastguard Worker    jz .w4_end
2389*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2390*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, r8d
2391*c0909341SAndroid Build Coastguard Worker    jge .w4_loop
2392*c0909341SAndroid Build Coastguard Worker.w4_leftonly_loop:
2393*c0909341SAndroid Build Coastguard Worker    mova                 m1, m7
2394*c0909341SAndroid Build Coastguard Worker    vpgatherdq           m2, [r5+xm4], m1
2395*c0909341SAndroid Build Coastguard Worker    add                  r5, dyq
2396*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2, m11
2397*c0909341SAndroid Build Coastguard Worker    vpermd               m0, m12, m0
2398*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
2399*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m13
2400*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
2401*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
2402*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm0
2403*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r9       ], xm0, 1
2404*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm1
2405*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm1, 1
2406*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2407*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2408*c0909341SAndroid Build Coastguard Worker    jg .w4_leftonly_loop
2409*c0909341SAndroid Build Coastguard Worker.w4_end:
2410*c0909341SAndroid Build Coastguard Worker    RET
2411*c0909341SAndroid Build Coastguard Worker.w8:
2412*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [base+z2_base_inc] ; base_inc << 6
2413*c0909341SAndroid Build Coastguard Worker    movd                xm5, dyd
2414*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m10, [base+z_filter_s+2]
2415*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m11, [base+z2_shuf_h4]
2416*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [dxq+(65<<6)] ; xpos
2417*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        xm5, xm5
2418*c0909341SAndroid Build Coastguard Worker    mov                 r8d, (63-8)<<6
2419*c0909341SAndroid Build Coastguard Worker    mov                 dyq, -4
2420*c0909341SAndroid Build Coastguard Worker    pmullw              xm5, [base+z2_ymul]
2421*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2422*c0909341SAndroid Build Coastguard Worker    jnz .w8_main
2423*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq+126]
2424*c0909341SAndroid Build Coastguard Worker    mov                 r3b, hb
2425*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
2426*c0909341SAndroid Build Coastguard Worker    ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
2427*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm3, [base+pb_8]
2428*c0909341SAndroid Build Coastguard Worker    movhps         [rsp+80], xm1
2429*c0909341SAndroid Build Coastguard Worker    call .upsample_above
2430*c0909341SAndroid Build Coastguard Worker    sub              angled, 53 ; angle - 53
2431*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
2432*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x7f ; 180 - angle
2433*c0909341SAndroid Build Coastguard Worker    call .filter_strength
2434*c0909341SAndroid Build Coastguard Worker    jmp .w8_filter_left
2435*c0909341SAndroid Build Coastguard Worker.w8_no_upsample_above:
2436*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
2437*c0909341SAndroid Build Coastguard Worker    sub              angled, 90 ; angle - 90
2438*c0909341SAndroid Build Coastguard Worker    call .filter_strength
2439*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
2440*c0909341SAndroid Build Coastguard Worker    jz .w8_no_filter_above
2441*c0909341SAndroid Build Coastguard Worker    popcnt              r3d, r3d
2442*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm3, [base+pb_8]
2443*c0909341SAndroid Build Coastguard Worker    pminub              xm3, [base+z_filter_s+8]
2444*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [base+z_filter_k-4+r3*4+12*0]
2445*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*1]
2446*c0909341SAndroid Build Coastguard Worker    pshufb              xm2, xm1, [base+z_filter_s] ; 00 01 12 23 34 45 56 67
2447*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm2, xm0
2448*c0909341SAndroid Build Coastguard Worker    pshufb              xm3, xm1, xm3               ; 34 45 56 67 78 88 88 88
2449*c0909341SAndroid Build Coastguard Worker    shufps              xm2, xm3, q2121             ; 12 23 34 45 56 67 78 88
2450*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm2, xm4
2451*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*2]
2452*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm3, xm4
2453*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, r6m ; max_width
2454*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm4
2455*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm2
2456*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm3
2457*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm13
2458*c0909341SAndroid Build Coastguard Worker    packsswb            xm4, xm4
2459*c0909341SAndroid Build Coastguard Worker    psrldq              xm1, 1
2460*c0909341SAndroid Build Coastguard Worker    psubb               xm4, [base+pb_1to32]
2461*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm0
2462*c0909341SAndroid Build Coastguard Worker    vpblendvb           xm0, xm1, xm4
2463*c0909341SAndroid Build Coastguard Worker    movq           [rsp+65], xm0
2464*c0909341SAndroid Build Coastguard Worker.w8_no_filter_above:
2465*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq-51]
2466*c0909341SAndroid Build Coastguard Worker    mov                 r3b, hb
2467*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
2468*c0909341SAndroid Build Coastguard Worker    jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
2469*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [base+pb_90]
2470*c0909341SAndroid Build Coastguard Worker    psubb                m0, m7
2471*c0909341SAndroid Build Coastguard Worker    pand                 m0, m8
2472*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m0, m9
2473*c0909341SAndroid Build Coastguard Worker    pmovmskb            r3d, m0
2474*c0909341SAndroid Build Coastguard Worker.w8_filter_left:
2475*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
2476*c0909341SAndroid Build Coastguard Worker    jz .w8_main
2477*c0909341SAndroid Build Coastguard Worker    popcnt              r3d, r3d
2478*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+z_filter_k-4+r3*4+12*0]
2479*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+z_filter_k-4+r3*4+12*1]
2480*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+z_filter_k-4+r3*4+12*2]
2481*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
2482*c0909341SAndroid Build Coastguard Worker    jne .w8_filter_left_h16
2483*c0909341SAndroid Build Coastguard Worker    movu                xm2, [rsp+27]
2484*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [rsp+35], 1
2485*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [base+pb_5]
2486*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [base+z_filter_s+ 8]
2487*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [base+z_filter_s+12]
2488*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [base+z_filter_s+16]
2489*c0909341SAndroid Build Coastguard Worker    pmaxub               m3, m0
2490*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m2, m3
2491*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m7
2492*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2, m1
2493*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m8
2494*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
2495*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9
2496*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
2497*c0909341SAndroid Build Coastguard Worker    paddw                m3, m2
2498*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m13
2499*c0909341SAndroid Build Coastguard Worker    jmp .w8_filter_left_top16
2500*c0909341SAndroid Build Coastguard Worker.w8_filter_left_h16:
2501*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 10
2502*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
2503*c0909341SAndroid Build Coastguard Worker    cmovs               r5d, hd
2504*c0909341SAndroid Build Coastguard Worker    xor                 r5d, 15 ; h == 16 ? 5 : 15 - h
2505*c0909341SAndroid Build Coastguard Worker    movd                xm0, r5d
2506*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, xm0
2507*c0909341SAndroid Build Coastguard Worker.w8_filter_left_top16:
2508*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [base+z_filter_s+12]
2509*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd   55 55 56 67 78 89 9a ab
2510*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [base+z_filter_s+16]
2511*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m4, 0x0f                   ; 78 89 9a ab bc cd de ef   56 67 78 89 9a ab bc cd
2512*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [base+z_filter_s+20], 0    ; 9a ab bc cd de ef ff ff   78 89 9a ab bc cd de ef
2513*c0909341SAndroid Build Coastguard Worker    pmaxub               m0, m2
2514*c0909341SAndroid Build Coastguard Worker    movu                xm2, [rsp+49]
2515*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [rsp+43], 1
2516*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2, m0
2517*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7
2518*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, r7m ; max_height
2519*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2, m1
2520*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m8
2521*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
2522*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9
2523*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m7
2524*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
2525*c0909341SAndroid Build Coastguard Worker    packsswb             m7, m7
2526*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2527*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m13
2528*c0909341SAndroid Build Coastguard Worker    psubb                m7, [base+pb_32to1]
2529*c0909341SAndroid Build Coastguard Worker    packuswb             m3, m1
2530*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m3, q1320
2531*c0909341SAndroid Build Coastguard Worker    vpblendvb            m3, [rsp+32], m7
2532*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m3
2533*c0909341SAndroid Build Coastguard Worker    jmp .w8_main
2534*c0909341SAndroid Build Coastguard Worker.w8_upsample_left:
2535*c0909341SAndroid Build Coastguard Worker    call .upsample_left
2536*c0909341SAndroid Build Coastguard Worker.w8_main:
2537*c0909341SAndroid Build Coastguard Worker    movd                xm3, dxd
2538*c0909341SAndroid Build Coastguard Worker    lea                  r5, [rsp+56]  ; left-7
2539*c0909341SAndroid Build Coastguard Worker    pshufd              xm1, xm5, q3120
2540*c0909341SAndroid Build Coastguard Worker    pand                xm5, xm14
2541*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, xm3
2542*c0909341SAndroid Build Coastguard Worker    pxor                xm0, xm0
2543*c0909341SAndroid Build Coastguard Worker    psubw               xm2, xm15, xm5
2544*c0909341SAndroid Build Coastguard Worker    psraw               xm1, 6
2545*c0909341SAndroid Build Coastguard Worker    lea                  r9, [strideq*3]
2546*c0909341SAndroid Build Coastguard Worker    paddw                m7, m3, m3
2547*c0909341SAndroid Build Coastguard Worker    psubw               xm9, xm0, xm1  ; base_y
2548*c0909341SAndroid Build Coastguard Worker    psllw               xm5, 8
2549*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm8, xm9, xm0  ; base_y 0, 1, 4, 5
2550*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m7, 0xf0  ; xpos0 xpos1
2551*c0909341SAndroid Build Coastguard Worker    por                 xm5, xm2       ; 64-frac_y, frac_y
2552*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm9, xm0       ; base_y 2, 3, 6, 7
2553*c0909341SAndroid Build Coastguard Worker    paddw                m6, m3
2554*c0909341SAndroid Build Coastguard Worker    vinserti128         m12, m5, xm5, 1
2555*c0909341SAndroid Build Coastguard Worker.w8_loop:
2556*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r2+dxq]
2557*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6         ; base_x0
2558*c0909341SAndroid Build Coastguard Worker    movu                xm0, [rsp+r2]
2559*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+dxq]
2560*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6         ; base_x1
2561*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [rsp+r3], 1
2562*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r2+dxq]
2563*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6         ; base_x2
2564*c0909341SAndroid Build Coastguard Worker    movu                xm1, [rsp+r2]
2565*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+dxq]
2566*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6         ; base_x3
2567*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [rsp+r3], 1
2568*c0909341SAndroid Build Coastguard Worker    pand                 m2, m14, m6
2569*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m6, m7
2570*c0909341SAndroid Build Coastguard Worker    psubw                m5, m15, m2
2571*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
2572*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m10
2573*c0909341SAndroid Build Coastguard Worker    por                  m2, m5
2574*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
2575*c0909341SAndroid Build Coastguard Worker    pand                 m2, m14, m4
2576*c0909341SAndroid Build Coastguard Worker    psubw                m5, m15, m2
2577*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
2578*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m10
2579*c0909341SAndroid Build Coastguard Worker    por                  m2, m5
2580*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
2581*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 64
2582*c0909341SAndroid Build Coastguard Worker    jge .w8_toponly
2583*c0909341SAndroid Build Coastguard Worker    mova                 m5, m7
2584*c0909341SAndroid Build Coastguard Worker    vpgatherdq           m3, [r5+xm9], m7
2585*c0909341SAndroid Build Coastguard Worker    mova                 m7, m5
2586*c0909341SAndroid Build Coastguard Worker    vpgatherdq           m2, [r5+xm8], m5
2587*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m11
2588*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m11
2589*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m2, m3    ; a0 b0 c0 d0 a1 b1 c1 d1   e0 f0 g0 h0 e1 f1 g1 h1
2590*c0909341SAndroid Build Coastguard Worker    punpckhdq            m2, m3        ; a2 b2 c2 d2 a3 b3 c3 d3   e2 f2 g2 h2 e3 f3 g3 h3
2591*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m5, q3120 ; y0 y1
2592*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m2, q3120 ; y2 y3
2593*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m12
2594*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m12
2595*c0909341SAndroid Build Coastguard Worker    psraw                m6, 15        ; base_x < topleft
2596*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m5, m6
2597*c0909341SAndroid Build Coastguard Worker    psraw                m3, m4, 15
2598*c0909341SAndroid Build Coastguard Worker    vpblendvb            m1, m2, m3
2599*c0909341SAndroid Build Coastguard Worker.w8_toponly:
2600*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m13
2601*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m13
2602*c0909341SAndroid Build Coastguard Worker    paddw                m6, m4, m7     ; xpos += dx
2603*c0909341SAndroid Build Coastguard Worker    add                  r5, dyq
2604*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2605*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
2606*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
2607*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
2608*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
2609*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r9       ], xm1
2610*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2611*c0909341SAndroid Build Coastguard Worker    jz .w8_end
2612*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2613*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, r8d
2614*c0909341SAndroid Build Coastguard Worker    jge .w8_loop
2615*c0909341SAndroid Build Coastguard Worker.w8_leftonly_loop:
2616*c0909341SAndroid Build Coastguard Worker    mova                 m0, m7
2617*c0909341SAndroid Build Coastguard Worker    vpgatherdq           m5, [r5+xm9], m7
2618*c0909341SAndroid Build Coastguard Worker    mova                 m7, m0
2619*c0909341SAndroid Build Coastguard Worker    vpgatherdq           m3, [r5+xm8], m0
2620*c0909341SAndroid Build Coastguard Worker    add                  r5, dyq
2621*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m11
2622*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3, m11
2623*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1, m2
2624*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m2
2625*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
2626*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q3120
2627*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m12
2628*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m12
2629*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m13
2630*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m13
2631*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2632*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
2633*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
2634*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm0
2635*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm1
2636*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r9       ], xm1
2637*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2638*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2639*c0909341SAndroid Build Coastguard Worker    jg .w8_leftonly_loop
2640*c0909341SAndroid Build Coastguard Worker.w8_end:
2641*c0909341SAndroid Build Coastguard Worker    RET
2642*c0909341SAndroid Build Coastguard Worker.w16:
2643*c0909341SAndroid Build Coastguard Worker    mov                 r8d, hd
2644*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2645*c0909341SAndroid Build Coastguard Worker    jnz .w16_main
2646*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+15]
2647*c0909341SAndroid Build Coastguard Worker    sub              angled, 90
2648*c0909341SAndroid Build Coastguard Worker    call .filter_strength
2649*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
2650*c0909341SAndroid Build Coastguard Worker    jz .w16_no_filter_above
2651*c0909341SAndroid Build Coastguard Worker    popcnt              r3d, r3d
2652*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [tlq+1]
2653*c0909341SAndroid Build Coastguard Worker    mova                xm2, [base+z_filter_s]
2654*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67   67 78 89 9a ab bc cd de
2655*c0909341SAndroid Build Coastguard Worker    movu                xm3, [base+z_filter_s+8]
2656*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [base+z_filter_s+22], 1 ; 34 45 56 67 78 89 9a ab   ab bc cd de ef ff ff ff
2657*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m6, 0xf0
2658*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [base+z_filter_k-4+r3*4+12*0]
2659*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+z_filter_k-4+r3*4+12*1]
2660*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [base+z_filter_k-4+r3*4+12*2]
2661*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m1, m2
2662*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
2663*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2, m0
2664*c0909341SAndroid Build Coastguard Worker    shufps               m2, m1, q2121                ; 12 23 34 45 56 67 78 89   89 9a ab bc cd de ef ff
2665*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m4
2666*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
2667*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, r6m ; max_width
2668*c0909341SAndroid Build Coastguard Worker    packssdw            xm4, xm4
2669*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
2670*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2671*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m13
2672*c0909341SAndroid Build Coastguard Worker    packsswb            xm4, xm4
2673*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m0, 1
2674*c0909341SAndroid Build Coastguard Worker    psubb               xm4, [base+pb_1to32]
2675*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm2
2676*c0909341SAndroid Build Coastguard Worker    vpblendvb           xm0, xm6, xm4
2677*c0909341SAndroid Build Coastguard Worker    movu           [rsp+65], xm0
2678*c0909341SAndroid Build Coastguard Worker.w16_no_filter_above:
2679*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [base+pb_90]
2680*c0909341SAndroid Build Coastguard Worker    psubb                m0, m7
2681*c0909341SAndroid Build Coastguard Worker    pand                 m0, m8
2682*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m0, m9
2683*c0909341SAndroid Build Coastguard Worker    pmovmskb            r3d, m0
2684*c0909341SAndroid Build Coastguard Worker    test                r3d, r3d
2685*c0909341SAndroid Build Coastguard Worker    jz .w16_main
2686*c0909341SAndroid Build Coastguard Worker    popcnt              r3d, r3d
2687*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+z_filter_k-4+r3*4+12*0]
2688*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+z_filter_k-4+r3*4+12*1]
2689*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+z_filter_k-4+r3*4+12*2]
2690*c0909341SAndroid Build Coastguard Worker.w16_filter_left:
2691*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, r7m ; max_height
2692*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m6
2693*c0909341SAndroid Build Coastguard Worker    packsswb             m6, m6
2694*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
2695*c0909341SAndroid Build Coastguard Worker    jl .w16_filter_left_h16
2696*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [base+pb_5]
2697*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m10, [base+z_filter_s+ 8]
2698*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m11, [base+z_filter_s+12]
2699*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m12, [base+z_filter_s+16]
2700*c0909341SAndroid Build Coastguard Worker    je .w16_filter_left_h32
2701*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq-69]
2702*c0909341SAndroid Build Coastguard Worker    movu                 m5, [tlq-61]
2703*c0909341SAndroid Build Coastguard Worker    pmaxub               m1, m10, m0
2704*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3, m1
2705*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m7
2706*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3, m11
2707*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m8
2708*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m12
2709*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m9
2710*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2711*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m10
2712*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
2713*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m5, m11
2714*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m8
2715*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m12
2716*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m9
2717*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
2718*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [base+pb_32]
2719*c0909341SAndroid Build Coastguard Worker    paddb                m3, [base+pb_32to1]
2720*c0909341SAndroid Build Coastguard Worker    paddw                m2, m4
2721*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
2722*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m13
2723*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m13
2724*c0909341SAndroid Build Coastguard Worker    psubb                m3, m6, m3
2725*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m2
2726*c0909341SAndroid Build Coastguard Worker    vpblendvb            m1, [tlq-64], m3
2727*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m1
2728*c0909341SAndroid Build Coastguard Worker    jmp .w16_filter_left_top32
2729*c0909341SAndroid Build Coastguard Worker.w16_filter_left_h32:
2730*c0909341SAndroid Build Coastguard Worker    pmaxub              m10, m0
2731*c0909341SAndroid Build Coastguard Worker.w16_filter_left_top32:
2732*c0909341SAndroid Build Coastguard Worker    movu                xm2, [tlq-37]
2733*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [tlq-29], 1
2734*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m2, m10
2735*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2, m11
2736*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m12
2737*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m7
2738*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m8
2739*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9
2740*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
2741*c0909341SAndroid Build Coastguard Worker    paddw                m3, m2
2742*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m13
2743*c0909341SAndroid Build Coastguard Worker    jmp .w16_filter_left_top16
2744*c0909341SAndroid Build Coastguard Worker.w16_filter_left_h16:
2745*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 10
2746*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
2747*c0909341SAndroid Build Coastguard Worker    cmovs               r5d, hd
2748*c0909341SAndroid Build Coastguard Worker    xor                 r5d, 15 ; h == 16 ? 5 : 15 - h
2749*c0909341SAndroid Build Coastguard Worker    movd                xm0, r5d
2750*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, xm0
2751*c0909341SAndroid Build Coastguard Worker.w16_filter_left_top16:
2752*c0909341SAndroid Build Coastguard Worker    movu                xm2, [tlq-15]
2753*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [tlq-21], 1
2754*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [base+z_filter_s+12]
2755*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [base+z_filter_s+16]
2756*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd   34 45 56 67 78 89 9a ab
2757*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m4, 0x0f                   ; 78 89 9a ab bc cd de ef   56 67 78 89 9a ab bc cd
2758*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [base+z_filter_s+20], 0    ; 9a ab bc cd de ef ff ff   78 89 9a ab bc cd de ef
2759*c0909341SAndroid Build Coastguard Worker    pmaxub               m0, m5
2760*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2, m0
2761*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7
2762*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2, m1
2763*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m8
2764*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
2765*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9
2766*c0909341SAndroid Build Coastguard Worker    psubb                m6, [base+pb_32to1]
2767*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
2768*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2769*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m13
2770*c0909341SAndroid Build Coastguard Worker    packuswb             m3, m1
2771*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m3, q1320
2772*c0909341SAndroid Build Coastguard Worker    vpblendvb            m3, [tlq-32], m6
2773*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m3
2774*c0909341SAndroid Build Coastguard Worker.w16_main:
2775*c0909341SAndroid Build Coastguard Worker    movd                xm1, dyd
2776*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m10, [base+z_filter_s+2]
2777*c0909341SAndroid Build Coastguard Worker    movd                xm7, dxd
2778*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m11, [base+z2_shuf_h2]
2779*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, xm1
2780*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, xm7
2781*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
2782*c0909341SAndroid Build Coastguard Worker    pmullw               m0, m1, [base+z2_ymul]
2783*c0909341SAndroid Build Coastguard Worker    psllw               xm1, 4
2784*c0909341SAndroid Build Coastguard Worker    paddw                m6, m7, [base+z2_base_inc]
2785*c0909341SAndroid Build Coastguard Worker    lea                 r9d, [dxq+(65<<6)] ; xpos
2786*c0909341SAndroid Build Coastguard Worker    movd          [rsp+156], xm1
2787*c0909341SAndroid Build Coastguard Worker.w16_loop0:
2788*c0909341SAndroid Build Coastguard Worker    mov                 r2d, r9d
2789*c0909341SAndroid Build Coastguard Worker    mova          [rsp+160], m0
2790*c0909341SAndroid Build Coastguard Worker    lea                  r5, [rsp+60] ; left-3
2791*c0909341SAndroid Build Coastguard Worker    mova          [rsp+192], m6
2792*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
2793*c0909341SAndroid Build Coastguard Worker    psraw                m2, m0, 6
2794*c0909341SAndroid Build Coastguard Worker    pand                 m0, m14
2795*c0909341SAndroid Build Coastguard Worker    psubw                m9, m1, m2   ; base_y
2796*c0909341SAndroid Build Coastguard Worker    psubw               m12, m15, m0
2797*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m9, m1   ; base_y  0,  1,  2,  3,     8,  9, 10, 11
2798*c0909341SAndroid Build Coastguard Worker    psllw                m0, 8
2799*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m1       ; base_y  4,  5,  6,  7,    12, 13, 14, 15
2800*c0909341SAndroid Build Coastguard Worker    por                 m12, m0       ; 64-frac_y, frac_y
2801*c0909341SAndroid Build Coastguard Worker.w16_loop:
2802*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r2+dxq]
2803*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6        ; base_x0
2804*c0909341SAndroid Build Coastguard Worker    movu                xm0, [rsp+r2]
2805*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [rsp+r2+8], 1
2806*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+dxq]
2807*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6        ; base_x1
2808*c0909341SAndroid Build Coastguard Worker    movu                xm1, [rsp+r3]
2809*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [rsp+r3+8], 1
2810*c0909341SAndroid Build Coastguard Worker    pand                 m2, m14, m6
2811*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m6, m7
2812*c0909341SAndroid Build Coastguard Worker    psubw                m3, m15, m2
2813*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
2814*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m10
2815*c0909341SAndroid Build Coastguard Worker    por                  m2, m3
2816*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
2817*c0909341SAndroid Build Coastguard Worker    pand                 m2, m14, m5
2818*c0909341SAndroid Build Coastguard Worker    psubw                m3, m15, m2
2819*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
2820*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m10
2821*c0909341SAndroid Build Coastguard Worker    por                  m2, m3
2822*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
2823*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 64
2824*c0909341SAndroid Build Coastguard Worker    jge .w16_toponly
2825*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m5, m5   ; mask out unnecessary loads
2826*c0909341SAndroid Build Coastguard Worker    vpgatherdd           m4, [r5+m9], m2
2827*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5, m5
2828*c0909341SAndroid Build Coastguard Worker    vpgatherdd           m3, [r5+m8], m2
2829*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m11      ; e0 f0 g0 h0 e1 f1 g1 h1   m0 n0 o0 p0 m1 n1 o1 p1
2830*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m11      ; a0 b0 c0 d0 a1 b1 c1 d1   i0 j0 k0 l0 i1 j1 k1 l1
2831*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m4   ; y0
2832*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m4       ; y1
2833*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m12
2834*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m12
2835*c0909341SAndroid Build Coastguard Worker    psraw                m6, 15       ; base_x < topleft
2836*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m2, m6
2837*c0909341SAndroid Build Coastguard Worker    psraw                m6, m5, 15
2838*c0909341SAndroid Build Coastguard Worker    vpblendvb            m1, m3, m6
2839*c0909341SAndroid Build Coastguard Worker.w16_toponly:
2840*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m13
2841*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m13
2842*c0909341SAndroid Build Coastguard Worker    paddw                m6, m5, m7   ; xpos += dx
2843*c0909341SAndroid Build Coastguard Worker    sub                  r5, 2
2844*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2845*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
2846*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
2847*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
2848*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2849*c0909341SAndroid Build Coastguard Worker    jz .w16_end
2850*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2851*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, (63-16)<<6
2852*c0909341SAndroid Build Coastguard Worker    jge .w16_loop
2853*c0909341SAndroid Build Coastguard Worker.w16_leftonly_loop:
2854*c0909341SAndroid Build Coastguard Worker    mova                 m0, m7
2855*c0909341SAndroid Build Coastguard Worker    vpgatherdd           m4, [r5+m9], m7
2856*c0909341SAndroid Build Coastguard Worker    mova                 m7, m0
2857*c0909341SAndroid Build Coastguard Worker    vpgatherdd           m3, [r5+m8], m0
2858*c0909341SAndroid Build Coastguard Worker    sub                  r5, 2
2859*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4, m11
2860*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3, m11
2861*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m1, m2
2862*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m2
2863*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m12
2864*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m12
2865*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m13
2866*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m13
2867*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2868*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
2869*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
2870*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m0, 1
2871*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2872*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2873*c0909341SAndroid Build Coastguard Worker    jg .w16_leftonly_loop
2874*c0909341SAndroid Build Coastguard Worker.w16_end:
2875*c0909341SAndroid Build Coastguard Worker    sub                 r8d, 1<<8
2876*c0909341SAndroid Build Coastguard Worker    jl .w16_ret
2877*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [rsp+156]
2878*c0909341SAndroid Build Coastguard Worker    paddw                m0, [rsp+160] ; base_y += 16*dy
2879*c0909341SAndroid Build Coastguard Worker    paddw                m6, m13, [rsp+192]
2880*c0909341SAndroid Build Coastguard Worker    add                  r7, 16
2881*c0909341SAndroid Build Coastguard Worker    add                 r9d, 16<<6
2882*c0909341SAndroid Build Coastguard Worker    movzx                hd, r8b
2883*c0909341SAndroid Build Coastguard Worker    mov                dstq, r7
2884*c0909341SAndroid Build Coastguard Worker    paddw                m6, m13 ; base_x += 16*64
2885*c0909341SAndroid Build Coastguard Worker    jmp .w16_loop0
2886*c0909341SAndroid Build Coastguard Worker.w16_ret:
2887*c0909341SAndroid Build Coastguard Worker    RET
2888*c0909341SAndroid Build Coastguard Worker.w32:
2889*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq+32]
2890*c0909341SAndroid Build Coastguard Worker    lea                 r8d, [hq+(1<<8)]
2891*c0909341SAndroid Build Coastguard Worker    mova           [rsp+96], m2
2892*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2893*c0909341SAndroid Build Coastguard Worker    jnz .w16_main
2894*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+z_filter_k+4*2+12*0]
2895*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+z_filter_k+4*2+12*1]
2896*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+z_filter_k+4*2+12*2]
2897*c0909341SAndroid Build Coastguard Worker    mova                xm5, [base+z_filter_s]
2898*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [base+z_filter_s+10], 1 ; 00 01 12 23 34 45 56 67   45 56 67 78 89 9a ab bc
2899*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [tlq+11], 1
2900*c0909341SAndroid Build Coastguard Worker    movu                xm6, [base+z_filter_s+12]
2901*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd   ab bc cd de ef ff ff ff
2902*c0909341SAndroid Build Coastguard Worker    movu                xm3, [tlq+ 6]
2903*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [tlq+17], 1
2904*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, r6m ; max_width
2905*c0909341SAndroid Build Coastguard Worker    packssdw            m10, m10
2906*c0909341SAndroid Build Coastguard Worker    packsswb            m10, m10
2907*c0909341SAndroid Build Coastguard Worker.w32_filter_above:
2908*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1, m5
2909*c0909341SAndroid Build Coastguard Worker    shufps               m4, m5, m6, q1021           ; 12 23 34 45 56 67 78 89   67 78 89 9a ab bc cd de
2910*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7
2911*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m1, m4
2912*c0909341SAndroid Build Coastguard Worker    shufps               m5, m6, q2132               ; 34 45 56 67 78 89 9a ab   89 9a ab bc cd de ef ff
2913*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m8
2914*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m5
2915*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m9
2916*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
2917*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2918*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3, m4
2919*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m7
2920*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3, m5
2921*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m8
2922*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m6
2923*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m9
2924*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2925*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
2926*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m13
2927*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m13
2928*c0909341SAndroid Build Coastguard Worker    psubb               m10, [base+pb_1to32]
2929*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2930*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, [tlq+1], m10
2931*c0909341SAndroid Build Coastguard Worker    movu           [rsp+65], m0
2932*c0909341SAndroid Build Coastguard Worker    jmp .w16_filter_left
2933*c0909341SAndroid Build Coastguard Worker.w64:
2934*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq+32]
2935*c0909341SAndroid Build Coastguard Worker    mov                 r3d, [tlq+64]
2936*c0909341SAndroid Build Coastguard Worker    lea                 r8d, [hq+(3<<8)]
2937*c0909341SAndroid Build Coastguard Worker    mova          [rsp+ 96], m2
2938*c0909341SAndroid Build Coastguard Worker    mov           [rsp+128], r3d
2939*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2940*c0909341SAndroid Build Coastguard Worker    jnz .w16_main
2941*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+z_filter_k+4*2+12*0]
2942*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+z_filter_k+4*2+12*1]
2943*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+z_filter_k+4*2+12*2]
2944*c0909341SAndroid Build Coastguard Worker    movu                xm6, [base+z_filter_s+ 4]
2945*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [base+z_filter_s+10], 1 ; 12 23 34 45 56 67 78 89   45 56 67 78 89 9a ab bc
2946*c0909341SAndroid Build Coastguard Worker    movu                xm3, [tlq+30]
2947*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [tlq+43], 1
2948*c0909341SAndroid Build Coastguard Worker    movu                xm5, [base+z_filter_s+16]
2949*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [base+z_filter_s+22], 1 ; 78 89 9a ab bc cd de ef   ab bc cd de ef ff ff ff
2950*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3, m6
2951*c0909341SAndroid Build Coastguard Worker    shufps               m4, m6, m5, q1021           ; 34 45 56 67 78 89 9a ab   67 78 89 9a ab bc cd de
2952*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7
2953*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3, m4
2954*c0909341SAndroid Build Coastguard Worker    shufps               m6, m5, q2132               ; 56 67 78 89 9a ab bc cd   89 9a ab bc cd de ef ff
2955*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m8
2956*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m6
2957*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m9
2958*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
2959*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
2960*c0909341SAndroid Build Coastguard Worker    movu                xm2, [tlq+36]
2961*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [tlq+49], 1
2962*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, r6m ; max_width
2963*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m2, m4
2964*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m7
2965*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m2, m6
2966*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m8
2967*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5
2968*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9
2969*c0909341SAndroid Build Coastguard Worker    packssdw            m10, m10
2970*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
2971*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
2972*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [base+pb_32]
2973*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m13
2974*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m13
2975*c0909341SAndroid Build Coastguard Worker    packsswb            m10, m10
2976*c0909341SAndroid Build Coastguard Worker    mova                xm5, [base+z_filter_s]
2977*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [base+z_filter_s+6], 1
2978*c0909341SAndroid Build Coastguard Worker    psubb                m3, m10, m3
2979*c0909341SAndroid Build Coastguard Worker    psubb                m3, [base+pb_1to32]
2980*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [tlq+13], 1
2981*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m2
2982*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, [tlq+33], m3
2983*c0909341SAndroid Build Coastguard Worker    movu                xm3, [tlq+ 6]
2984*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [tlq+19], 1
2985*c0909341SAndroid Build Coastguard Worker    movu           [rsp+97], m0
2986*c0909341SAndroid Build Coastguard Worker    jmp .w32_filter_above
2987*c0909341SAndroid Build Coastguard Worker
2988*c0909341SAndroid Build Coastguard Workercglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
2989*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ipred_z3_avx2_table]
2990*c0909341SAndroid Build Coastguard Worker    tzcnt                hd, hm
2991*c0909341SAndroid Build Coastguard Worker    movifnidn        angled, anglem
2992*c0909341SAndroid Build Coastguard Worker    lea                  r7, [dr_intra_derivative+45*2-1]
2993*c0909341SAndroid Build Coastguard Worker    dec                 tlq
2994*c0909341SAndroid Build Coastguard Worker    movsxd               hq, [r6+hq*4]
2995*c0909341SAndroid Build Coastguard Worker    sub              angled, 180
2996*c0909341SAndroid Build Coastguard Worker    add                  hq, r6
2997*c0909341SAndroid Build Coastguard Worker    mov                 dyd, angled
2998*c0909341SAndroid Build Coastguard Worker    neg                 dyd
2999*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x400
3000*c0909341SAndroid Build Coastguard Worker    or                  dyq, ~0x7e
3001*c0909341SAndroid Build Coastguard Worker    movzx               dyd, word [r7+dyq]
3002*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [pw_512]
3003*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [pw_62]
3004*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pw_64]
3005*c0909341SAndroid Build Coastguard Worker    mov              org_wd, wd
3006*c0909341SAndroid Build Coastguard Worker    jmp                  hq
3007*c0909341SAndroid Build Coastguard Worker.h4:
3008*c0909341SAndroid Build Coastguard Worker    lea                  r7, [strideq*3]
3009*c0909341SAndroid Build Coastguard Worker    cmp              angleb, 40
3010*c0909341SAndroid Build Coastguard Worker    jae .h4_no_upsample
3011*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [angleq-1024]
3012*c0909341SAndroid Build Coastguard Worker    sar                 r4d, 7
3013*c0909341SAndroid Build Coastguard Worker    add                 r4d, wd
3014*c0909341SAndroid Build Coastguard Worker    jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm)
3015*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK         -32, 9
3016*c0909341SAndroid Build Coastguard Worker    movu                xm8, [tlq-7]
3017*c0909341SAndroid Build Coastguard Worker    pshufb              xm0, xm8, [z_upsample1-4]
3018*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm2, xm8
3019*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm8, [z_filter_s+2]
3020*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], xm2 ; top[max_base_y]
3021*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm2, [pb_36_m4]
3022*c0909341SAndroid Build Coastguard Worker    add                 dyd, dyd
3023*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm2
3024*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm2
3025*c0909341SAndroid Build Coastguard Worker    movd                xm7, dyd
3026*c0909341SAndroid Build Coastguard Worker    mov                 r2d, dyd
3027*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m7, xm7
3028*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm0
3029*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm1, xm3
3030*c0909341SAndroid Build Coastguard Worker    pslldq               m6, m7, 8
3031*c0909341SAndroid Build Coastguard Worker    paddw               xm2, xm7, xm7
3032*c0909341SAndroid Build Coastguard Worker    paddw                m6, m7
3033*c0909341SAndroid Build Coastguard Worker    packuswb            xm1, xm1
3034*c0909341SAndroid Build Coastguard Worker    paddw                m6, m2
3035*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm1, xm8
3036*c0909341SAndroid Build Coastguard Worker    mova                xm8, [z_transpose4]
3037*c0909341SAndroid Build Coastguard Worker    psllw                m7, 2
3038*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, [pb_15to0]
3039*c0909341SAndroid Build Coastguard Worker    mova              [rsp], xm1
3040*c0909341SAndroid Build Coastguard Worker.h4_upsample_loop:
3041*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [r2+dyq]
3042*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6
3043*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [rsp+r2]
3044*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r4+dyq]
3045*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6
3046*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [rsp+r4]
3047*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [r2+dyq]
3048*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6
3049*c0909341SAndroid Build Coastguard Worker    movq                xm0, [rsp+r2]
3050*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r4+dyq]
3051*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6
3052*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [rsp+r4]
3053*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m2, 0xc0
3054*c0909341SAndroid Build Coastguard Worker    pand                 m2, m4, m6
3055*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m1, 0xf0
3056*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5, m2
3057*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
3058*c0909341SAndroid Build Coastguard Worker    por                  m1, m2
3059*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
3060*c0909341SAndroid Build Coastguard Worker    paddw                m6, m7
3061*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
3062*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
3063*c0909341SAndroid Build Coastguard Worker    packuswb            xm1, xm0
3064*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm8
3065*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm1
3066*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm1, 1
3067*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm1, 2
3068*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r7       ], xm1, 3
3069*c0909341SAndroid Build Coastguard Worker    add                dstq, 4
3070*c0909341SAndroid Build Coastguard Worker    sub                  wd, 4
3071*c0909341SAndroid Build Coastguard Worker    jg .h4_upsample_loop
3072*c0909341SAndroid Build Coastguard Worker    RET
3073*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3074*c0909341SAndroid Build Coastguard Worker.filter_strength: ; h4/h8/h16
3075*c0909341SAndroid Build Coastguard Worker%define base r4-z_filter_t0
3076*c0909341SAndroid Build Coastguard Worker    lea                  r4, [z_filter_t0]
3077*c0909341SAndroid Build Coastguard Worker    movd                xm0, maxbased
3078*c0909341SAndroid Build Coastguard Worker    movd                xm2, angled
3079*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
3080*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, xm0
3081*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m2, xm2
3082*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m1, m0, [base+z_filter_wh]
3083*c0909341SAndroid Build Coastguard Worker    pand                 m1, m2
3084*c0909341SAndroid Build Coastguard Worker    mova                xm2, [r4+angleq*8]
3085*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, m2
3086*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m1
3087*c0909341SAndroid Build Coastguard Worker    ret
3088*c0909341SAndroid Build Coastguard Worker.h4_no_upsample:
3089*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK         -16, 12
3090*c0909341SAndroid Build Coastguard Worker    mov            maxbased, 7
3091*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
3092*c0909341SAndroid Build Coastguard Worker    jnz .h4_main
3093*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [wq+3]
3094*c0909341SAndroid Build Coastguard Worker    call .filter_strength
3095*c0909341SAndroid Build Coastguard Worker    mov            maxbased, 7
3096*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
3097*c0909341SAndroid Build Coastguard Worker    jz .h4_main ; filter_strength == 0
3098*c0909341SAndroid Build Coastguard Worker    popcnt              r5d, r5d
3099*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+pb_7]
3100*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [tlq-14]
3101*c0909341SAndroid Build Coastguard Worker    pmaxub               m1, m7, [base+z_filter_s-4]
3102*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [base+z_filter_k-4+r5*4+12*0]
3103*c0909341SAndroid Build Coastguard Worker    pmaxub               m7, [base+z_filter_s+4]
3104*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
3105*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [base+z_filter_k-4+r5*4+12*2]
3106*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2, m1
3107*c0909341SAndroid Build Coastguard Worker    shufps               m1, m7, q2121
3108*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m8
3109*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2, m1
3110*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m9
3111*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
3112*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m10
3113*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3114*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
3115*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
3116*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 9
3117*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+15]
3118*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
3119*c0909341SAndroid Build Coastguard Worker    cmovne         maxbased, r4d
3120*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
3121*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm1
3122*c0909341SAndroid Build Coastguard Worker    mova              [rsp], xm0
3123*c0909341SAndroid Build Coastguard Worker.h4_main:
3124*c0909341SAndroid Build Coastguard Worker    movd                xm6, dyd
3125*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [z_base_inc] ; base_inc << 6
3126*c0909341SAndroid Build Coastguard Worker    mov                  r4, tlq
3127*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
3128*c0909341SAndroid Build Coastguard Worker    neg                 dyq
3129*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, xm6
3130*c0909341SAndroid Build Coastguard Worker    sub                  r4, maxbaseq
3131*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
3132*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m7, [r4]
3133*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dyq+63] ; ypos
3134*c0909341SAndroid Build Coastguard Worker    movd                xm9, maxbased
3135*c0909341SAndroid Build Coastguard Worker    not            maxbased
3136*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [z3_shuf_w4]
3137*c0909341SAndroid Build Coastguard Worker    add            maxbased, 64
3138*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, xm9
3139*c0909341SAndroid Build Coastguard Worker    psrlw                m7, 8  ; top[max_base_y]
3140*c0909341SAndroid Build Coastguard Worker    paddw               m10, m6, m6
3141*c0909341SAndroid Build Coastguard Worker    psubw                m9, m0 ; max_base_y
3142*c0909341SAndroid Build Coastguard Worker    vpblendd             m6, m10, 0xcc
3143*c0909341SAndroid Build Coastguard Worker    mova                xm0, xm10
3144*c0909341SAndroid Build Coastguard Worker    paddw                m6, m0 ; ypos2 ypos3 ypos0 ypos1
3145*c0909341SAndroid Build Coastguard Worker    paddw               m10, m10
3146*c0909341SAndroid Build Coastguard Worker    mova               xm11, [z_transpose4]
3147*c0909341SAndroid Build Coastguard Worker.h4_loop:
3148*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r4+dyq]
3149*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6 ; base0
3150*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [tlq+r4]
3151*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r5+dyq]
3152*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6 ; base1
3153*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m2, [tlq+r5]
3154*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r4+dyq]
3155*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6 ; base2
3156*c0909341SAndroid Build Coastguard Worker    movq                xm0, [tlq+r4]
3157*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r5+dyq]
3158*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6 ; base3
3159*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [tlq+r5]
3160*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m2, 0xc0
3161*c0909341SAndroid Build Coastguard Worker    pand                 m2, m4, m6 ; frac
3162*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m1, 0xf0
3163*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5, m2 ; 64-frac
3164*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
3165*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m8
3166*c0909341SAndroid Build Coastguard Worker    por                  m1, m2     ; 64-frac, frac
3167*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
3168*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m1, m9, m6 ; base < max_base_y
3169*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
3170*c0909341SAndroid Build Coastguard Worker    paddw                m6, m10    ; ypos += dy
3171*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m7, m0, m1
3172*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
3173*c0909341SAndroid Build Coastguard Worker    packuswb            xm1, xm0
3174*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm11   ; transpose
3175*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm1
3176*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm1, 1
3177*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm1, 2
3178*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r7       ], xm1, 3
3179*c0909341SAndroid Build Coastguard Worker    sub                  wd, 4
3180*c0909341SAndroid Build Coastguard Worker    jz .h4_end
3181*c0909341SAndroid Build Coastguard Worker    add                dstq, 4
3182*c0909341SAndroid Build Coastguard Worker    cmp                 r4d, maxbased
3183*c0909341SAndroid Build Coastguard Worker    jg .h4_loop
3184*c0909341SAndroid Build Coastguard Worker    packuswb            xm7, xm7
3185*c0909341SAndroid Build Coastguard Worker.h4_end_loop:
3186*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm7
3187*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], xm7
3188*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm7
3189*c0909341SAndroid Build Coastguard Worker    movd   [dstq+r7       ], xm7
3190*c0909341SAndroid Build Coastguard Worker    add                dstq, 4
3191*c0909341SAndroid Build Coastguard Worker    sub                  wd, 4
3192*c0909341SAndroid Build Coastguard Worker    jg .h4_end_loop
3193*c0909341SAndroid Build Coastguard Worker.h4_end:
3194*c0909341SAndroid Build Coastguard Worker    RET
3195*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3196*c0909341SAndroid Build Coastguard Worker.h8:
3197*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [angleq+216]
3198*c0909341SAndroid Build Coastguard Worker    mov                 r4b, wb
3199*c0909341SAndroid Build Coastguard Worker    cmp                 r4d, 8
3200*c0909341SAndroid Build Coastguard Worker    ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
3201*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK         -32, 8
3202*c0909341SAndroid Build Coastguard Worker    and                 r4d, 4
3203*c0909341SAndroid Build Coastguard Worker    mova                xm0, [tlq-15]
3204*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [tlq- 9], 1
3205*c0909341SAndroid Build Coastguard Worker    movd                xm1, r4d
3206*c0909341SAndroid Build Coastguard Worker    movu                xm2, [z_filter_s+2]
3207*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [z_filter_s+6], 1
3208*c0909341SAndroid Build Coastguard Worker    vpbroadcastb        xm1, xm1 ; w & 4
3209*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pb_36_m4]
3210*c0909341SAndroid Build Coastguard Worker    pmaxub              xm1, [z_upsample1-4] ; clip 4x8
3211*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [z_upsample1], 1
3212*c0909341SAndroid Build Coastguard Worker    add                 dyd, dyd
3213*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m0, m1
3214*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m0, m2
3215*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [tlq-7], 1
3216*c0909341SAndroid Build Coastguard Worker    movd                xm6, dyd
3217*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m7
3218*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
3219*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, xm6
3220*c0909341SAndroid Build Coastguard Worker    mov                 r2d, dyd
3221*c0909341SAndroid Build Coastguard Worker    lea                  r5, [strideq*3]
3222*c0909341SAndroid Build Coastguard Worker    paddw                m7, m6, m6
3223*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
3224*c0909341SAndroid Build Coastguard Worker    vpblendd             m6, m7, 0xf0
3225*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
3226*c0909341SAndroid Build Coastguard Worker    pslldq               m2, m7, 8
3227*c0909341SAndroid Build Coastguard Worker    paddw                m7, m7
3228*c0909341SAndroid Build Coastguard Worker    paddw                m6, m2
3229*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [pb_15to0]
3230*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m1
3231*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m0
3232*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2
3233*c0909341SAndroid Build Coastguard Worker    vextracti128   [rsp+ 0], m1, 1
3234*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], xm1
3235*c0909341SAndroid Build Coastguard Worker.h8_upsample_loop:
3236*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [r2+dyq]
3237*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6 ; base0
3238*c0909341SAndroid Build Coastguard Worker    movu                xm0, [rsp+r2]
3239*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r4+dyq]
3240*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6 ; base1
3241*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [rsp+r4], 1
3242*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [r2+dyq]
3243*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6 ; base2
3244*c0909341SAndroid Build Coastguard Worker    pand                 m1, m4, m6
3245*c0909341SAndroid Build Coastguard Worker    psubw                m2, m5, m1
3246*c0909341SAndroid Build Coastguard Worker    psllw                m1, 8
3247*c0909341SAndroid Build Coastguard Worker    por                  m2, m1
3248*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m2, m2 ; frac0 frac1
3249*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
3250*c0909341SAndroid Build Coastguard Worker    movu                xm1, [rsp+r2]
3251*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r4+dyq]
3252*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6 ; base3
3253*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [rsp+r4], 1
3254*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m2, m2 ; frac2 frac3
3255*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
3256*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
3257*c0909341SAndroid Build Coastguard Worker    paddw                m6, m7
3258*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
3259*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dstq+strideq*4]
3260*c0909341SAndroid Build Coastguard Worker    psllw                m1, 8
3261*c0909341SAndroid Build Coastguard Worker    por                  m0, m1
3262*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
3263*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm2, xm0, xm1
3264*c0909341SAndroid Build Coastguard Worker    punpckhbw           xm0, xm1
3265*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm2
3266*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm2, 1
3267*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm2, 2
3268*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r5       ], xm2, 3
3269*c0909341SAndroid Build Coastguard Worker    movd   [r4  +strideq*0], xm0
3270*c0909341SAndroid Build Coastguard Worker    pextrd [r4  +strideq*1], xm0, 1
3271*c0909341SAndroid Build Coastguard Worker    pextrd [r4  +strideq*2], xm0, 2
3272*c0909341SAndroid Build Coastguard Worker    pextrd [r4  +r5       ], xm0, 3
3273*c0909341SAndroid Build Coastguard Worker    add                dstq, 4
3274*c0909341SAndroid Build Coastguard Worker    sub                  wd, 4
3275*c0909341SAndroid Build Coastguard Worker    jg .h8_upsample_loop
3276*c0909341SAndroid Build Coastguard Worker    RET
3277*c0909341SAndroid Build Coastguard Worker.h8_no_intra_edge_filter:
3278*c0909341SAndroid Build Coastguard Worker    and            maxbased, 7
3279*c0909341SAndroid Build Coastguard Worker    or             maxbased, 8 ; imin(w+7, 15)
3280*c0909341SAndroid Build Coastguard Worker    jmp .h8_main
3281*c0909341SAndroid Build Coastguard Worker.h8_no_upsample:
3282*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK         -32, 10
3283*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [wq+7]
3284*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
3285*c0909341SAndroid Build Coastguard Worker    jnz .h8_no_intra_edge_filter
3286*c0909341SAndroid Build Coastguard Worker    call .filter_strength
3287*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
3288*c0909341SAndroid Build Coastguard Worker    jz .h8_main ; filter_strength == 0
3289*c0909341SAndroid Build Coastguard Worker    popcnt              r5d, r5d
3290*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm6, [base+pb_15]
3291*c0909341SAndroid Build Coastguard Worker    pcmpeqb             xm1, xm1
3292*c0909341SAndroid Build Coastguard Worker    psubusb             xm6, xm0
3293*c0909341SAndroid Build Coastguard Worker    psubb               xm6, xm1 ; w == 4 ? 5 : 1
3294*c0909341SAndroid Build Coastguard Worker    movu                xm2, [tlq-16]
3295*c0909341SAndroid Build Coastguard Worker    pmaxub              xm1, xm6, [base+z_filter_s]
3296*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [tlq-14], 1
3297*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [base+z_filter_s+12], 1
3298*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*0]
3299*c0909341SAndroid Build Coastguard Worker    pmaxub              xm6, [base+z_filter_s+ 8]
3300*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [base+z_filter_s+20], 1
3301*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2, m1
3302*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7
3303*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*1]
3304*c0909341SAndroid Build Coastguard Worker    movzx               r4d, byte [tlq-15]
3305*c0909341SAndroid Build Coastguard Worker    shufps               m1, m6, q2121
3306*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2, m1
3307*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m7
3308*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3309*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 3
3310*c0909341SAndroid Build Coastguard Worker    jnz .h8_3tap
3311*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [z_filter_k+4*8]
3312*c0909341SAndroid Build Coastguard Worker    movzx               r2d, byte [tlq-14]
3313*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m6
3314*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
3315*c0909341SAndroid Build Coastguard Worker    sub                 r2d, r4d
3316*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r2+r4*8+4]
3317*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 3
3318*c0909341SAndroid Build Coastguard Worker    mov            [rsp+15], r2b
3319*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
3320*c0909341SAndroid Build Coastguard Worker.h8_3tap:
3321*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
3322*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 1
3323*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+31]
3324*c0909341SAndroid Build Coastguard Worker    add                 r5d, 17
3325*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 16
3326*c0909341SAndroid Build Coastguard Worker    cmovns         maxbased, r5d
3327*c0909341SAndroid Build Coastguard Worker    neg                  r5
3328*c0909341SAndroid Build Coastguard Worker    mov            [tlq+r5], r4b
3329*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
3330*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm1
3331*c0909341SAndroid Build Coastguard Worker    mova           [tlq-15], xm0
3332*c0909341SAndroid Build Coastguard Worker.h8_main:
3333*c0909341SAndroid Build Coastguard Worker    movd                xm2, dyd
3334*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [z_base_inc]
3335*c0909341SAndroid Build Coastguard Worker    mov                  r4, tlq
3336*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 8
3337*c0909341SAndroid Build Coastguard Worker    neg                 dyq
3338*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, xm2
3339*c0909341SAndroid Build Coastguard Worker    sub                  r4, maxbaseq
3340*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
3341*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m7, [r4]
3342*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dyq+63]
3343*c0909341SAndroid Build Coastguard Worker    movd                xm9, maxbased
3344*c0909341SAndroid Build Coastguard Worker    not            maxbased
3345*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [z3_shuf]
3346*c0909341SAndroid Build Coastguard Worker    add            maxbased, 64
3347*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, xm9
3348*c0909341SAndroid Build Coastguard Worker    psrlw                m7, 8
3349*c0909341SAndroid Build Coastguard Worker    psubw                m9, m0
3350*c0909341SAndroid Build Coastguard Worker    paddw                m6, m2, m2
3351*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m6, 0x0f
3352*c0909341SAndroid Build Coastguard Worker.h8_loop:
3353*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r4+dyq]
3354*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6
3355*c0909341SAndroid Build Coastguard Worker    pand                 m0, m4, m2
3356*c0909341SAndroid Build Coastguard Worker    psubw                m1, m5, m0
3357*c0909341SAndroid Build Coastguard Worker    psllw                m0, 8
3358*c0909341SAndroid Build Coastguard Worker    por                  m1, m0
3359*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [tlq+r4]
3360*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r5+dyq]
3361*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6
3362*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [tlq+r5], 0
3363*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 8*2
3364*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m8
3365*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
3366*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m1, m9, m2
3367*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6
3368*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
3369*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m7, m0, m1
3370*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
3371*c0909341SAndroid Build Coastguard Worker    psllw               xm0, 8
3372*c0909341SAndroid Build Coastguard Worker    por                 xm0, xm1 ; interleave rows (partial transpose)
3373*c0909341SAndroid Build Coastguard Worker    mova              [rsp], xm0
3374*c0909341SAndroid Build Coastguard Worker    sub                  wd, 2
3375*c0909341SAndroid Build Coastguard Worker    jz .h8_transpose
3376*c0909341SAndroid Build Coastguard Worker    cmp                 r4d, maxbased
3377*c0909341SAndroid Build Coastguard Worker    jg .h8_loop
3378*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm7, xm7
3379*c0909341SAndroid Build Coastguard Worker.h8_end_loop:
3380*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 8*2
3381*c0909341SAndroid Build Coastguard Worker    mova              [rsp], xm0
3382*c0909341SAndroid Build Coastguard Worker    sub                  wd, 2
3383*c0909341SAndroid Build Coastguard Worker    jg .h8_end_loop
3384*c0909341SAndroid Build Coastguard Worker.h8_transpose:
3385*c0909341SAndroid Build Coastguard Worker    mova                xm2, [rsp+16*1]
3386*c0909341SAndroid Build Coastguard Worker    sub              org_wd, 8
3387*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
3388*c0909341SAndroid Build Coastguard Worker    lea                  r6, [dstq+org_wq]
3389*c0909341SAndroid Build Coastguard Worker    cmovns             dstq, r6
3390*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm1, xm2, xm0
3391*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm2, xm0
3392*c0909341SAndroid Build Coastguard Worker    lea                  r6, [dstq+strideq*4]
3393*c0909341SAndroid Build Coastguard Worker    jge .h8_w8
3394*c0909341SAndroid Build Coastguard Worker    add                 rsp, 16*2
3395*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm1
3396*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm1, 1
3397*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm1, 2
3398*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r2       ], xm1, 3
3399*c0909341SAndroid Build Coastguard Worker    movd   [r6  +strideq*0], xm2
3400*c0909341SAndroid Build Coastguard Worker    pextrd [r6  +strideq*1], xm2, 1
3401*c0909341SAndroid Build Coastguard Worker    pextrd [r6  +strideq*2], xm2, 2
3402*c0909341SAndroid Build Coastguard Worker    pextrd [r6  +r2       ], xm2, 3
3403*c0909341SAndroid Build Coastguard Worker    jmp .h8_end
3404*c0909341SAndroid Build Coastguard Worker.h8_w8_loop:
3405*c0909341SAndroid Build Coastguard Worker    mova                xm0, [rsp+16*0]
3406*c0909341SAndroid Build Coastguard Worker    mova                xm2, [rsp+16*1]
3407*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm1, xm2, xm0
3408*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm2, xm0
3409*c0909341SAndroid Build Coastguard Worker.h8_w8: ; w8/w16/w32
3410*c0909341SAndroid Build Coastguard Worker    mova                xm0, [rsp+16*2]
3411*c0909341SAndroid Build Coastguard Worker    mova                xm4, [rsp+16*3]
3412*c0909341SAndroid Build Coastguard Worker    add                 rsp, 16*4
3413*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm4, xm0
3414*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm4, xm0
3415*c0909341SAndroid Build Coastguard Worker    punpckldq           xm0, xm3, xm1
3416*c0909341SAndroid Build Coastguard Worker    punpckhdq           xm3, xm1
3417*c0909341SAndroid Build Coastguard Worker    punpckldq           xm1, xm4, xm2
3418*c0909341SAndroid Build Coastguard Worker    punpckhdq           xm4, xm2
3419*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
3420*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
3421*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm3
3422*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r2       ], xm3
3423*c0909341SAndroid Build Coastguard Worker    movq   [r6  +strideq*0], xm1
3424*c0909341SAndroid Build Coastguard Worker    movhps [r6  +strideq*1], xm1
3425*c0909341SAndroid Build Coastguard Worker    movq   [r6  +strideq*2], xm4
3426*c0909341SAndroid Build Coastguard Worker    movhps [r6  +r2       ], xm4
3427*c0909341SAndroid Build Coastguard Worker    sub                dstq, 8
3428*c0909341SAndroid Build Coastguard Worker    sub                  r6, 8
3429*c0909341SAndroid Build Coastguard Worker    sub              org_wd, 8
3430*c0909341SAndroid Build Coastguard Worker    jge .h8_w8_loop
3431*c0909341SAndroid Build Coastguard Worker.h8_end:
3432*c0909341SAndroid Build Coastguard Worker    RET
3433*c0909341SAndroid Build Coastguard Worker.h16_no_intra_edge_filter:
3434*c0909341SAndroid Build Coastguard Worker    and            maxbased, 15
3435*c0909341SAndroid Build Coastguard Worker    or             maxbased, 16 ; imin(w+15, 31)
3436*c0909341SAndroid Build Coastguard Worker    jmp .h16_main
3437*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3438*c0909341SAndroid Build Coastguard Worker.h16:
3439*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK         -64, 12
3440*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [wq+15]
3441*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
3442*c0909341SAndroid Build Coastguard Worker    jnz .h16_no_intra_edge_filter
3443*c0909341SAndroid Build Coastguard Worker    call .filter_strength
3444*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
3445*c0909341SAndroid Build Coastguard Worker    jz .h16_main ; filter_strength == 0
3446*c0909341SAndroid Build Coastguard Worker    popcnt              r5d, r5d
3447*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [base+pb_27]
3448*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [base+pb_1]
3449*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [base+z_filter_s+12]
3450*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, m6, [base+z_filter_s+4], 0
3451*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [base+z_filter_s+20], 1
3452*c0909341SAndroid Build Coastguard Worker    movu               xm10, [tlq-18]
3453*c0909341SAndroid Build Coastguard Worker    vinserti128         m10, [tlq-14], 1
3454*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*0]
3455*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m7, [base+z_filter_s+8]
3456*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, m7, [base+z_filter_s+0], 0
3457*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [base+z_filter_s+16], 1
3458*c0909341SAndroid Build Coastguard Worker    psubusb             m11, m0
3459*c0909341SAndroid Build Coastguard Worker    por                  m1, m11
3460*c0909341SAndroid Build Coastguard Worker    movu               xm11, [tlq-32]
3461*c0909341SAndroid Build Coastguard Worker    vinserti128         m11, [tlq-28], 1
3462*c0909341SAndroid Build Coastguard Worker    pmaxub               m8, m1
3463*c0909341SAndroid Build Coastguard Worker    pmaxub               m7, m1
3464*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m10, m2
3465*c0909341SAndroid Build Coastguard Worker    shufps               m2, m6, q2121
3466*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m9
3467*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m11, m8
3468*c0909341SAndroid Build Coastguard Worker    shufps               m8, m7, q2121
3469*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m9
3470*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
3471*c0909341SAndroid Build Coastguard Worker    movzx               r4d, byte [tlq-31]
3472*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m10, m2
3473*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9
3474*c0909341SAndroid Build Coastguard Worker    pshufb               m8, m11, m8
3475*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m8, m9
3476*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
3477*c0909341SAndroid Build Coastguard Worker    paddw                m1, m8
3478*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 3
3479*c0909341SAndroid Build Coastguard Worker    jnz .h16_3tap
3480*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [z_filter_k+4*8]
3481*c0909341SAndroid Build Coastguard Worker    movzx               r2d, byte [tlq-30]
3482*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m6
3483*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
3484*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m7
3485*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m11, m9
3486*c0909341SAndroid Build Coastguard Worker    sub                 r2d, r4d
3487*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r2+r4*8+4]
3488*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 3
3489*c0909341SAndroid Build Coastguard Worker    mov            [rsp+31], r2b
3490*c0909341SAndroid Build Coastguard Worker    paddw                m0, m10
3491*c0909341SAndroid Build Coastguard Worker    paddw                m1, m11
3492*c0909341SAndroid Build Coastguard Worker.h16_3tap:
3493*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
3494*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
3495*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 1
3496*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+63]
3497*c0909341SAndroid Build Coastguard Worker    add                 r5d, 33
3498*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 32
3499*c0909341SAndroid Build Coastguard Worker    cmovns         maxbased, r5d
3500*c0909341SAndroid Build Coastguard Worker    neg                  r5
3501*c0909341SAndroid Build Coastguard Worker    mov            [tlq+r5], r4b
3502*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
3503*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q2031
3504*c0909341SAndroid Build Coastguard Worker    mova           [tlq-31], m0
3505*c0909341SAndroid Build Coastguard Worker.h16_main:
3506*c0909341SAndroid Build Coastguard Worker    movd                xm6, dyd
3507*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [z_base_inc]
3508*c0909341SAndroid Build Coastguard Worker    mov                  r4, tlq
3509*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 8
3510*c0909341SAndroid Build Coastguard Worker    neg                 dyq
3511*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, xm6
3512*c0909341SAndroid Build Coastguard Worker    sub                  r4, maxbaseq
3513*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
3514*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m7, [r4]
3515*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dyq+63]
3516*c0909341SAndroid Build Coastguard Worker    movd                xm9, maxbased
3517*c0909341SAndroid Build Coastguard Worker    not            maxbased
3518*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [z3_shuf]
3519*c0909341SAndroid Build Coastguard Worker    add            maxbased, 64
3520*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, xm9
3521*c0909341SAndroid Build Coastguard Worker    psubw                m9, m0
3522*c0909341SAndroid Build Coastguard Worker    paddw               m11, m6, m6
3523*c0909341SAndroid Build Coastguard Worker    psubw               m10, m9, m3 ; 64*8
3524*c0909341SAndroid Build Coastguard Worker    vpblendd             m6, m11, 0xf0
3525*c0909341SAndroid Build Coastguard Worker.h16_loop:
3526*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r4+dyq]
3527*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6
3528*c0909341SAndroid Build Coastguard Worker    pand                 m1, m4, m6
3529*c0909341SAndroid Build Coastguard Worker    psubw                m2, m5, m1
3530*c0909341SAndroid Build Coastguard Worker    psllw                m1, 8
3531*c0909341SAndroid Build Coastguard Worker    por                  m2, m1
3532*c0909341SAndroid Build Coastguard Worker    movu                xm0, [tlq+r4-0]
3533*c0909341SAndroid Build Coastguard Worker    movu                xm1, [tlq+r4-8]
3534*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r5+dyq]
3535*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6
3536*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [tlq+r5-0], 1
3537*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [tlq+r5-8], 1
3538*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 32
3539*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m8
3540*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m8
3541*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
3542*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
3543*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
3544*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
3545*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
3546*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m1, m9, m6
3547*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m10, m6
3548*c0909341SAndroid Build Coastguard Worker    packsswb             m1, m2
3549*c0909341SAndroid Build Coastguard Worker    paddw                m6, m11
3550*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m7, m0, m1
3551*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
3552*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m0
3553*c0909341SAndroid Build Coastguard Worker    sub                  wd, 2
3554*c0909341SAndroid Build Coastguard Worker    jz .h16_transpose
3555*c0909341SAndroid Build Coastguard Worker    cmp                 r4d, maxbased
3556*c0909341SAndroid Build Coastguard Worker    jg .h16_loop
3557*c0909341SAndroid Build Coastguard Worker    mova                 m0, m7
3558*c0909341SAndroid Build Coastguard Worker.h16_end_loop:
3559*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 32
3560*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m7
3561*c0909341SAndroid Build Coastguard Worker    sub                  wd, 2
3562*c0909341SAndroid Build Coastguard Worker    jg .h16_end_loop
3563*c0909341SAndroid Build Coastguard Worker.h16_transpose:
3564*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+32*1]
3565*c0909341SAndroid Build Coastguard Worker    sub              org_wd, 8
3566*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
3567*c0909341SAndroid Build Coastguard Worker    lea                  r6, [dstq+org_wq]
3568*c0909341SAndroid Build Coastguard Worker    cmovns             dstq, r6
3569*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2, m0
3570*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m0
3571*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*5]
3572*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
3573*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
3574*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq+r2*2] ; stride*7
3575*c0909341SAndroid Build Coastguard Worker    jge .h16_w8
3576*c0909341SAndroid Build Coastguard Worker    add                 rsp, 32*2
3577*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
3578*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
3579*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 2
3580*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r2       ], xm0, 3
3581*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m0, 1
3582*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*4], xm1
3583*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r3       ], xm1, 1
3584*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r2*2     ], xm1, 2
3585*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r4       ], xm1, 3
3586*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*8]
3587*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m1, 1
3588*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
3589*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
3590*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 2
3591*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r2       ], xm0, 3
3592*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*4], xm1
3593*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r3       ], xm1, 1
3594*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r2*2     ], xm1, 2
3595*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r4       ], xm1, 3
3596*c0909341SAndroid Build Coastguard Worker    jmp .h16_end
3597*c0909341SAndroid Build Coastguard Worker.h16_w8_loop:
3598*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+32*0]
3599*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+32*1]
3600*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2, m0
3601*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m0
3602*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
3603*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
3604*c0909341SAndroid Build Coastguard Worker.h16_w8:
3605*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+32*2]
3606*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+32*3]
3607*c0909341SAndroid Build Coastguard Worker    lea                  r6, [dstq+strideq*8]
3608*c0909341SAndroid Build Coastguard Worker    add                 rsp, 32*4
3609*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4, m2
3610*c0909341SAndroid Build Coastguard Worker    punpckhbw            m4, m2
3611*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m4
3612*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m4
3613*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m2, m0
3614*c0909341SAndroid Build Coastguard Worker    punpckhdq            m2, m0
3615*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m3, m1
3616*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m1
3617*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm4
3618*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm4
3619*c0909341SAndroid Build Coastguard Worker    vextracti128        xm4, m4, 1
3620*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm2
3621*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r2       ], xm2
3622*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m2, 1
3623*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*4], xm0
3624*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r3       ], xm0
3625*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m0, 1
3626*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r2*2     ], xm3
3627*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r4       ], xm3
3628*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m3, 1
3629*c0909341SAndroid Build Coastguard Worker    movq     [r6+strideq*0], xm4
3630*c0909341SAndroid Build Coastguard Worker    movhps   [r6+strideq*1], xm4
3631*c0909341SAndroid Build Coastguard Worker    movq     [r6+strideq*2], xm2
3632*c0909341SAndroid Build Coastguard Worker    movhps   [r6+r2       ], xm2
3633*c0909341SAndroid Build Coastguard Worker    movq     [r6+strideq*4], xm0
3634*c0909341SAndroid Build Coastguard Worker    movhps   [r6+r3       ], xm0
3635*c0909341SAndroid Build Coastguard Worker    movq     [r6+r2*2     ], xm3
3636*c0909341SAndroid Build Coastguard Worker    movhps   [r6+r4       ], xm3
3637*c0909341SAndroid Build Coastguard Worker    sub                dstq, 8
3638*c0909341SAndroid Build Coastguard Worker    sub              org_wd, 8
3639*c0909341SAndroid Build Coastguard Worker    jge .h16_w8_loop
3640*c0909341SAndroid Build Coastguard Worker.h16_end:
3641*c0909341SAndroid Build Coastguard Worker    RET
3642*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3643*c0909341SAndroid Build Coastguard Worker.h32:
3644*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK         -96, 15
3645*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [wq+31]
3646*c0909341SAndroid Build Coastguard Worker    and            maxbased, 31
3647*c0909341SAndroid Build Coastguard Worker    or             maxbased, 32 ; imin(w+31, 63)
3648*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
3649*c0909341SAndroid Build Coastguard Worker    jnz .h32_main
3650*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [pb_0to15]
3651*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 21
3652*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 3
3653*c0909341SAndroid Build Coastguard Worker    movu               xm11, [tlq-66]    ; 56-63
3654*c0909341SAndroid Build Coastguard Worker    vinserti128         m11, [tlq-52], 1 ; 40-47
3655*c0909341SAndroid Build Coastguard Worker    sub                 r4d, wd ; 21-w
3656*c0909341SAndroid Build Coastguard Worker    cmovns              r5d, r4d
3657*c0909341SAndroid Build Coastguard Worker    movu               xm12, [tlq-58]    ; 48-55
3658*c0909341SAndroid Build Coastguard Worker    vinserti128         m12, [tlq-44], 1 ; 32-39
3659*c0909341SAndroid Build Coastguard Worker    sub                 r4d, 8 ; 13-w
3660*c0909341SAndroid Build Coastguard Worker    movd                xm1, r5d
3661*c0909341SAndroid Build Coastguard Worker    movu               xm13, [tlq-34]    ; 24-31
3662*c0909341SAndroid Build Coastguard Worker    vinserti128         m13, [tlq-20], 1 ;  8-15
3663*c0909341SAndroid Build Coastguard Worker    movd                xm2, r4d
3664*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m1, xm1
3665*c0909341SAndroid Build Coastguard Worker    movu               xm14, [tlq-28]    ; 16-23
3666*c0909341SAndroid Build Coastguard Worker    vinserti128         m14, [tlq-14], 1 ;  0- 7
3667*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m2, xm2
3668*c0909341SAndroid Build Coastguard Worker    pmaxsb               m1, m0 ; clip 16x32 and (32|64)x32
3669*c0909341SAndroid Build Coastguard Worker    movu                 m7, [z_filter_s+4]
3670*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m1
3671*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, m7, [z_filter_s+8], 1
3672*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [z_filter_s+16], 0
3673*c0909341SAndroid Build Coastguard Worker    pmaxsb               m2, m0 ; clip 8x32
3674*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
3675*c0909341SAndroid Build Coastguard Worker    pshufb              m12, m2
3676*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m11, m8
3677*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m9
3678*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m12, m8
3679*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9
3680*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m13, m8
3681*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m9
3682*c0909341SAndroid Build Coastguard Worker    shufps               m8, m7, q1021
3683*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m14, m8
3684*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m9
3685*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
3686*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m11, m8
3687*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
3688*c0909341SAndroid Build Coastguard Worker    paddw                m0, m10
3689*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m12, m8
3690*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
3691*c0909341SAndroid Build Coastguard Worker    paddw                m2, m10
3692*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m13, m8
3693*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
3694*c0909341SAndroid Build Coastguard Worker    shufps               m8, m7, q2121
3695*c0909341SAndroid Build Coastguard Worker    paddw                m1, m10
3696*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m14, m8
3697*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
3698*c0909341SAndroid Build Coastguard Worker    paddw                m6, m10
3699*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
3700*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m8
3701*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m11, m9
3702*c0909341SAndroid Build Coastguard Worker    pshufb              m12, m8
3703*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, m9
3704*c0909341SAndroid Build Coastguard Worker    movzx               r4d, byte [tlq-63]
3705*c0909341SAndroid Build Coastguard Worker    movzx               r2d, byte [tlq-62]
3706*c0909341SAndroid Build Coastguard Worker    paddw                m0, m11
3707*c0909341SAndroid Build Coastguard Worker    paddw                m2, m12
3708*c0909341SAndroid Build Coastguard Worker    pshufb              m13, m8
3709*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m9
3710*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m7
3711*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m9
3712*c0909341SAndroid Build Coastguard Worker    paddw                m1, m13
3713*c0909341SAndroid Build Coastguard Worker    paddw                m6, m14
3714*c0909341SAndroid Build Coastguard Worker    sub                 r2d, r4d
3715*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r2+r4*8+4] ; edge case for 64x32
3716*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
3717*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
3718*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
3719*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m3
3720*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 3
3721*c0909341SAndroid Build Coastguard Worker    mov            [rsp+31], r2b
3722*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+95]
3723*c0909341SAndroid Build Coastguard Worker    mov            [tlq-65], r4b
3724*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 65
3725*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 64
3726*c0909341SAndroid Build Coastguard Worker    cmove          maxbased, r4d
3727*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m2
3728*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m6
3729*c0909341SAndroid Build Coastguard Worker    mova           [tlq-63], m0
3730*c0909341SAndroid Build Coastguard Worker    mova           [tlq-31], m1
3731*c0909341SAndroid Build Coastguard Worker.h32_main:
3732*c0909341SAndroid Build Coastguard Worker    movd                xm6, dyd
3733*c0909341SAndroid Build Coastguard Worker    mov                  r4, tlq
3734*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 8
3735*c0909341SAndroid Build Coastguard Worker    neg                 dyq
3736*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, xm6
3737*c0909341SAndroid Build Coastguard Worker    sub                  r4, maxbaseq
3738*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
3739*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m7, [r4]
3740*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dyq+63]
3741*c0909341SAndroid Build Coastguard Worker    movd                xm9, maxbased
3742*c0909341SAndroid Build Coastguard Worker    not            maxbased
3743*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [z3_shuf]
3744*c0909341SAndroid Build Coastguard Worker    add            maxbased, 64
3745*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m9, xm9
3746*c0909341SAndroid Build Coastguard Worker    psubw                m9, [z_base_inc]
3747*c0909341SAndroid Build Coastguard Worker    mova                m11, m6
3748*c0909341SAndroid Build Coastguard Worker    psubw               m10, m9, m3 ; 64*8
3749*c0909341SAndroid Build Coastguard Worker.h32_loop:
3750*c0909341SAndroid Build Coastguard Worker    mov                  r5, r4
3751*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6
3752*c0909341SAndroid Build Coastguard Worker    pand                 m1, m4, m6
3753*c0909341SAndroid Build Coastguard Worker    psubw                m2, m5, m1
3754*c0909341SAndroid Build Coastguard Worker    psllw                m1, 8
3755*c0909341SAndroid Build Coastguard Worker    por                  m2, m1
3756*c0909341SAndroid Build Coastguard Worker    movu                xm0, [tlq+r5- 0]
3757*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [tlq+r5-16], 1
3758*c0909341SAndroid Build Coastguard Worker    movu                xm1, [tlq+r5- 8]
3759*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [tlq+r5-24], 1
3760*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 32
3761*c0909341SAndroid Build Coastguard Worker    add                  r4, dyq
3762*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m8
3763*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m8
3764*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
3765*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
3766*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
3767*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
3768*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
3769*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m1, m9, m6
3770*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m2, m10, m6
3771*c0909341SAndroid Build Coastguard Worker    packsswb             m1, m2
3772*c0909341SAndroid Build Coastguard Worker    paddw                m6, m11
3773*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m7, m0, m1
3774*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m0
3775*c0909341SAndroid Build Coastguard Worker    dec                  wd
3776*c0909341SAndroid Build Coastguard Worker    jz .h32_transpose
3777*c0909341SAndroid Build Coastguard Worker    cmp                 r4d, maxbased
3778*c0909341SAndroid Build Coastguard Worker    jg .h32_loop
3779*c0909341SAndroid Build Coastguard Worker.h32_end_loop:
3780*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 32
3781*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m7
3782*c0909341SAndroid Build Coastguard Worker    dec                  wd
3783*c0909341SAndroid Build Coastguard Worker    jg .h32_end_loop
3784*c0909341SAndroid Build Coastguard Worker.h32_transpose:
3785*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+org_wq-8]
3786*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
3787*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*5]
3788*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq+r2*2] ; stride*7
3789*c0909341SAndroid Build Coastguard Worker.h32_w8_loop:
3790*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+32*0]
3791*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+32*1]
3792*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+32*2]
3793*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+32*3]
3794*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+32*4]
3795*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+32*5]
3796*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+32*6]
3797*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+32*7]
3798*c0909341SAndroid Build Coastguard Worker    lea                  r6, [dstq+strideq*8]
3799*c0909341SAndroid Build Coastguard Worker    add                 rsp, 32*8
3800*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m0, m1
3801*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m1
3802*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2, m3
3803*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m3
3804*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4, m5
3805*c0909341SAndroid Build Coastguard Worker    punpckhbw            m4, m5
3806*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m6, m7
3807*c0909341SAndroid Build Coastguard Worker    punpckhbw            m6, m7
3808*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m8, m1
3809*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m1
3810*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0, m2
3811*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2
3812*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m5
3813*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m5
3814*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m4, m6
3815*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m6
3816*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m7, m2
3817*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m2
3818*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m8, m3
3819*c0909341SAndroid Build Coastguard Worker    punpckhdq            m8, m3
3820*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m1, m5
3821*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m5
3822*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m0, m4
3823*c0909341SAndroid Build Coastguard Worker    punpckhdq            m0, m4
3824*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm6
3825*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm6
3826*c0909341SAndroid Build Coastguard Worker    vextracti128        xm6, m6, 1
3827*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm7
3828*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r2       ], xm7
3829*c0909341SAndroid Build Coastguard Worker    vextracti128        xm7, m7, 1
3830*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*4], xm2
3831*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r3       ], xm2
3832*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m2, 1
3833*c0909341SAndroid Build Coastguard Worker    movq   [dstq+r2*2     ], xm8
3834*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r4       ], xm8
3835*c0909341SAndroid Build Coastguard Worker    vextracti128        xm8, m8, 1
3836*c0909341SAndroid Build Coastguard Worker    movq     [r6+strideq*0], xm3
3837*c0909341SAndroid Build Coastguard Worker    movhps   [r6+strideq*1], xm3
3838*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m3, 1
3839*c0909341SAndroid Build Coastguard Worker    movq     [r6+strideq*2], xm1
3840*c0909341SAndroid Build Coastguard Worker    movhps   [r6+r2       ], xm1
3841*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m1, 1
3842*c0909341SAndroid Build Coastguard Worker    movq     [r6+strideq*4], xm5
3843*c0909341SAndroid Build Coastguard Worker    movhps   [r6+r3       ], xm5
3844*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m5, 1
3845*c0909341SAndroid Build Coastguard Worker    movq     [r6+r2*2     ], xm0
3846*c0909341SAndroid Build Coastguard Worker    movhps   [r6+r4       ], xm0
3847*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6+strideq*8]
3848*c0909341SAndroid Build Coastguard Worker    vextracti128        xm0, m0, 1
3849*c0909341SAndroid Build Coastguard Worker    movq     [r6+strideq*0], xm6
3850*c0909341SAndroid Build Coastguard Worker    movhps   [r6+strideq*1], xm6
3851*c0909341SAndroid Build Coastguard Worker    movq     [r6+strideq*2], xm7
3852*c0909341SAndroid Build Coastguard Worker    movhps   [r6+r2       ], xm7
3853*c0909341SAndroid Build Coastguard Worker    movq     [r6+strideq*4], xm2
3854*c0909341SAndroid Build Coastguard Worker    movhps   [r6+r3       ], xm2
3855*c0909341SAndroid Build Coastguard Worker    movq     [r6+r2*2     ], xm8
3856*c0909341SAndroid Build Coastguard Worker    movhps   [r6+r4       ], xm8
3857*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6+strideq*8]
3858*c0909341SAndroid Build Coastguard Worker    movq     [r6+strideq*0], xm3
3859*c0909341SAndroid Build Coastguard Worker    movhps   [r6+strideq*1], xm3
3860*c0909341SAndroid Build Coastguard Worker    movq     [r6+strideq*2], xm1
3861*c0909341SAndroid Build Coastguard Worker    movhps   [r6+r2       ], xm1
3862*c0909341SAndroid Build Coastguard Worker    movq     [r6+strideq*4], xm5
3863*c0909341SAndroid Build Coastguard Worker    movhps   [r6+r3       ], xm5
3864*c0909341SAndroid Build Coastguard Worker    movq     [r6+r2*2     ], xm0
3865*c0909341SAndroid Build Coastguard Worker    movhps   [r6+r4       ], xm0
3866*c0909341SAndroid Build Coastguard Worker    sub                dstq, 8
3867*c0909341SAndroid Build Coastguard Worker    sub              org_wd, 8
3868*c0909341SAndroid Build Coastguard Worker    jg .h32_w8_loop
3869*c0909341SAndroid Build Coastguard Worker    RET
3870*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3871*c0909341SAndroid Build Coastguard Worker.h64:
3872*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK        -128, 16
3873*c0909341SAndroid Build Coastguard Worker    lea            maxbased, [wq+63]
3874*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
3875*c0909341SAndroid Build Coastguard Worker    jnz .h64_main
3876*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 21
3877*c0909341SAndroid Build Coastguard Worker    vpbroadcastb       xm11, [tlq-127]
3878*c0909341SAndroid Build Coastguard Worker    vpblendd           xm11, [tlq-130], 0x0e ; 120-127
3879*c0909341SAndroid Build Coastguard Worker    sub                 r4d, wd ; 21-w
3880*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 3
3881*c0909341SAndroid Build Coastguard Worker    vinserti128         m11, [tlq-116], 1    ; 104-111
3882*c0909341SAndroid Build Coastguard Worker    movu                 m7, [z_filter_s+4]
3883*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 32
3884*c0909341SAndroid Build Coastguard Worker    cmove               r4d, r5d
3885*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, m7, [z_filter_s+8], 1
3886*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m6, [pb_0to15]
3887*c0909341SAndroid Build Coastguard Worker    movd                xm1, r4d
3888*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
3889*c0909341SAndroid Build Coastguard Worker    movu               xm12, [tlq-122]       ; 112-119
3890*c0909341SAndroid Build Coastguard Worker    vinserti128         m12, [tlq-108], 1    ;  96-103
3891*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m1, xm1
3892*c0909341SAndroid Build Coastguard Worker    movu               xm13, [tlq- 98]       ;  88- 95
3893*c0909341SAndroid Build Coastguard Worker    vinserti128         m13, [tlq- 84], 1    ;  72- 79
3894*c0909341SAndroid Build Coastguard Worker    movu               xm14, [tlq- 90]       ;  80- 87
3895*c0909341SAndroid Build Coastguard Worker    vinserti128         m14, [tlq- 76], 1    ;  64- 71
3896*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [z_filter_s+16], 0
3897*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m11, m8
3898*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m9
3899*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m12, m8
3900*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9
3901*c0909341SAndroid Build Coastguard Worker    pmaxsb               m1, m6 ; clip (16|32)x64
3902*c0909341SAndroid Build Coastguard Worker    pshufb              m13, m1
3903*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m13, m8
3904*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m9
3905*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m14, m8
3906*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m9
3907*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
3908*c0909341SAndroid Build Coastguard Worker    shufps              m15, m8, m7, q1021
3909*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m11, m15
3910*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
3911*c0909341SAndroid Build Coastguard Worker    paddw                m0, m10
3912*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m12, m15
3913*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
3914*c0909341SAndroid Build Coastguard Worker    paddw                m2, m10
3915*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m13, m15
3916*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
3917*c0909341SAndroid Build Coastguard Worker    paddw                m1, m10
3918*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m14, m15
3919*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
3920*c0909341SAndroid Build Coastguard Worker    paddw                m6, m10
3921*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
3922*c0909341SAndroid Build Coastguard Worker    shufps              m10, m8, m7, q2132
3923*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m10
3924*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m11, m9
3925*c0909341SAndroid Build Coastguard Worker    pshufb              m12, m10
3926*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, m9
3927*c0909341SAndroid Build Coastguard Worker    pshufb              m13, m10
3928*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m9
3929*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m10
3930*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m9
3931*c0909341SAndroid Build Coastguard Worker    paddw                m0, m11
3932*c0909341SAndroid Build Coastguard Worker    paddw                m2, m12
3933*c0909341SAndroid Build Coastguard Worker    paddw                m1, m13
3934*c0909341SAndroid Build Coastguard Worker    paddw                m6, m14
3935*c0909341SAndroid Build Coastguard Worker    movu               xm11, [tlq-66]    ; 56-63
3936*c0909341SAndroid Build Coastguard Worker    vinserti128         m11, [tlq-52], 1 ; 40-47
3937*c0909341SAndroid Build Coastguard Worker    movu               xm12, [tlq-58]    ; 48-55
3938*c0909341SAndroid Build Coastguard Worker    vinserti128         m12, [tlq-44], 1 ; 32-39
3939*c0909341SAndroid Build Coastguard Worker    movu               xm13, [tlq-34]    ; 24-31
3940*c0909341SAndroid Build Coastguard Worker    vinserti128         m13, [tlq-20], 1 ;  8-15
3941*c0909341SAndroid Build Coastguard Worker    movu               xm14, [tlq-28]    ; 16-23
3942*c0909341SAndroid Build Coastguard Worker    vinserti128         m14, [tlq-14], 1 ;  0- 7
3943*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
3944*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
3945*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
3946*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m3
3947*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+127]
3948*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m2
3949*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m6
3950*c0909341SAndroid Build Coastguard Worker    mova          [tlq-127], m0
3951*c0909341SAndroid Build Coastguard Worker    mova          [tlq- 95], m1
3952*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m11, m10
3953*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m9
3954*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m12, m10
3955*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m9
3956*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m13, m10
3957*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m9
3958*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m14, m7
3959*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m9
3960*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
3961*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m11, m15
3962*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m9
3963*c0909341SAndroid Build Coastguard Worker    paddw                m0, m7
3964*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m12, m15
3965*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m9
3966*c0909341SAndroid Build Coastguard Worker    paddw                m2, m7
3967*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m13, m15
3968*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m9
3969*c0909341SAndroid Build Coastguard Worker    paddw                m1, m7
3970*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m14, m10
3971*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m9
3972*c0909341SAndroid Build Coastguard Worker    paddw                m6, m7
3973*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
3974*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m8
3975*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m11, m9
3976*c0909341SAndroid Build Coastguard Worker    pshufb              m12, m8
3977*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, m9
3978*c0909341SAndroid Build Coastguard Worker    pshufb              m13, m8
3979*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, m9
3980*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m15
3981*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m9
3982*c0909341SAndroid Build Coastguard Worker    paddw                m0, m11
3983*c0909341SAndroid Build Coastguard Worker    paddw                m2, m12
3984*c0909341SAndroid Build Coastguard Worker    paddw                m1, m13
3985*c0909341SAndroid Build Coastguard Worker    paddw                m6, m14
3986*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
3987*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
3988*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
3989*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m3
3990*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m2
3991*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m6
3992*c0909341SAndroid Build Coastguard Worker    mova           [tlq-63], m0
3993*c0909341SAndroid Build Coastguard Worker    mova           [tlq-31], m1
3994*c0909341SAndroid Build Coastguard Worker.h64_main:
3995*c0909341SAndroid Build Coastguard Worker    movd               xm12, dyd
3996*c0909341SAndroid Build Coastguard Worker    neg            maxbaseq
3997*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [z3_shuf]
3998*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m7, [tlq+maxbaseq]
3999*c0909341SAndroid Build Coastguard Worker    shl            maxbased, 6
4000*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        m12, xm12
4001*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [dyq+maxbaseq-64]
4002*c0909341SAndroid Build Coastguard Worker    neg                 dyq
4003*c0909341SAndroid Build Coastguard Worker    or             maxbased, 63
4004*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dyq+63]
4005*c0909341SAndroid Build Coastguard Worker    movd                xm6, r5d
4006*c0909341SAndroid Build Coastguard Worker    mova               xm10, [pb_1to32+16]
4007*c0909341SAndroid Build Coastguard Worker    vinserti128         m10, [pb_1to32], 1
4008*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pb_32]
4009*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m6, xm6
4010*c0909341SAndroid Build Coastguard Worker.h64_loop:
4011*c0909341SAndroid Build Coastguard Worker    mov                  r5, r4
4012*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6
4013*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r5-24]
4014*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r5-32]
4015*c0909341SAndroid Build Coastguard Worker    pand                 m2, m4, m6
4016*c0909341SAndroid Build Coastguard Worker    psubw                m9, m5, m2
4017*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
4018*c0909341SAndroid Build Coastguard Worker    por                  m9, m2
4019*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m8
4020*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m8
4021*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m9
4022*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m9
4023*c0909341SAndroid Build Coastguard Worker    psraw                m2, m6, 6
4024*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 64
4025*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
4026*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
4027*c0909341SAndroid Build Coastguard Worker    packsswb             m2, m2
4028*c0909341SAndroid Build Coastguard Worker    paddb                m2, m10
4029*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
4030*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m7, m0, m2
4031*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m0
4032*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r5-56]
4033*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r5-64]
4034*c0909341SAndroid Build Coastguard Worker    add                  r4, dyq
4035*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m8
4036*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m8
4037*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m9
4038*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m9
4039*c0909341SAndroid Build Coastguard Worker    paddb                m2, m11
4040*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
4041*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
4042*c0909341SAndroid Build Coastguard Worker    paddw                m6, m12
4043*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
4044*c0909341SAndroid Build Coastguard Worker    vpblendvb            m0, m7, m0, m2
4045*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m0
4046*c0909341SAndroid Build Coastguard Worker    dec                  wd
4047*c0909341SAndroid Build Coastguard Worker    jz .h64_transpose
4048*c0909341SAndroid Build Coastguard Worker    cmp                 r4d, maxbased
4049*c0909341SAndroid Build Coastguard Worker    jg .h64_loop
4050*c0909341SAndroid Build Coastguard Worker.h64_end_loop:
4051*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 64
4052*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m7
4053*c0909341SAndroid Build Coastguard Worker    mova           [rsp+ 0], m7
4054*c0909341SAndroid Build Coastguard Worker    dec                  wd
4055*c0909341SAndroid Build Coastguard Worker    jg .h64_end_loop
4056*c0909341SAndroid Build Coastguard Worker.h64_transpose:
4057*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
4058*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*5]
4059*c0909341SAndroid Build Coastguard Worker    imul                 r5, strideq, -8
4060*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+org_wq-16]
4061*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq+r2*2] ; stride*7
4062*c0909341SAndroid Build Coastguard Worker.h64_transpose_loop0:
4063*c0909341SAndroid Build Coastguard Worker    lea                  r6, [rsp+16*3]
4064*c0909341SAndroid Build Coastguard Worker.h64_transpose_loop:
4065*c0909341SAndroid Build Coastguard Worker    mova                xm0, [r6+64*15]
4066*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [r6+64* 7], 1
4067*c0909341SAndroid Build Coastguard Worker    mova                xm1, [r6+64*14]
4068*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [r6+64* 6], 1
4069*c0909341SAndroid Build Coastguard Worker    mova                xm2, [r6+64*13]
4070*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [r6+64* 5], 1
4071*c0909341SAndroid Build Coastguard Worker    mova                xm3, [r6+64*12]
4072*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [r6+64* 4], 1
4073*c0909341SAndroid Build Coastguard Worker    mova                xm4, [r6+64*11]
4074*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [r6+64* 3], 1
4075*c0909341SAndroid Build Coastguard Worker    mova                xm5, [r6+64*10]
4076*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [r6+64* 2], 1
4077*c0909341SAndroid Build Coastguard Worker    mova                xm6, [r6+64* 9]
4078*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [r6+64* 1], 1
4079*c0909341SAndroid Build Coastguard Worker    mova                xm7, [r6+64* 8]
4080*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [r6+64* 0], 1
4081*c0909341SAndroid Build Coastguard Worker    sub                  r6, 16
4082*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m0, m1
4083*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m1
4084*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2, m3
4085*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m3
4086*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4, m5
4087*c0909341SAndroid Build Coastguard Worker    punpckhbw            m4, m5
4088*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m6, m7
4089*c0909341SAndroid Build Coastguard Worker    punpckhbw            m6, m7
4090*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m8, m1
4091*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m1
4092*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0, m2
4093*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2
4094*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m5
4095*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m5
4096*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m4, m6
4097*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m6
4098*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m7, m2
4099*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m2
4100*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m8, m3
4101*c0909341SAndroid Build Coastguard Worker    punpckhdq            m8, m3
4102*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m1, m5
4103*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m5
4104*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m0, m4
4105*c0909341SAndroid Build Coastguard Worker    punpckhdq            m0, m4
4106*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m6, q3120
4107*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m7, q3120
4108*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m2, q3120
4109*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m8, q3120
4110*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m3, q3120
4111*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q3120
4112*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m5, q3120
4113*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
4114*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm6
4115*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m6, 1
4116*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*2], xm7
4117*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+r2       ], m7, 1
4118*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*4], xm2
4119*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+r3       ], m2, 1
4120*c0909341SAndroid Build Coastguard Worker    mova         [dstq+r2*2     ], xm8
4121*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+r4       ], m8, 1
4122*c0909341SAndroid Build Coastguard Worker    sub               dstq, r5
4123*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm3
4124*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m3, 1
4125*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*2], xm1
4126*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+r2       ], m1, 1
4127*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*4], xm5
4128*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+r3       ], m5, 1
4129*c0909341SAndroid Build Coastguard Worker    mova         [dstq+r2*2     ], xm0
4130*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+r4       ], m0, 1
4131*c0909341SAndroid Build Coastguard Worker    sub                dstq, r5
4132*c0909341SAndroid Build Coastguard Worker    cmp                  r6, rsp
4133*c0909341SAndroid Build Coastguard Worker    jae .h64_transpose_loop
4134*c0909341SAndroid Build Coastguard Worker    add                 rsp, 64*16
4135*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+r5*8-16]
4136*c0909341SAndroid Build Coastguard Worker    sub              org_wd, 16
4137*c0909341SAndroid Build Coastguard Worker    jg .h64_transpose_loop0
4138*c0909341SAndroid Build Coastguard Worker.h64_end:
4139*c0909341SAndroid Build Coastguard Worker    RET
4140*c0909341SAndroid Build Coastguard Worker
4141*c0909341SAndroid Build Coastguard Worker%macro FILTER_XMM 4 ; dst, src, tmp, shuf
4142*c0909341SAndroid Build Coastguard Worker%ifnum %4
4143*c0909341SAndroid Build Coastguard Worker    pshufb             xm%2, xm%4
4144*c0909341SAndroid Build Coastguard Worker%else
4145*c0909341SAndroid Build Coastguard Worker    pshufb             xm%2, %4
4146*c0909341SAndroid Build Coastguard Worker%endif
4147*c0909341SAndroid Build Coastguard Worker    pshufd             xm%1, xm%2, q0000 ; p0 p1
4148*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xm%1, xm2
4149*c0909341SAndroid Build Coastguard Worker    pshufd             xm%3, xm%2, q1111 ; p2 p3
4150*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xm%3, xm3
4151*c0909341SAndroid Build Coastguard Worker    paddw              xm%1, xm1
4152*c0909341SAndroid Build Coastguard Worker    paddw              xm%1, xm%3
4153*c0909341SAndroid Build Coastguard Worker    pshufd             xm%3, xm%2, q2222 ; p4 p5
4154*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xm%3, xm4
4155*c0909341SAndroid Build Coastguard Worker    paddw              xm%1, xm%3
4156*c0909341SAndroid Build Coastguard Worker    pshufd             xm%3, xm%2, q3333 ; p6 __
4157*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xm%3, xm5
4158*c0909341SAndroid Build Coastguard Worker    paddw              xm%1, xm%3
4159*c0909341SAndroid Build Coastguard Worker    psraw              xm%1, 4
4160*c0909341SAndroid Build Coastguard Worker    packuswb           xm%1, xm%1
4161*c0909341SAndroid Build Coastguard Worker%endmacro
4162*c0909341SAndroid Build Coastguard Worker
4163*c0909341SAndroid Build Coastguard Worker%macro FILTER_YMM 4 ; dst, src, tmp, shuf
4164*c0909341SAndroid Build Coastguard Worker    pshufb              m%2, m%4
4165*c0909341SAndroid Build Coastguard Worker    pshufd              m%1, m%2, q0000
4166*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%1, m2
4167*c0909341SAndroid Build Coastguard Worker    pshufd              m%3, m%2, q1111
4168*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%3, m3
4169*c0909341SAndroid Build Coastguard Worker    paddw               m%1, m1
4170*c0909341SAndroid Build Coastguard Worker    paddw               m%1, m%3
4171*c0909341SAndroid Build Coastguard Worker    pshufd              m%3, m%2, q2222
4172*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%3, m4
4173*c0909341SAndroid Build Coastguard Worker    paddw               m%1, m%3
4174*c0909341SAndroid Build Coastguard Worker    pshufd              m%3, m%2, q3333
4175*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%3, m5
4176*c0909341SAndroid Build Coastguard Worker    paddw               m%1, m%3
4177*c0909341SAndroid Build Coastguard Worker    psraw               m%1, 4
4178*c0909341SAndroid Build Coastguard Worker    vperm2i128          m%3, m%1, m%1, 0x01
4179*c0909341SAndroid Build Coastguard Worker    packuswb            m%1, m%3
4180*c0909341SAndroid Build Coastguard Worker%endmacro
4181*c0909341SAndroid Build Coastguard Worker
4182*c0909341SAndroid Build Coastguard Worker; The ipred_filter SIMD processes 4x2 blocks in the following order which
4183*c0909341SAndroid Build Coastguard Worker; increases parallelism compared to doing things row by row. One redundant
4184*c0909341SAndroid Build Coastguard Worker; block is calculated for w8 and w16, two for w32.
4185*c0909341SAndroid Build Coastguard Worker;     w4     w8       w16             w32
4186*c0909341SAndroid Build Coastguard Worker;     1     1 2     1 2 3 5     1 2 3 5 b c d f
4187*c0909341SAndroid Build Coastguard Worker;     2     2 3     2 4 5 7     2 4 5 7 c e f h
4188*c0909341SAndroid Build Coastguard Worker;     3     3 4     4 6 7 9     4 6 7 9 e g h j
4189*c0909341SAndroid Build Coastguard Worker; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___
4190*c0909341SAndroid Build Coastguard Worker;           5       8           8       i
4191*c0909341SAndroid Build Coastguard Worker
4192*c0909341SAndroid Build Coastguard Workercglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter
4193*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_filter_avx2_table
4194*c0909341SAndroid Build Coastguard Worker    lea                  r6, [filter_intra_taps]
4195*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
4196*c0909341SAndroid Build Coastguard Worker%ifidn filterd, filterm
4197*c0909341SAndroid Build Coastguard Worker    movzx           filterd, filterb
4198*c0909341SAndroid Build Coastguard Worker%else
4199*c0909341SAndroid Build Coastguard Worker    movzx           filterd, byte filterm
4200*c0909341SAndroid Build Coastguard Worker%endif
4201*c0909341SAndroid Build Coastguard Worker    shl             filterd, 6
4202*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       9, 15
4203*c0909341SAndroid Build Coastguard Worker    add             filterq, r6
4204*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ipred_filter_avx2_table]
4205*c0909341SAndroid Build Coastguard Worker    movq                xm0, [tlq-3] ; _ 6 5 0 1 2 3 4
4206*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
4207*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [base+pw_8]
4208*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [filterq+16*0]
4209*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [filterq+16*1]
4210*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [filterq+16*2]
4211*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [filterq+16*3]
4212*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
4213*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
4214*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4215*c0909341SAndroid Build Coastguard Worker.w4:
4216*c0909341SAndroid Build Coastguard Worker    mova                xm8, [base+filter_shuf2]
4217*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 3
4218*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
4219*c0909341SAndroid Build Coastguard Worker    jmp .w4_loop_start
4220*c0909341SAndroid Build Coastguard Worker.w4_loop:
4221*c0909341SAndroid Build Coastguard Worker    pinsrd              xm0, xm6, [tlq+hq], 0
4222*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
4223*c0909341SAndroid Build Coastguard Worker.w4_loop_start:
4224*c0909341SAndroid Build Coastguard Worker    FILTER_XMM            6, 0, 7, 8
4225*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm6
4226*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm6, 1
4227*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4228*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
4229*c0909341SAndroid Build Coastguard Worker    RET
4230*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4231*c0909341SAndroid Build Coastguard Worker.w8:
4232*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       10
4233*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+filter_shuf1]
4234*c0909341SAndroid Build Coastguard Worker    FILTER_XMM            7, 0, 6, [base+filter_shuf2]
4235*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [tlq+4]
4236*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [tlq+5]
4237*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
4238*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
4239*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m7, xm7
4240*c0909341SAndroid Build Coastguard Worker    vpblendd             m7, m6, 0x20
4241*c0909341SAndroid Build Coastguard Worker.w8_loop:
4242*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm6, [tlq+hq]
4243*c0909341SAndroid Build Coastguard Worker    palignr              m6, m0, 12
4244*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m6, m7, 0xeb     ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
4245*c0909341SAndroid Build Coastguard Worker                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
4246*c0909341SAndroid Build Coastguard Worker    mova                xm6, xm7
4247*c0909341SAndroid Build Coastguard Worker    call .main
4248*c0909341SAndroid Build Coastguard Worker    vpblendd            xm6, xm7, 0x0c
4249*c0909341SAndroid Build Coastguard Worker    pshufd              xm6, xm6, q3120
4250*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm6
4251*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm6
4252*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
4253*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4254*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
4255*c0909341SAndroid Build Coastguard Worker    RET
4256*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4257*c0909341SAndroid Build Coastguard Worker.w16:
4258*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4259*c0909341SAndroid Build Coastguard Worker    call .w16_main
4260*c0909341SAndroid Build Coastguard Worker%if WIN64
4261*c0909341SAndroid Build Coastguard Worker    jmp .end
4262*c0909341SAndroid Build Coastguard Worker%else
4263*c0909341SAndroid Build Coastguard Worker    RET
4264*c0909341SAndroid Build Coastguard Worker%endif
4265*c0909341SAndroid Build Coastguard Worker.w16_main:
4266*c0909341SAndroid Build Coastguard Worker    ; The spills are into the callers stack frame
4267*c0909341SAndroid Build Coastguard Worker    %assign stack_size stack_size + gprsize
4268*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       15, 9
4269*c0909341SAndroid Build Coastguard Worker    %assign stack_size stack_size - gprsize
4270*c0909341SAndroid Build Coastguard Worker    FILTER_XMM           12, 0, 7, [base+filter_shuf2]
4271*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [tlq+5]
4272*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, [tlq-12], 0x14
4273*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+filter_shuf1]
4274*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m7, xm12
4275*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m7, 0xc2         ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
4276*c0909341SAndroid Build Coastguard Worker                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
4277*c0909341SAndroid Build Coastguard Worker    call .main                                ; c0 d0 a1 b1   a1 b1 c0 d0
4278*c0909341SAndroid Build Coastguard Worker    movlps              xm9, xm7, [tlq+5]     ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
4279*c0909341SAndroid Build Coastguard Worker    vinserti128         m14, m8, [base+filter_shuf3], 0
4280*c0909341SAndroid Build Coastguard Worker    vpblendd           xm12, xm7, 0x0c        ; a0 b0 a1 b1
4281*c0909341SAndroid Build Coastguard Worker    FILTER_XMM            6, 9, 10, 14
4282*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m6, xm6              ; a2 b2 __ __ __ __ a2 b2
4283*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [tlq+13]
4284*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [tlq+12]
4285*c0909341SAndroid Build Coastguard Worker    psrld               m11, m8, 4
4286*c0909341SAndroid Build Coastguard Worker    vpblendd             m6, m9, 0x20         ; top
4287*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 6
4288*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
4289*c0909341SAndroid Build Coastguard Worker.w16_loop:
4290*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm9, [tlq+hq]
4291*c0909341SAndroid Build Coastguard Worker    palignr              m9, m0, 12
4292*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m9, m7, 0xe2     ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
4293*c0909341SAndroid Build Coastguard Worker                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
4294*c0909341SAndroid Build Coastguard Worker    mova               xm13, xm7
4295*c0909341SAndroid Build Coastguard Worker    call .main                                ; e0 f0 c1 d1   c1 d1 e0 f0
4296*c0909341SAndroid Build Coastguard Worker    vpblendd             m9, m12, m10, 0xf0
4297*c0909341SAndroid Build Coastguard Worker    vpblendd            m12, m6, 0xc0
4298*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m9, q3333
4299*c0909341SAndroid Build Coastguard Worker    vpblendd             m9, m6, 0xee
4300*c0909341SAndroid Build Coastguard Worker    vpblendd            m10, m9, m7, 0x0c     ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
4301*c0909341SAndroid Build Coastguard Worker                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
4302*c0909341SAndroid Build Coastguard Worker    FILTER_YMM            6, 10, 9, 14        ; c2 d2 a3 b3   a3 b3 c2 d2
4303*c0909341SAndroid Build Coastguard Worker    vpblendd            m12, m6, 0x30         ; a0 b0 a1 b1   a3 b3 a2 b2
4304*c0909341SAndroid Build Coastguard Worker    vpermd               m9, m11, m12         ; a0 a1 a2 a3   b0 b1 b2 b3
4305*c0909341SAndroid Build Coastguard Worker    vpblendd           xm12, xm13, xm7, 0x0c  ; c0 d0 c1 d1
4306*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm9
4307*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m9, 1
4308*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
4309*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4310*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
4311*c0909341SAndroid Build Coastguard Worker    vpblendd            xm7, xm6, xm10, 0x04  ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4
4312*c0909341SAndroid Build Coastguard Worker    pshufd              xm7, xm7, q1032       ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
4313*c0909341SAndroid Build Coastguard Worker    FILTER_XMM            0, 7, 9, [base+filter_shuf1+16]
4314*c0909341SAndroid Build Coastguard Worker    vpblendd            xm6, xm0, 0x0c        ; c2 d2 c3 d3
4315*c0909341SAndroid Build Coastguard Worker    shufps              xm0, xm12, xm6, q2020 ; c0 c1 c2 c3
4316*c0909341SAndroid Build Coastguard Worker    shufps              xm6, xm12, xm6, q3131 ; d0 d1 d2 d3
4317*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], xm0
4318*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], xm6
4319*c0909341SAndroid Build Coastguard Worker    ret
4320*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4321*c0909341SAndroid Build Coastguard Worker.w32:
4322*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4323*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+16]
4324*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [hq-2]
4325*c0909341SAndroid Build Coastguard Worker    call .w16_main
4326*c0909341SAndroid Build Coastguard Worker    add                 tlq, r5
4327*c0909341SAndroid Build Coastguard Worker    mov                dstq, r3
4328*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq-4]
4329*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r3+strideq*2]
4330*c0909341SAndroid Build Coastguard Worker    movq                xm0, [tlq+21]
4331*c0909341SAndroid Build Coastguard Worker    pinsrd              xm0, [dstq-4], 2
4332*c0909341SAndroid Build Coastguard Worker    pinsrd              xm0, [dstq+r3*1], 3
4333*c0909341SAndroid Build Coastguard Worker    FILTER_XMM           12, 0, 7, 14         ; a0 b0 a0 b0
4334*c0909341SAndroid Build Coastguard Worker    movq                xm7, [dstq+r3*2]
4335*c0909341SAndroid Build Coastguard Worker    pinsrd              xm7, [dstq+r4], 2
4336*c0909341SAndroid Build Coastguard Worker    palignr             xm7, xm0, 12          ; 0 _ _ _ _ _ _ _ _ _ _ 5 _ _ _ 6
4337*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [tlq+28]
4338*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [tlq+29]
4339*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m8, [base+filter_shuf1+16]
4340*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m9, 0x20
4341*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m7, 0x0f
4342*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m7, xm12
4343*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m7, 0xc2         ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
4344*c0909341SAndroid Build Coastguard Worker    call .main                                ; c0 d0 a1 b1   a1 b1 c0 d0
4345*c0909341SAndroid Build Coastguard Worker    add                  r3, 2
4346*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r4+strideq*2]
4347*c0909341SAndroid Build Coastguard Worker    movlps              xm9, xm7, [tlq+29]    ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
4348*c0909341SAndroid Build Coastguard Worker    vpblendd           xm12, xm7, 0x0c        ; a0 b0 a1 b1
4349*c0909341SAndroid Build Coastguard Worker    FILTER_XMM            6, 9, 10, 14
4350*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m6, xm6              ; a2 b2 __ __ __ __ a2 b2
4351*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [tlq+37]
4352*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [tlq+36]
4353*c0909341SAndroid Build Coastguard Worker    vpblendd             m6, m9, 0x20         ; top
4354*c0909341SAndroid Build Coastguard Worker.w32_loop:
4355*c0909341SAndroid Build Coastguard Worker    movq                xm9, [dstq+r3*4]
4356*c0909341SAndroid Build Coastguard Worker    pinsrd              xm9, [dstq+r4], 2
4357*c0909341SAndroid Build Coastguard Worker.w32_loop_last:
4358*c0909341SAndroid Build Coastguard Worker    palignr              m9, m0, 12
4359*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m9, m7, 0xe2     ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
4360*c0909341SAndroid Build Coastguard Worker    mova               xm13, xm7              ; c0 d0
4361*c0909341SAndroid Build Coastguard Worker    call .main                                ; e0 f0 c1 d1   c1 d1 e0 f0
4362*c0909341SAndroid Build Coastguard Worker    vpblendd             m9, m12, m10, 0xf0
4363*c0909341SAndroid Build Coastguard Worker    vpblendd            m12, m6, 0xc0
4364*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m9, q3333
4365*c0909341SAndroid Build Coastguard Worker    vpblendd             m9, m6, 0xee
4366*c0909341SAndroid Build Coastguard Worker    vpblendd            m10, m9, m7, 0x0c     ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
4367*c0909341SAndroid Build Coastguard Worker                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
4368*c0909341SAndroid Build Coastguard Worker    FILTER_YMM            6, 10, 9, 14        ; c2 d2 a3 b3   a3 b3 c2 d2
4369*c0909341SAndroid Build Coastguard Worker    vpblendd            m12, m6, 0x30         ; a0 b0 a1 b1   a3 b3 a2 b2
4370*c0909341SAndroid Build Coastguard Worker    vpermd               m9, m11, m12         ; a0 a1 a2 a3   b0 b1 b2 b3
4371*c0909341SAndroid Build Coastguard Worker    vpblendd           xm12, xm13, xm7, 0x0c  ; c0 d0 c1 d1
4372*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm9
4373*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m9, 1
4374*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
4375*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
4376*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
4377*c0909341SAndroid Build Coastguard Worker    jz .w32_loop_last
4378*c0909341SAndroid Build Coastguard Worker    vpblendd            xm7, xm6, xm10, 0x04  ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4
4379*c0909341SAndroid Build Coastguard Worker    pshufd              xm7, xm7, q1032       ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
4380*c0909341SAndroid Build Coastguard Worker    FILTER_XMM            0, 7, 9, [base+filter_shuf1+16]
4381*c0909341SAndroid Build Coastguard Worker    vpblendd            xm6, xm0, 0x0c        ; c2 d2 c3 d3
4382*c0909341SAndroid Build Coastguard Worker    shufps              xm0, xm12, xm6, q2020 ; c0 c1 c2 c3
4383*c0909341SAndroid Build Coastguard Worker    shufps              xm6, xm12, xm6, q3131 ; d0 d1 d2 d3
4384*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], xm0
4385*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], xm6
4386*c0909341SAndroid Build Coastguard Worker.end:
4387*c0909341SAndroid Build Coastguard Worker    RET
4388*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4389*c0909341SAndroid Build Coastguard Worker.main:
4390*c0909341SAndroid Build Coastguard Worker    FILTER_YMM            7, 0, 9, 8
4391*c0909341SAndroid Build Coastguard Worker    ret
4392*c0909341SAndroid Build Coastguard Worker
4393*c0909341SAndroid Build Coastguard Worker%if WIN64
4394*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5
4395*c0909341SAndroid Build Coastguard Worker%else
4396*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7
4397*c0909341SAndroid Build Coastguard Worker%endif
4398*c0909341SAndroid Build Coastguard Worker
4399*c0909341SAndroid Build Coastguard Worker%macro IPRED_CFL 1 ; ac in, unpacked pixels out
4400*c0909341SAndroid Build Coastguard Worker    psignw               m3, m%1, m1
4401*c0909341SAndroid Build Coastguard Worker    pabsw               m%1, m%1
4402*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%1, m2
4403*c0909341SAndroid Build Coastguard Worker    psignw              m%1, m3
4404*c0909341SAndroid Build Coastguard Worker    paddw               m%1, m0
4405*c0909341SAndroid Build Coastguard Worker%endmacro
4406*c0909341SAndroid Build Coastguard Worker
4407*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
4408*c0909341SAndroid Build Coastguard Worker    lea                  t0, [ipred_cfl_left_avx2_table]
4409*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
4410*c0909341SAndroid Build Coastguard Worker    inc                 tlq
4411*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq]
4412*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
4413*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x8000
4414*c0909341SAndroid Build Coastguard Worker    shrx                r6d, r6d, wd
4415*c0909341SAndroid Build Coastguard Worker    movd                xm3, r6d
4416*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [t0+wq*4]
4417*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m2, m2
4418*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4419*c0909341SAndroid Build Coastguard Worker    add                  r6, t0
4420*c0909341SAndroid Build Coastguard Worker    add                  t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table
4421*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4]
4422*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
4423*c0909341SAndroid Build Coastguard Worker    movifnidn           acq, acmp
4424*c0909341SAndroid Build Coastguard Worker    jmp                  r6
4425*c0909341SAndroid Build Coastguard Worker
4426*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
4427*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm ; zero upper half
4428*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, hd
4429*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
4430*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
4431*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq]
4432*c0909341SAndroid Build Coastguard Worker    mov                 t0d, 0x8000
4433*c0909341SAndroid Build Coastguard Worker    shrx                t0d, t0d, r6d
4434*c0909341SAndroid Build Coastguard Worker    movd                xm3, t0d
4435*c0909341SAndroid Build Coastguard Worker    lea                  t0, [ipred_cfl_left_avx2_table]
4436*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [t0+r6*4]
4437*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m2, m2
4438*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4439*c0909341SAndroid Build Coastguard Worker    add                  r6, t0
4440*c0909341SAndroid Build Coastguard Worker    add                  t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table
4441*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4]
4442*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
4443*c0909341SAndroid Build Coastguard Worker    movifnidn           acq, acmp
4444*c0909341SAndroid Build Coastguard Worker    jmp                  r6
4445*c0909341SAndroid Build Coastguard Worker.h32:
4446*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
4447*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4448*c0909341SAndroid Build Coastguard Worker.h16:
4449*c0909341SAndroid Build Coastguard Worker    punpckhqdq          xm1, xm0, xm0
4450*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4451*c0909341SAndroid Build Coastguard Worker.h8:
4452*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
4453*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4454*c0909341SAndroid Build Coastguard Worker.h4:
4455*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm0, xm2
4456*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm3
4457*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
4458*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4459*c0909341SAndroid Build Coastguard Worker
4460*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
4461*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
4462*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
4463*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, hd
4464*c0909341SAndroid Build Coastguard Worker    lea                 t0d, [wq+hq]
4465*c0909341SAndroid Build Coastguard Worker    movd                xm4, t0d
4466*c0909341SAndroid Build Coastguard Worker    tzcnt               t0d, t0d
4467*c0909341SAndroid Build Coastguard Worker    movd                xm5, t0d
4468*c0909341SAndroid Build Coastguard Worker    lea                  t0, [ipred_cfl_avx2_table]
4469*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
4470*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [t0+r6*4]
4471*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4+4*4]
4472*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m3, m3
4473*c0909341SAndroid Build Coastguard Worker    psrlw               xm4, 1
4474*c0909341SAndroid Build Coastguard Worker    add                  r6, t0
4475*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
4476*c0909341SAndroid Build Coastguard Worker    movifnidn           acq, acmp
4477*c0909341SAndroid Build Coastguard Worker    jmp                  r6
4478*c0909341SAndroid Build Coastguard Worker.h4:
4479*c0909341SAndroid Build Coastguard Worker    movd                xm0, [tlq-4]
4480*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm3
4481*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4482*c0909341SAndroid Build Coastguard Worker.w4:
4483*c0909341SAndroid Build Coastguard Worker    movd                xm1, [tlq+1]
4484*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm3
4485*c0909341SAndroid Build Coastguard Worker    psubw               xm0, xm4
4486*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4487*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm0, xm3
4488*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
4489*c0909341SAndroid Build Coastguard Worker    jg .w4_mul
4490*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 3
4491*c0909341SAndroid Build Coastguard Worker    jmp .w4_end
4492*c0909341SAndroid Build Coastguard Worker.w4_mul:
4493*c0909341SAndroid Build Coastguard Worker    punpckhqdq          xm1, xm0, xm0
4494*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [hq*2]
4495*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x55563334
4496*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4497*c0909341SAndroid Build Coastguard Worker    shrx                r6d, r6d, r2d
4498*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
4499*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4500*c0909341SAndroid Build Coastguard Worker    movd                xm1, r6d
4501*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, 2
4502*c0909341SAndroid Build Coastguard Worker    pmulhuw             xm0, xm1
4503*c0909341SAndroid Build Coastguard Worker.w4_end:
4504*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
4505*c0909341SAndroid Build Coastguard Worker.s4:
4506*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, alpham
4507*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
4508*c0909341SAndroid Build Coastguard Worker    pabsw                m2, m1
4509*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
4510*c0909341SAndroid Build Coastguard Worker.s4_loop:
4511*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq]
4512*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4
4513*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m4
4514*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
4515*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm4
4516*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm4, 1
4517*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], xm5
4518*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r6       ], xm5, 1
4519*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4520*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4521*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4522*c0909341SAndroid Build Coastguard Worker    jg .s4_loop
4523*c0909341SAndroid Build Coastguard Worker    RET
4524*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4525*c0909341SAndroid Build Coastguard Worker.h8:
4526*c0909341SAndroid Build Coastguard Worker    movq                xm0, [tlq-8]
4527*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm3
4528*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4529*c0909341SAndroid Build Coastguard Worker.w8:
4530*c0909341SAndroid Build Coastguard Worker    movq                xm1, [tlq+1]
4531*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m0, 1
4532*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm3
4533*c0909341SAndroid Build Coastguard Worker    psubw               xm0, xm4
4534*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm2
4535*c0909341SAndroid Build Coastguard Worker    punpckhqdq          xm2, xm0, xm0
4536*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm2
4537*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4538*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
4539*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4540*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm0, xm3
4541*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, xm5
4542*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
4543*c0909341SAndroid Build Coastguard Worker    je .w8_end
4544*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x5556
4545*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x3334
4546*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
4547*c0909341SAndroid Build Coastguard Worker    cmove               r6d, r2d
4548*c0909341SAndroid Build Coastguard Worker    movd                xm1, r6d
4549*c0909341SAndroid Build Coastguard Worker    pmulhuw             xm0, xm1
4550*c0909341SAndroid Build Coastguard Worker.w8_end:
4551*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
4552*c0909341SAndroid Build Coastguard Worker.s8:
4553*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, alpham
4554*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
4555*c0909341SAndroid Build Coastguard Worker    pabsw                m2, m1
4556*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
4557*c0909341SAndroid Build Coastguard Worker.s8_loop:
4558*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq]
4559*c0909341SAndroid Build Coastguard Worker    mova                 m5, [acq+32]
4560*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4
4561*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             5
4562*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
4563*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
4564*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm4
4565*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], xm5
4566*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*2], xm4
4567*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r6       ], xm5
4568*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4569*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4570*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4571*c0909341SAndroid Build Coastguard Worker    jg .s8_loop
4572*c0909341SAndroid Build Coastguard Worker    RET
4573*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4574*c0909341SAndroid Build Coastguard Worker.h16:
4575*c0909341SAndroid Build Coastguard Worker    mova                xm0, [tlq-16]
4576*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm3
4577*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4578*c0909341SAndroid Build Coastguard Worker.w16:
4579*c0909341SAndroid Build Coastguard Worker    movu                xm1, [tlq+1]
4580*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m0, 1
4581*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm3
4582*c0909341SAndroid Build Coastguard Worker    psubw               xm0, xm4
4583*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm2
4584*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4585*c0909341SAndroid Build Coastguard Worker    punpckhqdq          xm1, xm0, xm0
4586*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4587*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
4588*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4589*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm0, xm3
4590*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, xm5
4591*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
4592*c0909341SAndroid Build Coastguard Worker    je .w16_end
4593*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x5556
4594*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x3334
4595*c0909341SAndroid Build Coastguard Worker    test                 hb, 8|32
4596*c0909341SAndroid Build Coastguard Worker    cmovz               r6d, r2d
4597*c0909341SAndroid Build Coastguard Worker    movd                xm1, r6d
4598*c0909341SAndroid Build Coastguard Worker    pmulhuw             xm0, xm1
4599*c0909341SAndroid Build Coastguard Worker.w16_end:
4600*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
4601*c0909341SAndroid Build Coastguard Worker.s16:
4602*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, alpham
4603*c0909341SAndroid Build Coastguard Worker    pabsw                m2, m1
4604*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
4605*c0909341SAndroid Build Coastguard Worker.s16_loop:
4606*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq]
4607*c0909341SAndroid Build Coastguard Worker    mova                 m5, [acq+32]
4608*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4
4609*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             5
4610*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
4611*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m4, q3120
4612*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm4
4613*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m4, 1
4614*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
4615*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4616*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4617*c0909341SAndroid Build Coastguard Worker    jg .s16_loop
4618*c0909341SAndroid Build Coastguard Worker    RET
4619*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4620*c0909341SAndroid Build Coastguard Worker.h32:
4621*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-32]
4622*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
4623*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4624*c0909341SAndroid Build Coastguard Worker.w32:
4625*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+1]
4626*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
4627*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
4628*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
4629*c0909341SAndroid Build Coastguard Worker    psubw               xm0, xm4
4630*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4631*c0909341SAndroid Build Coastguard Worker    punpckhqdq          xm1, xm0, xm0
4632*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4633*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
4634*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4635*c0909341SAndroid Build Coastguard Worker    pmaddwd             xm0, xm3
4636*c0909341SAndroid Build Coastguard Worker    psrlw               xm0, xm5
4637*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
4638*c0909341SAndroid Build Coastguard Worker    je .w32_end
4639*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [hq*2]
4640*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x33345556
4641*c0909341SAndroid Build Coastguard Worker    shrx                r6d, r6d, r2d
4642*c0909341SAndroid Build Coastguard Worker    movd                xm1, r6d
4643*c0909341SAndroid Build Coastguard Worker    pmulhuw             xm0, xm1
4644*c0909341SAndroid Build Coastguard Worker.w32_end:
4645*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
4646*c0909341SAndroid Build Coastguard Worker.s32:
4647*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, alpham
4648*c0909341SAndroid Build Coastguard Worker    pabsw                m2, m1
4649*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
4650*c0909341SAndroid Build Coastguard Worker.s32_loop:
4651*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq]
4652*c0909341SAndroid Build Coastguard Worker    mova                 m5, [acq+32]
4653*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4
4654*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             5
4655*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
4656*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m4, q3120
4657*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m4
4658*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
4659*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4660*c0909341SAndroid Build Coastguard Worker    dec                  hd
4661*c0909341SAndroid Build Coastguard Worker    jg .s32_loop
4662*c0909341SAndroid Build Coastguard Worker    RET
4663*c0909341SAndroid Build Coastguard Worker
4664*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
4665*c0909341SAndroid Build Coastguard Worker    lea                  t0, [ipred_cfl_splat_avx2_table]
4666*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
4667*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
4668*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4]
4669*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [t0-ipred_cfl_splat_avx2_table+pw_128]
4670*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
4671*c0909341SAndroid Build Coastguard Worker    movifnidn           acq, acmp
4672*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4673*c0909341SAndroid Build Coastguard Worker
4674*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
4675*c0909341SAndroid Build Coastguard Worker    movifnidn         hpadd, hpadm
4676*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
4677*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
4678*c0909341SAndroid Build Coastguard Worker    mov                 szd, wd
4679*c0909341SAndroid Build Coastguard Worker    mov             ac_bakq, acq
4680*c0909341SAndroid Build Coastguard Worker    imul                szd, hd
4681*c0909341SAndroid Build Coastguard Worker    shl               hpadd, 2
4682*c0909341SAndroid Build Coastguard Worker    sub                  hd, hpadd
4683*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [pb_2]
4684*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
4685*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
4686*c0909341SAndroid Build Coastguard Worker    jg .w16
4687*c0909341SAndroid Build Coastguard Worker    je .w8
4688*c0909341SAndroid Build Coastguard Worker    ; fall-through
4689*c0909341SAndroid Build Coastguard Worker
4690*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
4691*c0909341SAndroid Build Coastguard Worker.w4:
4692*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
4693*c0909341SAndroid Build Coastguard Worker.w4_loop:
4694*c0909341SAndroid Build Coastguard Worker    movq                xm0, [yq]
4695*c0909341SAndroid Build Coastguard Worker    movq                xm1, [yq+strideq]
4696*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [yq+strideq*2]
4697*c0909341SAndroid Build Coastguard Worker    movhps              xm1, [yq+stride3q]
4698*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm2
4699*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm2
4700*c0909341SAndroid Build Coastguard Worker    paddw               xm0, xm1
4701*c0909341SAndroid Build Coastguard Worker    mova              [acq], xm0
4702*c0909341SAndroid Build Coastguard Worker    paddw               xm4, xm0
4703*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*4]
4704*c0909341SAndroid Build Coastguard Worker    add                 acq, 16
4705*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4706*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
4707*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4708*c0909341SAndroid Build Coastguard Worker    jz .calc_avg
4709*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q1111
4710*c0909341SAndroid Build Coastguard Worker.w4_hpad_loop:
4711*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4712*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4713*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4714*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 4
4715*c0909341SAndroid Build Coastguard Worker    jg .w4_hpad_loop
4716*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg
4717*c0909341SAndroid Build Coastguard Worker
4718*c0909341SAndroid Build Coastguard Worker.w8:
4719*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
4720*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
4721*c0909341SAndroid Build Coastguard Worker    jnz .w8_wpad
4722*c0909341SAndroid Build Coastguard Worker.w8_loop:
4723*c0909341SAndroid Build Coastguard Worker    mova                xm0, [yq]
4724*c0909341SAndroid Build Coastguard Worker    mova                xm1, [yq+strideq]
4725*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [yq+strideq*2], 1
4726*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [yq+stride3q], 1
4727*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4728*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4729*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
4730*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4731*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4732*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*4]
4733*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4734*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4735*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
4736*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4737*c0909341SAndroid Build Coastguard Worker    jz .calc_avg
4738*c0909341SAndroid Build Coastguard Worker    jmp .w8_hpad
4739*c0909341SAndroid Build Coastguard Worker.w8_wpad:
4740*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [cfl_ac_w8_pad1_shuffle]
4741*c0909341SAndroid Build Coastguard Worker.w8_wpad_loop:
4742*c0909341SAndroid Build Coastguard Worker    movq                xm0, [yq]
4743*c0909341SAndroid Build Coastguard Worker    movq                xm1, [yq+strideq]
4744*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [yq+strideq*2], 1
4745*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [yq+stride3q], 1
4746*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4747*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4748*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
4749*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
4750*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4751*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4752*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*4]
4753*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4754*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4755*c0909341SAndroid Build Coastguard Worker    jg .w8_wpad_loop
4756*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4757*c0909341SAndroid Build Coastguard Worker    jz .calc_avg
4758*c0909341SAndroid Build Coastguard Worker.w8_hpad:
4759*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3232
4760*c0909341SAndroid Build Coastguard Worker.w8_hpad_loop:
4761*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4762*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4763*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4764*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 2
4765*c0909341SAndroid Build Coastguard Worker    jg .w8_hpad_loop
4766*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg
4767*c0909341SAndroid Build Coastguard Worker
4768*c0909341SAndroid Build Coastguard Worker.w16:
4769*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
4770*c0909341SAndroid Build Coastguard Worker    jnz .w16_wpad
4771*c0909341SAndroid Build Coastguard Worker.w16_loop:
4772*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq]
4773*c0909341SAndroid Build Coastguard Worker    mova                 m1, [yq+strideq]
4774*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4775*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4776*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
4777*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4778*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4779*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
4780*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4781*c0909341SAndroid Build Coastguard Worker    dec                  hd
4782*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
4783*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4784*c0909341SAndroid Build Coastguard Worker    jz .calc_avg
4785*c0909341SAndroid Build Coastguard Worker    jmp .w16_hpad_loop
4786*c0909341SAndroid Build Coastguard Worker.w16_wpad:
4787*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
4788*c0909341SAndroid Build Coastguard Worker    lea               iptrq, [ipred_cfl_ac_420_avx2_table]
4789*c0909341SAndroid Build Coastguard Worker    shl               wpadd, 2
4790*c0909341SAndroid Build Coastguard Worker    mova                 m3, [iptrq+cfl_ac_w16_pad_shuffle- \
4791*c0909341SAndroid Build Coastguard Worker                              ipred_cfl_ac_420_avx2_table+wpadq*8-32]
4792*c0909341SAndroid Build Coastguard Worker    movsxd            wpadq, [iptrq+wpadq+4]
4793*c0909341SAndroid Build Coastguard Worker    add               iptrq, wpadq
4794*c0909341SAndroid Build Coastguard Worker    jmp iptrq
4795*c0909341SAndroid Build Coastguard Worker.w16_pad3:
4796*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [yq]
4797*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [yq+strideq]
4798*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_end
4799*c0909341SAndroid Build Coastguard Worker.w16_pad2:
4800*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [yq]
4801*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [yq+strideq]
4802*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_end
4803*c0909341SAndroid Build Coastguard Worker.w16_pad1:
4804*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq]
4805*c0909341SAndroid Build Coastguard Worker    mova                 m1, [yq+strideq]
4806*c0909341SAndroid Build Coastguard Worker    ; fall-through
4807*c0909341SAndroid Build Coastguard Worker.w16_wpad_end:
4808*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4809*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4810*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
4811*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
4812*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4813*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4814*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
4815*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4816*c0909341SAndroid Build Coastguard Worker    dec                  hd
4817*c0909341SAndroid Build Coastguard Worker    jz .w16_wpad_done
4818*c0909341SAndroid Build Coastguard Worker    jmp iptrq
4819*c0909341SAndroid Build Coastguard Worker.w16_wpad_done:
4820*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4821*c0909341SAndroid Build Coastguard Worker    jz .calc_avg
4822*c0909341SAndroid Build Coastguard Worker.w16_hpad_loop:
4823*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4824*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4825*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4826*c0909341SAndroid Build Coastguard Worker    dec               hpadd
4827*c0909341SAndroid Build Coastguard Worker    jg .w16_hpad_loop
4828*c0909341SAndroid Build Coastguard Worker    ; fall-through
4829*c0909341SAndroid Build Coastguard Worker
4830*c0909341SAndroid Build Coastguard Worker.calc_avg:
4831*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [pw_1]
4832*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m4, m2
4833*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
4834*c0909341SAndroid Build Coastguard Worker    tzcnt               r1d, szd
4835*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
4836*c0909341SAndroid Build Coastguard Worker    movd                xm2, r1d
4837*c0909341SAndroid Build Coastguard Worker    movd                xm3, szd
4838*c0909341SAndroid Build Coastguard Worker    punpckhqdq          xm1, xm0, xm0
4839*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
4840*c0909341SAndroid Build Coastguard Worker    psrad               xm3, 1
4841*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
4842*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm3
4843*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
4844*c0909341SAndroid Build Coastguard Worker    psrad               xm0, xm2
4845*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
4846*c0909341SAndroid Build Coastguard Worker.sub_loop:
4847*c0909341SAndroid Build Coastguard Worker    mova                 m1, [ac_bakq]
4848*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
4849*c0909341SAndroid Build Coastguard Worker    mova          [ac_bakq], m1
4850*c0909341SAndroid Build Coastguard Worker    add             ac_bakq, 32
4851*c0909341SAndroid Build Coastguard Worker    sub                 szd, 16
4852*c0909341SAndroid Build Coastguard Worker    jg .sub_loop
4853*c0909341SAndroid Build Coastguard Worker    RET
4854*c0909341SAndroid Build Coastguard Worker
4855*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
4856*c0909341SAndroid Build Coastguard Worker    movifnidn         hpadd, hpadm
4857*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
4858*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
4859*c0909341SAndroid Build Coastguard Worker    mov                 szd, wd
4860*c0909341SAndroid Build Coastguard Worker    mov             ac_bakq, acq
4861*c0909341SAndroid Build Coastguard Worker    imul                szd, hd
4862*c0909341SAndroid Build Coastguard Worker    shl               hpadd, 2
4863*c0909341SAndroid Build Coastguard Worker    sub                  hd, hpadd
4864*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [pb_4]
4865*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
4866*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
4867*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
4868*c0909341SAndroid Build Coastguard Worker    jg .w16
4869*c0909341SAndroid Build Coastguard Worker    je .w8
4870*c0909341SAndroid Build Coastguard Worker    ; fall-through
4871*c0909341SAndroid Build Coastguard Worker
4872*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
4873*c0909341SAndroid Build Coastguard Worker.w4:
4874*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
4875*c0909341SAndroid Build Coastguard Worker.w4_loop:
4876*c0909341SAndroid Build Coastguard Worker    movq                xm1, [yq]
4877*c0909341SAndroid Build Coastguard Worker    movhps              xm1, [yq+strideq]
4878*c0909341SAndroid Build Coastguard Worker    movq                xm0, [yq+strideq*2]
4879*c0909341SAndroid Build Coastguard Worker    movhps              xm0, [yq+stride3q]
4880*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm2
4881*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm2
4882*c0909341SAndroid Build Coastguard Worker    mova              [acq], xm1
4883*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], xm0
4884*c0909341SAndroid Build Coastguard Worker    paddw               xm4, xm0
4885*c0909341SAndroid Build Coastguard Worker    paddw               xm5, xm1
4886*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*4]
4887*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4888*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4889*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
4890*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4891*c0909341SAndroid Build Coastguard Worker    jz .calc_avg
4892*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q1111
4893*c0909341SAndroid Build Coastguard Worker.w4_hpad_loop:
4894*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4895*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4896*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4897*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 4
4898*c0909341SAndroid Build Coastguard Worker    jg .w4_hpad_loop
4899*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg
4900*c0909341SAndroid Build Coastguard Worker
4901*c0909341SAndroid Build Coastguard Worker.w8:
4902*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
4903*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
4904*c0909341SAndroid Build Coastguard Worker    jnz .w8_wpad
4905*c0909341SAndroid Build Coastguard Worker.w8_loop:
4906*c0909341SAndroid Build Coastguard Worker    mova                xm1, [yq]
4907*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [yq+strideq], 1
4908*c0909341SAndroid Build Coastguard Worker    mova                xm0, [yq+strideq*2]
4909*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [yq+stride3q], 1
4910*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4911*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4912*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4913*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m0
4914*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4915*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4916*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*4]
4917*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4918*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4919*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
4920*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4921*c0909341SAndroid Build Coastguard Worker    jz .calc_avg
4922*c0909341SAndroid Build Coastguard Worker    jmp .w8_hpad
4923*c0909341SAndroid Build Coastguard Worker.w8_wpad:
4924*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m3, [cfl_ac_w8_pad1_shuffle]
4925*c0909341SAndroid Build Coastguard Worker.w8_wpad_loop:
4926*c0909341SAndroid Build Coastguard Worker    movq                xm1, [yq]
4927*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [yq+strideq], 1
4928*c0909341SAndroid Build Coastguard Worker    movq                xm0, [yq+strideq*2]
4929*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [yq+stride3q], 1
4930*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4931*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4932*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
4933*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
4934*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4935*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m0
4936*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4937*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4938*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*4]
4939*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4940*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4941*c0909341SAndroid Build Coastguard Worker    jg .w8_wpad_loop
4942*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4943*c0909341SAndroid Build Coastguard Worker    jz .calc_avg
4944*c0909341SAndroid Build Coastguard Worker.w8_hpad:
4945*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3232
4946*c0909341SAndroid Build Coastguard Worker.w8_hpad_loop:
4947*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4948*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4949*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4950*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 2
4951*c0909341SAndroid Build Coastguard Worker    jg .w8_hpad_loop
4952*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg
4953*c0909341SAndroid Build Coastguard Worker
4954*c0909341SAndroid Build Coastguard Worker.w16:
4955*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
4956*c0909341SAndroid Build Coastguard Worker    jnz .w16_wpad
4957*c0909341SAndroid Build Coastguard Worker.w16_loop:
4958*c0909341SAndroid Build Coastguard Worker    mova                 m1, [yq]
4959*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq+strideq]
4960*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4961*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4962*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4963*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m0
4964*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4965*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4966*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
4967*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4968*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4969*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
4970*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4971*c0909341SAndroid Build Coastguard Worker    jz .calc_avg
4972*c0909341SAndroid Build Coastguard Worker    jmp .w16_hpad_loop
4973*c0909341SAndroid Build Coastguard Worker.w16_wpad:
4974*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
4975*c0909341SAndroid Build Coastguard Worker    lea               iptrq, [ipred_cfl_ac_422_avx2_table]
4976*c0909341SAndroid Build Coastguard Worker    shl               wpadd, 2
4977*c0909341SAndroid Build Coastguard Worker    mova                 m3, [iptrq+cfl_ac_w16_pad_shuffle- \
4978*c0909341SAndroid Build Coastguard Worker                              ipred_cfl_ac_422_avx2_table+wpadq*8-32]
4979*c0909341SAndroid Build Coastguard Worker    movsxd            wpadq, [iptrq+wpadq+4]
4980*c0909341SAndroid Build Coastguard Worker    add               iptrq, wpadq
4981*c0909341SAndroid Build Coastguard Worker    jmp iptrq
4982*c0909341SAndroid Build Coastguard Worker.w16_pad3:
4983*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [yq]
4984*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [yq+strideq]
4985*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_end
4986*c0909341SAndroid Build Coastguard Worker.w16_pad2:
4987*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [yq]
4988*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [yq+strideq]
4989*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_end
4990*c0909341SAndroid Build Coastguard Worker.w16_pad1:
4991*c0909341SAndroid Build Coastguard Worker    mova                 m1, [yq]
4992*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq+strideq]
4993*c0909341SAndroid Build Coastguard Worker    ; fall-through
4994*c0909341SAndroid Build Coastguard Worker.w16_wpad_end:
4995*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4996*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4997*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
4998*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
4999*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
5000*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m0
5001*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
5002*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
5003*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
5004*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
5005*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5006*c0909341SAndroid Build Coastguard Worker    jz .w16_wpad_done
5007*c0909341SAndroid Build Coastguard Worker    jmp iptrq
5008*c0909341SAndroid Build Coastguard Worker.w16_wpad_done:
5009*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
5010*c0909341SAndroid Build Coastguard Worker    jz .calc_avg
5011*c0909341SAndroid Build Coastguard Worker.w16_hpad_loop:
5012*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
5013*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m0
5014*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
5015*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0
5016*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
5017*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 2
5018*c0909341SAndroid Build Coastguard Worker    jg .w16_hpad_loop
5019*c0909341SAndroid Build Coastguard Worker    ; fall-through
5020*c0909341SAndroid Build Coastguard Worker
5021*c0909341SAndroid Build Coastguard Worker.calc_avg:
5022*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [pw_1]
5023*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m5, m2
5024*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m4, m2
5025*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
5026*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m0, 1
5027*c0909341SAndroid Build Coastguard Worker    tzcnt               r1d, szd
5028*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
5029*c0909341SAndroid Build Coastguard Worker    movd                xm2, r1d
5030*c0909341SAndroid Build Coastguard Worker    movd                xm3, szd
5031*c0909341SAndroid Build Coastguard Worker    punpckhqdq          xm1, xm0, xm0
5032*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
5033*c0909341SAndroid Build Coastguard Worker    psrad               xm3, 1
5034*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
5035*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm3
5036*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
5037*c0909341SAndroid Build Coastguard Worker    psrad               xm0, xm2
5038*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
5039*c0909341SAndroid Build Coastguard Worker.sub_loop:
5040*c0909341SAndroid Build Coastguard Worker    mova                 m1, [ac_bakq]
5041*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
5042*c0909341SAndroid Build Coastguard Worker    mova          [ac_bakq], m1
5043*c0909341SAndroid Build Coastguard Worker    add             ac_bakq, 32
5044*c0909341SAndroid Build Coastguard Worker    sub                 szd, 16
5045*c0909341SAndroid Build Coastguard Worker    jg .sub_loop
5046*c0909341SAndroid Build Coastguard Worker    RET
5047*c0909341SAndroid Build Coastguard Worker
5048*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
5049*c0909341SAndroid Build Coastguard Worker    movifnidn         hpadd, hpadm
5050*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
5051*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
5052*c0909341SAndroid Build Coastguard Worker    mov                 szd, wd
5053*c0909341SAndroid Build Coastguard Worker    imul                szd, hd
5054*c0909341SAndroid Build Coastguard Worker    shl               hpadd, 2
5055*c0909341SAndroid Build Coastguard Worker    sub                  hd, hpadd
5056*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
5057*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pw_1]
5058*c0909341SAndroid Build Coastguard Worker    tzcnt               r8d, wd
5059*c0909341SAndroid Build Coastguard Worker    lea                  r5, [ipred_cfl_ac_444_avx2_table]
5060*c0909341SAndroid Build Coastguard Worker    movsxd               r8, [r5+r8*4+12]
5061*c0909341SAndroid Build Coastguard Worker    add                  r5, r8
5062*c0909341SAndroid Build Coastguard Worker
5063*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
5064*c0909341SAndroid Build Coastguard Worker    mov             ac_bakq, acq
5065*c0909341SAndroid Build Coastguard Worker    jmp                  r5
5066*c0909341SAndroid Build Coastguard Worker
5067*c0909341SAndroid Build Coastguard Worker.w4:
5068*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
5069*c0909341SAndroid Build Coastguard Worker    pxor                xm2, xm2
5070*c0909341SAndroid Build Coastguard Worker.w4_loop:
5071*c0909341SAndroid Build Coastguard Worker    movd                xm1, [yq]
5072*c0909341SAndroid Build Coastguard Worker    movd                xm0, [yq+strideq*2]
5073*c0909341SAndroid Build Coastguard Worker    pinsrd              xm1, [yq+strideq], 1
5074*c0909341SAndroid Build Coastguard Worker    pinsrd              xm0, [yq+stride3q], 1
5075*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm1, xm2
5076*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm0, xm2
5077*c0909341SAndroid Build Coastguard Worker    psllw               xm1, 3
5078*c0909341SAndroid Build Coastguard Worker    psllw               xm0, 3
5079*c0909341SAndroid Build Coastguard Worker    mova              [acq], xm1
5080*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], xm0
5081*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm0
5082*c0909341SAndroid Build Coastguard Worker    paddw               xm4, xm1
5083*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*4]
5084*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
5085*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5086*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
5087*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
5088*c0909341SAndroid Build Coastguard Worker    jz .calc_avg_mul
5089*c0909341SAndroid Build Coastguard Worker    pshufd              xm0, xm0, q3232
5090*c0909341SAndroid Build Coastguard Worker    paddw               xm1, xm0, xm0
5091*c0909341SAndroid Build Coastguard Worker.w4_hpad_loop:
5092*c0909341SAndroid Build Coastguard Worker    mova              [acq], xm0
5093*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], xm0
5094*c0909341SAndroid Build Coastguard Worker    paddw               xm4, xm1
5095*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
5096*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 4
5097*c0909341SAndroid Build Coastguard Worker    jg .w4_hpad_loop
5098*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg_mul
5099*c0909341SAndroid Build Coastguard Worker
5100*c0909341SAndroid Build Coastguard Worker.w8:
5101*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
5102*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
5103*c0909341SAndroid Build Coastguard Worker.w8_loop:
5104*c0909341SAndroid Build Coastguard Worker    movq                xm1, [yq]
5105*c0909341SAndroid Build Coastguard Worker    movq                xm0, [yq+strideq*2]
5106*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [yq+strideq], 1
5107*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [yq+stride3q], 1
5108*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2
5109*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m2
5110*c0909341SAndroid Build Coastguard Worker    psllw                m1, 3
5111*c0909341SAndroid Build Coastguard Worker    psllw                m0, 3
5112*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
5113*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m0
5114*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
5115*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
5116*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*4]
5117*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
5118*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5119*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
5120*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
5121*c0909341SAndroid Build Coastguard Worker    jz .calc_avg_mul
5122*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3232
5123*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0, m0
5124*c0909341SAndroid Build Coastguard Worker.w8_hpad_loop:
5125*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
5126*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m0
5127*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
5128*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
5129*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 4
5130*c0909341SAndroid Build Coastguard Worker    jg .w8_hpad_loop
5131*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg_mul
5132*c0909341SAndroid Build Coastguard Worker
5133*c0909341SAndroid Build Coastguard Worker.w16:
5134*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
5135*c0909341SAndroid Build Coastguard Worker    jnz .w16_wpad
5136*c0909341SAndroid Build Coastguard Worker.w16_loop:
5137*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m1, [yq]
5138*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m0, [yq+strideq]
5139*c0909341SAndroid Build Coastguard Worker    psllw                m1, 3
5140*c0909341SAndroid Build Coastguard Worker    psllw                m0, 3
5141*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
5142*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m0
5143*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
5144*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5
5145*c0909341SAndroid Build Coastguard Worker    paddd                m4, m1
5146*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
5147*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
5148*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5149*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
5150*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
5151*c0909341SAndroid Build Coastguard Worker    jz .calc_avg
5152*c0909341SAndroid Build Coastguard Worker    jmp .w16_hpad
5153*c0909341SAndroid Build Coastguard Worker.w16_wpad:
5154*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cfl_ac_444_w16_pad1_shuffle]
5155*c0909341SAndroid Build Coastguard Worker.w16_wpad_loop:
5156*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [yq]
5157*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [yq+strideq]
5158*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
5159*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
5160*c0909341SAndroid Build Coastguard Worker    psllw                m1, 3
5161*c0909341SAndroid Build Coastguard Worker    psllw                m0, 3
5162*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
5163*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m0
5164*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
5165*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5
5166*c0909341SAndroid Build Coastguard Worker    paddd                m4, m1
5167*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
5168*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
5169*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5170*c0909341SAndroid Build Coastguard Worker    jg .w16_wpad_loop
5171*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
5172*c0909341SAndroid Build Coastguard Worker    jz .calc_avg
5173*c0909341SAndroid Build Coastguard Worker.w16_hpad:
5174*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0, m0
5175*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5
5176*c0909341SAndroid Build Coastguard Worker.w16_hpad_loop:
5177*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
5178*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m0
5179*c0909341SAndroid Build Coastguard Worker    paddd                m4, m1
5180*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
5181*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 2
5182*c0909341SAndroid Build Coastguard Worker    jg .w16_hpad_loop
5183*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg
5184*c0909341SAndroid Build Coastguard Worker
5185*c0909341SAndroid Build Coastguard Worker.w32:
5186*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
5187*c0909341SAndroid Build Coastguard Worker    jnz .w32_wpad
5188*c0909341SAndroid Build Coastguard Worker.w32_loop:
5189*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m1, [yq]
5190*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m0, [yq+16]
5191*c0909341SAndroid Build Coastguard Worker    psllw                m1, 3
5192*c0909341SAndroid Build Coastguard Worker    psllw                m0, 3
5193*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
5194*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m0
5195*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1, m0
5196*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5
5197*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
5198*c0909341SAndroid Build Coastguard Worker    add                  yq, strideq
5199*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
5200*c0909341SAndroid Build Coastguard Worker    dec                  hd
5201*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
5202*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
5203*c0909341SAndroid Build Coastguard Worker    jz .calc_avg
5204*c0909341SAndroid Build Coastguard Worker    jmp .w32_hpad_loop
5205*c0909341SAndroid Build Coastguard Worker.w32_wpad:
5206*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
5207*c0909341SAndroid Build Coastguard Worker    lea               iptrq, [ipred_cfl_ac_444_avx2_table]
5208*c0909341SAndroid Build Coastguard Worker    add               wpadd, wpadd
5209*c0909341SAndroid Build Coastguard Worker    mova                 m3, [iptrq+cfl_ac_444_w16_pad1_shuffle-ipred_cfl_ac_444_avx2_table]
5210*c0909341SAndroid Build Coastguard Worker    movsxd            wpadq, [iptrq+wpadq+4]
5211*c0909341SAndroid Build Coastguard Worker    add               iptrq, wpadq
5212*c0909341SAndroid Build Coastguard Worker    jmp iptrq
5213*c0909341SAndroid Build Coastguard Worker.w32_pad3:
5214*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m1, [yq]
5215*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
5216*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m1, q3232
5217*c0909341SAndroid Build Coastguard Worker    jmp .w32_wpad_end
5218*c0909341SAndroid Build Coastguard Worker.w32_pad2:
5219*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m1, [yq]
5220*c0909341SAndroid Build Coastguard Worker    pshufhw              m0, m1, q3333
5221*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3333
5222*c0909341SAndroid Build Coastguard Worker    jmp .w32_wpad_end
5223*c0909341SAndroid Build Coastguard Worker.w32_pad1:
5224*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m1, [yq]
5225*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m0, [yq+16]
5226*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
5227*c0909341SAndroid Build Coastguard Worker    ; fall-through
5228*c0909341SAndroid Build Coastguard Worker.w32_wpad_end:
5229*c0909341SAndroid Build Coastguard Worker    psllw                m1, 3
5230*c0909341SAndroid Build Coastguard Worker    psllw                m0, 3
5231*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
5232*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m0
5233*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1, m0
5234*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5
5235*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
5236*c0909341SAndroid Build Coastguard Worker    add                  yq, strideq
5237*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
5238*c0909341SAndroid Build Coastguard Worker    dec                  hd
5239*c0909341SAndroid Build Coastguard Worker    jz .w32_wpad_done
5240*c0909341SAndroid Build Coastguard Worker    jmp iptrq
5241*c0909341SAndroid Build Coastguard Worker.w32_wpad_done:
5242*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
5243*c0909341SAndroid Build Coastguard Worker    jz .calc_avg
5244*c0909341SAndroid Build Coastguard Worker.w32_hpad_loop:
5245*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
5246*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m0
5247*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
5248*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
5249*c0909341SAndroid Build Coastguard Worker    dec               hpadd
5250*c0909341SAndroid Build Coastguard Worker    jg .w32_hpad_loop
5251*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg
5252*c0909341SAndroid Build Coastguard Worker
5253*c0909341SAndroid Build Coastguard Worker.calc_avg_mul:
5254*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m5
5255*c0909341SAndroid Build Coastguard Worker.calc_avg:
5256*c0909341SAndroid Build Coastguard Worker    vextracti128        xm1, m4, 1
5257*c0909341SAndroid Build Coastguard Worker    tzcnt               r1d, szd
5258*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm4, xm1
5259*c0909341SAndroid Build Coastguard Worker    movd                xm2, r1d
5260*c0909341SAndroid Build Coastguard Worker    movd                xm3, szd
5261*c0909341SAndroid Build Coastguard Worker    punpckhqdq          xm1, xm0, xm0
5262*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
5263*c0909341SAndroid Build Coastguard Worker    psrad               xm3, 1
5264*c0909341SAndroid Build Coastguard Worker    psrlq               xm1, xm0, 32
5265*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm3
5266*c0909341SAndroid Build Coastguard Worker    paddd               xm0, xm1
5267*c0909341SAndroid Build Coastguard Worker    psrad               xm0, xm2
5268*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
5269*c0909341SAndroid Build Coastguard Worker.sub_loop:
5270*c0909341SAndroid Build Coastguard Worker    mova                 m1, [ac_bakq]
5271*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
5272*c0909341SAndroid Build Coastguard Worker    mova          [ac_bakq], m1
5273*c0909341SAndroid Build Coastguard Worker    add             ac_bakq, 32
5274*c0909341SAndroid Build Coastguard Worker    sub                 szd, 16
5275*c0909341SAndroid Build Coastguard Worker    jg .sub_loop
5276*c0909341SAndroid Build Coastguard Worker    RET
5277*c0909341SAndroid Build Coastguard Worker
5278*c0909341SAndroid Build Coastguard Workercglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
5279*c0909341SAndroid Build Coastguard Worker    vpbroadcastq         m4, [palq]
5280*c0909341SAndroid Build Coastguard Worker    lea                  r2, [pal_pred_avx2_table]
5281*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5282*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5283*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r2+wq*4]
5284*c0909341SAndroid Build Coastguard Worker    add                  wq, r2
5285*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
5286*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5287*c0909341SAndroid Build Coastguard Worker.w4:
5288*c0909341SAndroid Build Coastguard Worker    movq                xm0, [idxq]
5289*c0909341SAndroid Build Coastguard Worker    add                idxq, 8
5290*c0909341SAndroid Build Coastguard Worker    psrlw               xm1, xm0, 4
5291*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm0, xm1
5292*c0909341SAndroid Build Coastguard Worker    pshufb              xm0, xm4, xm0
5293*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm0
5294*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm0, 1
5295*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm0, 2
5296*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r2       ], xm0, 3
5297*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5298*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5299*c0909341SAndroid Build Coastguard Worker    jg .w4
5300*c0909341SAndroid Build Coastguard Worker    RET
5301*c0909341SAndroid Build Coastguard Worker.w8:
5302*c0909341SAndroid Build Coastguard Worker    movu                xm2, [idxq]
5303*c0909341SAndroid Build Coastguard Worker    add                idxq, 16
5304*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm4, xm2
5305*c0909341SAndroid Build Coastguard Worker    psrlw               xm2, 4
5306*c0909341SAndroid Build Coastguard Worker    pshufb              xm2, xm4, xm2
5307*c0909341SAndroid Build Coastguard Worker    punpcklbw           xm0, xm1, xm2
5308*c0909341SAndroid Build Coastguard Worker    punpckhbw           xm1, xm2
5309*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], xm0
5310*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], xm0
5311*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], xm1
5312*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r2       ], xm1
5313*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5314*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5315*c0909341SAndroid Build Coastguard Worker    jg .w8
5316*c0909341SAndroid Build Coastguard Worker    RET
5317*c0909341SAndroid Build Coastguard Worker.w16:
5318*c0909341SAndroid Build Coastguard Worker    movu                 m2, [idxq]
5319*c0909341SAndroid Build Coastguard Worker    add                idxq, 32
5320*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m2
5321*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 4
5322*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4, m2
5323*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
5324*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
5325*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm0
5326*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*1], xm1
5327*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*2], m0, 1
5328*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+r2       ], m1, 1
5329*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5330*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5331*c0909341SAndroid Build Coastguard Worker    jg .w16
5332*c0909341SAndroid Build Coastguard Worker    RET
5333*c0909341SAndroid Build Coastguard Worker.w32:
5334*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [idxq], q3120
5335*c0909341SAndroid Build Coastguard Worker    add                idxq, 32
5336*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m2
5337*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 4
5338*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4, m2
5339*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
5340*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
5341*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
5342*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
5343*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
5344*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5345*c0909341SAndroid Build Coastguard Worker    jg .w32
5346*c0909341SAndroid Build Coastguard Worker    RET
5347*c0909341SAndroid Build Coastguard Worker.w64:
5348*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [idxq], q3120
5349*c0909341SAndroid Build Coastguard Worker    add                idxq, 32
5350*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m2
5351*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 4
5352*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4, m2
5353*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
5354*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
5355*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m0
5356*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m1
5357*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
5358*c0909341SAndroid Build Coastguard Worker    dec                  hd
5359*c0909341SAndroid Build Coastguard Worker    jg .w64
5360*c0909341SAndroid Build Coastguard Worker    RET
5361*c0909341SAndroid Build Coastguard Worker
5362*c0909341SAndroid Build Coastguard Worker%endif
5363