xref: /aosp_15_r20/external/libdav1d/src/x86/ipred_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2018-2021, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 16
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Worker%macro SMOOTH_WEIGHT_TABLE 1-*
32*c0909341SAndroid Build Coastguard Worker    %rep %0
33*c0909341SAndroid Build Coastguard Worker        db %1-128, 127-%1
34*c0909341SAndroid Build Coastguard Worker        %rotate 1
35*c0909341SAndroid Build Coastguard Worker    %endrep
36*c0909341SAndroid Build Coastguard Worker%endmacro
37*c0909341SAndroid Build Coastguard Worker
38*c0909341SAndroid Build Coastguard Worker; sm_weights[], but modified to precalculate x and 256-x with offsets to
39*c0909341SAndroid Build Coastguard Worker; enable efficient use of pmaddubsw (which requires signed values)
40*c0909341SAndroid Build Coastguard Workersmooth_weights: SMOOTH_WEIGHT_TABLE         \
41*c0909341SAndroid Build Coastguard Worker      0,   0, 255, 128, 255, 149,  85,  64, \
42*c0909341SAndroid Build Coastguard Worker    255, 197, 146, 105,  73,  50,  37,  32, \
43*c0909341SAndroid Build Coastguard Worker    255, 225, 196, 170, 145, 123, 102,  84, \
44*c0909341SAndroid Build Coastguard Worker     68,  54,  43,  33,  26,  20,  17,  16, \
45*c0909341SAndroid Build Coastguard Worker    255, 240, 225, 210, 196, 182, 169, 157, \
46*c0909341SAndroid Build Coastguard Worker    145, 133, 122, 111, 101,  92,  83,  74, \
47*c0909341SAndroid Build Coastguard Worker     66,  59,  52,  45,  39,  34,  29,  25, \
48*c0909341SAndroid Build Coastguard Worker     21,  17,  14,  12,  10,   9,   8,   8, \
49*c0909341SAndroid Build Coastguard Worker    255, 248, 240, 233, 225, 218, 210, 203, \
50*c0909341SAndroid Build Coastguard Worker    196, 189, 182, 176, 169, 163, 156, 150, \
51*c0909341SAndroid Build Coastguard Worker    144, 138, 133, 127, 121, 116, 111, 106, \
52*c0909341SAndroid Build Coastguard Worker    101,  96,  91,  86,  82,  77,  73,  69, \
53*c0909341SAndroid Build Coastguard Worker     65,  61,  57,  54,  50,  47,  44,  41, \
54*c0909341SAndroid Build Coastguard Worker     38,  35,  32,  29,  27,  25,  22,  20, \
55*c0909341SAndroid Build Coastguard Worker     18,  16,  15,  13,  12,  10,   9,   8, \
56*c0909341SAndroid Build Coastguard Worker      7,   6,   6,   5,   5,   4,   4,   4
57*c0909341SAndroid Build Coastguard Worker
58*c0909341SAndroid Build Coastguard Workeripred_v_shuf:     db  0,  1,  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7
59*c0909341SAndroid Build Coastguard Workeripred_h_shuf:     db  3,  3,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  0,  0,  0,  0
60*c0909341SAndroid Build Coastguard Workeripred_paeth_shuf: db  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0
61*c0909341SAndroid Build Coastguard Workerz_upsample1:      db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
62*c0909341SAndroid Build Coastguard Workerz_upsample2:      db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  8,  8,  8
63*c0909341SAndroid Build Coastguard Workerz_transpose4:     db  8, 12,  0,  4,  9, 13,  1,  5, 10, 14,  2,  6, 11, 15,  3,  7
64*c0909341SAndroid Build Coastguard Workerz3_shuf:          db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
65*c0909341SAndroid Build Coastguard Workerz3_shuf_h4:       db  4,  3,  3,  2,  2,  1,  1,  0, 12, 11, 11, 10, 10,  9,  9,  8
66*c0909341SAndroid Build Coastguard Workerfilter_shuf1:     db  3,  4,  3,  4,  5,  6,  5,  6,  7,  2,  7,  2,  1, -1,  1, -1
67*c0909341SAndroid Build Coastguard Workerfilter_shuf2:     db  3,  4,  3,  4,  5,  6,  5,  6,  7, 11,  7, 11, 15, -1, 15, -1
68*c0909341SAndroid Build Coastguard Workerz_filter_wh4:     db  7,  7, 19,  7,
69*c0909341SAndroid Build Coastguard Workerz_filter_wh8:     db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39
70*c0909341SAndroid Build Coastguard Workerpd_32768:         dd 32768
71*c0909341SAndroid Build Coastguard Workerz3_filter_k_tail: db 64,  0, 64,  0, 64,  0, 56,  8
72*c0909341SAndroid Build Coastguard Workerz1_shuf_w4:       db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
73*c0909341SAndroid Build Coastguard Workerpb_0to15:         db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
74*c0909341SAndroid Build Coastguard Workerpb_15to0:         db 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0
75*c0909341SAndroid Build Coastguard Workerz_base_inc:       dw   0*64,   1*64,   2*64,   3*64,   4*64,   5*64,   6*64,   7*64
76*c0909341SAndroid Build Coastguard Workerz3_base_inc:      dw   7*64,   6*64,   5*64,   4*64,   3*64,   2*64,   1*64,   0*64
77*c0909341SAndroid Build Coastguard Workerz_filter_wh16:    db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1
78*c0909341SAndroid Build Coastguard Workerz_filter_t_w48:   db 55,127,  7,127, 15, 31, 39, 31,127, 39,127, 39,  7, 15, 31, 15
79*c0909341SAndroid Build Coastguard Worker                  db 39, 63,  3, 63,  3,  3, 19,  3, 47, 19, 47, 19,  3,  3,  3,  3
80*c0909341SAndroid Build Coastguard Workerz_filter_t_w16:   db 15, 31,  7, 15, 31,  7,  3, 31,  3,  3,  3,  3,  3,  3,  0,  0
81*c0909341SAndroid Build Coastguard Workerz_filter_s:       db  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7
82*c0909341SAndroid Build Coastguard Worker                  db  7,  8,  8,  9,  9, 10, 10, 11
83*c0909341SAndroid Build Coastguard Workerz_filter_k_tail:  db  0, 64,  0, 64,  8, 56,  0, 64
84*c0909341SAndroid Build Coastguard Workerz2_h_shuf:        db  7,  6, 15, 14,  6,  5, 14, 13,  5,  4, 13, 12,  4,  3, 12, 11
85*c0909341SAndroid Build Coastguard Workerz2_upsample:      db  7,  6, 15, 14,  5,  4, 13, 12,  3,  2, 11, 10,  1,  0,  9,  8
86*c0909341SAndroid Build Coastguard Workerz2_dy_offset:     dw 88*64, 88*64, 87*64, 87*64
87*c0909341SAndroid Build Coastguard Workerpw_m1to4:         dw -1, -2, -3, -4
88*c0909341SAndroid Build Coastguard Workerz_filter_k:       times  4 db  0, 16
89*c0909341SAndroid Build Coastguard Worker                  times  4 db  0, 20
90*c0909341SAndroid Build Coastguard Worker                  times  4 db  8, 16
91*c0909341SAndroid Build Coastguard Worker                  times  4 db 32, 16
92*c0909341SAndroid Build Coastguard Worker                  times  4 db 24, 20
93*c0909341SAndroid Build Coastguard Worker                  times  4 db 16, 16
94*c0909341SAndroid Build Coastguard Worker                  times  4 db  0,  0
95*c0909341SAndroid Build Coastguard Worker                  times  4 db  0,  0
96*c0909341SAndroid Build Coastguard Workerpw_8:             times  8 db  8,  0
97*c0909341SAndroid Build Coastguard Workerpb_3:             times 16 db 3
98*c0909341SAndroid Build Coastguard Workerpb_16:            times 16 db 16
99*c0909341SAndroid Build Coastguard Workerpw_62:            times  8 dw 62
100*c0909341SAndroid Build Coastguard Workerpw_64:            times  8 dw 64
101*c0909341SAndroid Build Coastguard Workerpw_256:           times  8 dw 256
102*c0909341SAndroid Build Coastguard Workerpw_512:           times  8 dw 512
103*c0909341SAndroid Build Coastguard Workerpw_m256:          times  8 dw -256
104*c0909341SAndroid Build Coastguard Workerpb_2:             times  8 db 2
105*c0909341SAndroid Build Coastguard Workerpb_4:             times  8 db 4
106*c0909341SAndroid Build Coastguard Workerpb_8:             times  8 db 8
107*c0909341SAndroid Build Coastguard Workerpb_128:           times  8 db 128
108*c0909341SAndroid Build Coastguard Workerpb_m16:           times  8 db -16
109*c0909341SAndroid Build Coastguard Workerpw_128:           times  4 dw 128
110*c0909341SAndroid Build Coastguard Workerpw_255:           times  4 dw 255
111*c0909341SAndroid Build Coastguard Workerpb_36_m4:         times  4 db 36, -4
112*c0909341SAndroid Build Coastguard Workerpb_127_m127:      times  4 db 127, -127
113*c0909341SAndroid Build Coastguard Worker
114*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 3-*
115*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - 2*4)
116*c0909341SAndroid Build Coastguard Worker    %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
117*c0909341SAndroid Build Coastguard Worker    %%table:
118*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
119*c0909341SAndroid Build Coastguard Worker        dd %%base %+ .%3 - (%%table - 2*4)
120*c0909341SAndroid Build Coastguard Worker        %rotate 1
121*c0909341SAndroid Build Coastguard Worker    %endrep
122*c0909341SAndroid Build Coastguard Worker%endmacro
123*c0909341SAndroid Build Coastguard Worker
124*c0909341SAndroid Build Coastguard Worker%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4)
125*c0909341SAndroid Build Coastguard Worker%define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4)
126*c0909341SAndroid Build Coastguard Worker
127*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_h,          ssse3, w4, w8, w16, w32, w64
128*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_dc,         ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
129*c0909341SAndroid Build Coastguard Worker                                s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
130*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_dc_left,    ssse3, h4, h8, h16, h32, h64
131*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth,     ssse3, w4, w8, w16, w32, w64
132*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_v,   ssse3, w4, w8, w16, w32, w64
133*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_smooth_h,   ssse3, w4, w8, w16, w32, w64
134*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_paeth,      ssse3, w4, w8, w16, w32, w64
135*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z1,         ssse3, w4, w8, w16, w32, w64
136*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z2,         ssse3, w4, w8, w16, w32, w64
137*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_z3,         ssse3, h4, h8, h16, h32, h64
138*c0909341SAndroid Build Coastguard WorkerJMP_TABLE pal_pred,         ssse3, w4, w8, w16, w32, w64
139*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl,        ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
140*c0909341SAndroid Build Coastguard Worker                                s4-8*4, s8-8*4, s16-8*4, s32-8*4
141*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_cfl_left,   ssse3, h4, h8, h16, h32
142*c0909341SAndroid Build Coastguard WorkerJMP_TABLE ipred_filter,     ssse3, w4, w8, w16, w32
143*c0909341SAndroid Build Coastguard Worker
144*c0909341SAndroid Build Coastguard Workercextern dr_intra_derivative
145*c0909341SAndroid Build Coastguard Workercextern filter_intra_taps
146*c0909341SAndroid Build Coastguard Worker
147*c0909341SAndroid Build Coastguard WorkerSECTION .text
148*c0909341SAndroid Build Coastguard Worker
149*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
150*c0909341SAndroid Build Coastguard Worker;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
151*c0909341SAndroid Build Coastguard Worker;                                    const int width, const int height, const int a);
152*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
153*c0909341SAndroid Build Coastguard Worker%macro IPRED_SET   3                                          ; width, stride, stride size pshuflw_imm8
154*c0909341SAndroid Build Coastguard Worker    pshuflw                      m1, m0, %3                   ; extend 8 byte for 2 pos
155*c0909341SAndroid Build Coastguard Worker    punpcklqdq                   m1, m1
156*c0909341SAndroid Build Coastguard Worker    mova           [dstq +      %2], m1
157*c0909341SAndroid Build Coastguard Worker%if %1 > 16
158*c0909341SAndroid Build Coastguard Worker    mova           [dstq + 16 + %2], m1
159*c0909341SAndroid Build Coastguard Worker%endif
160*c0909341SAndroid Build Coastguard Worker%if %1 > 32
161*c0909341SAndroid Build Coastguard Worker    mova           [dstq + 32 + %2], m1
162*c0909341SAndroid Build Coastguard Worker    mova           [dstq + 48 + %2], m1
163*c0909341SAndroid Build Coastguard Worker%endif
164*c0909341SAndroid Build Coastguard Worker%endmacro
165*c0909341SAndroid Build Coastguard Worker
166*c0909341SAndroid Build Coastguard Worker%macro IPRED_H 1                                            ; width
167*c0909341SAndroid Build Coastguard Worker    sub                         tlq, 4
168*c0909341SAndroid Build Coastguard Worker    movd                         m0, [tlq]                  ; get 4 bytes of topleft data
169*c0909341SAndroid Build Coastguard Worker    punpcklbw                    m0, m0                     ; extend 2 byte
170*c0909341SAndroid Build Coastguard Worker%if %1 == 4
171*c0909341SAndroid Build Coastguard Worker    pshuflw                      m1, m0, q2233
172*c0909341SAndroid Build Coastguard Worker    movd           [dstq+strideq*0], m1
173*c0909341SAndroid Build Coastguard Worker    psrlq                        m1, 32
174*c0909341SAndroid Build Coastguard Worker    movd           [dstq+strideq*1], m1
175*c0909341SAndroid Build Coastguard Worker    pshuflw                      m0, m0, q0011
176*c0909341SAndroid Build Coastguard Worker    movd           [dstq+strideq*2], m0
177*c0909341SAndroid Build Coastguard Worker    psrlq                        m0, 32
178*c0909341SAndroid Build Coastguard Worker    movd           [dstq+stride3q ], m0
179*c0909341SAndroid Build Coastguard Worker
180*c0909341SAndroid Build Coastguard Worker%elif %1 == 8
181*c0909341SAndroid Build Coastguard Worker    punpcklwd                    m0, m0
182*c0909341SAndroid Build Coastguard Worker    punpckhdq                    m1, m0, m0
183*c0909341SAndroid Build Coastguard Worker    punpckldq                    m0, m0
184*c0909341SAndroid Build Coastguard Worker    movq           [dstq+strideq*1], m1
185*c0909341SAndroid Build Coastguard Worker    movhps         [dstq+strideq*0], m1
186*c0909341SAndroid Build Coastguard Worker    movq           [dstq+stride3q ], m0
187*c0909341SAndroid Build Coastguard Worker    movhps         [dstq+strideq*2], m0
188*c0909341SAndroid Build Coastguard Worker%else
189*c0909341SAndroid Build Coastguard Worker    IPRED_SET                    %1,         0, q3333
190*c0909341SAndroid Build Coastguard Worker    IPRED_SET                    %1,   strideq, q2222
191*c0909341SAndroid Build Coastguard Worker    IPRED_SET                    %1, strideq*2, q1111
192*c0909341SAndroid Build Coastguard Worker    IPRED_SET                    %1,  stride3q, q0000
193*c0909341SAndroid Build Coastguard Worker%endif
194*c0909341SAndroid Build Coastguard Worker    lea                        dstq, [dstq+strideq*4]
195*c0909341SAndroid Build Coastguard Worker    sub                          hd, 4
196*c0909341SAndroid Build Coastguard Worker    jg .w%1
197*c0909341SAndroid Build Coastguard Worker    RET
198*c0909341SAndroid Build Coastguard Worker%endmacro
199*c0909341SAndroid Build Coastguard Worker
200*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
201*c0909341SAndroid Build Coastguard Workercglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3
202*c0909341SAndroid Build Coastguard Worker    LEA                          r5, ipred_h_ssse3_table
203*c0909341SAndroid Build Coastguard Worker    tzcnt                        wd, wm
204*c0909341SAndroid Build Coastguard Worker    movifnidn                    hd, hm
205*c0909341SAndroid Build Coastguard Worker    movsxd                       wq, [r5+wq*4]
206*c0909341SAndroid Build Coastguard Worker    add                          wq, r5
207*c0909341SAndroid Build Coastguard Worker    lea                    stride3q, [strideq*3]
208*c0909341SAndroid Build Coastguard Worker    jmp                          wq
209*c0909341SAndroid Build Coastguard Worker.w4:
210*c0909341SAndroid Build Coastguard Worker    IPRED_H                       4
211*c0909341SAndroid Build Coastguard Worker.w8:
212*c0909341SAndroid Build Coastguard Worker    IPRED_H                       8
213*c0909341SAndroid Build Coastguard Worker.w16:
214*c0909341SAndroid Build Coastguard Worker    IPRED_H                      16
215*c0909341SAndroid Build Coastguard Worker.w32:
216*c0909341SAndroid Build Coastguard Worker    IPRED_H                      32
217*c0909341SAndroid Build Coastguard Worker.w64:
218*c0909341SAndroid Build Coastguard Worker    IPRED_H                      64
219*c0909341SAndroid Build Coastguard Worker
220*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
221*c0909341SAndroid Build Coastguard Worker;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
222*c0909341SAndroid Build Coastguard Worker;                                    const int width, const int height, const int a);
223*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
224*c0909341SAndroid Build Coastguard Workercglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
225*c0909341SAndroid Build Coastguard Worker    LEA                  r5, ipred_dc_splat_ssse3_table
226*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
227*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+ 1]
228*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+17]
229*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+33]
230*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+49]
231*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
232*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
233*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
234*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
235*c0909341SAndroid Build Coastguard Worker    jmp                  wq
236*c0909341SAndroid Build Coastguard Worker
237*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
238*c0909341SAndroid Build Coastguard Worker;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
239*c0909341SAndroid Build Coastguard Worker;                                    const int width, const int height, const int a);
240*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
241*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
242*c0909341SAndroid Build Coastguard Worker    movifnidn                    hd, hm
243*c0909341SAndroid Build Coastguard Worker    movifnidn                    wd, wm
244*c0909341SAndroid Build Coastguard Worker    tzcnt                       r6d, hd
245*c0909341SAndroid Build Coastguard Worker    lea                         r5d, [wq+hq]
246*c0909341SAndroid Build Coastguard Worker    movd                         m4, r5d
247*c0909341SAndroid Build Coastguard Worker    tzcnt                       r5d, r5d
248*c0909341SAndroid Build Coastguard Worker    movd                         m5, r5d
249*c0909341SAndroid Build Coastguard Worker    LEA                          r5, ipred_dc_ssse3_table
250*c0909341SAndroid Build Coastguard Worker    tzcnt                        wd, wd
251*c0909341SAndroid Build Coastguard Worker    movsxd                       r6, [r5+r6*4]
252*c0909341SAndroid Build Coastguard Worker    movsxd                       wq, [r5+wq*4+20]
253*c0909341SAndroid Build Coastguard Worker    pcmpeqd                      m3, m3
254*c0909341SAndroid Build Coastguard Worker    psrlw                        m4, 1                             ; dc = (width + height) >> 1;
255*c0909341SAndroid Build Coastguard Worker    add                          r6, r5
256*c0909341SAndroid Build Coastguard Worker    add                          wq, r5
257*c0909341SAndroid Build Coastguard Worker    lea                    stride3q, [strideq*3]
258*c0909341SAndroid Build Coastguard Worker    jmp r6
259*c0909341SAndroid Build Coastguard Worker.h4:
260*c0909341SAndroid Build Coastguard Worker    movd                         m0, [tlq-4]
261*c0909341SAndroid Build Coastguard Worker    pmaddubsw                    m0, m3
262*c0909341SAndroid Build Coastguard Worker    jmp                          wq
263*c0909341SAndroid Build Coastguard Worker.w4:
264*c0909341SAndroid Build Coastguard Worker    movd                         m1, [tlq+1]
265*c0909341SAndroid Build Coastguard Worker    pmaddubsw                    m1, m3
266*c0909341SAndroid Build Coastguard Worker    psubw                        m0, m4
267*c0909341SAndroid Build Coastguard Worker    paddw                        m0, m1
268*c0909341SAndroid Build Coastguard Worker    pmaddwd                      m0, m3
269*c0909341SAndroid Build Coastguard Worker    cmp                          hd, 4
270*c0909341SAndroid Build Coastguard Worker    jg .w4_mul
271*c0909341SAndroid Build Coastguard Worker    psrlw                        m0, 3                             ; dc >>= ctz(width + height);
272*c0909341SAndroid Build Coastguard Worker    jmp .w4_end
273*c0909341SAndroid Build Coastguard Worker.w4_mul:
274*c0909341SAndroid Build Coastguard Worker    punpckhqdq                   m1, m0, m0
275*c0909341SAndroid Build Coastguard Worker    paddw                        m0, m1
276*c0909341SAndroid Build Coastguard Worker    psrlq                        m1, m0, 32
277*c0909341SAndroid Build Coastguard Worker    paddw                        m0, m1
278*c0909341SAndroid Build Coastguard Worker    psrlw                        m0, 2
279*c0909341SAndroid Build Coastguard Worker    mov                         r6d, 0x5556
280*c0909341SAndroid Build Coastguard Worker    mov                         r2d, 0x3334
281*c0909341SAndroid Build Coastguard Worker    test                         hd, 8
282*c0909341SAndroid Build Coastguard Worker    cmovz                       r6d, r2d
283*c0909341SAndroid Build Coastguard Worker    movd                         m5, r6d
284*c0909341SAndroid Build Coastguard Worker    pmulhuw                      m0, m5
285*c0909341SAndroid Build Coastguard Worker.w4_end:
286*c0909341SAndroid Build Coastguard Worker    pxor                         m1, m1
287*c0909341SAndroid Build Coastguard Worker    pshufb                       m0, m1
288*c0909341SAndroid Build Coastguard Worker.s4:
289*c0909341SAndroid Build Coastguard Worker    movd           [dstq+strideq*0], m0
290*c0909341SAndroid Build Coastguard Worker    movd           [dstq+strideq*1], m0
291*c0909341SAndroid Build Coastguard Worker    movd           [dstq+strideq*2], m0
292*c0909341SAndroid Build Coastguard Worker    movd           [dstq+stride3q ], m0
293*c0909341SAndroid Build Coastguard Worker    lea                        dstq, [dstq+strideq*4]
294*c0909341SAndroid Build Coastguard Worker    sub                          hd, 4
295*c0909341SAndroid Build Coastguard Worker    jg .s4
296*c0909341SAndroid Build Coastguard Worker    RET
297*c0909341SAndroid Build Coastguard WorkerALIGN function_align
298*c0909341SAndroid Build Coastguard Worker.h8:
299*c0909341SAndroid Build Coastguard Worker    movq                         m0, [tlq-8]
300*c0909341SAndroid Build Coastguard Worker    pmaddubsw                    m0, m3
301*c0909341SAndroid Build Coastguard Worker    jmp                          wq
302*c0909341SAndroid Build Coastguard Worker.w8:
303*c0909341SAndroid Build Coastguard Worker    movq                         m1, [tlq+1]
304*c0909341SAndroid Build Coastguard Worker    pmaddubsw                    m1, m3
305*c0909341SAndroid Build Coastguard Worker    psubw                        m4, m0
306*c0909341SAndroid Build Coastguard Worker    punpckhqdq                   m0, m0
307*c0909341SAndroid Build Coastguard Worker    psubw                        m0, m4
308*c0909341SAndroid Build Coastguard Worker    paddw                        m0, m1
309*c0909341SAndroid Build Coastguard Worker    pshuflw                      m1, m0, q1032                  ; psrlq  m1, m0, 32
310*c0909341SAndroid Build Coastguard Worker    paddw                        m0, m1
311*c0909341SAndroid Build Coastguard Worker    pmaddwd                      m0, m3
312*c0909341SAndroid Build Coastguard Worker    psrlw                        m0, m5
313*c0909341SAndroid Build Coastguard Worker    cmp                          hd, 8
314*c0909341SAndroid Build Coastguard Worker    je .w8_end
315*c0909341SAndroid Build Coastguard Worker    mov                         r6d, 0x5556
316*c0909341SAndroid Build Coastguard Worker    mov                         r2d, 0x3334
317*c0909341SAndroid Build Coastguard Worker    cmp                          hd, 32
318*c0909341SAndroid Build Coastguard Worker    cmovz                       r6d, r2d
319*c0909341SAndroid Build Coastguard Worker    movd                         m1, r6d
320*c0909341SAndroid Build Coastguard Worker    pmulhuw                      m0, m1
321*c0909341SAndroid Build Coastguard Worker.w8_end:
322*c0909341SAndroid Build Coastguard Worker    pxor                         m1, m1
323*c0909341SAndroid Build Coastguard Worker    pshufb                       m0, m1
324*c0909341SAndroid Build Coastguard Worker.s8:
325*c0909341SAndroid Build Coastguard Worker    movq           [dstq+strideq*0], m0
326*c0909341SAndroid Build Coastguard Worker    movq           [dstq+strideq*1], m0
327*c0909341SAndroid Build Coastguard Worker    movq           [dstq+strideq*2], m0
328*c0909341SAndroid Build Coastguard Worker    movq           [dstq+stride3q ], m0
329*c0909341SAndroid Build Coastguard Worker    lea                        dstq, [dstq+strideq*4]
330*c0909341SAndroid Build Coastguard Worker    sub                          hd, 4
331*c0909341SAndroid Build Coastguard Worker    jg .s8
332*c0909341SAndroid Build Coastguard Worker    RET
333*c0909341SAndroid Build Coastguard WorkerALIGN function_align
334*c0909341SAndroid Build Coastguard Worker.h16:
335*c0909341SAndroid Build Coastguard Worker    mova                         m0, [tlq-16]
336*c0909341SAndroid Build Coastguard Worker    pmaddubsw                    m0, m3
337*c0909341SAndroid Build Coastguard Worker    jmp                          wq
338*c0909341SAndroid Build Coastguard Worker.w16:
339*c0909341SAndroid Build Coastguard Worker    movu                         m1, [tlq+1]
340*c0909341SAndroid Build Coastguard Worker    pmaddubsw                    m1, m3
341*c0909341SAndroid Build Coastguard Worker    paddw                        m0, m1
342*c0909341SAndroid Build Coastguard Worker    psubw                        m4, m0
343*c0909341SAndroid Build Coastguard Worker    punpckhqdq                   m0, m0
344*c0909341SAndroid Build Coastguard Worker    psubw                        m0, m4
345*c0909341SAndroid Build Coastguard Worker    pshuflw                      m1, m0, q1032                  ; psrlq  m1, m0, 32
346*c0909341SAndroid Build Coastguard Worker    paddw                        m0, m1
347*c0909341SAndroid Build Coastguard Worker    pmaddwd                      m0, m3
348*c0909341SAndroid Build Coastguard Worker    psrlw                        m0, m5
349*c0909341SAndroid Build Coastguard Worker    cmp                          hd, 16
350*c0909341SAndroid Build Coastguard Worker    je .w16_end
351*c0909341SAndroid Build Coastguard Worker    mov                         r6d, 0x5556
352*c0909341SAndroid Build Coastguard Worker    mov                         r2d, 0x3334
353*c0909341SAndroid Build Coastguard Worker    test                         hd, 8|32
354*c0909341SAndroid Build Coastguard Worker    cmovz                       r6d, r2d
355*c0909341SAndroid Build Coastguard Worker    movd                         m1, r6d
356*c0909341SAndroid Build Coastguard Worker    pmulhuw                      m0, m1
357*c0909341SAndroid Build Coastguard Worker.w16_end:
358*c0909341SAndroid Build Coastguard Worker    pxor                         m1, m1
359*c0909341SAndroid Build Coastguard Worker    pshufb                       m0, m1
360*c0909341SAndroid Build Coastguard Worker.s16:
361*c0909341SAndroid Build Coastguard Worker    mova           [dstq+strideq*0], m0
362*c0909341SAndroid Build Coastguard Worker    mova           [dstq+strideq*1], m0
363*c0909341SAndroid Build Coastguard Worker    mova           [dstq+strideq*2], m0
364*c0909341SAndroid Build Coastguard Worker    mova           [dstq+stride3q ], m0
365*c0909341SAndroid Build Coastguard Worker    lea                        dstq, [dstq+strideq*4]
366*c0909341SAndroid Build Coastguard Worker    sub                          hd, 4
367*c0909341SAndroid Build Coastguard Worker    jg .s16
368*c0909341SAndroid Build Coastguard Worker    RET
369*c0909341SAndroid Build Coastguard WorkerALIGN function_align
370*c0909341SAndroid Build Coastguard Worker.h32:
371*c0909341SAndroid Build Coastguard Worker    mova                         m0, [tlq-32]
372*c0909341SAndroid Build Coastguard Worker    pmaddubsw                    m0, m3
373*c0909341SAndroid Build Coastguard Worker    mova                         m2, [tlq-16]
374*c0909341SAndroid Build Coastguard Worker    pmaddubsw                    m2, m3
375*c0909341SAndroid Build Coastguard Worker    paddw                        m0, m2
376*c0909341SAndroid Build Coastguard Worker    jmp wq
377*c0909341SAndroid Build Coastguard Worker.w32:
378*c0909341SAndroid Build Coastguard Worker    movu                         m1, [tlq+1]
379*c0909341SAndroid Build Coastguard Worker    pmaddubsw                    m1, m3
380*c0909341SAndroid Build Coastguard Worker    movu                         m2, [tlq+17]
381*c0909341SAndroid Build Coastguard Worker    pmaddubsw                    m2, m3
382*c0909341SAndroid Build Coastguard Worker    paddw                        m1, m2
383*c0909341SAndroid Build Coastguard Worker    paddw                        m0, m1
384*c0909341SAndroid Build Coastguard Worker    psubw                        m4, m0
385*c0909341SAndroid Build Coastguard Worker    punpckhqdq                   m0, m0
386*c0909341SAndroid Build Coastguard Worker    psubw                        m0, m4
387*c0909341SAndroid Build Coastguard Worker    pshuflw                      m1, m0, q1032                   ; psrlq  m1, m0, 32
388*c0909341SAndroid Build Coastguard Worker    paddw                        m0, m1
389*c0909341SAndroid Build Coastguard Worker    pmaddwd                      m0, m3
390*c0909341SAndroid Build Coastguard Worker    psrlw                        m0, m5
391*c0909341SAndroid Build Coastguard Worker    cmp                          hd, 32
392*c0909341SAndroid Build Coastguard Worker    je .w32_end
393*c0909341SAndroid Build Coastguard Worker    lea                         r2d, [hq*2]
394*c0909341SAndroid Build Coastguard Worker    mov                         r6d, 0x5556
395*c0909341SAndroid Build Coastguard Worker    mov                         r2d, 0x3334
396*c0909341SAndroid Build Coastguard Worker    test                         hd, 64|16
397*c0909341SAndroid Build Coastguard Worker    cmovz                       r6d, r2d
398*c0909341SAndroid Build Coastguard Worker    movd                         m1, r6d
399*c0909341SAndroid Build Coastguard Worker    pmulhuw                      m0, m1
400*c0909341SAndroid Build Coastguard Worker.w32_end:
401*c0909341SAndroid Build Coastguard Worker    pxor                         m1, m1
402*c0909341SAndroid Build Coastguard Worker    pshufb                       m0, m1
403*c0909341SAndroid Build Coastguard Worker    mova                         m1, m0
404*c0909341SAndroid Build Coastguard Worker.s32:
405*c0909341SAndroid Build Coastguard Worker    mova                     [dstq], m0
406*c0909341SAndroid Build Coastguard Worker    mova                  [dstq+16], m1
407*c0909341SAndroid Build Coastguard Worker    mova             [dstq+strideq], m0
408*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq+16], m1
409*c0909341SAndroid Build Coastguard Worker    mova           [dstq+strideq*2], m0
410*c0909341SAndroid Build Coastguard Worker    mova        [dstq+strideq*2+16], m1
411*c0909341SAndroid Build Coastguard Worker    mova            [dstq+stride3q], m0
412*c0909341SAndroid Build Coastguard Worker    mova         [dstq+stride3q+16], m1
413*c0909341SAndroid Build Coastguard Worker    lea                        dstq, [dstq+strideq*4]
414*c0909341SAndroid Build Coastguard Worker    sub                          hd, 4
415*c0909341SAndroid Build Coastguard Worker    jg .s32
416*c0909341SAndroid Build Coastguard Worker    RET
417*c0909341SAndroid Build Coastguard WorkerALIGN function_align
418*c0909341SAndroid Build Coastguard Worker.h64:
419*c0909341SAndroid Build Coastguard Worker    mova                         m0, [tlq-64]
420*c0909341SAndroid Build Coastguard Worker    mova                         m1, [tlq-48]
421*c0909341SAndroid Build Coastguard Worker    pmaddubsw                    m0, m3
422*c0909341SAndroid Build Coastguard Worker    pmaddubsw                    m1, m3
423*c0909341SAndroid Build Coastguard Worker    paddw                        m0, m1
424*c0909341SAndroid Build Coastguard Worker    mova                         m1, [tlq-32]
425*c0909341SAndroid Build Coastguard Worker    pmaddubsw                    m1, m3
426*c0909341SAndroid Build Coastguard Worker    paddw                        m0, m1
427*c0909341SAndroid Build Coastguard Worker    mova                         m1, [tlq-16]
428*c0909341SAndroid Build Coastguard Worker    pmaddubsw                    m1, m3
429*c0909341SAndroid Build Coastguard Worker    paddw                        m0, m1
430*c0909341SAndroid Build Coastguard Worker    jmp wq
431*c0909341SAndroid Build Coastguard Worker.w64:
432*c0909341SAndroid Build Coastguard Worker    movu                         m1, [tlq+ 1]
433*c0909341SAndroid Build Coastguard Worker    movu                         m2, [tlq+17]
434*c0909341SAndroid Build Coastguard Worker    pmaddubsw                    m1, m3
435*c0909341SAndroid Build Coastguard Worker    pmaddubsw                    m2, m3
436*c0909341SAndroid Build Coastguard Worker    paddw                        m1, m2
437*c0909341SAndroid Build Coastguard Worker    movu                         m2, [tlq+33]
438*c0909341SAndroid Build Coastguard Worker    pmaddubsw                    m2, m3
439*c0909341SAndroid Build Coastguard Worker    paddw                        m1, m2
440*c0909341SAndroid Build Coastguard Worker    movu                         m2, [tlq+49]
441*c0909341SAndroid Build Coastguard Worker    pmaddubsw                    m2, m3
442*c0909341SAndroid Build Coastguard Worker    paddw                        m1, m2
443*c0909341SAndroid Build Coastguard Worker    paddw                        m0, m1
444*c0909341SAndroid Build Coastguard Worker    psubw                        m4, m0
445*c0909341SAndroid Build Coastguard Worker    punpckhqdq                   m0, m0
446*c0909341SAndroid Build Coastguard Worker    psubw                        m0, m4
447*c0909341SAndroid Build Coastguard Worker    pshuflw                      m1, m0, q1032                   ; psrlq  m1, m0, 32
448*c0909341SAndroid Build Coastguard Worker    paddw                        m0, m1
449*c0909341SAndroid Build Coastguard Worker    pmaddwd                      m0, m3
450*c0909341SAndroid Build Coastguard Worker    psrlw                        m0, m5
451*c0909341SAndroid Build Coastguard Worker    cmp                          hd, 64
452*c0909341SAndroid Build Coastguard Worker    je .w64_end
453*c0909341SAndroid Build Coastguard Worker    mov                         r6d, 0x5556
454*c0909341SAndroid Build Coastguard Worker    mov                         r2d, 0x3334
455*c0909341SAndroid Build Coastguard Worker    test                         hd, 32
456*c0909341SAndroid Build Coastguard Worker    cmovz                       r6d, r2d
457*c0909341SAndroid Build Coastguard Worker    movd                         m1, r6d
458*c0909341SAndroid Build Coastguard Worker    pmulhuw                      m0, m1
459*c0909341SAndroid Build Coastguard Worker.w64_end:
460*c0909341SAndroid Build Coastguard Worker    pxor                         m1, m1
461*c0909341SAndroid Build Coastguard Worker    pshufb                       m0, m1
462*c0909341SAndroid Build Coastguard Worker    mova                         m1, m0
463*c0909341SAndroid Build Coastguard Worker    mova                         m2, m0
464*c0909341SAndroid Build Coastguard Worker    mova                         m3, m0
465*c0909341SAndroid Build Coastguard Worker.s64:
466*c0909341SAndroid Build Coastguard Worker    mova                     [dstq], m0
467*c0909341SAndroid Build Coastguard Worker    mova                  [dstq+16], m1
468*c0909341SAndroid Build Coastguard Worker    mova                  [dstq+32], m2
469*c0909341SAndroid Build Coastguard Worker    mova                  [dstq+48], m3
470*c0909341SAndroid Build Coastguard Worker    mova             [dstq+strideq], m0
471*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq+16], m1
472*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq+32], m2
473*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq+48], m3
474*c0909341SAndroid Build Coastguard Worker    lea                        dstq, [dstq+strideq*2]
475*c0909341SAndroid Build Coastguard Worker    sub                          hd, 2
476*c0909341SAndroid Build Coastguard Worker    jg .s64
477*c0909341SAndroid Build Coastguard Worker    RET
478*c0909341SAndroid Build Coastguard Worker
479*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
480*c0909341SAndroid Build Coastguard Worker;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
481*c0909341SAndroid Build Coastguard Worker;                                    const int width, const int height, const int a);
482*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
483*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
484*c0909341SAndroid Build Coastguard Worker    LEA                  r5, ipred_dc_left_ssse3_table
485*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm                ; zero upper half
486*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, hd
487*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
488*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
489*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq]
490*c0909341SAndroid Build Coastguard Worker    movd                 m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
491*c0909341SAndroid Build Coastguard Worker    movd                 m2, r6d
492*c0909341SAndroid Build Coastguard Worker    psrld                m3, m2
493*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [r5+r6*4]
494*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m2, m2
495*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
496*c0909341SAndroid Build Coastguard Worker    add                  r6, r5
497*c0909341SAndroid Build Coastguard Worker    add                  r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
498*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
499*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
500*c0909341SAndroid Build Coastguard Worker    jmp                  r6
501*c0909341SAndroid Build Coastguard Worker.h64:
502*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+48]                           ; unaligned when jumping here from dc_top
503*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
504*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
505*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+32]                           ; unaligned when jumping here from dc_top
506*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
507*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
508*c0909341SAndroid Build Coastguard Worker.h32:
509*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+16]                           ; unaligned when jumping here from dc_top
510*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
511*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
512*c0909341SAndroid Build Coastguard Worker.h16:
513*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m0, q3232                          ; psrlq               m1, m0, 16
514*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
515*c0909341SAndroid Build Coastguard Worker.h8:
516*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032                          ; psrlq               m1, m0, 32
517*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
518*c0909341SAndroid Build Coastguard Worker.h4:
519*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m2
520*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
521*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
522*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
523*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1
524*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
525*c0909341SAndroid Build Coastguard Worker    mova                 m2, m0
526*c0909341SAndroid Build Coastguard Worker    mova                 m3, m0
527*c0909341SAndroid Build Coastguard Worker    jmp                  wq
528*c0909341SAndroid Build Coastguard Worker
529*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
530*c0909341SAndroid Build Coastguard Worker;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
531*c0909341SAndroid Build Coastguard Worker;                                    const int width, const int height, const int a);
532*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
533*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
534*c0909341SAndroid Build Coastguard Worker    LEA                  r5, ipred_dc_splat_ssse3_table
535*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
536*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
537*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
538*c0909341SAndroid Build Coastguard Worker    movddup              m0, [r5-ipred_dc_splat_ssse3_table+pb_128]
539*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
540*c0909341SAndroid Build Coastguard Worker    mova                 m2, m0
541*c0909341SAndroid Build Coastguard Worker    mova                 m3, m0
542*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
543*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
544*c0909341SAndroid Build Coastguard Worker    jmp                  wq
545*c0909341SAndroid Build Coastguard Worker
546*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
547*c0909341SAndroid Build Coastguard Worker;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
548*c0909341SAndroid Build Coastguard Worker;                                    const int width, const int height, const int a);
549*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
550*c0909341SAndroid Build Coastguard Workercglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
551*c0909341SAndroid Build Coastguard Worker    LEA                  r5, ipred_dc_left_ssse3_table
552*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
553*c0909341SAndroid Build Coastguard Worker    inc                 tlq
554*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq]
555*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
556*c0909341SAndroid Build Coastguard Worker    movd                 m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
557*c0909341SAndroid Build Coastguard Worker    movd                 m2, wd
558*c0909341SAndroid Build Coastguard Worker    psrld                m3, m2
559*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [r5+wq*4]
560*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m2, m2
561*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
562*c0909341SAndroid Build Coastguard Worker    add                  r6, r5
563*c0909341SAndroid Build Coastguard Worker    add                  r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
564*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
565*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
566*c0909341SAndroid Build Coastguard Worker    jmp                  r6
567*c0909341SAndroid Build Coastguard Worker
568*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
569*c0909341SAndroid Build Coastguard Worker;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
570*c0909341SAndroid Build Coastguard Worker;                                    const int width, const int height, const int a);
571*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
572*c0909341SAndroid Build Coastguard Worker%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
573*c0909341SAndroid Build Coastguard Worker                ;            w * a         = (w - 128) * a + 128 * a
574*c0909341SAndroid Build Coastguard Worker                ;            (256 - w) * b = (127 - w) * b + 129 * b
575*c0909341SAndroid Build Coastguard Worker                ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b]
576*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m%3, m%1
577*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m%4, m%2                    ; (w - 128) * a + (127 - w) * b
578*c0909341SAndroid Build Coastguard Worker    paddw                m6, m%5
579*c0909341SAndroid Build Coastguard Worker    paddw                m0, m%6                         ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128]
580*c0909341SAndroid Build Coastguard Worker    psrlw                m6, 8
581*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 8
582*c0909341SAndroid Build Coastguard Worker    packuswb             m6, m0
583*c0909341SAndroid Build Coastguard Worker%endmacro
584*c0909341SAndroid Build Coastguard Worker
585*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights
586*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_smooth_v_ssse3_table
587*c0909341SAndroid Build Coastguard Worker    LEA                  r6, ipred_smooth_v_ssse3_table
588*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
589*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
590*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
591*c0909341SAndroid Build Coastguard Worker    movddup              m0, [base+pb_127_m127]
592*c0909341SAndroid Build Coastguard Worker    movddup              m1, [base+pw_128]
593*c0909341SAndroid Build Coastguard Worker    lea            weightsq, [base+smooth_weights+hq*4]
594*c0909341SAndroid Build Coastguard Worker    neg                  hq
595*c0909341SAndroid Build Coastguard Worker    movd                 m5, [tlq+hq]
596*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
597*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m2
598*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
599*c0909341SAndroid Build Coastguard Worker    jmp                  wq
600*c0909341SAndroid Build Coastguard Worker.w4:
601*c0909341SAndroid Build Coastguard Worker    movd                 m2, [tlq+1]
602*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m2
603*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m5                          ; top, bottom
604*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
605*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+ipred_v_shuf]
606*c0909341SAndroid Build Coastguard Worker    mova                 m5, m4
607*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m4
608*c0909341SAndroid Build Coastguard Worker    punpckhdq            m5, m5
609*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m2, m0                      ; m3: 127 * top - 127 * bottom
610*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2                          ; m1:   1 * top + 256 * bottom + 128, overflow is ok
611*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1                          ; m3: 128 * top + 129 * bottom + 128
612*c0909341SAndroid Build Coastguard Worker.w4_loop:
613*c0909341SAndroid Build Coastguard Worker    movu                 m1, [weightsq+hq*2]
614*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1, m4                      ;m2, m3, m4 and m5 should be stable in loop
615*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m5
616*c0909341SAndroid Build Coastguard Worker    SMOOTH                0, 1, 2, 2, 3, 3
617*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m6
618*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m6, q1032
619*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m1
620*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m6, m6
621*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], m6
622*c0909341SAndroid Build Coastguard Worker    psrlq                m6, 32
623*c0909341SAndroid Build Coastguard Worker    movd   [dstq+r3       ], m6
624*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
625*c0909341SAndroid Build Coastguard Worker    add                  hq, 4
626*c0909341SAndroid Build Coastguard Worker    jl .w4_loop
627*c0909341SAndroid Build Coastguard Worker    RET
628*c0909341SAndroid Build Coastguard WorkerALIGN function_align
629*c0909341SAndroid Build Coastguard Worker.w8:
630*c0909341SAndroid Build Coastguard Worker    movq                 m2, [tlq+1]
631*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m5
632*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+ipred_v_shuf]
633*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
634*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m5, q0000
635*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q1111
636*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m2, m0
637*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
638*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1                           ; m3 is output for loop
639*c0909341SAndroid Build Coastguard Worker.w8_loop:
640*c0909341SAndroid Build Coastguard Worker    movq                 m1, [weightsq+hq*2]
641*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1, m4
642*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m5
643*c0909341SAndroid Build Coastguard Worker    SMOOTH                0, 1, 2, 2, 3, 3
644*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m6
645*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m6
646*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
647*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
648*c0909341SAndroid Build Coastguard Worker    jl .w8_loop
649*c0909341SAndroid Build Coastguard Worker    RET
650*c0909341SAndroid Build Coastguard WorkerALIGN function_align
651*c0909341SAndroid Build Coastguard Worker.w16:
652*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+1]
653*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m5
654*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m5
655*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m2, m0
656*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m3, m0
657*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1, m2
658*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
659*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
660*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1                           ; m4 and m5 is output for loop
661*c0909341SAndroid Build Coastguard Worker.w16_loop:
662*c0909341SAndroid Build Coastguard Worker    movd                 m1, [weightsq+hq*2]
663*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m1, q0000
664*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m1
665*c0909341SAndroid Build Coastguard Worker    SMOOTH 1, 1, 2, 3, 4, 5
666*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m6
667*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
668*c0909341SAndroid Build Coastguard Worker    add                  hq, 1
669*c0909341SAndroid Build Coastguard Worker    jl .w16_loop
670*c0909341SAndroid Build Coastguard Worker    RET
671*c0909341SAndroid Build Coastguard WorkerALIGN function_align
672*c0909341SAndroid Build Coastguard Worker.w32:
673*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM        8, 7
674*c0909341SAndroid Build Coastguard Worker    mova                 m7, m5
675*c0909341SAndroid Build Coastguard Worker.w32_loop_init:
676*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 2
677*c0909341SAndroid Build Coastguard Worker.w32_loop:
678*c0909341SAndroid Build Coastguard Worker    movddup              m0, [base+pb_127_m127]
679*c0909341SAndroid Build Coastguard Worker    movddup              m1, [base+pw_128]
680*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+1]
681*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m7
682*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m7
683*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m2, m0
684*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m3, m0
685*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1, m2
686*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
687*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
688*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
689*c0909341SAndroid Build Coastguard Worker    movd                 m1, [weightsq+hq*2]
690*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m1, q0000
691*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m1
692*c0909341SAndroid Build Coastguard Worker    SMOOTH                1, 1, 2, 3, 4, 5
693*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m6
694*c0909341SAndroid Build Coastguard Worker    add                 tlq, 16
695*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
696*c0909341SAndroid Build Coastguard Worker    dec                 r3d
697*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
698*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq-32+strideq]
699*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 32
700*c0909341SAndroid Build Coastguard Worker    add                  hq, 1
701*c0909341SAndroid Build Coastguard Worker    jl .w32_loop_init
702*c0909341SAndroid Build Coastguard Worker    RET
703*c0909341SAndroid Build Coastguard WorkerALIGN function_align
704*c0909341SAndroid Build Coastguard Worker.w64:
705*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM        8, 7
706*c0909341SAndroid Build Coastguard Worker    mova                 m7, m5
707*c0909341SAndroid Build Coastguard Worker.w64_loop_init:
708*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 4
709*c0909341SAndroid Build Coastguard Worker.w64_loop:
710*c0909341SAndroid Build Coastguard Worker    movddup              m0, [base+pb_127_m127]
711*c0909341SAndroid Build Coastguard Worker    movddup              m1, [base+pw_128]
712*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+1]
713*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m7
714*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m7
715*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m2, m0
716*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m3, m0
717*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1, m2
718*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
719*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
720*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
721*c0909341SAndroid Build Coastguard Worker    movd                 m1, [weightsq+hq*2]
722*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m1, q0000
723*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m1
724*c0909341SAndroid Build Coastguard Worker    SMOOTH                1, 1, 2, 3, 4, 5
725*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m6
726*c0909341SAndroid Build Coastguard Worker    add                 tlq, 16
727*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
728*c0909341SAndroid Build Coastguard Worker    dec                 r3d
729*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
730*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq-64+strideq]
731*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 64
732*c0909341SAndroid Build Coastguard Worker    add                  hq, 1
733*c0909341SAndroid Build Coastguard Worker    jl .w64_loop_init
734*c0909341SAndroid Build Coastguard Worker    RET
735*c0909341SAndroid Build Coastguard Worker
736*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
737*c0909341SAndroid Build Coastguard Worker;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
738*c0909341SAndroid Build Coastguard Worker;                                    const int width, const int height, const int a);
739*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
740*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h
741*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_smooth_h_ssse3_table
742*c0909341SAndroid Build Coastguard Worker    LEA                  r6, ipred_smooth_h_ssse3_table
743*c0909341SAndroid Build Coastguard Worker    mov                  wd, wm
744*c0909341SAndroid Build Coastguard Worker    movd                 m3, [tlq+wq]
745*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
746*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m1                          ; right
747*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
748*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
749*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
750*c0909341SAndroid Build Coastguard Worker    movddup              m4, [base+pb_127_m127]
751*c0909341SAndroid Build Coastguard Worker    movddup              m5, [base+pw_128]
752*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
753*c0909341SAndroid Build Coastguard Worker    jmp                  wq
754*c0909341SAndroid Build Coastguard Worker.w4:
755*c0909341SAndroid Build Coastguard Worker    movddup              m6, [base+smooth_weights+4*2]
756*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+ipred_h_shuf]
757*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
758*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
759*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
760*c0909341SAndroid Build Coastguard Worker.w4_loop:
761*c0909341SAndroid Build Coastguard Worker    movd                 m2, [tlq+hq]                    ; left
762*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
763*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2, m3                      ; left, right
764*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m3
765*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
766*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1                          ; 128 * left + 129 * right
767*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6
768*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
769*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
770*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2, m4
771*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
772*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6
773*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
774*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
775*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 8
776*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 8
777*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
778*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m0
779*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032
780*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m1
781*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
782*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], m0
783*c0909341SAndroid Build Coastguard Worker    psrlq                m0, 32
784*c0909341SAndroid Build Coastguard Worker    movd   [dstq+r3       ], m0
785*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
786*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
787*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
788*c0909341SAndroid Build Coastguard Worker    RET
789*c0909341SAndroid Build Coastguard WorkerALIGN function_align
790*c0909341SAndroid Build Coastguard Worker.w8:
791*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+smooth_weights+8*2]
792*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+ipred_h_shuf]
793*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
794*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
795*c0909341SAndroid Build Coastguard Worker    punpckldq            m7, m7
796*c0909341SAndroid Build Coastguard Worker.w8_loop:
797*c0909341SAndroid Build Coastguard Worker    movd                 m2, [tlq+hq]                    ; left
798*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
799*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2, m3                      ; left, right
800*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m3
801*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
802*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1                          ; 128 * left + 129 * right
803*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6
804*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
805*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
806*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2, m4
807*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
808*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6
809*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
810*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
811*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 8
812*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 8
813*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
814*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
815*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
816*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
817*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
818*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
819*c0909341SAndroid Build Coastguard Worker    RET
820*c0909341SAndroid Build Coastguard WorkerALIGN function_align
821*c0909341SAndroid Build Coastguard Worker.w16:
822*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+smooth_weights+16*2]
823*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+smooth_weights+16*3]
824*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 1
825*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
826*c0909341SAndroid Build Coastguard Worker.w16_loop:
827*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
828*c0909341SAndroid Build Coastguard Worker    movd                 m2, [tlq+hq]                    ; left
829*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m1
830*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2, m3                      ; left, right
831*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m3
832*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
833*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1                          ; 128 * left + 129 * right
834*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6
835*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
836*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
837*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2, m4
838*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
839*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
840*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
841*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
842*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 8
843*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 8
844*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
845*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
846*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq]
847*c0909341SAndroid Build Coastguard Worker    sub                  hd, 1
848*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
849*c0909341SAndroid Build Coastguard Worker    RET
850*c0909341SAndroid Build Coastguard WorkerALIGN function_align
851*c0909341SAndroid Build Coastguard Worker.w32:
852*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 1
853*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
854*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
855*c0909341SAndroid Build Coastguard Worker.w32_loop_init:
856*c0909341SAndroid Build Coastguard Worker    mov                  r5, 2
857*c0909341SAndroid Build Coastguard Worker    lea                  r3, [base+smooth_weights+16*4]
858*c0909341SAndroid Build Coastguard Worker.w32_loop:
859*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3]
860*c0909341SAndroid Build Coastguard Worker    add                  r3, 16
861*c0909341SAndroid Build Coastguard Worker    movd                 m2, [tlq+hq]                    ; left
862*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m6
863*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2, m3                      ; left, right
864*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m3
865*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
866*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1                          ; 128 * left + 129 * right
867*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m7
868*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
869*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
870*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2, m4
871*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
872*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3]
873*c0909341SAndroid Build Coastguard Worker    add                  r3, 16
874*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
875*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
876*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
877*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 8
878*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 8
879*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
880*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
881*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
882*c0909341SAndroid Build Coastguard Worker    dec                  r5
883*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
884*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq-32+strideq]
885*c0909341SAndroid Build Coastguard Worker    sub                  hd, 1
886*c0909341SAndroid Build Coastguard Worker    jg .w32_loop_init
887*c0909341SAndroid Build Coastguard Worker    RET
888*c0909341SAndroid Build Coastguard WorkerALIGN function_align
889*c0909341SAndroid Build Coastguard Worker.w64:
890*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 1
891*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
892*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
893*c0909341SAndroid Build Coastguard Worker.w64_loop_init:
894*c0909341SAndroid Build Coastguard Worker    mov                  r5, 4
895*c0909341SAndroid Build Coastguard Worker    lea                  r3, [base+smooth_weights+16*8]
896*c0909341SAndroid Build Coastguard Worker.w64_loop:
897*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3]
898*c0909341SAndroid Build Coastguard Worker    add                  r3, 16
899*c0909341SAndroid Build Coastguard Worker    movd                 m2, [tlq+hq]                    ; left
900*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m6
901*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2, m3                      ; left, right
902*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m3
903*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
904*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1                          ; 128 * left + 129 * right
905*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m7
906*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
907*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
908*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2, m4
909*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
910*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3]
911*c0909341SAndroid Build Coastguard Worker    add                  r3, 16
912*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
913*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
914*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
915*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 8
916*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 8
917*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
918*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
919*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
920*c0909341SAndroid Build Coastguard Worker    dec                  r5
921*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
922*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq-64+strideq]
923*c0909341SAndroid Build Coastguard Worker    sub                  hd, 1
924*c0909341SAndroid Build Coastguard Worker    jg .w64_loop_init
925*c0909341SAndroid Build Coastguard Worker    RET
926*c0909341SAndroid Build Coastguard Worker
927*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
928*c0909341SAndroid Build Coastguard Worker;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
929*c0909341SAndroid Build Coastguard Worker;                                    const int width, const int height, const int a);
930*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
931*c0909341SAndroid Build Coastguard Worker%macro SMOOTH_2D_END  7                                  ; src[1-2], mul[1-2], add[1-2], m3
932*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m%3, m%1
933*c0909341SAndroid Build Coastguard Worker    mova                 m0, m6
934*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m%4, m%2
935*c0909341SAndroid Build Coastguard Worker    mova                 m1, m6
936*c0909341SAndroid Build Coastguard Worker%ifnum %5
937*c0909341SAndroid Build Coastguard Worker    paddw                m0, m%5
938*c0909341SAndroid Build Coastguard Worker%else
939*c0909341SAndroid Build Coastguard Worker    paddw                m0, %5
940*c0909341SAndroid Build Coastguard Worker%endif
941*c0909341SAndroid Build Coastguard Worker%ifnum %6
942*c0909341SAndroid Build Coastguard Worker    paddw                m1, m%6
943*c0909341SAndroid Build Coastguard Worker%else
944*c0909341SAndroid Build Coastguard Worker    paddw                m1, %6
945*c0909341SAndroid Build Coastguard Worker%endif
946*c0909341SAndroid Build Coastguard Worker%ifnum %7
947*c0909341SAndroid Build Coastguard Worker%else
948*c0909341SAndroid Build Coastguard Worker    mova                 m3, %7
949*c0909341SAndroid Build Coastguard Worker%endif
950*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m2
951*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m3
952*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 8
953*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 8
954*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
955*c0909341SAndroid Build Coastguard Worker%endmacro
956*c0909341SAndroid Build Coastguard Worker
957*c0909341SAndroid Build Coastguard Worker%macro SMOOTH_OUTPUT_16B  12      ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5]
958*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+16*%1]                  ; top
959*c0909341SAndroid Build Coastguard Worker    punpckhbw            m6, m1, m0                       ; top, bottom
960*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m0                           ; top, bottom
961*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m1, m5
962*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*%2], m1
963*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3                           ;   1 * top + 255 * bottom + 255
964*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1                           ; 128 * top + 129 * bottom + 255
965*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*%3], m2
966*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6, m5
967*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*%4], m6
968*c0909341SAndroid Build Coastguard Worker    paddw                m6, m3                           ;   1 * top + 255 * bottom + 255
969*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6                           ; 128 * top + 129 * bottom + 255
970*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*%5], m2
971*c0909341SAndroid Build Coastguard Worker    movd                 m1, [tlq+hq]                     ; left
972*c0909341SAndroid Build Coastguard Worker    pshufb               m1, [base+pb_3]                  ; topleft[-(1 + y)]
973*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m4                           ; left, right
974*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m1, m5                       ; 127 * left - 127 * right
975*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1                           ; 128 * left + 129 * right
976*c0909341SAndroid Build Coastguard Worker    mova                 m3, m2
977*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1, %6                       ; weights_hor = &dav1d_sm_weights[width];
978*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, %7
979*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3, m0
980*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
981*c0909341SAndroid Build Coastguard Worker    movd                 m1, [v_weightsq]                 ; weights_ver = &dav1d_sm_weights[height];
982*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+16*%9]
983*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m7
984*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*%8], m3
985*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*%2]
986*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*%3]
987*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+16*%4]
988*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+16*%5]
989*c0909341SAndroid Build Coastguard Worker    SMOOTH_2D_END         1, 1, 4, 3, 5, 7, [rsp+16*%8]
990*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
991*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+pw_255]                ; recovery
992*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+16*%10]                 ; recovery
993*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*%11]                 ; recovery
994*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*%12]                 ; recovery
995*c0909341SAndroid Build Coastguard Worker%endmacro
996*c0909341SAndroid Build Coastguard Worker
997*c0909341SAndroid Build Coastguard Workercglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights
998*c0909341SAndroid Build Coastguard Worker%define base r6-ipred_smooth_ssse3_table
999*c0909341SAndroid Build Coastguard Worker    mov                  wd, wm
1000*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
1001*c0909341SAndroid Build Coastguard Worker    LEA                  r6, ipred_smooth_ssse3_table
1002*c0909341SAndroid Build Coastguard Worker    movd                 m4, [tlq+wq]                     ; right
1003*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
1004*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m2
1005*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
1006*c0909341SAndroid Build Coastguard Worker    mov                  r5, tlq
1007*c0909341SAndroid Build Coastguard Worker    sub                  r5, hq
1008*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
1009*c0909341SAndroid Build Coastguard Worker    movddup              m5, [base+pb_127_m127]
1010*c0909341SAndroid Build Coastguard Worker    movd                 m0, [r5]
1011*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2                           ; bottom
1012*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+pw_255]
1013*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
1014*c0909341SAndroid Build Coastguard Worker    lea          v_weightsq, [base+smooth_weights+hq*2]   ; weights_ver = &dav1d_sm_weights[height]
1015*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1016*c0909341SAndroid Build Coastguard Worker.w4:
1017*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+ipred_v_shuf]
1018*c0909341SAndroid Build Coastguard Worker    movd                 m1, [tlq+1]                      ; left
1019*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q0000
1020*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
1021*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
1022*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
1023*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m0                           ; top, bottom
1024*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m7, q1100
1025*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q3322
1026*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m1, m5
1027*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1                           ;   1 * top + 255 * bottom + 255
1028*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3                           ; 128 * top + 129 * bottom + 255
1029*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m1
1030*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m2
1031*c0909341SAndroid Build Coastguard Worker    movq                 m1,  [base+smooth_weights+4*2]   ; weights_hor = &dav1d_sm_weights[width];
1032*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m1
1033*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m1
1034*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m4
1035*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*4], m6
1036*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*5], m5
1037*c0909341SAndroid Build Coastguard Worker.w4_loop:
1038*c0909341SAndroid Build Coastguard Worker    movd                 m1, [tlq+hq]                 ; left
1039*c0909341SAndroid Build Coastguard Worker    pshufb               m1, [base+ipred_h_shuf]
1040*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m4                   ; left, right
1041*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m4
1042*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m0, m5                   ; 127 * left - 127 * right
1043*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m1, m5
1044*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0                       ; 128 * left + 129 * right
1045*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
1046*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*2]
1047*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m4
1048*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m4
1049*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
1050*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
1051*c0909341SAndroid Build Coastguard Worker    movq                 m1, [v_weightsq]             ; weights_ver = &dav1d_sm_weights[height];
1052*c0909341SAndroid Build Coastguard Worker    add          v_weightsq, 8
1053*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1, m6
1054*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m7
1055*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*0]
1056*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*1]
1057*c0909341SAndroid Build Coastguard Worker    SMOOTH_2D_END         0, 1, 4, 4, 5, 5, 3
1058*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*3]
1059*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+16*4]
1060*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*5]
1061*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m0
1062*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032
1063*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m1
1064*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
1065*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], m0
1066*c0909341SAndroid Build Coastguard Worker    psrlq                m0, 32
1067*c0909341SAndroid Build Coastguard Worker    movd   [dstq+r3       ], m0
1068*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1069*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1070*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
1071*c0909341SAndroid Build Coastguard Worker    RET
1072*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1073*c0909341SAndroid Build Coastguard Worker.w8:
1074*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+ipred_v_shuf]
1075*c0909341SAndroid Build Coastguard Worker    movq                 m1, [tlq+1]                  ; left
1076*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m1
1077*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
1078*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
1079*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m0
1080*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m7, q0000
1081*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q1111
1082*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m1, m5
1083*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
1084*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
1085*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m1
1086*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m2
1087*c0909341SAndroid Build Coastguard Worker    mova                 m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width];
1088*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m1
1089*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m4
1090*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*4], m6
1091*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*5], m5
1092*c0909341SAndroid Build Coastguard Worker.w8_loop:
1093*c0909341SAndroid Build Coastguard Worker    movd                 m1, [tlq+hq]                  ; left
1094*c0909341SAndroid Build Coastguard Worker    pshufb               m1, [base+ipred_h_shuf]
1095*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q1100
1096*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m4
1097*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m4
1098*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m0, m5
1099*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m1, m5
1100*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
1101*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
1102*c0909341SAndroid Build Coastguard Worker    mova                 m4,  [rsp+16*2]
1103*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m4
1104*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m4
1105*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
1106*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
1107*c0909341SAndroid Build Coastguard Worker    movd                 m1, [v_weightsq]              ; weights_ver = &dav1d_sm_weights[height];
1108*c0909341SAndroid Build Coastguard Worker    add          v_weightsq, 4
1109*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1, m6
1110*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m7
1111*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*0]
1112*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*1]
1113*c0909341SAndroid Build Coastguard Worker    SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3
1114*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*3]
1115*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+16*4]
1116*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*5]
1117*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
1118*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
1119*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1120*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1121*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
1122*c0909341SAndroid Build Coastguard Worker    RET
1123*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1124*c0909341SAndroid Build Coastguard Worker.w16:
1125*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+ipred_v_shuf]
1126*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+1]                     ; left
1127*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
1128*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
1129*c0909341SAndroid Build Coastguard Worker    punpckhbw            m6, m1, m0                      ; top, bottom
1130*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m0                          ; top, bottom
1131*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q0000
1132*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m7
1133*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6, m5
1134*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*5], m6
1135*c0909341SAndroid Build Coastguard Worker    paddw                m6, m3                          ;   1 * top + 255 * bottom + 255
1136*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6                          ; 128 * top + 129 * bottom + 255
1137*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*6], m2
1138*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m1, m5
1139*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1                          ;   1 * top + 255 * bottom + 255
1140*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m1
1141*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3                          ; 128 * top + 129 * bottom + 255
1142*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m2
1143*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m4
1144*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*4], m5
1145*c0909341SAndroid Build Coastguard Worker.w16_loop:
1146*c0909341SAndroid Build Coastguard Worker    movd                 m1, [tlq+hq]                    ; left
1147*c0909341SAndroid Build Coastguard Worker    pshufb               m1, [base+pb_3]                 ; topleft[-(1 + y)]
1148*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m4                          ; left, right
1149*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m1, m5                      ; 127 * left - 127 * right
1150*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1                          ; 128 * left + 129 * right
1151*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
1152*c0909341SAndroid Build Coastguard Worker    mova                 m3, m2
1153*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, [base+smooth_weights+16*2]  ; weights_hor = &dav1d_sm_weights[width];
1154*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, [base+smooth_weights+16*3]
1155*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
1156*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
1157*c0909341SAndroid Build Coastguard Worker    movd                 m1, [v_weightsq]                ; weights_ver = &dav1d_sm_weights[height];
1158*c0909341SAndroid Build Coastguard Worker    add          v_weightsq, 2
1159*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+16*2]
1160*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m7
1161*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*7], m3
1162*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*0]
1163*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*1]
1164*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+16*5]
1165*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+16*6]
1166*c0909341SAndroid Build Coastguard Worker    SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7]
1167*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*3]
1168*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*4]
1169*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
1170*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq]
1171*c0909341SAndroid Build Coastguard Worker    sub                  hd, 1
1172*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
1173*c0909341SAndroid Build Coastguard Worker    RET
1174*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1175*c0909341SAndroid Build Coastguard Worker.w32:
1176*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+1]                     ; top     topleft[1 + x]
1177*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+17]                    ; top
1178*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m1
1179*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m2
1180*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
1181*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
1182*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+ipred_v_shuf]
1183*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q0000
1184*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m7
1185*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m0
1186*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*4], m4
1187*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*5], m5
1188*c0909341SAndroid Build Coastguard Worker.w32_loop:
1189*c0909341SAndroid Build Coastguard Worker    SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5
1190*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
1191*c0909341SAndroid Build Coastguard Worker    SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5
1192*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq-16+strideq]
1193*c0909341SAndroid Build Coastguard Worker    add          v_weightsq, 2
1194*c0909341SAndroid Build Coastguard Worker    sub                  hd, 1
1195*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
1196*c0909341SAndroid Build Coastguard Worker    RET
1197*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1198*c0909341SAndroid Build Coastguard Worker.w64:
1199*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+1]                     ; top     topleft[1 + x]
1200*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+17]                    ; top
1201*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m1
1202*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m2
1203*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+33]                    ; top
1204*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+49]                    ; top
1205*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*11], m1
1206*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*12], m2
1207*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
1208*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
1209*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+ipred_v_shuf]
1210*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q0000
1211*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m7
1212*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m0
1213*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*4], m4
1214*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*5], m5
1215*c0909341SAndroid Build Coastguard Worker.w64_loop:
1216*c0909341SAndroid Build Coastguard Worker    SMOOTH_OUTPUT_16B  0, 6, 7, 8, 9,  [base+smooth_weights+16*8],  [base+smooth_weights+16*9], 10, 2, 3, 4, 5
1217*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
1218*c0909341SAndroid Build Coastguard Worker    SMOOTH_OUTPUT_16B  1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5
1219*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
1220*c0909341SAndroid Build Coastguard Worker    SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5
1221*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
1222*c0909341SAndroid Build Coastguard Worker    SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5
1223*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq-48+strideq]
1224*c0909341SAndroid Build Coastguard Worker    add          v_weightsq, 2
1225*c0909341SAndroid Build Coastguard Worker    sub                  hd, 1
1226*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
1227*c0909341SAndroid Build Coastguard Worker    RET
1228*c0909341SAndroid Build Coastguard Worker
1229*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1230*c0909341SAndroid Build Coastguard Workercglobal ipred_z1_8bpc, 3, 8, 11, 16*12, dst, stride, tl, w, h, angle, dx
1231*c0909341SAndroid Build Coastguard Worker    %define            base  r7-$$
1232*c0909341SAndroid Build Coastguard Worker    lea                  r7, [$$]
1233*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+pw_62]
1234*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pw_64]
1235*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+pw_512]
1236*c0909341SAndroid Build Coastguard Worker%else
1237*c0909341SAndroid Build Coastguard Workercglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, h, angle, dx
1238*c0909341SAndroid Build Coastguard Worker    %define            base  r1-$$
1239*c0909341SAndroid Build Coastguard Worker    %define              m8  [base+pw_62]
1240*c0909341SAndroid Build Coastguard Worker    %define              m9  [base+pw_64]
1241*c0909341SAndroid Build Coastguard Worker    %define             m10  [base+pw_512]
1242*c0909341SAndroid Build Coastguard Worker    %define         strideq  r3
1243*c0909341SAndroid Build Coastguard Worker    %define        stridemp  dword [rsp+16*12]
1244*c0909341SAndroid Build Coastguard Worker    mov            stridemp, r1
1245*c0909341SAndroid Build Coastguard Worker    LEA                  r1, $$
1246*c0909341SAndroid Build Coastguard Worker%endif
1247*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
1248*c0909341SAndroid Build Coastguard Worker    movifnidn        angled, anglem
1249*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
1250*c0909341SAndroid Build Coastguard Worker    inc                 tlq
1251*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [base+ipred_z1_ssse3_table+wq*4]
1252*c0909341SAndroid Build Coastguard Worker    mov                 dxd, angled
1253*c0909341SAndroid Build Coastguard Worker    and                 dxd, 0x7e
1254*c0909341SAndroid Build Coastguard Worker    add              angled, 165 ; ~90
1255*c0909341SAndroid Build Coastguard Worker    lea                  wq, [base+wq+ipred_z1_ssse3_table]
1256*c0909341SAndroid Build Coastguard Worker    movzx               dxd, word [base+dr_intra_derivative+dxq]
1257*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x4ff ; d = 90 - angle
1258*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1259*c0909341SAndroid Build Coastguard Worker.w4:
1260*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq+88]
1261*c0909341SAndroid Build Coastguard Worker    test                r3d, 0x480
1262*c0909341SAndroid Build Coastguard Worker    jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40
1263*c0909341SAndroid Build Coastguard Worker    sar                 r3d, 9
1264*c0909341SAndroid Build Coastguard Worker    add                 r3d, hd
1265*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
1266*c0909341SAndroid Build Coastguard Worker    jg .w4_no_upsample ; h > 8 || (w == h && is_sm)
1267*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tlq-1]
1268*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1, [base+z_upsample1]
1269*c0909341SAndroid Build Coastguard Worker    pshufb               m1, [base+z_upsample2]
1270*c0909341SAndroid Build Coastguard Worker    movddup              m2, [base+pb_36_m4]
1271*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
1272*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
1273*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m1, q3333
1274*c0909341SAndroid Build Coastguard Worker    movd           [rsp+16], m7 ; top[max_base_x]
1275*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
1276*c0909341SAndroid Build Coastguard Worker    movd                 m6, dxd
1277*c0909341SAndroid Build Coastguard Worker    mov                 r5d, dxd ; xpos
1278*c0909341SAndroid Build Coastguard Worker    pshufb               m6, [base+pw_256]
1279*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
1280*c0909341SAndroid Build Coastguard Worker    movq                 m0, [tlq]
1281*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
1282*c0909341SAndroid Build Coastguard Worker    paddw                m7, m6, m6
1283*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m7 ; xpos0 xpos1
1284*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m1
1285*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1
1286*c0909341SAndroid Build Coastguard Worker    movifnidn       strideq, stridemp
1287*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m0
1288*c0909341SAndroid Build Coastguard Worker.w4_upsample_loop:
1289*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r5+dxq]
1290*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 6      ; base0
1291*c0909341SAndroid Build Coastguard Worker    movq                 m0, [rsp+r5]
1292*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r2+dxq]
1293*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6      ; base1
1294*c0909341SAndroid Build Coastguard Worker    movhps               m0, [rsp+r2]
1295*c0909341SAndroid Build Coastguard Worker    pand                 m2, m8, m6 ; frac
1296*c0909341SAndroid Build Coastguard Worker    psubw                m1, m9, m2 ; 64-frac
1297*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
1298*c0909341SAndroid Build Coastguard Worker    por                  m1, m2     ; 64-frac, frac
1299*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
1300*c0909341SAndroid Build Coastguard Worker    paddw                m6, m7     ; xpos += dx
1301*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
1302*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
1303*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m0
1304*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q1032
1305*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m0
1306*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1307*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1308*c0909341SAndroid Build Coastguard Worker    jg .w4_upsample_loop
1309*c0909341SAndroid Build Coastguard Worker    RET
1310*c0909341SAndroid Build Coastguard Worker.w4_no_upsample:
1311*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 7     ; max_base
1312*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
1313*c0909341SAndroid Build Coastguard Worker    jnz .w4_main
1314*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
1315*c0909341SAndroid Build Coastguard Worker    movd                 m0, r3d
1316*c0909341SAndroid Build Coastguard Worker    movd                 m2, angled
1317*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
1318*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
1319*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1
1320*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m1
1321*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m1, m0, [base+z_filter_wh4]
1322*c0909341SAndroid Build Coastguard Worker    pand                 m1, m2
1323*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, [base+z_filter_t_w48+angleq*8]
1324*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m1
1325*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 7
1326*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1327*c0909341SAndroid Build Coastguard Worker    jz .w4_main ; filter_strength == 0
1328*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tlq-1]
1329*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x55555555
1330*c0909341SAndroid Build Coastguard Worker    movu                 m7, [base+z_filter_s+8]
1331*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30 ; filter_strength
1332*c0909341SAndroid Build Coastguard Worker    movddup              m0, [base+pb_8]
1333*c0909341SAndroid Build Coastguard Worker    pminub               m7, m0
1334*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3, [base+z_filter_s]
1335*c0909341SAndroid Build Coastguard Worker    movddup              m4, [base+z_filter_k-8+r5*8+24*0]
1336*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7
1337*c0909341SAndroid Build Coastguard Worker    movddup              m5, [base+z_filter_k-8+r5*8+24*1]
1338*c0909341SAndroid Build Coastguard Worker    shufps               m2, m0, m3, q2121
1339*c0909341SAndroid Build Coastguard Worker    movddup              m6, [base+z_filter_k-8+r5*8+24*2]
1340*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m4
1341*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2, m4
1342*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
1343*c0909341SAndroid Build Coastguard Worker    paddd                m5, m6
1344*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m3, m5
1345*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m6
1346*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1347*c0909341SAndroid Build Coastguard Worker    paddw                m1, m4
1348*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
1349*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q3333
1350*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
1351*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
1352*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 9
1353*c0909341SAndroid Build Coastguard Worker    mov                 tlq, rsp
1354*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
1355*c0909341SAndroid Build Coastguard Worker    cmovne              r3d, r5d
1356*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1357*c0909341SAndroid Build Coastguard Worker    mova              [tlq], m0
1358*c0909341SAndroid Build Coastguard Worker.w4_main:
1359*c0909341SAndroid Build Coastguard Worker    add                 tlq, r3
1360*c0909341SAndroid Build Coastguard Worker    movd                 m5, dxd
1361*c0909341SAndroid Build Coastguard Worker    movddup              m0, [base+z_base_inc] ; base_inc << 6
1362*c0909341SAndroid Build Coastguard Worker    movd                 m7, [tlq] ; top[max_base_x]
1363*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1364*c0909341SAndroid Build Coastguard Worker    movd                 m4, r3d
1365*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [base+pw_256]
1366*c0909341SAndroid Build Coastguard Worker    mov                 r5d, dxd ; xpos
1367*c0909341SAndroid Build Coastguard Worker    pshufb               m7, [base+pw_m256]
1368*c0909341SAndroid Build Coastguard Worker    sub                  r5, r3
1369*c0909341SAndroid Build Coastguard Worker    pshufb               m4, [base+pw_256]
1370*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+z1_shuf_w4]
1371*c0909341SAndroid Build Coastguard Worker    paddw                m6, m5, m5
1372*c0909341SAndroid Build Coastguard Worker    psubw                m4, m0 ; max_base_x
1373*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m5, m6 ; xpos0 xpos1
1374*c0909341SAndroid Build Coastguard Worker.w4_loop:
1375*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r5+dxq]
1376*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6      ; base0
1377*c0909341SAndroid Build Coastguard Worker    movq                 m0, [tlq+r5]
1378*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r3+dxq]
1379*c0909341SAndroid Build Coastguard Worker    sar                  r3, 6      ; base1
1380*c0909341SAndroid Build Coastguard Worker    movhps               m0, [tlq+r3]
1381*c0909341SAndroid Build Coastguard Worker    pand                 m2, m8, m5 ; frac
1382*c0909341SAndroid Build Coastguard Worker    psubw                m1, m9, m2 ; 64-frac
1383*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
1384*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
1385*c0909341SAndroid Build Coastguard Worker    por                  m1, m2     ; 64-frac, frac
1386*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
1387*c0909341SAndroid Build Coastguard Worker    movifnidn       strideq, stridemp
1388*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m1, m4, m5 ; base < max_base_x
1389*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
1390*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6     ; xpos += dx
1391*c0909341SAndroid Build Coastguard Worker    pand                 m0, m1
1392*c0909341SAndroid Build Coastguard Worker    pandn                m1, m7
1393*c0909341SAndroid Build Coastguard Worker    por                  m0, m1
1394*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
1395*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m0
1396*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q1032
1397*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m0
1398*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1399*c0909341SAndroid Build Coastguard Worker    jz .w4_end
1400*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1401*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1402*c0909341SAndroid Build Coastguard Worker    jl .w4_loop
1403*c0909341SAndroid Build Coastguard Worker    packuswb             m7, m7
1404*c0909341SAndroid Build Coastguard Worker.w4_end_loop:
1405*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m7
1406*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m7
1407*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1408*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1409*c0909341SAndroid Build Coastguard Worker    jg .w4_end_loop
1410*c0909341SAndroid Build Coastguard Worker.w4_end:
1411*c0909341SAndroid Build Coastguard Worker    RET
1412*c0909341SAndroid Build Coastguard Worker.w8:
1413*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq+88]
1414*c0909341SAndroid Build Coastguard Worker    and                 r3d, ~0x7f
1415*c0909341SAndroid Build Coastguard Worker    or                  r3d, hd
1416*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
1417*c0909341SAndroid Build Coastguard Worker    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
1418*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+z_upsample1]
1419*c0909341SAndroid Build Coastguard Worker    movu                 m3, [base+z_filter_s+6]
1420*c0909341SAndroid Build Coastguard Worker    movd                 m4, hd
1421*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-1]
1422*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+7]
1423*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
1424*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m7
1425*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+pb_36_m4]
1426*c0909341SAndroid Build Coastguard Worker    pminub               m4, m3
1427*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
1428*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m0, m5
1429*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
1430*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
1431*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7
1432*c0909341SAndroid Build Coastguard Worker    movd                 m6, dxd
1433*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m1, m5
1434*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m7
1435*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1436*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m7
1437*c0909341SAndroid Build Coastguard Worker    pshufb               m6, [base+pw_256]
1438*c0909341SAndroid Build Coastguard Worker    mov                 r5d, dxd
1439*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
1440*c0909341SAndroid Build Coastguard Worker    paddw                m7, m6, m6
1441*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
1442*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m7 ; xpos0 xpos1
1443*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq]
1444*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m10
1445*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m10
1446*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m3
1447*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
1448*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
1449*c0909341SAndroid Build Coastguard Worker    movifnidn       strideq, stridemp
1450*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m0
1451*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m1
1452*c0909341SAndroid Build Coastguard Worker.w8_upsample_loop:
1453*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r5+dxq]
1454*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 6 ; base0
1455*c0909341SAndroid Build Coastguard Worker    movu                 m0, [rsp+r5]
1456*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r2+dxq]
1457*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6 ; base1
1458*c0909341SAndroid Build Coastguard Worker    movu                 m1, [rsp+r2]
1459*c0909341SAndroid Build Coastguard Worker    pand                 m2, m8, m6
1460*c0909341SAndroid Build Coastguard Worker    psubw                m3, m9, m2
1461*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
1462*c0909341SAndroid Build Coastguard Worker    por                  m3, m2
1463*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m3 ; frac0
1464*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
1465*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m3     ; frac1
1466*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
1467*c0909341SAndroid Build Coastguard Worker    paddw                m6, m7
1468*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
1469*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
1470*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1471*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
1472*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
1473*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1474*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1475*c0909341SAndroid Build Coastguard Worker    jg .w8_upsample_loop
1476*c0909341SAndroid Build Coastguard Worker    RET
1477*c0909341SAndroid Build Coastguard Worker.w8_no_upsample:
1478*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
1479*c0909341SAndroid Build Coastguard Worker    movd                 m0, r3d
1480*c0909341SAndroid Build Coastguard Worker    and                 r3d, 7
1481*c0909341SAndroid Build Coastguard Worker    or                  r3d, 8 ; imin(h+7, 15)
1482*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1483*c0909341SAndroid Build Coastguard Worker    jnz .w8_main
1484*c0909341SAndroid Build Coastguard Worker    movd                 m2, angled
1485*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
1486*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
1487*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1
1488*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m1
1489*c0909341SAndroid Build Coastguard Worker    movu                 m1, [base+z_filter_wh8]
1490*c0909341SAndroid Build Coastguard Worker    psrldq               m3, [base+z_filter_t_w48+angleq*8], 4
1491*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m1, m0
1492*c0909341SAndroid Build Coastguard Worker    pand                 m1, m2
1493*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, m3
1494*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m1
1495*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1496*c0909341SAndroid Build Coastguard Worker    jz .w8_main ; filter_strength == 0
1497*c0909341SAndroid Build Coastguard Worker    movd                 m3, [tlq-1]
1498*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+16*0]
1499*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x55555555
1500*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+16*1]
1501*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30 ; filter_strength
1502*c0909341SAndroid Build Coastguard Worker    movd                 m2, [tlq+r3]
1503*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*4]
1504*c0909341SAndroid Build Coastguard Worker    sub                  r5, 3
1505*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*1], m0
1506*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
1507*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*0], m1
1508*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7
1509*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
1510*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*2], m3
1511*c0909341SAndroid Build Coastguard Worker    movq        [tlq+r3-15], m2
1512*c0909341SAndroid Build Coastguard Worker    call .filter_edge
1513*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 1
1514*c0909341SAndroid Build Coastguard Worker    add                 r5d, 17
1515*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
1516*c0909341SAndroid Build Coastguard Worker    cmova               r3d, r5d
1517*c0909341SAndroid Build Coastguard Worker.w8_main:
1518*c0909341SAndroid Build Coastguard Worker    add                 tlq, r3
1519*c0909341SAndroid Build Coastguard Worker    movd                 m5, dxd
1520*c0909341SAndroid Build Coastguard Worker    movd                 m7, [tlq]
1521*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1522*c0909341SAndroid Build Coastguard Worker    movu                 m3, [base+z_filter_s+2]
1523*c0909341SAndroid Build Coastguard Worker    movd                 m4, r3d
1524*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [base+pw_256]
1525*c0909341SAndroid Build Coastguard Worker    mov                 r5d, dxd
1526*c0909341SAndroid Build Coastguard Worker    pshufb               m7, [base+pw_m256]
1527*c0909341SAndroid Build Coastguard Worker    sub                  r5, r3
1528*c0909341SAndroid Build Coastguard Worker    pshufb               m4, [base+pw_256]
1529*c0909341SAndroid Build Coastguard Worker    psubw                m4, [base+z_base_inc]
1530*c0909341SAndroid Build Coastguard Worker    mova                 m6, m5
1531*c0909341SAndroid Build Coastguard Worker.w8_loop:
1532*c0909341SAndroid Build Coastguard Worker    mov                  r3, r5
1533*c0909341SAndroid Build Coastguard Worker    sar                  r3, 6
1534*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r3]
1535*c0909341SAndroid Build Coastguard Worker    pand                 m1, m8, m5
1536*c0909341SAndroid Build Coastguard Worker    psubw                m2, m9, m1
1537*c0909341SAndroid Build Coastguard Worker    psllw                m1, 8
1538*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
1539*c0909341SAndroid Build Coastguard Worker    por                  m1, m2
1540*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
1541*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m1, m4, m5
1542*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6
1543*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
1544*c0909341SAndroid Build Coastguard Worker    pand                 m0, m1
1545*c0909341SAndroid Build Coastguard Worker    pandn                m1, m7
1546*c0909341SAndroid Build Coastguard Worker    por                  m0, m1
1547*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
1548*c0909341SAndroid Build Coastguard Worker    movq             [dstq], m0
1549*c0909341SAndroid Build Coastguard Worker    dec                  hd
1550*c0909341SAndroid Build Coastguard Worker    jz .w8_end
1551*c0909341SAndroid Build Coastguard Worker    movifnidn       strideq, stridemp
1552*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1553*c0909341SAndroid Build Coastguard Worker    add                  r5, dxq
1554*c0909341SAndroid Build Coastguard Worker    jl .w8_loop
1555*c0909341SAndroid Build Coastguard Worker    packuswb             m7, m7
1556*c0909341SAndroid Build Coastguard Worker.w8_end_loop:
1557*c0909341SAndroid Build Coastguard Worker    movq             [dstq], m7
1558*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1559*c0909341SAndroid Build Coastguard Worker    dec                  hd
1560*c0909341SAndroid Build Coastguard Worker    jg .w8_end_loop
1561*c0909341SAndroid Build Coastguard Worker.w8_end:
1562*c0909341SAndroid Build Coastguard Worker    RET
1563*c0909341SAndroid Build Coastguard Worker.w16:
1564*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+15]
1565*c0909341SAndroid Build Coastguard Worker    movd                 m0, r3d
1566*c0909341SAndroid Build Coastguard Worker    and                 r3d, 15
1567*c0909341SAndroid Build Coastguard Worker    or                  r3d, 16 ; imin(h+15, 31)
1568*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
1569*c0909341SAndroid Build Coastguard Worker    jnz .w16_main
1570*c0909341SAndroid Build Coastguard Worker    movd                 m2, angled
1571*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
1572*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
1573*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1
1574*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m1
1575*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+z_filter_t_w16+angleq*4]
1576*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m0, [base+z_filter_wh16]
1577*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
1578*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m0, m3
1579*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m0
1580*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1581*c0909341SAndroid Build Coastguard Worker    jz .w16_main ; filter_strength == 0
1582*c0909341SAndroid Build Coastguard Worker    movd                 m4, [tlq-1]
1583*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+16*0]
1584*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x24924924
1585*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+16*1]
1586*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30
1587*c0909341SAndroid Build Coastguard Worker    movd                 m2, [tlq+30]
1588*c0909341SAndroid Build Coastguard Worker    adc                  r5, -4 ; filter_strength-3
1589*c0909341SAndroid Build Coastguard Worker    movd                 m3, [tlq+r3]
1590*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*4]
1591*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*1], m0
1592*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
1593*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*0], m1
1594*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m7
1595*c0909341SAndroid Build Coastguard Worker    movd              [rsp], m2
1596*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7
1597*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*2], m4
1598*c0909341SAndroid Build Coastguard Worker    movd        [tlq+r3-16], m3
1599*c0909341SAndroid Build Coastguard Worker    call .filter_edge
1600*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
1601*c0909341SAndroid Build Coastguard Worker    jle .w16_main
1602*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, [rsp], q0000
1603*c0909341SAndroid Build Coastguard Worker    sar                  r5, 1
1604*c0909341SAndroid Build Coastguard Worker    movd                 m1, [base+z_filter_k_tail+4+r5*4]
1605*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r5+33]
1606*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
1607*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1608*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
1609*c0909341SAndroid Build Coastguard Worker%else
1610*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
1611*c0909341SAndroid Build Coastguard Worker%endif
1612*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
1613*c0909341SAndroid Build Coastguard Worker    movd           [tlq+32], m0
1614*c0909341SAndroid Build Coastguard Worker.w16_main:
1615*c0909341SAndroid Build Coastguard Worker    add                 tlq, r3
1616*c0909341SAndroid Build Coastguard Worker    movd                 m5, dxd
1617*c0909341SAndroid Build Coastguard Worker    movd                 m7, [tlq]
1618*c0909341SAndroid Build Coastguard Worker    movd                 m4, r3d
1619*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1620*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [base+pw_256]
1621*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
1622*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m6
1623*c0909341SAndroid Build Coastguard Worker    mov                 r5d, dxd
1624*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m6
1625*c0909341SAndroid Build Coastguard Worker    sub                  r5, r3
1626*c0909341SAndroid Build Coastguard Worker    psubb                m4, [base+pb_0to15]
1627*c0909341SAndroid Build Coastguard Worker    mova                 m6, m5
1628*c0909341SAndroid Build Coastguard Worker.w16_loop:
1629*c0909341SAndroid Build Coastguard Worker    mov                  r3, r5
1630*c0909341SAndroid Build Coastguard Worker    sar                  r3, 6
1631*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3+0]
1632*c0909341SAndroid Build Coastguard Worker    pand                 m0, m8, m5
1633*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3+1]
1634*c0909341SAndroid Build Coastguard Worker    psubw                m3, m9, m0
1635*c0909341SAndroid Build Coastguard Worker    psllw                m0, 8
1636*c0909341SAndroid Build Coastguard Worker    por                  m3, m0
1637*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
1638*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
1639*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
1640*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
1641*c0909341SAndroid Build Coastguard Worker    psrlw                m3, m5, 6
1642*c0909341SAndroid Build Coastguard Worker    packsswb             m3, m3
1643*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
1644*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
1645*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6
1646*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m2, m4, m3
1647*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1648*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
1649*c0909341SAndroid Build Coastguard Worker    pandn                m2, m7
1650*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
1651*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
1652*c0909341SAndroid Build Coastguard Worker    dec                  hd
1653*c0909341SAndroid Build Coastguard Worker    jz .w16_end
1654*c0909341SAndroid Build Coastguard Worker    movifnidn       strideq, stridemp
1655*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1656*c0909341SAndroid Build Coastguard Worker    add                  r5, dxq
1657*c0909341SAndroid Build Coastguard Worker    jl .w16_loop
1658*c0909341SAndroid Build Coastguard Worker.w16_end_loop:
1659*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m7
1660*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1661*c0909341SAndroid Build Coastguard Worker    dec                  hd
1662*c0909341SAndroid Build Coastguard Worker    jg .w16_end_loop
1663*c0909341SAndroid Build Coastguard Worker.w16_end:
1664*c0909341SAndroid Build Coastguard Worker    RET
1665*c0909341SAndroid Build Coastguard Worker.w32:
1666*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+31]
1667*c0909341SAndroid Build Coastguard Worker    and                 r3d, 31
1668*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32    ; imin(h+31, 63)
1669*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
1670*c0909341SAndroid Build Coastguard Worker    jnz .w32_main
1671*c0909341SAndroid Build Coastguard Worker    movd                 m6, [tlq-1]
1672*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+16*0]
1673*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+16*1]
1674*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+16*2]
1675*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+16*3]
1676*c0909341SAndroid Build Coastguard Worker    movd                 m4, [tlq+62]
1677*c0909341SAndroid Build Coastguard Worker    movd                 m5, [tlq+r3]
1678*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*6]
1679*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*3], m0
1680*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
1681*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*2], m1
1682*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m7
1683*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*1], m2
1684*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d ; filter_strength = 3
1685*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*0], m3
1686*c0909341SAndroid Build Coastguard Worker    movd              [rsp], m4
1687*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m7
1688*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*4], m6
1689*c0909341SAndroid Build Coastguard Worker    movd        [tlq+r3-48], m5
1690*c0909341SAndroid Build Coastguard Worker    call .filter_edge
1691*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 16*2
1692*c0909341SAndroid Build Coastguard Worker    call .filter_edge
1693*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
1694*c0909341SAndroid Build Coastguard Worker    jle .w32_main
1695*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, [rsp], q0000
1696*c0909341SAndroid Build Coastguard Worker    movd                 m1, [base+z_filter_k_tail+4]
1697*c0909341SAndroid Build Coastguard Worker    add                 r3d, 2
1698*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
1699*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1700*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
1701*c0909341SAndroid Build Coastguard Worker%else
1702*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
1703*c0909341SAndroid Build Coastguard Worker%endif
1704*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
1705*c0909341SAndroid Build Coastguard Worker    movd           [tlq+64], m0
1706*c0909341SAndroid Build Coastguard Worker.w32_main:
1707*c0909341SAndroid Build Coastguard Worker    add                 tlq, r3
1708*c0909341SAndroid Build Coastguard Worker    movd                 m0, r3d
1709*c0909341SAndroid Build Coastguard Worker    movd                 m7, [tlq]
1710*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1711*c0909341SAndroid Build Coastguard Worker    movd                 m5, dxd
1712*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
1713*c0909341SAndroid Build Coastguard Worker    mov                 r5d, dxd
1714*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6
1715*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [base+pw_256]
1716*c0909341SAndroid Build Coastguard Worker    sub                  r5, r3
1717*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m6
1718*c0909341SAndroid Build Coastguard Worker    psubb                m0, [base+pb_0to15]
1719*c0909341SAndroid Build Coastguard Worker    movddup              m1, [base+pb_m16]
1720*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m0
1721*c0909341SAndroid Build Coastguard Worker    paddb                m0, m1
1722*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m0
1723*c0909341SAndroid Build Coastguard Worker    mova                 m6, m5
1724*c0909341SAndroid Build Coastguard Worker.w32_loop:
1725*c0909341SAndroid Build Coastguard Worker    mov                  r3, r5
1726*c0909341SAndroid Build Coastguard Worker    sar                  r3, 6
1727*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3+16*0+0]
1728*c0909341SAndroid Build Coastguard Worker    pand                 m0, m8, m5
1729*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3+16*0+1]
1730*c0909341SAndroid Build Coastguard Worker    psubw                m3, m9, m0
1731*c0909341SAndroid Build Coastguard Worker    psllw                m0, 8
1732*c0909341SAndroid Build Coastguard Worker    por                  m3, m0
1733*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
1734*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
1735*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
1736*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
1737*c0909341SAndroid Build Coastguard Worker    psrlw                m4, m5, 6
1738*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
1739*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
1740*c0909341SAndroid Build Coastguard Worker    packsswb             m4, m4
1741*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m2, [rsp+16*0], m4
1742*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1743*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
1744*c0909341SAndroid Build Coastguard Worker    pandn                m2, m7
1745*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
1746*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3+16*1+0]
1747*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3+16*1+1]
1748*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
1749*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
1750*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
1751*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
1752*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
1753*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6
1754*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
1755*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
1756*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m2, [rsp+16*1], m4
1757*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1758*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
1759*c0909341SAndroid Build Coastguard Worker    pandn                m2, m7
1760*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
1761*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m0
1762*c0909341SAndroid Build Coastguard Worker    dec                  hd
1763*c0909341SAndroid Build Coastguard Worker    jz .w32_end
1764*c0909341SAndroid Build Coastguard Worker    movifnidn       strideq, stridemp
1765*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1766*c0909341SAndroid Build Coastguard Worker    add                  r5, dxq
1767*c0909341SAndroid Build Coastguard Worker    jl .w32_loop
1768*c0909341SAndroid Build Coastguard Worker.w32_end_loop:
1769*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m7
1770*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m7
1771*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1772*c0909341SAndroid Build Coastguard Worker    dec                  hd
1773*c0909341SAndroid Build Coastguard Worker    jg .w32_end_loop
1774*c0909341SAndroid Build Coastguard Worker.w32_end:
1775*c0909341SAndroid Build Coastguard Worker    RET
1776*c0909341SAndroid Build Coastguard Worker.w64:
1777*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+63]
1778*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
1779*c0909341SAndroid Build Coastguard Worker    jnz .w64_main
1780*c0909341SAndroid Build Coastguard Worker    movd                 m4, [tlq-1]
1781*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+16*0]
1782*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+16*1]
1783*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+16*2]
1784*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+16*3]
1785*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m0
1786*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
1787*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*4], m1
1788*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m7
1789*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*5], m2
1790*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*6], m3
1791*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m4
1792*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+16*4]
1793*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+16*5]
1794*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+16*6]
1795*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq+16*7]
1796*c0909341SAndroid Build Coastguard Worker    movd                 m4, [tlq+r3]
1797*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*10]
1798*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*3], m0
1799*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d ; filter_strength = 3
1800*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*2], m1
1801*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m7
1802*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*1], m2
1803*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*0], m3
1804*c0909341SAndroid Build Coastguard Worker    movd      [tlq+r3-16*7], m4
1805*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 64
1806*c0909341SAndroid Build Coastguard Worker    jl .w64_filter96 ; skip one call if the last 32 bytes aren't used
1807*c0909341SAndroid Build Coastguard Worker    call .filter_edge
1808*c0909341SAndroid Build Coastguard Worker.w64_filter96:
1809*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 16*2
1810*c0909341SAndroid Build Coastguard Worker    call .filter_edge
1811*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 16*2
1812*c0909341SAndroid Build Coastguard Worker    call .filter_edge
1813*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 16*2
1814*c0909341SAndroid Build Coastguard Worker    call .filter_edge
1815*c0909341SAndroid Build Coastguard Worker.w64_main:
1816*c0909341SAndroid Build Coastguard Worker    add                 tlq, r3
1817*c0909341SAndroid Build Coastguard Worker    movd                 m0, r3d
1818*c0909341SAndroid Build Coastguard Worker    movd                 m7, [tlq]
1819*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
1820*c0909341SAndroid Build Coastguard Worker    movd                 m5, dxd
1821*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
1822*c0909341SAndroid Build Coastguard Worker    mov                 r5d, dxd
1823*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6
1824*c0909341SAndroid Build Coastguard Worker    sub                  r5, r3
1825*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [base+pw_256]
1826*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m6
1827*c0909341SAndroid Build Coastguard Worker    psubb                m0, [base+pb_0to15]
1828*c0909341SAndroid Build Coastguard Worker    movddup              m1, [base+pb_m16]
1829*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m0
1830*c0909341SAndroid Build Coastguard Worker    paddb                m0, m1
1831*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m0
1832*c0909341SAndroid Build Coastguard Worker    paddb                m0, m1
1833*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m0
1834*c0909341SAndroid Build Coastguard Worker    paddb                m0, m1
1835*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m0
1836*c0909341SAndroid Build Coastguard Worker    mova                 m6, m5
1837*c0909341SAndroid Build Coastguard Worker.w64_loop:
1838*c0909341SAndroid Build Coastguard Worker    mov                  r3, r5
1839*c0909341SAndroid Build Coastguard Worker    sar                  r3, 6
1840*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3+16*0+0]
1841*c0909341SAndroid Build Coastguard Worker    pand                 m0, m8, m5
1842*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3+16*0+1]
1843*c0909341SAndroid Build Coastguard Worker    psubw                m3, m9, m0
1844*c0909341SAndroid Build Coastguard Worker    psllw                m0, 8
1845*c0909341SAndroid Build Coastguard Worker    por                  m3, m0
1846*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
1847*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
1848*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
1849*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
1850*c0909341SAndroid Build Coastguard Worker    psrlw                m4, m5, 6
1851*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
1852*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
1853*c0909341SAndroid Build Coastguard Worker    packsswb             m4, m4
1854*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m2, [rsp+16*0], m4
1855*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1856*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
1857*c0909341SAndroid Build Coastguard Worker    pandn                m2, m7
1858*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
1859*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3+16*1+0]
1860*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3+16*1+1]
1861*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
1862*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
1863*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
1864*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
1865*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
1866*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
1867*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
1868*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m2, [rsp+16*1], m4
1869*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1870*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
1871*c0909341SAndroid Build Coastguard Worker    pandn                m2, m7
1872*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
1873*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3+16*2+0]
1874*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3+16*2+1]
1875*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m0
1876*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
1877*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
1878*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
1879*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
1880*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
1881*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
1882*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m2, [rsp+16*2], m4
1883*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1884*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
1885*c0909341SAndroid Build Coastguard Worker    pandn                m2, m7
1886*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
1887*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r3+16*3+0]
1888*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+r3+16*3+1]
1889*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
1890*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
1891*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
1892*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
1893*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
1894*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6
1895*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
1896*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
1897*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m2, [rsp+16*3], m4
1898*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1899*c0909341SAndroid Build Coastguard Worker    pand                 m0, m2
1900*c0909341SAndroid Build Coastguard Worker    pandn                m2, m7
1901*c0909341SAndroid Build Coastguard Worker    por                  m0, m2
1902*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m0
1903*c0909341SAndroid Build Coastguard Worker    dec                  hd
1904*c0909341SAndroid Build Coastguard Worker    jz .w64_end
1905*c0909341SAndroid Build Coastguard Worker    movifnidn       strideq, stridemp
1906*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1907*c0909341SAndroid Build Coastguard Worker    add                  r5, dxq
1908*c0909341SAndroid Build Coastguard Worker    jl .w64_loop
1909*c0909341SAndroid Build Coastguard Worker.w64_end_loop:
1910*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m7
1911*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m7
1912*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m7
1913*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m7
1914*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1915*c0909341SAndroid Build Coastguard Worker    dec                  hd
1916*c0909341SAndroid Build Coastguard Worker    jg .w64_end_loop
1917*c0909341SAndroid Build Coastguard Worker.w64_end:
1918*c0909341SAndroid Build Coastguard Worker    RET
1919*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1920*c0909341SAndroid Build Coastguard Worker.filter_edge: ; 32 pixels/iteration
1921*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+z_filter_k+8*2+r5*8+24*0]
1922*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq-18]
1923*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq-17]
1924*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq- 2]
1925*c0909341SAndroid Build Coastguard Worker    movu                 m4, [tlq- 1]
1926*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m2, m1
1927*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7
1928*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m1
1929*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
1930*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m3, m4
1931*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m7
1932*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m4
1933*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m7
1934*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+z_filter_k+8*2+r5*8+24*1]
1935*c0909341SAndroid Build Coastguard Worker    mova                 m5, [tlq-16]
1936*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq-15]
1937*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m5, m6
1938*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m7
1939*c0909341SAndroid Build Coastguard Worker    punpckhbw            m5, m6
1940*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m7
1941*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
1942*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
1943*c0909341SAndroid Build Coastguard Worker    mova                 m5, [tlq+ 0]
1944*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq+ 1]
1945*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m5, m6
1946*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m7
1947*c0909341SAndroid Build Coastguard Worker    punpckhbw            m5, m6
1948*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m7
1949*c0909341SAndroid Build Coastguard Worker    paddw                m1, m4
1950*c0909341SAndroid Build Coastguard Worker    paddw                m3, m5
1951*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1952*c0909341SAndroid Build Coastguard Worker    jnz .filter_end ; 3-tap
1953*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+z_filter_k+8*8]
1954*c0909341SAndroid Build Coastguard Worker    movu                 m5, [tlq-14]
1955*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq+ 2]
1956*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m5, m5
1957*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m7
1958*c0909341SAndroid Build Coastguard Worker    punpckhbw            m5, m5
1959*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m7
1960*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
1961*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
1962*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m6, m6
1963*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m7
1964*c0909341SAndroid Build Coastguard Worker    punpckhbw            m6, m6
1965*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m7
1966*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
1967*c0909341SAndroid Build Coastguard Worker    paddw                m3, m6
1968*c0909341SAndroid Build Coastguard Worker.filter_end:
1969*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1970*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m0, m2, m1, m3
1971*c0909341SAndroid Build Coastguard Worker%else
1972*c0909341SAndroid Build Coastguard Worker    mova                 m4, m10
1973*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m4 }, m0, m2, m1, m3
1974*c0909341SAndroid Build Coastguard Worker%endif
1975*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m2
1976*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m3
1977*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*0], m0
1978*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*1], m1
1979*c0909341SAndroid Build Coastguard Worker    ret
1980*c0909341SAndroid Build Coastguard Worker
1981*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1982*c0909341SAndroid Build Coastguard Workercglobal ipred_z2_8bpc, 4, 12, 13, 16*16, dst, stride, tl, w, h, angle, dx, _, dy
1983*c0909341SAndroid Build Coastguard Worker    %define            base  r7-$$
1984*c0909341SAndroid Build Coastguard Worker    %define           maxwm  r6m
1985*c0909341SAndroid Build Coastguard Worker    %define           maxhm  r7m
1986*c0909341SAndroid Build Coastguard Worker    lea                  r7, [$$]
1987*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
1988*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+pw_62]
1989*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pw_64]
1990*c0909341SAndroid Build Coastguard Worker    lea                 r9d, [wq-4]
1991*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+pw_512]
1992*c0909341SAndroid Build Coastguard Worker    shl                 r9d, 6
1993*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+z1_shuf_w4]
1994*c0909341SAndroid Build Coastguard Worker    or                  r9d, hd
1995*c0909341SAndroid Build Coastguard Worker    mova                m12, [base+z2_h_shuf]
1996*c0909341SAndroid Build Coastguard Worker%else
1997*c0909341SAndroid Build Coastguard Workercglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, h, angle, dx
1998*c0909341SAndroid Build Coastguard Worker    %define            base  r1-$$
1999*c0909341SAndroid Build Coastguard Worker    %define              m8  [base+pw_62]
2000*c0909341SAndroid Build Coastguard Worker    %define              m9  [base+pw_64]
2001*c0909341SAndroid Build Coastguard Worker    %define             m10  [base+pw_512]
2002*c0909341SAndroid Build Coastguard Worker    %define             m11  [rsp+16*16]
2003*c0909341SAndroid Build Coastguard Worker    %define             m12  [rsp+16*17]
2004*c0909341SAndroid Build Coastguard Worker    %define             r9b  byte [rsp+16*18+4*0]
2005*c0909341SAndroid Build Coastguard Worker    %define             r9d  dword [rsp+16*18+4*0]
2006*c0909341SAndroid Build Coastguard Worker    %define            r10d  dword [rsp+16*18+4*1]
2007*c0909341SAndroid Build Coastguard Worker    %define            r11d  dword [rsp+16*18+4*2]
2008*c0909341SAndroid Build Coastguard Worker    %define           maxwm  [rsp+16*18+4*3]
2009*c0909341SAndroid Build Coastguard Worker    %define           maxhm  [rsp+16*19+4*0]
2010*c0909341SAndroid Build Coastguard Worker    %define        stridemp  [rsp+16*19+4*1]
2011*c0909341SAndroid Build Coastguard Worker    %define         strideq  r3
2012*c0909341SAndroid Build Coastguard Worker    %define             dyd  r4
2013*c0909341SAndroid Build Coastguard Worker    %define             dyq  r4
2014*c0909341SAndroid Build Coastguard Worker    mov            stridemp, r1
2015*c0909341SAndroid Build Coastguard Worker    mov                 r1d, r6m
2016*c0909341SAndroid Build Coastguard Worker    mov                 r4d, r7m
2017*c0909341SAndroid Build Coastguard Worker    mov               maxwm, r1d
2018*c0909341SAndroid Build Coastguard Worker    mov               maxhm, r4d
2019*c0909341SAndroid Build Coastguard Worker    LEA                  r1, $$
2020*c0909341SAndroid Build Coastguard Worker    lea                  hd, [wq-4]
2021*c0909341SAndroid Build Coastguard Worker    mova                 m0, [base+z1_shuf_w4]
2022*c0909341SAndroid Build Coastguard Worker    shl                  hd, 6
2023*c0909341SAndroid Build Coastguard Worker    mova                 m1, [base+z2_h_shuf]
2024*c0909341SAndroid Build Coastguard Worker    or                   hd, hm
2025*c0909341SAndroid Build Coastguard Worker    mova                m11, m0
2026*c0909341SAndroid Build Coastguard Worker    mov                 r9d, hd
2027*c0909341SAndroid Build Coastguard Worker    mova                m12, m1
2028*c0909341SAndroid Build Coastguard Worker%endif
2029*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
2030*c0909341SAndroid Build Coastguard Worker    movifnidn        angled, anglem
2031*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [base+ipred_z2_ssse3_table+wq*4]
2032*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2033*c0909341SAndroid Build Coastguard Worker    movzx               dxd, angleb
2034*c0909341SAndroid Build Coastguard Worker%else
2035*c0909341SAndroid Build Coastguard Worker    movzx               dxd, byte anglem
2036*c0909341SAndroid Build Coastguard Worker%endif
2037*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x400
2038*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-16*4]
2039*c0909341SAndroid Build Coastguard Worker    mov                 dyd, dxd
2040*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tlq-16*3]
2041*c0909341SAndroid Build Coastguard Worker    neg                 dxq
2042*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq-16*2]
2043*c0909341SAndroid Build Coastguard Worker    and                 dyd, ~1
2044*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tlq-16*1]
2045*c0909341SAndroid Build Coastguard Worker    and                 dxq, ~1
2046*c0909341SAndroid Build Coastguard Worker    movd                 m4, [tlq]
2047*c0909341SAndroid Build Coastguard Worker    movu                 m5, [tlq+16*0+1]
2048*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq+16*1+1]
2049*c0909341SAndroid Build Coastguard Worker    movzx               dyd, word [base+dr_intra_derivative+dyq-90]  ; angle - 90
2050*c0909341SAndroid Build Coastguard Worker    movzx               dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle
2051*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m0
2052*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
2053*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m1
2054*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m7
2055*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*4], m2
2056*c0909341SAndroid Build Coastguard Worker    lea                  wq, [base+ipred_z2_ssse3_table+wq]
2057*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*5], m3
2058*c0909341SAndroid Build Coastguard Worker    neg                 dxd
2059*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*6], m4
2060*c0909341SAndroid Build Coastguard Worker    or                  dyd, 4<<16
2061*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*7], m4
2062*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*8], m5
2063*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*9], m6
2064*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base+z_base_inc+2]
2065*c0909341SAndroid Build Coastguard Worker    movsldup             m1, [base+z2_dy_offset]
2066*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+pw_256] ; 4<<6
2067*c0909341SAndroid Build Coastguard Worker    movq    [rsp+16*14+8*0], m0
2068*c0909341SAndroid Build Coastguard Worker    movq    [rsp+16*15+8*0], m1
2069*c0909341SAndroid Build Coastguard Worker    movq    [rsp+16*15+8*1], m2
2070*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2071*c0909341SAndroid Build Coastguard Worker    lea                r10d, [dxq+(128<<6)] ; xpos
2072*c0909341SAndroid Build Coastguard Worker%else
2073*c0909341SAndroid Build Coastguard Worker    mov      [rsp+16*7+4*1], dyd
2074*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [dxq+(128<<6)]
2075*c0909341SAndroid Build Coastguard Worker    mov                r10d, r4d
2076*c0909341SAndroid Build Coastguard Worker    movzx                hd, r9b
2077*c0909341SAndroid Build Coastguard Worker%endif
2078*c0909341SAndroid Build Coastguard Worker    mov                r11d, (128-4)<<6
2079*c0909341SAndroid Build Coastguard Worker    jmp                  wq
2080*c0909341SAndroid Build Coastguard Worker.w4:
2081*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2082*c0909341SAndroid Build Coastguard Worker    jnz .w4_main
2083*c0909341SAndroid Build Coastguard Worker    movd                 m5, [tlq+4]
2084*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+2]
2085*c0909341SAndroid Build Coastguard Worker    add              angled, 1022
2086*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m7
2087*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
2088*c0909341SAndroid Build Coastguard Worker    movd       [rsp+16*8+4], m5
2089*c0909341SAndroid Build Coastguard Worker    test                r3d, angled
2090*c0909341SAndroid Build Coastguard Worker    jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
2091*c0909341SAndroid Build Coastguard Worker    call .upsample_above
2092*c0909341SAndroid Build Coastguard Worker    sub              angled, 1075 ; angle - 53
2093*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
2094*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x7f ; 180 - angle
2095*c0909341SAndroid Build Coastguard Worker    movd                 m0, r3d
2096*c0909341SAndroid Build Coastguard Worker    movd                 m6, angled
2097*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
2098*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m7
2099*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m7
2100*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m0, [base+z_filter_wh4]
2101*c0909341SAndroid Build Coastguard Worker    pand                 m6, m0
2102*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m6, [base+z_filter_t_w48+angleq*8]
2103*c0909341SAndroid Build Coastguard Worker    jmp .w8_filter_left
2104*c0909341SAndroid Build Coastguard Worker.upsample_above: ; w4/w8
2105*c0909341SAndroid Build Coastguard Worker    movq                 m3, [rsp+gprsize+16*8-2]
2106*c0909341SAndroid Build Coastguard Worker    movq                 m1, [rsp+gprsize+16*8-1]
2107*c0909341SAndroid Build Coastguard Worker    movq                 m0, [rsp+gprsize+16*8+0]
2108*c0909341SAndroid Build Coastguard Worker    movq                 m4, [rsp+gprsize+16*8+1]
2109*c0909341SAndroid Build Coastguard Worker    movddup              m5, [base+pb_36_m4]
2110*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m3
2111*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m0, m4
2112*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
2113*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
2114*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2115*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+pb_0to15]
2116*c0909341SAndroid Build Coastguard Worker    lea                r10d, [r10+dxq+(1<<6)]
2117*c0909341SAndroid Build Coastguard Worker    mov                r11d, (128-7)<<6
2118*c0909341SAndroid Build Coastguard Worker%else
2119*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+pb_0to15]
2120*c0909341SAndroid Build Coastguard Worker    mov                 r3d, [rsp+gprsize+16*18+4*1]
2121*c0909341SAndroid Build Coastguard Worker    mov dword [rsp+gprsize+16*18+4*2], (128-7)<<6
2122*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r3+dxq+(1<<6)]
2123*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize+16*18+4*1], r3d
2124*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16*16], m3
2125*c0909341SAndroid Build Coastguard Worker%endif
2126*c0909341SAndroid Build Coastguard Worker    add                 dxd, dxd
2127*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2128*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
2129*c0909341SAndroid Build Coastguard Worker    movq                 m2, [rsp+gprsize+16*14]
2130*c0909341SAndroid Build Coastguard Worker    paddw                m2, m2
2131*c0909341SAndroid Build Coastguard Worker    movq [rsp+gprsize+16*14], m2
2132*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m1
2133*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m0
2134*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16*8], m1
2135*c0909341SAndroid Build Coastguard Worker    ret
2136*c0909341SAndroid Build Coastguard Worker.w4_no_upsample_above:
2137*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+3]
2138*c0909341SAndroid Build Coastguard Worker    mov               [rsp], angled
2139*c0909341SAndroid Build Coastguard Worker    sub              angled, 1112 ; angle - 90
2140*c0909341SAndroid Build Coastguard Worker    movd                 m0, r3d
2141*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 90
2142*c0909341SAndroid Build Coastguard Worker    movd                 m1, angled
2143*c0909341SAndroid Build Coastguard Worker    sub                 r3d, angled ; 180 - angle
2144*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
2145*c0909341SAndroid Build Coastguard Worker    movu                 m3, [base+z_filter_wh4]
2146*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+z_filter_t_w48+angleq*8]
2147*c0909341SAndroid Build Coastguard Worker    call .w8_filter_top
2148*c0909341SAndroid Build Coastguard Worker    mov              angled, [rsp]
2149*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+2]
2150*c0909341SAndroid Build Coastguard Worker    sub              angled, 139
2151*c0909341SAndroid Build Coastguard Worker    shl                 r3d, 6
2152*c0909341SAndroid Build Coastguard Worker    test                r3d, angled
2153*c0909341SAndroid Build Coastguard Worker    jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
2154*c0909341SAndroid Build Coastguard Worker.upsample_left: ; w4/w8
2155*c0909341SAndroid Build Coastguard Worker    neg                  hq
2156*c0909341SAndroid Build Coastguard Worker    movd                 m0, [tlq+hq]
2157*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m7
2158*c0909341SAndroid Build Coastguard Worker    movd    [rsp+16*6+hq-4], m0
2159*c0909341SAndroid Build Coastguard Worker    movq                 m3, [rsp+16*5+7]
2160*c0909341SAndroid Build Coastguard Worker    movq                 m0, [rsp+16*5+8]
2161*c0909341SAndroid Build Coastguard Worker    movq                 m2, [rsp+16*5+9]
2162*c0909341SAndroid Build Coastguard Worker    movq                 m4, [rsp+16*5+10]
2163*c0909341SAndroid Build Coastguard Worker    movddup              m5, [base+pb_36_m4]
2164*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m0, m3
2165*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4
2166*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
2167*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
2168*c0909341SAndroid Build Coastguard Worker    movshdup             m3, [base+z2_dy_offset]
2169*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2170*c0909341SAndroid Build Coastguard Worker    mova                m12, [base+z2_upsample]
2171*c0909341SAndroid Build Coastguard Worker    add                 dyd, dyd
2172*c0909341SAndroid Build Coastguard Worker%else
2173*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+z2_upsample]
2174*c0909341SAndroid Build Coastguard Worker    shl dword [rsp+16*7+4*1], 1
2175*c0909341SAndroid Build Coastguard Worker    mova                m12, m4
2176*c0909341SAndroid Build Coastguard Worker%endif
2177*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2178*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
2179*c0909341SAndroid Build Coastguard Worker    movq        [rsp+16*15], m3
2180*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m1
2181*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1
2182*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*5], m0
2183*c0909341SAndroid Build Coastguard Worker.w4_main:
2184*c0909341SAndroid Build Coastguard Worker    movd                 m6, dxd
2185*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2186*c0909341SAndroid Build Coastguard Worker    movd                 m3, dyd
2187*c0909341SAndroid Build Coastguard Worker%else
2188*c0909341SAndroid Build Coastguard Worker    movd                 m3, [rsp+16*7+4*1]
2189*c0909341SAndroid Build Coastguard Worker%endif
2190*c0909341SAndroid Build Coastguard Worker    movddup              m0, [rsp+16*14+8*0]
2191*c0909341SAndroid Build Coastguard Worker    pshufb               m6, [base+pw_256]
2192*c0909341SAndroid Build Coastguard Worker    paddw                m7, m6, m6
2193*c0909341SAndroid Build Coastguard Worker    movq                 m5, [base+pw_m1to4]
2194*c0909341SAndroid Build Coastguard Worker    pshuflw              m4, m3, q0000
2195*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m7
2196*c0909341SAndroid Build Coastguard Worker    pmullw               m4, m5
2197*c0909341SAndroid Build Coastguard Worker    pshuflw              m3, m3, q1111
2198*c0909341SAndroid Build Coastguard Worker    paddw                m6, m0
2199*c0909341SAndroid Build Coastguard Worker    mov                 r2d, r10d
2200*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m4, q3333
2201*c0909341SAndroid Build Coastguard Worker    psubw                m4, [rsp+16*15]
2202*c0909341SAndroid Build Coastguard Worker    movq     [rsp+16*6+8*1], m3
2203*c0909341SAndroid Build Coastguard Worker    movq          [rsp+8*1], m0 ; dy*4
2204*c0909341SAndroid Build Coastguard Worker    mov                  r5, dstq
2205*c0909341SAndroid Build Coastguard Worker.w4_loop0:
2206*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*12], m6
2207*c0909341SAndroid Build Coastguard Worker    movq          [rsp+8*0], m4
2208*c0909341SAndroid Build Coastguard Worker    pand                 m0, m4, m8
2209*c0909341SAndroid Build Coastguard Worker    psraw                m4, 6
2210*c0909341SAndroid Build Coastguard Worker    psubw                m1, m9, m0
2211*c0909341SAndroid Build Coastguard Worker    psllw                m0, 8
2212*c0909341SAndroid Build Coastguard Worker    por                  m0, m1       ; 64-frac_y, frac_y
2213*c0909341SAndroid Build Coastguard Worker    movq          [rsp+8*3], m0
2214*c0909341SAndroid Build Coastguard Worker    pabsw                m4, m4
2215*c0909341SAndroid Build Coastguard Worker    movq          [rsp+8*2], m4
2216*c0909341SAndroid Build Coastguard Worker    movzx                hd, r9b
2217*c0909341SAndroid Build Coastguard Worker.w4_loop:
2218*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r2+dxq]
2219*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6        ; base_x0
2220*c0909341SAndroid Build Coastguard Worker    movq                 m0, [rsp+r2]
2221*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+dxq]
2222*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6        ; base_x1
2223*c0909341SAndroid Build Coastguard Worker    movhps               m0, [rsp+r3]
2224*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [r2+dxq]
2225*c0909341SAndroid Build Coastguard Worker    shr                 r2d, 6        ; base_x2
2226*c0909341SAndroid Build Coastguard Worker    movq                 m1, [rsp+r2]
2227*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r3+dxq]
2228*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 6        ; base_x3
2229*c0909341SAndroid Build Coastguard Worker    movhps               m1, [rsp+r3]
2230*c0909341SAndroid Build Coastguard Worker    pand                 m2, m8, m6
2231*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m6, m7
2232*c0909341SAndroid Build Coastguard Worker    psubw                m3, m9, m2
2233*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
2234*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m11
2235*c0909341SAndroid Build Coastguard Worker    por                  m2, m3
2236*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
2237*c0909341SAndroid Build Coastguard Worker    pand                 m2, m8, m5
2238*c0909341SAndroid Build Coastguard Worker    psubw                m3, m9, m2
2239*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
2240*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m11
2241*c0909341SAndroid Build Coastguard Worker    por                  m2, m3
2242*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
2243*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 127 ; topleft
2244*c0909341SAndroid Build Coastguard Worker    jge .w4_toponly
2245*c0909341SAndroid Build Coastguard Worker    movzx               r3d, byte [rsp+8*2+0] ; base_y0
2246*c0909341SAndroid Build Coastguard Worker    movq                 m3, [rsp+r3]
2247*c0909341SAndroid Build Coastguard Worker    movzx               r3d, byte [rsp+8*2+2] ; base_y1
2248*c0909341SAndroid Build Coastguard Worker    movhps               m3, [rsp+r3]
2249*c0909341SAndroid Build Coastguard Worker    movzx               r3d, byte [rsp+8*2+4] ; base_y2
2250*c0909341SAndroid Build Coastguard Worker    movq                 m4, [rsp+r3]
2251*c0909341SAndroid Build Coastguard Worker    movzx               r3d, byte [rsp+8*2+6] ; base_y3
2252*c0909341SAndroid Build Coastguard Worker    movhps               m4, [rsp+r3]
2253*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m12
2254*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m12
2255*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m4
2256*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m4
2257*c0909341SAndroid Build Coastguard Worker    movddup              m4, [rsp+8*3]
2258*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m4
2259*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m4
2260*c0909341SAndroid Build Coastguard Worker    psraw                m6, 15       ; base_x < topleft
2261*c0909341SAndroid Build Coastguard Worker    pand                 m2, m6
2262*c0909341SAndroid Build Coastguard Worker    pandn                m6, m0
2263*c0909341SAndroid Build Coastguard Worker    por                  m0, m2, m6
2264*c0909341SAndroid Build Coastguard Worker    psraw                m6, m5, 15
2265*c0909341SAndroid Build Coastguard Worker    pand                 m3, m6
2266*c0909341SAndroid Build Coastguard Worker    pandn                m6, m1
2267*c0909341SAndroid Build Coastguard Worker    por                  m1, m3, m6
2268*c0909341SAndroid Build Coastguard Worker.w4_toponly:
2269*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
2270*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
2271*c0909341SAndroid Build Coastguard Worker    movifnidn       strideq, stridemp
2272*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2273*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m0
2274*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032
2275*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m1
2276*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2277*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
2278*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m0
2279*c0909341SAndroid Build Coastguard Worker    psrlq                m0, 32
2280*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m0
2281*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2282*c0909341SAndroid Build Coastguard Worker    jz .w4_end
2283*c0909341SAndroid Build Coastguard Worker    movq                 m4, [rsp+8*2]
2284*c0909341SAndroid Build Coastguard Worker    movq                 m3, [rsp+16*6+8*1]
2285*c0909341SAndroid Build Coastguard Worker    paddw                m6, m5, m7   ; xpos += dx
2286*c0909341SAndroid Build Coastguard Worker    psubw                m4, m3
2287*c0909341SAndroid Build Coastguard Worker    movq          [rsp+8*2], m4
2288*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2289*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, r11d
2290*c0909341SAndroid Build Coastguard Worker    jge .w4_loop
2291*c0909341SAndroid Build Coastguard Worker    movddup              m5, [rsp+8*3]
2292*c0909341SAndroid Build Coastguard Worker.w4_leftonly_loop:
2293*c0909341SAndroid Build Coastguard Worker    movzx               r2d, byte [rsp+8*2+0] ; base_y0
2294*c0909341SAndroid Build Coastguard Worker    movq                 m1, [rsp+r2]
2295*c0909341SAndroid Build Coastguard Worker    movzx               r2d, byte [rsp+8*2+2] ; base_y1
2296*c0909341SAndroid Build Coastguard Worker    movhps               m1, [rsp+r2]
2297*c0909341SAndroid Build Coastguard Worker    movzx               r2d, byte [rsp+8*2+4] ; base_y2
2298*c0909341SAndroid Build Coastguard Worker    movq                 m2, [rsp+r2]
2299*c0909341SAndroid Build Coastguard Worker    movzx               r2d, byte [rsp+8*2+6] ; base_y3
2300*c0909341SAndroid Build Coastguard Worker    movhps               m2, [rsp+r2]
2301*c0909341SAndroid Build Coastguard Worker    psubw                m4, m3
2302*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m12
2303*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m12
2304*c0909341SAndroid Build Coastguard Worker    movq          [rsp+8*2], m4
2305*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1, m2
2306*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m2
2307*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
2308*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
2309*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
2310*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
2311*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2312*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m0
2313*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032
2314*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m1
2315*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2316*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
2317*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m0
2318*c0909341SAndroid Build Coastguard Worker    psrlq                m0, 32
2319*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m0
2320*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2321*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
2322*c0909341SAndroid Build Coastguard Worker    jg .w4_leftonly_loop
2323*c0909341SAndroid Build Coastguard Worker.w4_end:
2324*c0909341SAndroid Build Coastguard Worker    sub                 r9d, 1<<8
2325*c0909341SAndroid Build Coastguard Worker    jl .w4_ret
2326*c0909341SAndroid Build Coastguard Worker    movq                 m4, [rsp+8*1]
2327*c0909341SAndroid Build Coastguard Worker    add                  r5, 4
2328*c0909341SAndroid Build Coastguard Worker    mov                dstq, r5
2329*c0909341SAndroid Build Coastguard Worker    paddw                m4, [rsp+8*0] ; base_y += 4*dy
2330*c0909341SAndroid Build Coastguard Worker    movzx               r2d, word [rsp+16*15+8*1]
2331*c0909341SAndroid Build Coastguard Worker    movddup              m6, [rsp+16*15+8*1]
2332*c0909341SAndroid Build Coastguard Worker    paddw                m6, [rsp+16*12] ; base_x += (4 << upsample_above)
2333*c0909341SAndroid Build Coastguard Worker    add                 r2d, r10d
2334*c0909341SAndroid Build Coastguard Worker    mov                r10d, r2d
2335*c0909341SAndroid Build Coastguard Worker    jmp .w4_loop0
2336*c0909341SAndroid Build Coastguard Worker.w4_ret:
2337*c0909341SAndroid Build Coastguard Worker    RET
2338*c0909341SAndroid Build Coastguard Worker.w8:
2339*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2340*c0909341SAndroid Build Coastguard Worker    jnz .w4_main
2341*c0909341SAndroid Build Coastguard Worker    movd                 m5, [tlq+8]
2342*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [angleq+126]
2343*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m7
2344*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2345*c0909341SAndroid Build Coastguard Worker    mov                 r3b, hb
2346*c0909341SAndroid Build Coastguard Worker%else
2347*c0909341SAndroid Build Coastguard Worker    xor                 r3b, r3b
2348*c0909341SAndroid Build Coastguard Worker    or                  r3d, hd
2349*c0909341SAndroid Build Coastguard Worker%endif
2350*c0909341SAndroid Build Coastguard Worker    movd       [rsp+16*8+8], m5
2351*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
2352*c0909341SAndroid Build Coastguard Worker    ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
2353*c0909341SAndroid Build Coastguard Worker    call .upsample_above
2354*c0909341SAndroid Build Coastguard Worker    sub              angled, 53
2355*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
2356*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x7f ; 180 - angle
2357*c0909341SAndroid Build Coastguard Worker    movu                 m1, [base+z_filter_wh8]
2358*c0909341SAndroid Build Coastguard Worker    movd                 m0, r3d
2359*c0909341SAndroid Build Coastguard Worker    movd                 m6, angled
2360*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
2361*c0909341SAndroid Build Coastguard Worker    psrldq               m2, [base+z_filter_t_w48+angleq*8], 4
2362*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m7
2363*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m7
2364*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m0, m1
2365*c0909341SAndroid Build Coastguard Worker    pand                 m6, m0
2366*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m6, m2
2367*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2368*c0909341SAndroid Build Coastguard Worker    movq    [rsp+16*15+8*1], m10 ; 8<<6
2369*c0909341SAndroid Build Coastguard Worker%else
2370*c0909341SAndroid Build Coastguard Worker    movq                 m0, m10
2371*c0909341SAndroid Build Coastguard Worker    movq    [rsp+16*15+8*1], m0
2372*c0909341SAndroid Build Coastguard Worker%endif
2373*c0909341SAndroid Build Coastguard Worker    jmp .w8_filter_left
2374*c0909341SAndroid Build Coastguard Worker.w8_no_upsample_above:
2375*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+7]
2376*c0909341SAndroid Build Coastguard Worker    mov               [rsp], angled
2377*c0909341SAndroid Build Coastguard Worker    sub              angled, 90
2378*c0909341SAndroid Build Coastguard Worker    movd                 m0, r3d
2379*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 90
2380*c0909341SAndroid Build Coastguard Worker    movd                 m1, angled
2381*c0909341SAndroid Build Coastguard Worker    sub                 r3d, angled ; 180 - angle
2382*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
2383*c0909341SAndroid Build Coastguard Worker    movu                 m3, [base+z_filter_wh8]
2384*c0909341SAndroid Build Coastguard Worker    psrldq               m4, [base+z_filter_t_w48+angleq*8], 4
2385*c0909341SAndroid Build Coastguard Worker    call .w8_filter_top
2386*c0909341SAndroid Build Coastguard Worker    mov                 r3d, [rsp]
2387*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 141
2388*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2389*c0909341SAndroid Build Coastguard Worker    mov                 r3b, hb
2390*c0909341SAndroid Build Coastguard Worker%else
2391*c0909341SAndroid Build Coastguard Worker    xor                 r3b, r3b
2392*c0909341SAndroid Build Coastguard Worker    or                  r3d, hd
2393*c0909341SAndroid Build Coastguard Worker%endif
2394*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
2395*c0909341SAndroid Build Coastguard Worker    jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm
2396*c0909341SAndroid Build Coastguard Worker.w8_filter_left:
2397*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m6
2398*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2399*c0909341SAndroid Build Coastguard Worker    jz .w4_main
2400*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x55555555
2401*c0909341SAndroid Build Coastguard Worker    mov                  r3, tlq
2402*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30
2403*c0909341SAndroid Build Coastguard Worker    sub                  r5, 3 ; filter_strength-3
2404*c0909341SAndroid Build Coastguard Worker    jmp .filter_left
2405*c0909341SAndroid Build Coastguard Worker.w8_filter_top:
2406*c0909341SAndroid Build Coastguard Worker    movd                 m6, r3d
2407*c0909341SAndroid Build Coastguard Worker    REPX     {pshufb x, m7}, m0, m1, m6
2408*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m0, m3
2409*c0909341SAndroid Build Coastguard Worker    pand                 m1, m0
2410*c0909341SAndroid Build Coastguard Worker    pand                 m6, m0
2411*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, m4
2412*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m6, m4
2413*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m1
2414*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2415*c0909341SAndroid Build Coastguard Worker    jz .w8_filter_top_end ; filter_strength == 0
2416*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x55555555
2417*c0909341SAndroid Build Coastguard Worker    movq                 m0, [rsp+gprsize+16*8-2]
2418*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30
2419*c0909341SAndroid Build Coastguard Worker    movq                 m1, [rsp+gprsize+16*8-1]
2420*c0909341SAndroid Build Coastguard Worker    sub                  r5, 3 ; filter_strength-3
2421*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+z_filter_k+8*2+r5*8+24*0]
2422*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1
2423*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7
2424*c0909341SAndroid Build Coastguard Worker    movq                 m1, [rsp+gprsize+16*8+0]
2425*c0909341SAndroid Build Coastguard Worker    movq                 m2, [rsp+gprsize+16*8+1]
2426*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+z_filter_k+8*2+r5*8+24*1]
2427*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2
2428*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m7
2429*c0909341SAndroid Build Coastguard Worker    movq                 m2, [rsp+gprsize+16*8+2]
2430*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+z_filter_k+8*2+r5*8+24*2]
2431*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m2
2432*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
2433*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2434*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
2435*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2436*c0909341SAndroid Build Coastguard Worker    mov                 r3d, r7m ; maxw, offset due to call
2437*c0909341SAndroid Build Coastguard Worker%else
2438*c0909341SAndroid Build Coastguard Worker    mov                 r3d, [rsp+gprsize+16*18+4*3]
2439*c0909341SAndroid Build Coastguard Worker%endif
2440*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
2441*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
2442*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2443*c0909341SAndroid Build Coastguard Worker    movq [rsp+gprsize+16*8], m0
2444*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 8
2445*c0909341SAndroid Build Coastguard Worker    jge .w8_filter_top_end
2446*c0909341SAndroid Build Coastguard Worker    movq                 m0, [tlq+r3+1]
2447*c0909341SAndroid Build Coastguard Worker    movq [rsp+gprsize+r3+16*8], m0
2448*c0909341SAndroid Build Coastguard Worker.w8_filter_top_end:
2449*c0909341SAndroid Build Coastguard Worker    ret
2450*c0909341SAndroid Build Coastguard Worker.w16:
2451*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2452*c0909341SAndroid Build Coastguard Worker    jnz .w4_main
2453*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+15]
2454*c0909341SAndroid Build Coastguard Worker    sub              angled, 90
2455*c0909341SAndroid Build Coastguard Worker    movd                 m0, r3d
2456*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 90
2457*c0909341SAndroid Build Coastguard Worker    movd                 m1, angled
2458*c0909341SAndroid Build Coastguard Worker    sub                 r3d, angled ; 180 - angle
2459*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
2460*c0909341SAndroid Build Coastguard Worker    movd                 m6, r3d
2461*c0909341SAndroid Build Coastguard Worker    REPX     {pshufb x, m7}, m0, m1, m6
2462*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+z_filter_t_w16+angleq*4]
2463*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m0, [base+z_filter_wh16]
2464*c0909341SAndroid Build Coastguard Worker    pand                 m1, m0
2465*c0909341SAndroid Build Coastguard Worker    pand                 m6, m0
2466*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, m3
2467*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m6, m3
2468*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m1
2469*c0909341SAndroid Build Coastguard Worker    mov                  r3, tlq
2470*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2471*c0909341SAndroid Build Coastguard Worker    jz .w16_filter_left ; filter_strength == 0
2472*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x24924924
2473*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [base+z_filter_t_w16] ; tlq[16]
2474*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30
2475*c0909341SAndroid Build Coastguard Worker    adc                  r5, -4 ; filter_strength-3
2476*c0909341SAndroid Build Coastguard Worker    movd         [rsp+16*9], m5
2477*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+z_filter_k+8*2+r5*8+24*0]
2478*c0909341SAndroid Build Coastguard Worker    movu                 m1, [rsp+16*8-2]
2479*c0909341SAndroid Build Coastguard Worker    movu                 m2, [rsp+16*8-1]
2480*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
2481*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7
2482*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
2483*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m7
2484*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+z_filter_k+8*2+r5*8+24*1]
2485*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+16*8+0]
2486*c0909341SAndroid Build Coastguard Worker    movu                 m4, [rsp+16*8+1]
2487*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m4
2488*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
2489*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m4
2490*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m7
2491*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
2492*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
2493*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2494*c0909341SAndroid Build Coastguard Worker    jnz .w16_filter_end ; 3-tap
2495*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+z_filter_k+8*8]
2496*c0909341SAndroid Build Coastguard Worker    movu                 m3, [rsp+16*8+2]
2497*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m3
2498*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
2499*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m3
2500*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m7
2501*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
2502*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
2503*c0909341SAndroid Build Coastguard Worker.w16_filter_end:
2504*c0909341SAndroid Build Coastguard Worker    mov                 r2d, maxwm
2505*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
2506*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
2507*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2508*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*8], m0
2509*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, 16
2510*c0909341SAndroid Build Coastguard Worker    jge .w16_filter_left
2511*c0909341SAndroid Build Coastguard Worker    movu                 m0, [r3+r2+1]
2512*c0909341SAndroid Build Coastguard Worker    movu      [rsp+r2+16*8], m0
2513*c0909341SAndroid Build Coastguard Worker.w16_filter_left:
2514*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m6
2515*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2516*c0909341SAndroid Build Coastguard Worker    jz .w4_main
2517*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x24924924
2518*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30
2519*c0909341SAndroid Build Coastguard Worker    adc                  r5, -4 ; filter_strength-3
2520*c0909341SAndroid Build Coastguard Worker    jmp .filter_left
2521*c0909341SAndroid Build Coastguard Worker.w32:
2522*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2523*c0909341SAndroid Build Coastguard Worker    jnz .w4_main
2524*c0909341SAndroid Build Coastguard Worker    pshufb               m6, [base+z_filter_t_w16] ; tlq[32]
2525*c0909341SAndroid Build Coastguard Worker    mov                  r3, tlq
2526*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*9]
2527*c0909341SAndroid Build Coastguard Worker    movd         [tlq+16*1], m6
2528*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d ; filter_strength = 3
2529*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
2530*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq+16*0]
2531*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tlq+16*1]
2532*c0909341SAndroid Build Coastguard Worker    mov                 r2d, maxwm
2533*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*8], m0
2534*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*9], m1
2535*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, 32
2536*c0909341SAndroid Build Coastguard Worker    jge .filter_left
2537*c0909341SAndroid Build Coastguard Worker    movu                 m0, [r3+r2+16*0+1]
2538*c0909341SAndroid Build Coastguard Worker    movu                 m1, [r3+r2+16*1+1]
2539*c0909341SAndroid Build Coastguard Worker    movu      [rsp+r2+16*8], m0
2540*c0909341SAndroid Build Coastguard Worker    movu      [rsp+r2+16*9], m1
2541*c0909341SAndroid Build Coastguard Worker    jmp .filter_left
2542*c0909341SAndroid Build Coastguard Worker.w64:
2543*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+16*2+1]
2544*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+16*3+1]
2545*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*10], m0
2546*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*11], m1
2547*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2548*c0909341SAndroid Build Coastguard Worker    jnz .w4_main
2549*c0909341SAndroid Build Coastguard Worker    pshufb               m1, [base+z_filter_t_w16] ; tlq[64]
2550*c0909341SAndroid Build Coastguard Worker    mov                  r3, tlq
2551*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*11]
2552*c0909341SAndroid Build Coastguard Worker    movd         [tlq+16*1], m1
2553*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d ; filter_strength = 3
2554*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
2555*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 16*2
2556*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
2557*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq+16*0]
2558*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tlq+16*1]
2559*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq+16*2]
2560*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tlq+16*3]
2561*c0909341SAndroid Build Coastguard Worker    mov                 r2d, maxwm
2562*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 8], m0
2563*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 9], m1
2564*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*10], m2
2565*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*11], m3
2566*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, 64
2567*c0909341SAndroid Build Coastguard Worker    jge .filter_left
2568*c0909341SAndroid Build Coastguard Worker    movu                 m0, [r3+r2+16*0+1]
2569*c0909341SAndroid Build Coastguard Worker    movu                 m1, [r3+r2+16*1+1]
2570*c0909341SAndroid Build Coastguard Worker    movu     [rsp+r2+16* 8], m0
2571*c0909341SAndroid Build Coastguard Worker    movu     [rsp+r2+16* 9], m1
2572*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, 32
2573*c0909341SAndroid Build Coastguard Worker    jge .filter_left
2574*c0909341SAndroid Build Coastguard Worker    movu                 m0, [r3+r2+16*2+1]
2575*c0909341SAndroid Build Coastguard Worker    movu                 m1, [r3+r2+16*3+1]
2576*c0909341SAndroid Build Coastguard Worker    movu     [rsp+r2+16*10], m0
2577*c0909341SAndroid Build Coastguard Worker    movu     [rsp+r2+16*11], m1
2578*c0909341SAndroid Build Coastguard Worker.filter_left:
2579*c0909341SAndroid Build Coastguard Worker    neg                  hq
2580*c0909341SAndroid Build Coastguard Worker    movd                 m0, [r3+hq]
2581*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
2582*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1
2583*c0909341SAndroid Build Coastguard Worker    movd    [rsp+16*6+hq-4], m0
2584*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*5]
2585*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
2586*c0909341SAndroid Build Coastguard Worker    cmp                  hd, -32
2587*c0909341SAndroid Build Coastguard Worker    jge .filter_left_end
2588*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 16*2
2589*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
2590*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq+16*0]
2591*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tlq+16*1]
2592*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m0
2593*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m1
2594*c0909341SAndroid Build Coastguard Worker.filter_left_end:
2595*c0909341SAndroid Build Coastguard Worker    mov                 r2d, maxhm
2596*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+16*5]
2597*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+16*6]
2598*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+16*7]
2599*c0909341SAndroid Build Coastguard Worker    neg                  r2
2600*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*4], m0
2601*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*5], m1
2602*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*6], m2
2603*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, hd
2604*c0909341SAndroid Build Coastguard Worker    jle .w4_main
2605*c0909341SAndroid Build Coastguard Worker    movu                 m0, [r3+r2-16*2]
2606*c0909341SAndroid Build Coastguard Worker    movu                 m1, [r3+r2-16*1]
2607*c0909341SAndroid Build Coastguard Worker    movu      [rsp+r2+16*4], m0
2608*c0909341SAndroid Build Coastguard Worker    movu      [rsp+r2+16*5], m1
2609*c0909341SAndroid Build Coastguard Worker    cmp                 r2d, -32
2610*c0909341SAndroid Build Coastguard Worker    jle .w4_main
2611*c0909341SAndroid Build Coastguard Worker    movu                 m0, [r3+r2-16*4]
2612*c0909341SAndroid Build Coastguard Worker    movu                 m1, [r3+r2-16*3]
2613*c0909341SAndroid Build Coastguard Worker    movu      [rsp+r2+16*2], m0
2614*c0909341SAndroid Build Coastguard Worker    movu      [rsp+r2+16*3], m1
2615*c0909341SAndroid Build Coastguard Worker    jmp .w4_main
2616*c0909341SAndroid Build Coastguard Worker
2617*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2618*c0909341SAndroid Build Coastguard Workercglobal ipred_z3_8bpc, 4, 9, 11, 16*10, dst, stride, tl, w, h, angle, dy, _, org_w
2619*c0909341SAndroid Build Coastguard Worker    %define            base  r7-$$
2620*c0909341SAndroid Build Coastguard Worker    lea                  r7, [$$]
2621*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+pw_62]
2622*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pw_64]
2623*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+pw_512]
2624*c0909341SAndroid Build Coastguard Worker    mov              org_wd, wd
2625*c0909341SAndroid Build Coastguard Worker%else
2626*c0909341SAndroid Build Coastguard Workercglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, tl, w, h, angle, dy
2627*c0909341SAndroid Build Coastguard Worker    %define            base  r1-$$
2628*c0909341SAndroid Build Coastguard Worker    %define              m8  [base+pw_62]
2629*c0909341SAndroid Build Coastguard Worker    %define              m9  [base+pw_64]
2630*c0909341SAndroid Build Coastguard Worker    %define             m10  [base+pw_512]
2631*c0909341SAndroid Build Coastguard Worker    %define          org_wd  r5
2632*c0909341SAndroid Build Coastguard Worker    %define          org_wq  r5
2633*c0909341SAndroid Build Coastguard Worker    mov    [dstq+strideq*0], strideq
2634*c0909341SAndroid Build Coastguard Worker    mov    [dstq+strideq*1], wd
2635*c0909341SAndroid Build Coastguard Worker    LEA                  r1, $$
2636*c0909341SAndroid Build Coastguard Worker%endif
2637*c0909341SAndroid Build Coastguard Worker    tzcnt                hd, hm
2638*c0909341SAndroid Build Coastguard Worker    movifnidn        angled, anglem
2639*c0909341SAndroid Build Coastguard Worker    dec                 tlq
2640*c0909341SAndroid Build Coastguard Worker    movsxd               hq, [base+ipred_z3_ssse3_table+hq*4]
2641*c0909341SAndroid Build Coastguard Worker    sub              angled, 180
2642*c0909341SAndroid Build Coastguard Worker    mov                 dyd, angled
2643*c0909341SAndroid Build Coastguard Worker    neg                 dyd
2644*c0909341SAndroid Build Coastguard Worker    xor              angled, 0x400
2645*c0909341SAndroid Build Coastguard Worker    or                  dyq, ~0x7e
2646*c0909341SAndroid Build Coastguard Worker    lea                  hq, [base+ipred_z3_ssse3_table+hq]
2647*c0909341SAndroid Build Coastguard Worker    movzx               dyd, word [base+dr_intra_derivative+45*2-1+dyq]
2648*c0909341SAndroid Build Coastguard Worker    jmp                  hq
2649*c0909341SAndroid Build Coastguard Worker.h4:
2650*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [angleq+88]
2651*c0909341SAndroid Build Coastguard Worker    test                r4d, 0x480
2652*c0909341SAndroid Build Coastguard Worker    jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40
2653*c0909341SAndroid Build Coastguard Worker    sar                 r4d, 9
2654*c0909341SAndroid Build Coastguard Worker    add                 r4d, wd
2655*c0909341SAndroid Build Coastguard Worker    cmp                 r4d, 8
2656*c0909341SAndroid Build Coastguard Worker    jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm)
2657*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq-7]
2658*c0909341SAndroid Build Coastguard Worker    movu                 m1, [base+z_upsample1-4]
2659*c0909341SAndroid Build Coastguard Worker    movu                 m4, [base+z_filter_s+2]
2660*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3, m1
2661*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
2662*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3, m1
2663*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3, m4
2664*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], m2 ; top[max_base_y]
2665*c0909341SAndroid Build Coastguard Worker    movddup              m2, [base+pb_36_m4]
2666*c0909341SAndroid Build Coastguard Worker    add                 dyd, dyd
2667*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
2668*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
2669*c0909341SAndroid Build Coastguard Worker    movd                 m5, dyd
2670*c0909341SAndroid Build Coastguard Worker    mov                 r5d, dyd
2671*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [base+pw_256]
2672*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2673*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
2674*c0909341SAndroid Build Coastguard Worker    shl                  wd, 2
2675*c0909341SAndroid Build Coastguard Worker    mov                 tlq, rsp
2676*c0909341SAndroid Build Coastguard Worker    sub                 rsp, wq
2677*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
2678*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m3
2679*c0909341SAndroid Build Coastguard Worker    paddw                m6, m5, m5
2680*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m5, m6
2681*c0909341SAndroid Build Coastguard Worker    pshufb               m0, [base+pb_15to0]
2682*c0909341SAndroid Build Coastguard Worker    mova              [tlq], m0
2683*c0909341SAndroid Build Coastguard Worker.h4_upsample_loop:
2684*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [r5+dyq]
2685*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 6
2686*c0909341SAndroid Build Coastguard Worker    movq                 m0, [tlq+r5]
2687*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [r4+dyq]
2688*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6
2689*c0909341SAndroid Build Coastguard Worker    movhps               m0, [tlq+r4]
2690*c0909341SAndroid Build Coastguard Worker    pand                 m2, m8, m5
2691*c0909341SAndroid Build Coastguard Worker    psubw                m1, m9, m2
2692*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
2693*c0909341SAndroid Build Coastguard Worker    por                  m1, m2
2694*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
2695*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6
2696*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
2697*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
2698*c0909341SAndroid Build Coastguard Worker    movq         [rsp+wq-8], m0
2699*c0909341SAndroid Build Coastguard Worker    sub                  wd, 8
2700*c0909341SAndroid Build Coastguard Worker    jg .h4_upsample_loop
2701*c0909341SAndroid Build Coastguard Worker    jmp .h4_transpose
2702*c0909341SAndroid Build Coastguard Worker.h4_no_upsample:
2703*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 7
2704*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
2705*c0909341SAndroid Build Coastguard Worker    jnz .h4_main
2706*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [wq+3]
2707*c0909341SAndroid Build Coastguard Worker    movd                 m0, r4d
2708*c0909341SAndroid Build Coastguard Worker    movd                 m2, angled
2709*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
2710*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
2711*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1
2712*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m1
2713*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m1, m0, [base+z_filter_wh4]
2714*c0909341SAndroid Build Coastguard Worker    pand                 m1, m2
2715*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, [base+z_filter_t_w48+angleq*8]
2716*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m1
2717*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 7
2718*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2719*c0909341SAndroid Build Coastguard Worker    jz .h4_main ; filter_strength == 0
2720*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq-7]
2721*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x55555555
2722*c0909341SAndroid Build Coastguard Worker    movu                 m3, [base+z_filter_s-2]
2723*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30 ; filter_strength
2724*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+z_upsample2]
2725*c0909341SAndroid Build Coastguard Worker    movddup              m5, [base+z_filter_k-8+r5*8+24*0]
2726*c0909341SAndroid Build Coastguard Worker    movddup              m6, [base+z_filter_k-8+r5*8+24*1]
2727*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+z_filter_k-8+r5*8+24*2]
2728*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2, m3
2729*c0909341SAndroid Build Coastguard Worker    shufps               m3, m4, q2121
2730*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m0, m5
2731*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m6
2732*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m2, m3
2733*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5, m6
2734*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m7
2735*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
2736*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
2737*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
2738*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
2739*c0909341SAndroid Build Coastguard Worker    paddw                m0, m5
2740*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2741*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
2742*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
2743*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [r4+2]
2744*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2745*c0909341SAndroid Build Coastguard Worker    cmovne              r4d, r2d
2746*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m0, q0000
2747*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+15]
2748*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2749*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m0
2750*c0909341SAndroid Build Coastguard Worker.h4_main:
2751*c0909341SAndroid Build Coastguard Worker    movd                 m5, dyd
2752*c0909341SAndroid Build Coastguard Worker    movddup              m0, [base+z_base_inc] ; base_inc << 6
2753*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r4
2754*c0909341SAndroid Build Coastguard Worker    shl                 r4d, 6
2755*c0909341SAndroid Build Coastguard Worker    movd                 m7, [tlq]
2756*c0909341SAndroid Build Coastguard Worker    movd                 m4, r4d
2757*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [base+pw_256]
2758*c0909341SAndroid Build Coastguard Worker    neg                 dyq
2759*c0909341SAndroid Build Coastguard Worker    pshufb               m7, [base+pw_m256]
2760*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+z3_shuf_h4]
2761*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dyq+r4+63] ; ypos
2762*c0909341SAndroid Build Coastguard Worker    pshufb               m4, [base+pw_256]
2763*c0909341SAndroid Build Coastguard Worker    psubw                m4, m0 ; max_base_y
2764*c0909341SAndroid Build Coastguard Worker    shl                  wd, 2
2765*c0909341SAndroid Build Coastguard Worker    paddw                m6, m5, m5
2766*c0909341SAndroid Build Coastguard Worker    sub                 rsp, wq
2767*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m5, m6
2768*c0909341SAndroid Build Coastguard Worker.h4_loop:
2769*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r5+dyq]
2770*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6
2771*c0909341SAndroid Build Coastguard Worker    movq                 m0, [tlq+r5-4]
2772*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r4+dyq]
2773*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6
2774*c0909341SAndroid Build Coastguard Worker    movhps               m0, [tlq+r4-4]
2775*c0909341SAndroid Build Coastguard Worker    pand                 m2, m8, m5
2776*c0909341SAndroid Build Coastguard Worker    psubw                m1, m9, m2
2777*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
2778*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
2779*c0909341SAndroid Build Coastguard Worker    por                  m1, m2
2780*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
2781*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m1, m4, m5
2782*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6
2783*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
2784*c0909341SAndroid Build Coastguard Worker    pand                 m0, m1
2785*c0909341SAndroid Build Coastguard Worker    pandn                m1, m7
2786*c0909341SAndroid Build Coastguard Worker    por                  m0, m1
2787*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
2788*c0909341SAndroid Build Coastguard Worker    movq         [rsp+wq-8], m0
2789*c0909341SAndroid Build Coastguard Worker    sub                  wd, 8
2790*c0909341SAndroid Build Coastguard Worker    jz .h4_transpose
2791*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2792*c0909341SAndroid Build Coastguard Worker    jg .h4_loop
2793*c0909341SAndroid Build Coastguard Worker    packuswb             m7, m7
2794*c0909341SAndroid Build Coastguard Worker.h4_end_loop:
2795*c0909341SAndroid Build Coastguard Worker    movq         [rsp+wq-8], m7
2796*c0909341SAndroid Build Coastguard Worker    sub                  wd, 8
2797*c0909341SAndroid Build Coastguard Worker    jg .h4_end_loop
2798*c0909341SAndroid Build Coastguard Worker.h4_transpose:
2799*c0909341SAndroid Build Coastguard Worker    mova                 m1, [base+z_transpose4]
2800*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2801*c0909341SAndroid Build Coastguard Worker    mov             strideq, [dstq]
2802*c0909341SAndroid Build Coastguard Worker    mov              org_wd, [dstq+strideq]
2803*c0909341SAndroid Build Coastguard Worker%endif
2804*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
2805*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+org_wq-4]
2806*c0909341SAndroid Build Coastguard Worker.h4_transpose_loop:
2807*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp]
2808*c0909341SAndroid Build Coastguard Worker    add                 rsp, 16
2809*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1
2810*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m0
2811*c0909341SAndroid Build Coastguard Worker    pshuflw              m2, m0, q1032
2812*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m2
2813*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
2814*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], m0
2815*c0909341SAndroid Build Coastguard Worker    psrlq                m0, 32
2816*c0909341SAndroid Build Coastguard Worker    movd   [dstq+r2       ], m0
2817*c0909341SAndroid Build Coastguard Worker    sub                dstq, 4
2818*c0909341SAndroid Build Coastguard Worker    sub              org_wd, 4
2819*c0909341SAndroid Build Coastguard Worker    jg .h4_transpose_loop
2820*c0909341SAndroid Build Coastguard Worker    RET
2821*c0909341SAndroid Build Coastguard Worker.h8:
2822*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [angleq+88]
2823*c0909341SAndroid Build Coastguard Worker    and                 r4d, ~0x7f
2824*c0909341SAndroid Build Coastguard Worker    or                  r4d, wd
2825*c0909341SAndroid Build Coastguard Worker    cmp                 r4d, 8
2826*c0909341SAndroid Build Coastguard Worker    ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
2827*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tlq-15]
2828*c0909341SAndroid Build Coastguard Worker    and                 r4d, 4
2829*c0909341SAndroid Build Coastguard Worker    movu                 m3, [tlq- 9]
2830*c0909341SAndroid Build Coastguard Worker    movd                 m1, r4d
2831*c0909341SAndroid Build Coastguard Worker    movu                 m2, [base+z_filter_s+2]
2832*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
2833*c0909341SAndroid Build Coastguard Worker    movu                 m5, [base+z_filter_s+6]
2834*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+pb_36_m4]
2835*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m0 ; w & 4
2836*c0909341SAndroid Build Coastguard Worker    movu                 m0, [base+z_upsample1-4]
2837*c0909341SAndroid Build Coastguard Worker    pmaxub               m1, m0 ; clip 4x8
2838*c0909341SAndroid Build Coastguard Worker    add                 dyd, dyd
2839*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4, m1
2840*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7
2841*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m2
2842*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m7
2843*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3, [base+z_upsample1]
2844*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7
2845*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m5
2846*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m7
2847*c0909341SAndroid Build Coastguard Worker    movd                 m5, dyd
2848*c0909341SAndroid Build Coastguard Worker    neg                 dyq
2849*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
2850*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
2851*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
2852*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m10
2853*c0909341SAndroid Build Coastguard Worker    shl                  wd, 3
2854*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16]
2855*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [base+pw_256]
2856*c0909341SAndroid Build Coastguard Worker    sub                 rsp, wq
2857*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m2
2858*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dyq+63]
2859*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m4
2860*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m4
2861*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*1], m0
2862*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*0], m1
2863*c0909341SAndroid Build Coastguard Worker    paddw                m6, m5, m5
2864*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m5, m6
2865*c0909341SAndroid Build Coastguard Worker.h8_upsample_loop:
2866*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r5+dyq]
2867*c0909341SAndroid Build Coastguard Worker    sar                  r5, 6
2868*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r5]
2869*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r4+dyq]
2870*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6
2871*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r4]
2872*c0909341SAndroid Build Coastguard Worker    pand                 m3, m8, m5
2873*c0909341SAndroid Build Coastguard Worker    psubw                m2, m9, m3
2874*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
2875*c0909341SAndroid Build Coastguard Worker    por                  m3, m2
2876*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q1010
2877*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
2878*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m3
2879*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
2880*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6
2881*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
2882*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
2883*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m0
2884*c0909341SAndroid Build Coastguard Worker    mova        [rsp+wq-16], m1
2885*c0909341SAndroid Build Coastguard Worker    sub                  wd, 16
2886*c0909341SAndroid Build Coastguard Worker    jg .h8_upsample_loop
2887*c0909341SAndroid Build Coastguard Worker    jmp .h8_transpose
2888*c0909341SAndroid Build Coastguard Worker.h8_no_upsample:
2889*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [wq+7]
2890*c0909341SAndroid Build Coastguard Worker    movd                 m0, r4d
2891*c0909341SAndroid Build Coastguard Worker    and                 r4d, 7
2892*c0909341SAndroid Build Coastguard Worker    or                  r4d, 8 ; imin(w+7, 15)
2893*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
2894*c0909341SAndroid Build Coastguard Worker    jnz .h8_main
2895*c0909341SAndroid Build Coastguard Worker    movd                 m2, angled
2896*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
2897*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
2898*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1
2899*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m1
2900*c0909341SAndroid Build Coastguard Worker    movu                 m1, [base+z_filter_wh8]
2901*c0909341SAndroid Build Coastguard Worker    psrldq               m3, [base+z_filter_t_w48+angleq*8], 4
2902*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m1, m0
2903*c0909341SAndroid Build Coastguard Worker    pand                 m1, m2
2904*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, m3
2905*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m1
2906*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
2907*c0909341SAndroid Build Coastguard Worker    jz .h8_main ; filter_strength == 0
2908*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-15]
2909*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x55555555
2910*c0909341SAndroid Build Coastguard Worker    movd                 m1, [tlq+1]
2911*c0909341SAndroid Build Coastguard Worker    neg                  r4
2912*c0909341SAndroid Build Coastguard Worker    movd                 m2, [tlq+r4]
2913*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30
2914*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
2915*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*2]
2916*c0909341SAndroid Build Coastguard Worker    sub                  r5, 3 ; filter_strength-3
2917*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*0], m0
2918*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m7
2919*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*1], m1
2920*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
2921*c0909341SAndroid Build Coastguard Worker    movq         [tlq+r4+8], m2
2922*c0909341SAndroid Build Coastguard Worker    neg                 r4d
2923*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
2924*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 1
2925*c0909341SAndroid Build Coastguard Worker    add                 tlq, 31
2926*c0909341SAndroid Build Coastguard Worker    add                 r5d, 17
2927*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
2928*c0909341SAndroid Build Coastguard Worker    cmova               r4d, r5d
2929*c0909341SAndroid Build Coastguard Worker.h8_main:
2930*c0909341SAndroid Build Coastguard Worker    movd                 m5, dyd
2931*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r4
2932*c0909341SAndroid Build Coastguard Worker    shl                 r4d, 6
2933*c0909341SAndroid Build Coastguard Worker    movd                 m7, [tlq]
2934*c0909341SAndroid Build Coastguard Worker    movd                 m4, r4d
2935*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [base+pw_256]
2936*c0909341SAndroid Build Coastguard Worker    neg                 dyq
2937*c0909341SAndroid Build Coastguard Worker    pshufb               m7, [base+pw_m256]
2938*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+z3_shuf]
2939*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dyq+r4+63]
2940*c0909341SAndroid Build Coastguard Worker    pshufb               m4, [base+pw_256]
2941*c0909341SAndroid Build Coastguard Worker    psubw                m4, [base+z3_base_inc]
2942*c0909341SAndroid Build Coastguard Worker    shl                  wd, 3
2943*c0909341SAndroid Build Coastguard Worker    mova                 m6, m5
2944*c0909341SAndroid Build Coastguard Worker    sub                 rsp, wq
2945*c0909341SAndroid Build Coastguard Worker.h8_loop:
2946*c0909341SAndroid Build Coastguard Worker    mov                  r4, r5
2947*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6
2948*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r4-8]
2949*c0909341SAndroid Build Coastguard Worker    pand                 m2, m8, m5
2950*c0909341SAndroid Build Coastguard Worker    psubw                m1, m9, m2
2951*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
2952*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
2953*c0909341SAndroid Build Coastguard Worker    por                  m1, m2
2954*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
2955*c0909341SAndroid Build Coastguard Worker    pcmpgtw              m1, m4, m5
2956*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6
2957*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
2958*c0909341SAndroid Build Coastguard Worker    pand                 m0, m1
2959*c0909341SAndroid Build Coastguard Worker    pandn                m1, m7
2960*c0909341SAndroid Build Coastguard Worker    por                  m0, m1
2961*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
2962*c0909341SAndroid Build Coastguard Worker    movq         [rsp+wq-8], m0
2963*c0909341SAndroid Build Coastguard Worker    sub                  wd, 8
2964*c0909341SAndroid Build Coastguard Worker    jz .h8_transpose
2965*c0909341SAndroid Build Coastguard Worker    add                  r5, dyq
2966*c0909341SAndroid Build Coastguard Worker    jg .h8_loop
2967*c0909341SAndroid Build Coastguard Worker    packuswb             m7, m7
2968*c0909341SAndroid Build Coastguard Worker.h8_end_loop:
2969*c0909341SAndroid Build Coastguard Worker    movq         [rsp+wq-8], m7
2970*c0909341SAndroid Build Coastguard Worker    sub                  wd, 8
2971*c0909341SAndroid Build Coastguard Worker    jg .h8_end_loop
2972*c0909341SAndroid Build Coastguard Worker.h8_transpose:
2973*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2974*c0909341SAndroid Build Coastguard Worker    mov             strideq, [dstq]
2975*c0909341SAndroid Build Coastguard Worker    mov              org_wd, [dstq+strideq]
2976*c0909341SAndroid Build Coastguard Worker%endif
2977*c0909341SAndroid Build Coastguard Worker    or                  r3d, 8
2978*c0909341SAndroid Build Coastguard Worker    cmp              org_wd, 4
2979*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2980*c0909341SAndroid Build Coastguard Worker    jne .end_transpose_main
2981*c0909341SAndroid Build Coastguard Worker%else
2982*c0909341SAndroid Build Coastguard Worker    jne .end_transpose_loop
2983*c0909341SAndroid Build Coastguard Worker%endif
2984*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+16*1]
2985*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+16*0]
2986*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
2987*c0909341SAndroid Build Coastguard Worker    add                 rsp, 16*2
2988*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m1, m0
2989*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m0
2990*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m1, m2
2991*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2
2992*c0909341SAndroid Build Coastguard Worker.write_4x8_end:
2993*c0909341SAndroid Build Coastguard Worker    call .write_4x8
2994*c0909341SAndroid Build Coastguard Worker    RET
2995*c0909341SAndroid Build Coastguard Worker.write_4x8:
2996*c0909341SAndroid Build Coastguard Worker    movd   [dstq+r2       ], m0
2997*c0909341SAndroid Build Coastguard Worker    pshuflw              m4, m0, q1032
2998*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], m4
2999*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
3000*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m0
3001*c0909341SAndroid Build Coastguard Worker    psrlq                m0, 32
3002*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m0
3003*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
3004*c0909341SAndroid Build Coastguard Worker    movd   [dstq+r2       ], m1
3005*c0909341SAndroid Build Coastguard Worker    pshuflw              m4, m1, q1032
3006*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], m4
3007*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m1
3008*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m1
3009*c0909341SAndroid Build Coastguard Worker    psrlq                m1, 32
3010*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m1
3011*c0909341SAndroid Build Coastguard Worker    ret
3012*c0909341SAndroid Build Coastguard Worker.h16:
3013*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [wq+15]
3014*c0909341SAndroid Build Coastguard Worker    movd                 m0, r4d
3015*c0909341SAndroid Build Coastguard Worker    and                 r4d, 15
3016*c0909341SAndroid Build Coastguard Worker    or                  r4d, 16 ; imin(w+15, 31)
3017*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400
3018*c0909341SAndroid Build Coastguard Worker    jnz .h16_main
3019*c0909341SAndroid Build Coastguard Worker    movd                 m2, angled
3020*c0909341SAndroid Build Coastguard Worker    shr              angled, 8 ; is_sm << 1
3021*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
3022*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1
3023*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m1
3024*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+z_filter_t_w16+angleq*4]
3025*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m1, m0, [base+z_filter_wh16]
3026*c0909341SAndroid Build Coastguard Worker    pand                 m1, m2
3027*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, m3
3028*c0909341SAndroid Build Coastguard Worker    pmovmskb            r5d, m1
3029*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
3030*c0909341SAndroid Build Coastguard Worker    jz .h16_main ; filter_strength == 0
3031*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-16*2+1]
3032*c0909341SAndroid Build Coastguard Worker    imul                r5d, 0x24924924
3033*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tlq-16*1+1]
3034*c0909341SAndroid Build Coastguard Worker    neg                  r4
3035*c0909341SAndroid Build Coastguard Worker    movd                 m2, [tlq-16*0+1]
3036*c0909341SAndroid Build Coastguard Worker    shr                 r5d, 30
3037*c0909341SAndroid Build Coastguard Worker    movd                 m3, [tlq+r4]
3038*c0909341SAndroid Build Coastguard Worker    adc                  r5, -4 ; filter_strength-3
3039*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
3040*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*2]
3041*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*1], m0
3042*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m7
3043*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*0], m1
3044*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m7
3045*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*1], m2
3046*c0909341SAndroid Build Coastguard Worker    movq         [tlq+r4+8], m3
3047*c0909341SAndroid Build Coastguard Worker    neg                 r4d
3048*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
3049*c0909341SAndroid Build Coastguard Worker    add                 tlq, 31
3050*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 16
3051*c0909341SAndroid Build Coastguard Worker    jle .h16_main
3052*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, [tlq-47], q0000
3053*c0909341SAndroid Build Coastguard Worker    sar                  r5, 1
3054*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base+z3_filter_k_tail+r5*4]
3055*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [r5+33]
3056*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
3057*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3058*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
3059*c0909341SAndroid Build Coastguard Worker%else
3060*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
3061*c0909341SAndroid Build Coastguard Worker%endif
3062*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
3063*c0909341SAndroid Build Coastguard Worker    movd           [tlq-35], m0
3064*c0909341SAndroid Build Coastguard Worker.h16_main:
3065*c0909341SAndroid Build Coastguard Worker    movd                 m5, dyd
3066*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r4
3067*c0909341SAndroid Build Coastguard Worker    movd                 m4, r4d
3068*c0909341SAndroid Build Coastguard Worker    shl                 r4d, 6
3069*c0909341SAndroid Build Coastguard Worker    movd                 m7, [tlq]
3070*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
3071*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [base+pw_256]
3072*c0909341SAndroid Build Coastguard Worker    neg                 dyq
3073*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m6
3074*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+z3_shuf]
3075*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dyq+r4+63]
3076*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m6
3077*c0909341SAndroid Build Coastguard Worker    psubb                m4, [base+pb_15to0]
3078*c0909341SAndroid Build Coastguard Worker    shl                  wd, 4
3079*c0909341SAndroid Build Coastguard Worker    mova                 m6, m5
3080*c0909341SAndroid Build Coastguard Worker    sub                 rsp, wq
3081*c0909341SAndroid Build Coastguard Worker.h16_loop:
3082*c0909341SAndroid Build Coastguard Worker    mov                  r4, r5
3083*c0909341SAndroid Build Coastguard Worker    pand                 m2, m8, m5
3084*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6
3085*c0909341SAndroid Build Coastguard Worker    psubw                m1, m9, m2
3086*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
3087*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r4-8*2]
3088*c0909341SAndroid Build Coastguard Worker    por                  m2, m1
3089*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r4-8*1]
3090*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
3091*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
3092*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
3093*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
3094*c0909341SAndroid Build Coastguard Worker    psrlw                m2, m5, 6
3095*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6
3096*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
3097*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
3098*c0909341SAndroid Build Coastguard Worker    packsswb             m2, m2
3099*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
3100*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, m4, m2
3101*c0909341SAndroid Build Coastguard Worker    pand                 m0, m1
3102*c0909341SAndroid Build Coastguard Worker    pandn                m1, m7
3103*c0909341SAndroid Build Coastguard Worker    por                  m0, m1
3104*c0909341SAndroid Build Coastguard Worker    mova        [rsp+wq-16], m0
3105*c0909341SAndroid Build Coastguard Worker    sub                  wd, 16
3106*c0909341SAndroid Build Coastguard Worker    jz .h16_transpose
3107*c0909341SAndroid Build Coastguard Worker    add                  r5, dyq
3108*c0909341SAndroid Build Coastguard Worker    jg .h16_loop
3109*c0909341SAndroid Build Coastguard Worker.h16_end_loop:
3110*c0909341SAndroid Build Coastguard Worker    mova        [rsp+wq-16], m7
3111*c0909341SAndroid Build Coastguard Worker    sub                  wd, 16
3112*c0909341SAndroid Build Coastguard Worker    jg .h16_end_loop
3113*c0909341SAndroid Build Coastguard Worker.h16_transpose:
3114*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3115*c0909341SAndroid Build Coastguard Worker    mov             strideq, [dstq]
3116*c0909341SAndroid Build Coastguard Worker    mov              org_wd, [dstq+strideq]
3117*c0909341SAndroid Build Coastguard Worker%endif
3118*c0909341SAndroid Build Coastguard Worker    or                  r3d, 16
3119*c0909341SAndroid Build Coastguard Worker    cmp              org_wd, 4
3120*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3121*c0909341SAndroid Build Coastguard Worker    jne .end_transpose_main
3122*c0909341SAndroid Build Coastguard Worker%else
3123*c0909341SAndroid Build Coastguard Worker    jne .end_transpose_loop
3124*c0909341SAndroid Build Coastguard Worker%endif
3125*c0909341SAndroid Build Coastguard Worker.h16_transpose_w4:
3126*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+16*3]
3127*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*2]
3128*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+16*1]
3129*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+16*0]
3130*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
3131*c0909341SAndroid Build Coastguard Worker    add                 rsp, 16*4
3132*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2, m4
3133*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4
3134*c0909341SAndroid Build Coastguard Worker    punpckhbw            m4, m3, m0
3135*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m0
3136*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m1, m4
3137*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m4
3138*c0909341SAndroid Build Coastguard Worker    call .write_4x8
3139*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
3140*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2, m3
3141*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m3
3142*c0909341SAndroid Build Coastguard Worker    jmp .write_4x8_end
3143*c0909341SAndroid Build Coastguard Worker.h32:
3144*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [wq+31]
3145*c0909341SAndroid Build Coastguard Worker    and                 r4d, 31
3146*c0909341SAndroid Build Coastguard Worker    or                  r4d, 32 ; imin(w+31, 63)
3147*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
3148*c0909341SAndroid Build Coastguard Worker    jnz .h32_main
3149*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-16*4+1]
3150*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tlq-16*3+1]
3151*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq-16*2+1]
3152*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tlq-16*1+1]
3153*c0909341SAndroid Build Coastguard Worker    movd                 m4, [tlq-16*0+1]
3154*c0909341SAndroid Build Coastguard Worker    neg                  r4
3155*c0909341SAndroid Build Coastguard Worker    movd                 m5, [tlq+r4]
3156*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
3157*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*4]
3158*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*3], m0
3159*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*2], m1
3160*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d ; filter_strength = 3
3161*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*1], m2
3162*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m7
3163*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*0], m3
3164*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m7
3165*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*1], m4
3166*c0909341SAndroid Build Coastguard Worker    movq         [tlq+r4+8], m5
3167*c0909341SAndroid Build Coastguard Worker    neg                 r4d
3168*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
3169*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 16*2
3170*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
3171*c0909341SAndroid Build Coastguard Worker    add                 tlq, 63
3172*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 32
3173*c0909341SAndroid Build Coastguard Worker    jle .h32_main
3174*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, [tlq-79], q0000
3175*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base+z3_filter_k_tail]
3176*c0909341SAndroid Build Coastguard Worker    add                 r4d, 2
3177*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
3178*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3179*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
3180*c0909341SAndroid Build Coastguard Worker%else
3181*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
3182*c0909341SAndroid Build Coastguard Worker%endif
3183*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
3184*c0909341SAndroid Build Coastguard Worker    movd           [tlq-67], m0
3185*c0909341SAndroid Build Coastguard Worker.h32_main:
3186*c0909341SAndroid Build Coastguard Worker    movd                 m5, dyd
3187*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r4
3188*c0909341SAndroid Build Coastguard Worker    movd                 m4, r4d
3189*c0909341SAndroid Build Coastguard Worker    shl                 r4d, 6
3190*c0909341SAndroid Build Coastguard Worker    movd                 m7, [tlq]
3191*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
3192*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [base+pw_256]
3193*c0909341SAndroid Build Coastguard Worker    neg                 dyq
3194*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m6
3195*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+z3_shuf]
3196*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dyq+r4+63]
3197*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m6
3198*c0909341SAndroid Build Coastguard Worker    psubb                m4, [base+pb_15to0]
3199*c0909341SAndroid Build Coastguard Worker    mova                 m6, m5
3200*c0909341SAndroid Build Coastguard Worker.h32_loop:
3201*c0909341SAndroid Build Coastguard Worker    mov                  r4, r5
3202*c0909341SAndroid Build Coastguard Worker    pand                 m2, m8, m5
3203*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6
3204*c0909341SAndroid Build Coastguard Worker    psubw                m1, m9, m2
3205*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
3206*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r4-8*4]
3207*c0909341SAndroid Build Coastguard Worker    por                  m2, m1
3208*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r4-8*3]
3209*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
3210*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
3211*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
3212*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
3213*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
3214*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
3215*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 32
3216*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
3217*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m0
3218*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r4-8*2]
3219*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r4-8*1]
3220*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
3221*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
3222*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
3223*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
3224*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
3225*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
3226*c0909341SAndroid Build Coastguard Worker    psrlw                m2, m5, 6
3227*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6
3228*c0909341SAndroid Build Coastguard Worker    packsswb             m2, m2
3229*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
3230*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, m4, m2
3231*c0909341SAndroid Build Coastguard Worker    paddsb               m2, [base+pb_16]
3232*c0909341SAndroid Build Coastguard Worker    pand                 m0, m1
3233*c0909341SAndroid Build Coastguard Worker    pandn                m1, m7
3234*c0909341SAndroid Build Coastguard Worker    por                  m0, m1
3235*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, m4, m2
3236*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m0
3237*c0909341SAndroid Build Coastguard Worker    pand                 m0, m1, [rsp+16*0]
3238*c0909341SAndroid Build Coastguard Worker    pandn                m1, m7
3239*c0909341SAndroid Build Coastguard Worker    por                  m0, m1
3240*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m0
3241*c0909341SAndroid Build Coastguard Worker    dec                  wd
3242*c0909341SAndroid Build Coastguard Worker    jz .h32_transpose
3243*c0909341SAndroid Build Coastguard Worker    add                  r5, dyq
3244*c0909341SAndroid Build Coastguard Worker    jg .h32_loop
3245*c0909341SAndroid Build Coastguard Worker.h32_end_loop:
3246*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 32
3247*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m7
3248*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m7
3249*c0909341SAndroid Build Coastguard Worker    dec                  wd
3250*c0909341SAndroid Build Coastguard Worker    jg .h32_end_loop
3251*c0909341SAndroid Build Coastguard Worker.h32_transpose:
3252*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32
3253*c0909341SAndroid Build Coastguard Worker    jmp .end_transpose_main
3254*c0909341SAndroid Build Coastguard Worker.h64:
3255*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [wq+63]
3256*c0909341SAndroid Build Coastguard Worker    test             angled, 0x400 ; !enable_intra_edge_filter
3257*c0909341SAndroid Build Coastguard Worker    jnz .h64_main
3258*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-16*8+1]
3259*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tlq-16*7+1]
3260*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq-16*6+1]
3261*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tlq-16*5+1]
3262*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m0
3263*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m1
3264*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m2
3265*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*4], m3
3266*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-16*4+1]
3267*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tlq-16*3+1]
3268*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq-16*2+1]
3269*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tlq-16*1+1]
3270*c0909341SAndroid Build Coastguard Worker    movd                 m4, [tlq-16*0+1]
3271*c0909341SAndroid Build Coastguard Worker    neg                  r4
3272*c0909341SAndroid Build Coastguard Worker    movd                 m5, [tlq+r4]
3273*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
3274*c0909341SAndroid Build Coastguard Worker    lea                 tlq, [rsp+16*8]
3275*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*3], m0
3276*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*2], m1
3277*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d ; filter_strength = 3
3278*c0909341SAndroid Build Coastguard Worker    mova         [tlq-16*1], m2
3279*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m7
3280*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*0], m3
3281*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m7
3282*c0909341SAndroid Build Coastguard Worker    mova         [tlq+16*1], m4
3283*c0909341SAndroid Build Coastguard Worker    movq         [tlq+r4+8], m5
3284*c0909341SAndroid Build Coastguard Worker    neg                 r4d
3285*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
3286*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 16*2
3287*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
3288*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 16*2
3289*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
3290*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 16*2
3291*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 64
3292*c0909341SAndroid Build Coastguard Worker    jl .h64_filter96 ; skip one call if the last 32 bytes aren't used
3293*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
3294*c0909341SAndroid Build Coastguard Worker.h64_filter96:
3295*c0909341SAndroid Build Coastguard Worker    add                 tlq, 127
3296*c0909341SAndroid Build Coastguard Worker.h64_main:
3297*c0909341SAndroid Build Coastguard Worker    movd                 m5, dyd
3298*c0909341SAndroid Build Coastguard Worker    sub                 tlq, r4
3299*c0909341SAndroid Build Coastguard Worker    movd                 m4, r4d
3300*c0909341SAndroid Build Coastguard Worker    shl                 r4d, 6
3301*c0909341SAndroid Build Coastguard Worker    movd                 m7, [tlq]
3302*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
3303*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [base+pw_256]
3304*c0909341SAndroid Build Coastguard Worker    neg                 dyq
3305*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m6
3306*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+z3_shuf]
3307*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dyq+r4+63]
3308*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m6
3309*c0909341SAndroid Build Coastguard Worker    psubb                m4, [base+pb_15to0]
3310*c0909341SAndroid Build Coastguard Worker    mova                 m6, m5
3311*c0909341SAndroid Build Coastguard Worker.h64_loop:
3312*c0909341SAndroid Build Coastguard Worker    mov                  r4, r5
3313*c0909341SAndroid Build Coastguard Worker    pand                 m2, m8, m5
3314*c0909341SAndroid Build Coastguard Worker    sar                  r4, 6
3315*c0909341SAndroid Build Coastguard Worker    psubw                m1, m9, m2
3316*c0909341SAndroid Build Coastguard Worker    psllw                m2, 8
3317*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r4-8*8]
3318*c0909341SAndroid Build Coastguard Worker    por                  m2, m1
3319*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r4-8*7]
3320*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
3321*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
3322*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
3323*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
3324*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
3325*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
3326*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 64
3327*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
3328*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m0
3329*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r4-8*6]
3330*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r4-8*5]
3331*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
3332*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
3333*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
3334*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
3335*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
3336*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
3337*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
3338*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m0
3339*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r4-8*4]
3340*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r4-8*3]
3341*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
3342*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
3343*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
3344*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
3345*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
3346*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
3347*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
3348*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m0
3349*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq+r4-8*2]
3350*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+r4-8*1]
3351*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
3352*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
3353*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
3354*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
3355*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
3356*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
3357*c0909341SAndroid Build Coastguard Worker    psrlw                m2, m5, 6
3358*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6
3359*c0909341SAndroid Build Coastguard Worker    packsswb             m2, m2
3360*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
3361*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, m4, m2
3362*c0909341SAndroid Build Coastguard Worker    paddsb               m2, [base+pb_16]
3363*c0909341SAndroid Build Coastguard Worker    pand                 m0, m1
3364*c0909341SAndroid Build Coastguard Worker    pandn                m1, m7
3365*c0909341SAndroid Build Coastguard Worker    por                  m0, m1
3366*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, m4, m2
3367*c0909341SAndroid Build Coastguard Worker    paddsb               m2, [base+pb_16]
3368*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m0
3369*c0909341SAndroid Build Coastguard Worker    pand                 m0, m1, [rsp+16*2]
3370*c0909341SAndroid Build Coastguard Worker    pandn                m1, m7
3371*c0909341SAndroid Build Coastguard Worker    por                  m0, m1
3372*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, m4, m2
3373*c0909341SAndroid Build Coastguard Worker    paddsb               m2, [base+pb_16]
3374*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m0
3375*c0909341SAndroid Build Coastguard Worker    pand                 m0, m1, [rsp+16*1]
3376*c0909341SAndroid Build Coastguard Worker    pandn                m1, m7
3377*c0909341SAndroid Build Coastguard Worker    por                  m0, m1
3378*c0909341SAndroid Build Coastguard Worker    pcmpgtb              m1, m4, m2
3379*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m0
3380*c0909341SAndroid Build Coastguard Worker    pand                 m0, m1, [rsp+16*0]
3381*c0909341SAndroid Build Coastguard Worker    pandn                m1, m7
3382*c0909341SAndroid Build Coastguard Worker    por                  m0, m1
3383*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m0
3384*c0909341SAndroid Build Coastguard Worker    dec                  wd
3385*c0909341SAndroid Build Coastguard Worker    jz .h64_transpose
3386*c0909341SAndroid Build Coastguard Worker    add                  r5, dyq
3387*c0909341SAndroid Build Coastguard Worker    jg .h64_loop
3388*c0909341SAndroid Build Coastguard Worker.h64_end_loop:
3389*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 64
3390*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m7
3391*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m7
3392*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m7
3393*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m7
3394*c0909341SAndroid Build Coastguard Worker    dec                  wd
3395*c0909341SAndroid Build Coastguard Worker    jg .h64_end_loop
3396*c0909341SAndroid Build Coastguard Worker.h64_transpose:
3397*c0909341SAndroid Build Coastguard Worker    or                  r3d, 64
3398*c0909341SAndroid Build Coastguard Worker.end_transpose_main:
3399*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3400*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r3*3]
3401*c0909341SAndroid Build Coastguard Worker    lea                  r7, [strideq*3]
3402*c0909341SAndroid Build Coastguard Worker%else
3403*c0909341SAndroid Build Coastguard Worker    mov             strideq, [dstq]
3404*c0909341SAndroid Build Coastguard Worker    mov              org_wd, [dstq+strideq]
3405*c0909341SAndroid Build Coastguard Worker%endif
3406*c0909341SAndroid Build Coastguard Worker.end_transpose_loop:
3407*c0909341SAndroid Build Coastguard Worker    lea                  r4, [rsp+r3-8]
3408*c0909341SAndroid Build Coastguard Worker    lea                  r6, [dstq+org_wq-8]
3409*c0909341SAndroid Build Coastguard Worker.end_transpose_loop_y:
3410*c0909341SAndroid Build Coastguard Worker    movq                 m0, [r4+r3*1]
3411*c0909341SAndroid Build Coastguard Worker    movq                 m4, [r4+r3*0]
3412*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3413*c0909341SAndroid Build Coastguard Worker    movq                 m1, [r4+r5  ]
3414*c0909341SAndroid Build Coastguard Worker    movq                 m5, [r4+r3*2]
3415*c0909341SAndroid Build Coastguard Worker    lea                  r2, [r4+r3*4]
3416*c0909341SAndroid Build Coastguard Worker%else
3417*c0909341SAndroid Build Coastguard Worker    lea                  r2, [r4+r3*2]
3418*c0909341SAndroid Build Coastguard Worker    movq                 m1, [r2+r3*1]
3419*c0909341SAndroid Build Coastguard Worker    movq                 m5, [r2+r3*0]
3420*c0909341SAndroid Build Coastguard Worker    lea                  r2, [r2+r3*2]
3421*c0909341SAndroid Build Coastguard Worker%endif
3422*c0909341SAndroid Build Coastguard Worker    movq                 m2, [r2+r3*1]
3423*c0909341SAndroid Build Coastguard Worker    movq                 m6, [r2+r3*0]
3424*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3425*c0909341SAndroid Build Coastguard Worker    movq                 m3, [r2+r5  ]
3426*c0909341SAndroid Build Coastguard Worker    movq                 m7, [r2+r3*2]
3427*c0909341SAndroid Build Coastguard Worker%else
3428*c0909341SAndroid Build Coastguard Worker    lea                  r2, [r2+r3*2]
3429*c0909341SAndroid Build Coastguard Worker    movq                 m3, [r2+r3*1]
3430*c0909341SAndroid Build Coastguard Worker    movq                 m7, [r2+r3*0]
3431*c0909341SAndroid Build Coastguard Worker%endif
3432*c0909341SAndroid Build Coastguard Worker    sub                  r4, 8
3433*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m4
3434*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m5
3435*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m6
3436*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m7
3437*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m1, m0
3438*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0
3439*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m3, m2
3440*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m2
3441*c0909341SAndroid Build Coastguard Worker    punpckhdq            m2, m3, m1
3442*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m1
3443*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m0, m4
3444*c0909341SAndroid Build Coastguard Worker    punpckhdq            m0, m4
3445*c0909341SAndroid Build Coastguard Worker    movhps   [r6+strideq*0], m0
3446*c0909341SAndroid Build Coastguard Worker    movq     [r6+strideq*1], m0
3447*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3448*c0909341SAndroid Build Coastguard Worker    movhps   [r6+strideq*2], m1
3449*c0909341SAndroid Build Coastguard Worker    movq     [r6+r7       ], m1
3450*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6+strideq*4]
3451*c0909341SAndroid Build Coastguard Worker%else
3452*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6+strideq*2]
3453*c0909341SAndroid Build Coastguard Worker    movhps   [r6+strideq*0], m1
3454*c0909341SAndroid Build Coastguard Worker    movq     [r6+strideq*1], m1
3455*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6+strideq*2]
3456*c0909341SAndroid Build Coastguard Worker%endif
3457*c0909341SAndroid Build Coastguard Worker    movhps   [r6+strideq*0], m2
3458*c0909341SAndroid Build Coastguard Worker    movq     [r6+strideq*1], m2
3459*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3460*c0909341SAndroid Build Coastguard Worker    movhps   [r6+strideq*2], m3
3461*c0909341SAndroid Build Coastguard Worker    movq     [r6+r7       ], m3
3462*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6+strideq*4]
3463*c0909341SAndroid Build Coastguard Worker%else
3464*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6+strideq*2]
3465*c0909341SAndroid Build Coastguard Worker    movhps   [r6+strideq*0], m3
3466*c0909341SAndroid Build Coastguard Worker    movq     [r6+strideq*1], m3
3467*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6+strideq*2]
3468*c0909341SAndroid Build Coastguard Worker%endif
3469*c0909341SAndroid Build Coastguard Worker    cmp                  r4, rsp
3470*c0909341SAndroid Build Coastguard Worker    jae .end_transpose_loop_y
3471*c0909341SAndroid Build Coastguard Worker    lea                 rsp, [rsp+r3*8]
3472*c0909341SAndroid Build Coastguard Worker    sub              org_wd, 8
3473*c0909341SAndroid Build Coastguard Worker    jg .end_transpose_loop
3474*c0909341SAndroid Build Coastguard Worker    RET
3475*c0909341SAndroid Build Coastguard Worker
3476*c0909341SAndroid Build Coastguard Worker;-------------------------------------------------------------------------------
3477*c0909341SAndroid Build Coastguard Worker;int dav1d_pal_pred_ssse3(pixel *dst, ptrdiff_t stride, const pixel *pal,
3478*c0909341SAndroid Build Coastguard Worker;                         const uint8_t *idx, int w, int h);
3479*c0909341SAndroid Build Coastguard Worker;-------------------------------------------------------------------------------
3480*c0909341SAndroid Build Coastguard Workercglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
3481*c0909341SAndroid Build Coastguard Worker    movq                 m4, [palq]
3482*c0909341SAndroid Build Coastguard Worker    LEA                  r2, pal_pred_ssse3_table
3483*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
3484*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
3485*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r2+wq*4]
3486*c0909341SAndroid Build Coastguard Worker    add                  wq, r2
3487*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
3488*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3489*c0909341SAndroid Build Coastguard Worker.w4:
3490*c0909341SAndroid Build Coastguard Worker    movq                 m1, [idxq]
3491*c0909341SAndroid Build Coastguard Worker    add                idxq, 8
3492*c0909341SAndroid Build Coastguard Worker    psrlw                m0, m1, 4
3493*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m0
3494*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4, m1
3495*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m0
3496*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032
3497*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m1
3498*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
3499*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], m0
3500*c0909341SAndroid Build Coastguard Worker    psrlq                m0, 32
3501*c0909341SAndroid Build Coastguard Worker    movd   [dstq+r2       ], m0
3502*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
3503*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3504*c0909341SAndroid Build Coastguard Worker    jg .w4
3505*c0909341SAndroid Build Coastguard Worker    RET
3506*c0909341SAndroid Build Coastguard Worker.w8:
3507*c0909341SAndroid Build Coastguard Worker    movu                 m0, [idxq]
3508*c0909341SAndroid Build Coastguard Worker    add                idxq, 16
3509*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m0
3510*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 4
3511*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4, m0
3512*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
3513*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
3514*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
3515*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
3516*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], m1
3517*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r2       ], m1
3518*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
3519*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3520*c0909341SAndroid Build Coastguard Worker    jg .w8
3521*c0909341SAndroid Build Coastguard Worker    RET
3522*c0909341SAndroid Build Coastguard Worker.w16:
3523*c0909341SAndroid Build Coastguard Worker    movu                 m0, [idxq]
3524*c0909341SAndroid Build Coastguard Worker    add                idxq, 16
3525*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m0
3526*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 4
3527*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4, m0
3528*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
3529*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
3530*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
3531*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
3532*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
3533*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3534*c0909341SAndroid Build Coastguard Worker    jg .w16
3535*c0909341SAndroid Build Coastguard Worker    RET
3536*c0909341SAndroid Build Coastguard Worker.w32:
3537*c0909341SAndroid Build Coastguard Worker    movu                 m0, [idxq]
3538*c0909341SAndroid Build Coastguard Worker    add                idxq, 16
3539*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m0
3540*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 4
3541*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4, m0
3542*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
3543*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
3544*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
3545*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
3546*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
3547*c0909341SAndroid Build Coastguard Worker    dec                  hd
3548*c0909341SAndroid Build Coastguard Worker    jg .w32
3549*c0909341SAndroid Build Coastguard Worker    RET
3550*c0909341SAndroid Build Coastguard Worker.w64:
3551*c0909341SAndroid Build Coastguard Worker    movu                 m0, [idxq+16*0]
3552*c0909341SAndroid Build Coastguard Worker    movu                 m2, [idxq+16*1]
3553*c0909341SAndroid Build Coastguard Worker    add                idxq, 32
3554*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m0
3555*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 4
3556*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4, m0
3557*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m3
3558*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m3
3559*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
3560*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
3561*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4, m2
3562*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 4
3563*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4, m2
3564*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m3
3565*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m3
3566*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
3567*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m1
3568*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
3569*c0909341SAndroid Build Coastguard Worker    sub                  hd, 1
3570*c0909341SAndroid Build Coastguard Worker    jg .w64
3571*c0909341SAndroid Build Coastguard Worker    RET
3572*c0909341SAndroid Build Coastguard Worker
3573*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
3574*c0909341SAndroid Build Coastguard Worker;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
3575*c0909341SAndroid Build Coastguard Worker;                           const int width, const int height, const int16_t *ac, const int alpha);
3576*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
3577*c0909341SAndroid Build Coastguard Worker%macro IPRED_CFL 1                   ; ac in, unpacked pixels out
3578*c0909341SAndroid Build Coastguard Worker    psignw               m3, m%1, m1
3579*c0909341SAndroid Build Coastguard Worker    pabsw               m%1, m%1
3580*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%1, m2
3581*c0909341SAndroid Build Coastguard Worker    psignw              m%1, m3
3582*c0909341SAndroid Build Coastguard Worker    paddw               m%1, m0
3583*c0909341SAndroid Build Coastguard Worker%endmacro
3584*c0909341SAndroid Build Coastguard Worker
3585*c0909341SAndroid Build Coastguard Worker%if UNIX64
3586*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7
3587*c0909341SAndroid Build Coastguard Worker%else
3588*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5
3589*c0909341SAndroid Build Coastguard Worker%endif
3590*c0909341SAndroid Build Coastguard Worker
3591*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
3592*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
3593*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
3594*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, hd
3595*c0909341SAndroid Build Coastguard Worker    lea                 t0d, [wq+hq]
3596*c0909341SAndroid Build Coastguard Worker    movd                 m4, t0d
3597*c0909341SAndroid Build Coastguard Worker    tzcnt               t0d, t0d
3598*c0909341SAndroid Build Coastguard Worker    movd                 m5, t0d
3599*c0909341SAndroid Build Coastguard Worker    LEA                  t0, ipred_cfl_ssse3_table
3600*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
3601*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [t0+r6*4]
3602*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4+16]
3603*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m3, m3
3604*c0909341SAndroid Build Coastguard Worker    psrlw                m4, 1
3605*c0909341SAndroid Build Coastguard Worker    add                  r6, t0
3606*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
3607*c0909341SAndroid Build Coastguard Worker    movifnidn           acq, acmp
3608*c0909341SAndroid Build Coastguard Worker    jmp                  r6
3609*c0909341SAndroid Build Coastguard Worker.h4:
3610*c0909341SAndroid Build Coastguard Worker    movd                 m0, [tlq-4]
3611*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
3612*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3613*c0909341SAndroid Build Coastguard Worker.w4:
3614*c0909341SAndroid Build Coastguard Worker    movd                 m1, [tlq+1]
3615*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
3616*c0909341SAndroid Build Coastguard Worker    psubw                m0, m4
3617*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3618*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m3
3619*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 4
3620*c0909341SAndroid Build Coastguard Worker    jg .w4_mul
3621*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 3                             ; dc >>= ctz(width + height);
3622*c0909341SAndroid Build Coastguard Worker    jmp .w4_end
3623*c0909341SAndroid Build Coastguard Worker.w4_mul:
3624*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m0, m0
3625*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3626*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032                     ; psrlq                m1, m0, 32
3627*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3628*c0909341SAndroid Build Coastguard Worker    psrlw                m0, 2
3629*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x5556
3630*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x3334
3631*c0909341SAndroid Build Coastguard Worker    test                 hd, 8
3632*c0909341SAndroid Build Coastguard Worker    cmovz               r6d, r2d
3633*c0909341SAndroid Build Coastguard Worker    movd                 m5, r6d
3634*c0909341SAndroid Build Coastguard Worker    pmulhuw              m0, m5
3635*c0909341SAndroid Build Coastguard Worker.w4_end:
3636*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q0000
3637*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
3638*c0909341SAndroid Build Coastguard Worker.s4:
3639*c0909341SAndroid Build Coastguard Worker    movd                 m1, alpham
3640*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m1, q0000
3641*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m1
3642*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
3643*c0909341SAndroid Build Coastguard Worker    pabsw                m2, m1
3644*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
3645*c0909341SAndroid Build Coastguard Worker.s4_loop:
3646*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq]
3647*c0909341SAndroid Build Coastguard Worker    mova                 m5, [acq+16]
3648*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4
3649*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             5
3650*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
3651*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m4
3652*c0909341SAndroid Build Coastguard Worker    pshuflw              m4, m4, q1032
3653*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m4
3654*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m4, m4
3655*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], m4
3656*c0909341SAndroid Build Coastguard Worker    psrlq                m4, 32
3657*c0909341SAndroid Build Coastguard Worker    movd   [dstq+r6       ], m4
3658*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
3659*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
3660*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3661*c0909341SAndroid Build Coastguard Worker    jg .s4_loop
3662*c0909341SAndroid Build Coastguard Worker    RET
3663*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3664*c0909341SAndroid Build Coastguard Worker.h8:
3665*c0909341SAndroid Build Coastguard Worker    movq                 m0, [tlq-8]
3666*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
3667*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3668*c0909341SAndroid Build Coastguard Worker.w8:
3669*c0909341SAndroid Build Coastguard Worker    movq                 m1, [tlq+1]
3670*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
3671*c0909341SAndroid Build Coastguard Worker    psubw                m4, m0
3672*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
3673*c0909341SAndroid Build Coastguard Worker    psubw                m0, m4
3674*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3675*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032                  ; psrlq  m1, m0, 32
3676*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3677*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m3
3678*c0909341SAndroid Build Coastguard Worker    psrlw                m0, m5
3679*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 8
3680*c0909341SAndroid Build Coastguard Worker    je .w8_end
3681*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x5556
3682*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x3334
3683*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
3684*c0909341SAndroid Build Coastguard Worker    cmovz               r6d, r2d
3685*c0909341SAndroid Build Coastguard Worker    movd                 m1, r6d
3686*c0909341SAndroid Build Coastguard Worker    pmulhuw              m0, m1
3687*c0909341SAndroid Build Coastguard Worker.w8_end:
3688*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q0000
3689*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
3690*c0909341SAndroid Build Coastguard Worker.s8:
3691*c0909341SAndroid Build Coastguard Worker    movd                 m1, alpham
3692*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m1, q0000
3693*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m1
3694*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
3695*c0909341SAndroid Build Coastguard Worker    pabsw                m2, m1
3696*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
3697*c0909341SAndroid Build Coastguard Worker.s8_loop:
3698*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq]
3699*c0909341SAndroid Build Coastguard Worker    mova                 m5, [acq+16]
3700*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4
3701*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             5
3702*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
3703*c0909341SAndroid Build Coastguard Worker    movq   [dstq          ], m4
3704*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq  ], m4
3705*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq+32]
3706*c0909341SAndroid Build Coastguard Worker    mova                 m5, [acq+48]
3707*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4
3708*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             5
3709*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
3710*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], m4
3711*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r6       ], m4
3712*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
3713*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
3714*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
3715*c0909341SAndroid Build Coastguard Worker    jg .s8_loop
3716*c0909341SAndroid Build Coastguard Worker    RET
3717*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3718*c0909341SAndroid Build Coastguard Worker.h16:
3719*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-16]
3720*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
3721*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3722*c0909341SAndroid Build Coastguard Worker.w16:
3723*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+1]
3724*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
3725*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3726*c0909341SAndroid Build Coastguard Worker    psubw                m4, m0
3727*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
3728*c0909341SAndroid Build Coastguard Worker    psubw                m0, m4
3729*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032                  ; psrlq  m1, m0, 32
3730*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3731*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m3
3732*c0909341SAndroid Build Coastguard Worker    psrlw                m0, m5
3733*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 16
3734*c0909341SAndroid Build Coastguard Worker    je .w16_end
3735*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x5556
3736*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x3334
3737*c0909341SAndroid Build Coastguard Worker    test                 hd, 8|32
3738*c0909341SAndroid Build Coastguard Worker    cmovz               r6d, r2d
3739*c0909341SAndroid Build Coastguard Worker    movd                 m1, r6d
3740*c0909341SAndroid Build Coastguard Worker    pmulhuw              m0, m1
3741*c0909341SAndroid Build Coastguard Worker.w16_end:
3742*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q0000
3743*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
3744*c0909341SAndroid Build Coastguard Worker.s16:
3745*c0909341SAndroid Build Coastguard Worker    movd                 m1, alpham
3746*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m1, q0000
3747*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m1
3748*c0909341SAndroid Build Coastguard Worker    pabsw                m2, m1
3749*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
3750*c0909341SAndroid Build Coastguard Worker.s16_loop:
3751*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq]
3752*c0909341SAndroid Build Coastguard Worker    mova                 m5, [acq+16]
3753*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4
3754*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             5
3755*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
3756*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m4
3757*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq+32]
3758*c0909341SAndroid Build Coastguard Worker    mova                 m5, [acq+48]
3759*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4
3760*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             5
3761*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
3762*c0909341SAndroid Build Coastguard Worker    mova     [dstq+strideq], m4
3763*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
3764*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
3765*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3766*c0909341SAndroid Build Coastguard Worker    jg .s16_loop
3767*c0909341SAndroid Build Coastguard Worker    RET
3768*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3769*c0909341SAndroid Build Coastguard Worker.h32:
3770*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tlq-32]
3771*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
3772*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tlq-16]
3773*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m3
3774*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
3775*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3776*c0909341SAndroid Build Coastguard Worker.w32:
3777*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+1]
3778*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
3779*c0909341SAndroid Build Coastguard Worker    movu                 m2, [tlq+17]
3780*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m3
3781*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
3782*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3783*c0909341SAndroid Build Coastguard Worker    psubw                m4, m0
3784*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
3785*c0909341SAndroid Build Coastguard Worker    psubw                m0, m4
3786*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032                   ; psrlq  m1, m0, 32
3787*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3788*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m3
3789*c0909341SAndroid Build Coastguard Worker    psrlw                m0, m5
3790*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 32
3791*c0909341SAndroid Build Coastguard Worker    je .w32_end
3792*c0909341SAndroid Build Coastguard Worker    lea                 r2d, [hq*2]
3793*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x5556
3794*c0909341SAndroid Build Coastguard Worker    mov                 r2d, 0x3334
3795*c0909341SAndroid Build Coastguard Worker    test                 hd, 64|16
3796*c0909341SAndroid Build Coastguard Worker    cmovz               r6d, r2d
3797*c0909341SAndroid Build Coastguard Worker    movd                 m1, r6d
3798*c0909341SAndroid Build Coastguard Worker    pmulhuw              m0, m1
3799*c0909341SAndroid Build Coastguard Worker.w32_end:
3800*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q0000
3801*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
3802*c0909341SAndroid Build Coastguard Worker.s32:
3803*c0909341SAndroid Build Coastguard Worker    movd                 m1, alpham
3804*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m1, q0000
3805*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m1
3806*c0909341SAndroid Build Coastguard Worker    pabsw                m2, m1
3807*c0909341SAndroid Build Coastguard Worker    psllw                m2, 9
3808*c0909341SAndroid Build Coastguard Worker.s32_loop:
3809*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq]
3810*c0909341SAndroid Build Coastguard Worker    mova                 m5, [acq+16]
3811*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4
3812*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             5
3813*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
3814*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m4
3815*c0909341SAndroid Build Coastguard Worker    mova                 m4, [acq+32]
3816*c0909341SAndroid Build Coastguard Worker    mova                 m5, [acq+48]
3817*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             4
3818*c0909341SAndroid Build Coastguard Worker    IPRED_CFL             5
3819*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
3820*c0909341SAndroid Build Coastguard Worker    mova          [dstq+16], m4
3821*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
3822*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
3823*c0909341SAndroid Build Coastguard Worker    dec                  hd
3824*c0909341SAndroid Build Coastguard Worker    jg .s32_loop
3825*c0909341SAndroid Build Coastguard Worker    RET
3826*c0909341SAndroid Build Coastguard Worker
3827*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
3828*c0909341SAndroid Build Coastguard Worker;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
3829*c0909341SAndroid Build Coastguard Worker;                           const int width, const int height, const int16_t *ac, const int alpha);
3830*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
3831*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
3832*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm                                 ; zero upper half
3833*c0909341SAndroid Build Coastguard Worker    tzcnt               r6d, hd
3834*c0909341SAndroid Build Coastguard Worker    sub                 tlq, hq
3835*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
3836*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq]
3837*c0909341SAndroid Build Coastguard Worker    mov                 t0d, 0x8000
3838*c0909341SAndroid Build Coastguard Worker    movd                 m3, t0d
3839*c0909341SAndroid Build Coastguard Worker    movd                 m2, r6d
3840*c0909341SAndroid Build Coastguard Worker    psrld                m3, m2
3841*c0909341SAndroid Build Coastguard Worker    LEA                  t0, ipred_cfl_left_ssse3_table
3842*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [t0+r6*4]
3843*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m2, m2
3844*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
3845*c0909341SAndroid Build Coastguard Worker    add                  r6, t0
3846*c0909341SAndroid Build Coastguard Worker    add                  t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table
3847*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4]
3848*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
3849*c0909341SAndroid Build Coastguard Worker    movifnidn           acq, acmp
3850*c0909341SAndroid Build Coastguard Worker    jmp                  r6
3851*c0909341SAndroid Build Coastguard Worker.h32:
3852*c0909341SAndroid Build Coastguard Worker    movu                 m1, [tlq+16]                           ; unaligned when jumping here from dc_top
3853*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
3854*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3855*c0909341SAndroid Build Coastguard Worker.h16:
3856*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m0, q3232                          ; psrlq               m1, m0, 16
3857*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3858*c0909341SAndroid Build Coastguard Worker.h8:
3859*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032                          ; psrlq               m1, m0, 32
3860*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3861*c0909341SAndroid Build Coastguard Worker.h4:
3862*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m2
3863*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
3864*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q0000
3865*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
3866*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3867*c0909341SAndroid Build Coastguard Worker
3868*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
3869*c0909341SAndroid Build Coastguard Worker;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
3870*c0909341SAndroid Build Coastguard Worker;                           const int width, const int height, const int16_t *ac, const int alpha);
3871*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
3872*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
3873*c0909341SAndroid Build Coastguard Worker    LEA                  t0, ipred_cfl_left_ssse3_table
3874*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
3875*c0909341SAndroid Build Coastguard Worker    inc                 tlq
3876*c0909341SAndroid Build Coastguard Worker    movu                 m0, [tlq]
3877*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
3878*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x8000
3879*c0909341SAndroid Build Coastguard Worker    movd                 m3, r6d
3880*c0909341SAndroid Build Coastguard Worker    movd                 m2, wd
3881*c0909341SAndroid Build Coastguard Worker    psrld                m3, m2
3882*c0909341SAndroid Build Coastguard Worker    movsxd               r6, [t0+wq*4]
3883*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m2, m2
3884*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
3885*c0909341SAndroid Build Coastguard Worker    add                  r6, t0
3886*c0909341SAndroid Build Coastguard Worker    add                  t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table
3887*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4]
3888*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
3889*c0909341SAndroid Build Coastguard Worker    movifnidn           acq, acmp
3890*c0909341SAndroid Build Coastguard Worker    jmp                  r6
3891*c0909341SAndroid Build Coastguard Worker
3892*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
3893*c0909341SAndroid Build Coastguard Worker;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
3894*c0909341SAndroid Build Coastguard Worker;                           const int width, const int height, const int16_t *ac, const int alpha);
3895*c0909341SAndroid Build Coastguard Worker;---------------------------------------------------------------------------------------
3896*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
3897*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
3898*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
3899*c0909341SAndroid Build Coastguard Worker    LEA                  r6, ipred_cfl_splat_ssse3_table
3900*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r6+wq*4]
3901*c0909341SAndroid Build Coastguard Worker    movddup              m0, [r6-ipred_cfl_splat_ssse3_table+pw_128]
3902*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
3903*c0909341SAndroid Build Coastguard Worker    movifnidn           acq, acmp
3904*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3905*c0909341SAndroid Build Coastguard Worker
3906*c0909341SAndroid Build Coastguard Worker%macro RELOAD_ACQ_32 1
3907*c0909341SAndroid Build Coastguard Worker    mov                 acq, ac_bakq       ; restore acq
3908*c0909341SAndroid Build Coastguard Worker%endmacro
3909*c0909341SAndroid Build Coastguard Worker
3910*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3911*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
3912*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7
3913*c0909341SAndroid Build Coastguard Worker    movddup              m2, [pb_2]
3914*c0909341SAndroid Build Coastguard Worker%else
3915*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
3916*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4
3917*c0909341SAndroid Build Coastguard Worker%define ac_bakq acmp
3918*c0909341SAndroid Build Coastguard Worker    mov                 t0d, 0x02020202
3919*c0909341SAndroid Build Coastguard Worker    movd                 m2, t0d
3920*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q0000
3921*c0909341SAndroid Build Coastguard Worker%endif
3922*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
3923*c0909341SAndroid Build Coastguard Worker    mov                 t0d, hm
3924*c0909341SAndroid Build Coastguard Worker    mov                  hd, t0d
3925*c0909341SAndroid Build Coastguard Worker    imul                t0d, wd
3926*c0909341SAndroid Build Coastguard Worker    movd                 m5, t0d
3927*c0909341SAndroid Build Coastguard Worker    movifnidn         hpadd, hpadm
3928*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3929*c0909341SAndroid Build Coastguard Worker    mov             ac_bakq, acq
3930*c0909341SAndroid Build Coastguard Worker%endif
3931*c0909341SAndroid Build Coastguard Worker    shl               hpadd, 2
3932*c0909341SAndroid Build Coastguard Worker    sub                  hd, hpadd
3933*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
3934*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
3935*c0909341SAndroid Build Coastguard Worker    jg .w16
3936*c0909341SAndroid Build Coastguard Worker    je .w8
3937*c0909341SAndroid Build Coastguard Worker    ; fall-through
3938*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3939*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
3940*c0909341SAndroid Build Coastguard Worker%else
3941*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
3942*c0909341SAndroid Build Coastguard Worker%endif
3943*c0909341SAndroid Build Coastguard Worker.w4:
3944*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
3945*c0909341SAndroid Build Coastguard Worker.w4_loop:
3946*c0909341SAndroid Build Coastguard Worker    movq                 m0, [yq]
3947*c0909341SAndroid Build Coastguard Worker    movq                 m1, [yq+strideq]
3948*c0909341SAndroid Build Coastguard Worker    movhps               m0, [yq+strideq*2]
3949*c0909341SAndroid Build Coastguard Worker    movhps               m1, [yq+stride3q]
3950*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
3951*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
3952*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3953*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
3954*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
3955*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*4]
3956*c0909341SAndroid Build Coastguard Worker    add                 acq, 16
3957*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3958*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
3959*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
3960*c0909341SAndroid Build Coastguard Worker    jz .calc_avg_4_8
3961*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
3962*c0909341SAndroid Build Coastguard Worker.w4_hpad_loop:
3963*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
3964*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
3965*c0909341SAndroid Build Coastguard Worker    add                 acq, 16
3966*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 2
3967*c0909341SAndroid Build Coastguard Worker    jg .w4_hpad_loop
3968*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg_4_8
3969*c0909341SAndroid Build Coastguard Worker.w8:
3970*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
3971*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
3972*c0909341SAndroid Build Coastguard Worker    jnz .w8_wpad
3973*c0909341SAndroid Build Coastguard Worker.w8_loop:
3974*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq]
3975*c0909341SAndroid Build Coastguard Worker    mova                 m1, [yq+strideq]
3976*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
3977*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
3978*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3979*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
3980*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
3981*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq+strideq*2]
3982*c0909341SAndroid Build Coastguard Worker    mova                 m1, [yq+stride3q]
3983*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
3984*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
3985*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
3986*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
3987*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
3988*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*4]
3989*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
3990*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3991*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
3992*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
3993*c0909341SAndroid Build Coastguard Worker    jz .calc_avg_4_8
3994*c0909341SAndroid Build Coastguard Worker    jmp .w8_hpad
3995*c0909341SAndroid Build Coastguard Worker.w8_wpad:                                              ; wpadd=1
3996*c0909341SAndroid Build Coastguard Worker    movddup              m0, [yq]
3997*c0909341SAndroid Build Coastguard Worker    movddup              m1, [yq+strideq]
3998*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
3999*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4000*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
4001*c0909341SAndroid Build Coastguard Worker    pshufhw              m0, m0, q3333
4002*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4003*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4004*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
4005*c0909341SAndroid Build Coastguard Worker    add                 acq, 16
4006*c0909341SAndroid Build Coastguard Worker    sub                  hd, 1
4007*c0909341SAndroid Build Coastguard Worker    jg .w8_wpad
4008*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4009*c0909341SAndroid Build Coastguard Worker    jz .calc_avg_4_8
4010*c0909341SAndroid Build Coastguard Worker.w8_hpad:
4011*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4012*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4013*c0909341SAndroid Build Coastguard Worker    add                 acq, 16
4014*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 1
4015*c0909341SAndroid Build Coastguard Worker    jg .w8_hpad
4016*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg_4_8
4017*c0909341SAndroid Build Coastguard Worker.w16:
4018*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
4019*c0909341SAndroid Build Coastguard Worker    jnz .w16_wpad
4020*c0909341SAndroid Build Coastguard Worker.w16_loop:
4021*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq]
4022*c0909341SAndroid Build Coastguard Worker    mova                 m1, [yq+strideq]
4023*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4024*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4025*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
4026*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4027*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4028*c0909341SAndroid Build Coastguard Worker    mova                 m6, [yq+16]
4029*c0909341SAndroid Build Coastguard Worker    mova                 m1, [yq+strideq+16]
4030*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m2
4031*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4032*c0909341SAndroid Build Coastguard Worker    paddw                m6, m1
4033*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m6
4034*c0909341SAndroid Build Coastguard Worker    paddw                m4, m6
4035*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
4036*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4037*c0909341SAndroid Build Coastguard Worker    dec                  hd
4038*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
4039*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4040*c0909341SAndroid Build Coastguard Worker    jz .calc_avg16
4041*c0909341SAndroid Build Coastguard Worker    jmp .w16_hpad_loop
4042*c0909341SAndroid Build Coastguard Worker.w16_wpad:
4043*c0909341SAndroid Build Coastguard Worker    cmp               wpadd, 2
4044*c0909341SAndroid Build Coastguard Worker    jl .w16_pad1
4045*c0909341SAndroid Build Coastguard Worker    je .w16_pad2
4046*c0909341SAndroid Build Coastguard Worker.w16_pad3:
4047*c0909341SAndroid Build Coastguard Worker    movddup              m0, [yq]
4048*c0909341SAndroid Build Coastguard Worker    movddup              m1, [yq+strideq]
4049*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4050*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4051*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
4052*c0909341SAndroid Build Coastguard Worker    pshufhw              m0, m0, q3333
4053*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4054*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4055*c0909341SAndroid Build Coastguard Worker    mova                 m6, m0
4056*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m6, m0, m0
4057*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m6
4058*c0909341SAndroid Build Coastguard Worker    paddw                m4, m6
4059*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
4060*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4061*c0909341SAndroid Build Coastguard Worker    dec                  hd
4062*c0909341SAndroid Build Coastguard Worker    jg .w16_pad3
4063*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_done
4064*c0909341SAndroid Build Coastguard Worker.w16_pad2:
4065*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq]
4066*c0909341SAndroid Build Coastguard Worker    mova                 m1, [yq+strideq]
4067*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4068*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4069*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
4070*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4071*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4072*c0909341SAndroid Build Coastguard Worker    pshufhw              m6, m0, q3333
4073*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m6, m6
4074*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m6
4075*c0909341SAndroid Build Coastguard Worker    paddw                m4, m6
4076*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
4077*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4078*c0909341SAndroid Build Coastguard Worker    dec                  hd
4079*c0909341SAndroid Build Coastguard Worker    jg .w16_pad2
4080*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_done
4081*c0909341SAndroid Build Coastguard Worker.w16_pad1:
4082*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq]
4083*c0909341SAndroid Build Coastguard Worker    mova                 m1, [yq+strideq]
4084*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4085*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4086*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
4087*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4088*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4089*c0909341SAndroid Build Coastguard Worker    movddup              m6, [yq+16]
4090*c0909341SAndroid Build Coastguard Worker    movddup              m1, [yq+strideq+16]
4091*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m2
4092*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4093*c0909341SAndroid Build Coastguard Worker    paddw                m6, m1
4094*c0909341SAndroid Build Coastguard Worker    pshufhw              m6, m6, q3333
4095*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m6
4096*c0909341SAndroid Build Coastguard Worker    paddw                m4, m6
4097*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
4098*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4099*c0909341SAndroid Build Coastguard Worker    dec                  hd
4100*c0909341SAndroid Build Coastguard Worker    jg .w16_pad1
4101*c0909341SAndroid Build Coastguard Worker.w16_wpad_done:
4102*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4103*c0909341SAndroid Build Coastguard Worker    jz .calc_avg16
4104*c0909341SAndroid Build Coastguard Worker.w16_hpad_loop:
4105*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4106*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4107*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m6
4108*c0909341SAndroid Build Coastguard Worker    paddw                m4, m6
4109*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4110*c0909341SAndroid Build Coastguard Worker    dec               hpadd
4111*c0909341SAndroid Build Coastguard Worker    jg .w16_hpad_loop
4112*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg16
4113*c0909341SAndroid Build Coastguard Worker
4114*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4115*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
4116*c0909341SAndroid Build Coastguard Worker%else
4117*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
4118*c0909341SAndroid Build Coastguard Worker%endif
4119*c0909341SAndroid Build Coastguard Worker.calc_avg_4_8:
4120*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 9
4121*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m2
4122*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg
4123*c0909341SAndroid Build Coastguard Worker.calc_avg16:
4124*c0909341SAndroid Build Coastguard Worker    psrld                m0, m4, 16
4125*c0909341SAndroid Build Coastguard Worker    pslld                m4, 16
4126*c0909341SAndroid Build Coastguard Worker    psrld                m4, 16
4127*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
4128*c0909341SAndroid Build Coastguard Worker.calc_avg:
4129*c0909341SAndroid Build Coastguard Worker    movd                szd, m5
4130*c0909341SAndroid Build Coastguard Worker    psrad                m5, 1
4131*c0909341SAndroid Build Coastguard Worker    tzcnt               r1d, szd
4132*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
4133*c0909341SAndroid Build Coastguard Worker    movd                 m1, r1d
4134*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m4, q2301
4135*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
4136*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m0, q1032
4137*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
4138*c0909341SAndroid Build Coastguard Worker    psrad                m0, m1                        ; sum >>= log2sz;
4139*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m0
4140*c0909341SAndroid Build Coastguard Worker    RELOAD_ACQ_32       acq
4141*c0909341SAndroid Build Coastguard Worker.sub_loop:
4142*c0909341SAndroid Build Coastguard Worker    mova                 m1, [acq]
4143*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0                        ; ac[x] -= sum;
4144*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4145*c0909341SAndroid Build Coastguard Worker    add                 acq, 16
4146*c0909341SAndroid Build Coastguard Worker    sub                 szd, 8
4147*c0909341SAndroid Build Coastguard Worker    jg .sub_loop
4148*c0909341SAndroid Build Coastguard Worker    RET
4149*c0909341SAndroid Build Coastguard Worker
4150*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4151*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
4152*c0909341SAndroid Build Coastguard Worker    movddup              m2, [pb_4]
4153*c0909341SAndroid Build Coastguard Worker%else
4154*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
4155*c0909341SAndroid Build Coastguard Worker    mov                 t0d, 0x04040404
4156*c0909341SAndroid Build Coastguard Worker    movd                 m2, t0d
4157*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q0000
4158*c0909341SAndroid Build Coastguard Worker%endif
4159*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
4160*c0909341SAndroid Build Coastguard Worker    mov                 t0d, hm
4161*c0909341SAndroid Build Coastguard Worker    mov                  hd, t0d
4162*c0909341SAndroid Build Coastguard Worker    imul                t0d, wd
4163*c0909341SAndroid Build Coastguard Worker    movd                 m6, t0d
4164*c0909341SAndroid Build Coastguard Worker    movifnidn         hpadd, hpadm
4165*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4166*c0909341SAndroid Build Coastguard Worker    mov             ac_bakq, acq
4167*c0909341SAndroid Build Coastguard Worker%endif
4168*c0909341SAndroid Build Coastguard Worker    shl               hpadd, 2
4169*c0909341SAndroid Build Coastguard Worker    sub                  hd, hpadd
4170*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
4171*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
4172*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
4173*c0909341SAndroid Build Coastguard Worker    jg .w16
4174*c0909341SAndroid Build Coastguard Worker    je .w8
4175*c0909341SAndroid Build Coastguard Worker    ; fall-through
4176*c0909341SAndroid Build Coastguard Worker
4177*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4178*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
4179*c0909341SAndroid Build Coastguard Worker%else
4180*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
4181*c0909341SAndroid Build Coastguard Worker%endif
4182*c0909341SAndroid Build Coastguard Worker.w4:
4183*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
4184*c0909341SAndroid Build Coastguard Worker.w4_loop:
4185*c0909341SAndroid Build Coastguard Worker    movq                 m1, [yq]
4186*c0909341SAndroid Build Coastguard Worker    movhps               m1, [yq+strideq]
4187*c0909341SAndroid Build Coastguard Worker    movq                 m0, [yq+strideq*2]
4188*c0909341SAndroid Build Coastguard Worker    movhps               m0, [yq+stride3q]
4189*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4190*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4191*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4192*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4193*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4194*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4195*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*4]
4196*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4197*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4198*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
4199*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4200*c0909341SAndroid Build Coastguard Worker    jz .calc_avg_4
4201*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
4202*c0909341SAndroid Build Coastguard Worker.w4_hpad_loop:
4203*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4204*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4205*c0909341SAndroid Build Coastguard Worker    add                 acq, 16
4206*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 2
4207*c0909341SAndroid Build Coastguard Worker    jg .w4_hpad_loop
4208*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg_4
4209*c0909341SAndroid Build Coastguard Worker.w8:
4210*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
4211*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
4212*c0909341SAndroid Build Coastguard Worker    jnz .w8_wpad
4213*c0909341SAndroid Build Coastguard Worker.w8_loop:
4214*c0909341SAndroid Build Coastguard Worker    mova                 m1, [yq]
4215*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq+strideq]
4216*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4217*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4218*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4219*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4220*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4221*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4222*c0909341SAndroid Build Coastguard Worker    mova                 m1, [yq+strideq*2]
4223*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq+stride3q]
4224*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4225*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4226*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m1
4227*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m0
4228*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4229*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4230*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*4]
4231*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4232*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4233*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
4234*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4235*c0909341SAndroid Build Coastguard Worker    jz .calc_avg_8_16
4236*c0909341SAndroid Build Coastguard Worker    jmp .w8_hpad
4237*c0909341SAndroid Build Coastguard Worker.w8_wpad:
4238*c0909341SAndroid Build Coastguard Worker    movddup              m1, [yq]
4239*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4240*c0909341SAndroid Build Coastguard Worker    pshufhw              m1, m1, q3333
4241*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4242*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4243*c0909341SAndroid Build Coastguard Worker    movddup              m0, [yq+strideq]
4244*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4245*c0909341SAndroid Build Coastguard Worker    pshufhw              m0, m0, q3333
4246*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4247*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4248*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
4249*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4250*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4251*c0909341SAndroid Build Coastguard Worker    jg .w8_wpad
4252*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4253*c0909341SAndroid Build Coastguard Worker    jz .calc_avg_8_16
4254*c0909341SAndroid Build Coastguard Worker.w8_hpad:
4255*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4256*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4257*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4258*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4259*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4260*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 2
4261*c0909341SAndroid Build Coastguard Worker    jg .w8_hpad
4262*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg_8_16
4263*c0909341SAndroid Build Coastguard Worker.w16:
4264*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
4265*c0909341SAndroid Build Coastguard Worker    jnz .w16_wpad
4266*c0909341SAndroid Build Coastguard Worker.w16_loop:
4267*c0909341SAndroid Build Coastguard Worker    mova                 m1, [yq]
4268*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq+16]
4269*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4270*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4271*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4272*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4273*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0
4274*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4275*c0909341SAndroid Build Coastguard Worker    mova                 m1, [yq+strideq]
4276*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq+strideq+16]
4277*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4278*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4279*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m1
4280*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m0
4281*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4282*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
4283*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
4284*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4285*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4286*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
4287*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4288*c0909341SAndroid Build Coastguard Worker    jz .calc_avg_8_16
4289*c0909341SAndroid Build Coastguard Worker    jmp .w16_hpad_loop
4290*c0909341SAndroid Build Coastguard Worker.w16_wpad:
4291*c0909341SAndroid Build Coastguard Worker    cmp               wpadd, 2
4292*c0909341SAndroid Build Coastguard Worker    jl .w16_pad1
4293*c0909341SAndroid Build Coastguard Worker    je .w16_pad2
4294*c0909341SAndroid Build Coastguard Worker.w16_pad3:
4295*c0909341SAndroid Build Coastguard Worker    movddup              m1, [yq]
4296*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4297*c0909341SAndroid Build Coastguard Worker    pshufhw              m1, m1, q3333
4298*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4299*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4300*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m1
4301*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m1
4302*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4303*c0909341SAndroid Build Coastguard Worker    movddup              m1, [yq+strideq]
4304*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4305*c0909341SAndroid Build Coastguard Worker    pshufhw              m1, m1, q3333
4306*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m1
4307*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
4308*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m1, m1
4309*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m0
4310*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4311*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
4312*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4313*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4314*c0909341SAndroid Build Coastguard Worker    jg .w16_pad3
4315*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_done
4316*c0909341SAndroid Build Coastguard Worker.w16_pad2:
4317*c0909341SAndroid Build Coastguard Worker    mova                 m1, [yq]
4318*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4319*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4320*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4321*c0909341SAndroid Build Coastguard Worker    pshufhw              m1, m1, q3333
4322*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m1
4323*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m1
4324*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4325*c0909341SAndroid Build Coastguard Worker    mova                 m1, [yq+strideq]
4326*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4327*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m1
4328*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
4329*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
4330*c0909341SAndroid Build Coastguard Worker    pshufhw              m0, m0, q3333
4331*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
4332*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m0
4333*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4334*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
4335*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4336*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4337*c0909341SAndroid Build Coastguard Worker    jg .w16_pad2
4338*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_done
4339*c0909341SAndroid Build Coastguard Worker.w16_pad1:
4340*c0909341SAndroid Build Coastguard Worker    mova                 m1, [yq]
4341*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4342*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4343*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4344*c0909341SAndroid Build Coastguard Worker    movddup              m0, [yq+16]
4345*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4346*c0909341SAndroid Build Coastguard Worker    pshufhw              m0, m0, q3333
4347*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4348*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0
4349*c0909341SAndroid Build Coastguard Worker    mova                 m1, [yq+strideq]
4350*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4351*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m1
4352*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
4353*c0909341SAndroid Build Coastguard Worker    movddup              m0, [yq+strideq+16]
4354*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4355*c0909341SAndroid Build Coastguard Worker    pshufhw              m0, m0, q3333
4356*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m0
4357*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4358*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
4359*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4360*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4361*c0909341SAndroid Build Coastguard Worker    jg .w16_pad1
4362*c0909341SAndroid Build Coastguard Worker.w16_wpad_done:
4363*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4364*c0909341SAndroid Build Coastguard Worker    jz .calc_avg_8_16
4365*c0909341SAndroid Build Coastguard Worker.w16_hpad_loop:
4366*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4367*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4368*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
4369*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0
4370*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m1
4371*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m0
4372*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
4373*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0
4374*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4375*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 2
4376*c0909341SAndroid Build Coastguard Worker    jg .w16_hpad_loop
4377*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg_8_16
4378*c0909341SAndroid Build Coastguard Worker
4379*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4380*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
4381*c0909341SAndroid Build Coastguard Worker%else
4382*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
4383*c0909341SAndroid Build Coastguard Worker%endif
4384*c0909341SAndroid Build Coastguard Worker.calc_avg_4:
4385*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 10
4386*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m2
4387*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m4, m2
4388*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg
4389*c0909341SAndroid Build Coastguard Worker.calc_avg_8_16:
4390*c0909341SAndroid Build Coastguard Worker    mova                 m0, m5
4391*c0909341SAndroid Build Coastguard Worker    psrld                m5, 16
4392*c0909341SAndroid Build Coastguard Worker    pslld                m0, 16
4393*c0909341SAndroid Build Coastguard Worker    psrld                m0, 16
4394*c0909341SAndroid Build Coastguard Worker    paddd                m5, m0
4395*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
4396*c0909341SAndroid Build Coastguard Worker    psrld                m0, 16
4397*c0909341SAndroid Build Coastguard Worker    pslld                m4, 16
4398*c0909341SAndroid Build Coastguard Worker    psrld                m4, 16
4399*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
4400*c0909341SAndroid Build Coastguard Worker.calc_avg:
4401*c0909341SAndroid Build Coastguard Worker    paddd                m5, m0
4402*c0909341SAndroid Build Coastguard Worker    movd                szd, m6
4403*c0909341SAndroid Build Coastguard Worker    psrad                m6, 1
4404*c0909341SAndroid Build Coastguard Worker    tzcnt               r1d, szd                       ; const int log2sz = ctz(width) + ctz(height);
4405*c0909341SAndroid Build Coastguard Worker    paddd                m5, m6
4406*c0909341SAndroid Build Coastguard Worker    movd                 m1, r1d
4407*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m5, q2301
4408*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
4409*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m0, q1032
4410*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
4411*c0909341SAndroid Build Coastguard Worker    psrad                m0, m1                        ; sum >>= log2sz;
4412*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m0
4413*c0909341SAndroid Build Coastguard Worker    RELOAD_ACQ_32       acq                            ; ac = ac_orig
4414*c0909341SAndroid Build Coastguard Worker.sub_loop:
4415*c0909341SAndroid Build Coastguard Worker    mova                 m1, [acq]
4416*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
4417*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4418*c0909341SAndroid Build Coastguard Worker    add                 acq, 16
4419*c0909341SAndroid Build Coastguard Worker    sub                 szd, 8
4420*c0909341SAndroid Build Coastguard Worker    jg .sub_loop
4421*c0909341SAndroid Build Coastguard Worker    RET
4422*c0909341SAndroid Build Coastguard Worker
4423*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4424*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak
4425*c0909341SAndroid Build Coastguard Worker    movddup              m2, [pb_4]
4426*c0909341SAndroid Build Coastguard Worker%else
4427*c0909341SAndroid Build Coastguard Workercglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h
4428*c0909341SAndroid Build Coastguard Worker%define ac_bakq [rsp+16*4]
4429*c0909341SAndroid Build Coastguard Worker    mov                 t0d, 0x04040404
4430*c0909341SAndroid Build Coastguard Worker    movd                 m2, t0d
4431*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q0000
4432*c0909341SAndroid Build Coastguard Worker%endif
4433*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
4434*c0909341SAndroid Build Coastguard Worker    movifnidn         hpadd, hpadm
4435*c0909341SAndroid Build Coastguard Worker    movd                 m0, hpadd
4436*c0909341SAndroid Build Coastguard Worker    mov                 t0d, hm
4437*c0909341SAndroid Build Coastguard Worker    mov                  hd, t0d
4438*c0909341SAndroid Build Coastguard Worker    imul                t0d, wd
4439*c0909341SAndroid Build Coastguard Worker    movd                 m6, t0d
4440*c0909341SAndroid Build Coastguard Worker    movd              hpadd, m0
4441*c0909341SAndroid Build Coastguard Worker    mov             ac_bakq, acq
4442*c0909341SAndroid Build Coastguard Worker    shl               hpadd, 2
4443*c0909341SAndroid Build Coastguard Worker    sub                  hd, hpadd
4444*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
4445*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
4446*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 16
4447*c0909341SAndroid Build Coastguard Worker    jg .w32
4448*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
4449*c0909341SAndroid Build Coastguard Worker    jg .w16
4450*c0909341SAndroid Build Coastguard Worker    je .w8
4451*c0909341SAndroid Build Coastguard Worker    ; fall-through
4452*c0909341SAndroid Build Coastguard Worker
4453*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4454*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
4455*c0909341SAndroid Build Coastguard Worker%else
4456*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
4457*c0909341SAndroid Build Coastguard Worker%endif
4458*c0909341SAndroid Build Coastguard Worker.w4:
4459*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
4460*c0909341SAndroid Build Coastguard Worker.w4_loop:
4461*c0909341SAndroid Build Coastguard Worker    movd                 m1, [yq]
4462*c0909341SAndroid Build Coastguard Worker    movd                 m3, [yq+strideq]
4463*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m3
4464*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4465*c0909341SAndroid Build Coastguard Worker    movd                 m0, [yq+strideq*2]
4466*c0909341SAndroid Build Coastguard Worker    movd                 m3, [yq+stride3q]
4467*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m3
4468*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
4469*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4470*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4471*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4472*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4473*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0
4474*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4475*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*4]
4476*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4477*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4478*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
4479*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4480*c0909341SAndroid Build Coastguard Worker    jz .calc_avg_4
4481*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
4482*c0909341SAndroid Build Coastguard Worker.w4_hpad_loop:
4483*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4484*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0
4485*c0909341SAndroid Build Coastguard Worker    add                 acq, 16
4486*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 2
4487*c0909341SAndroid Build Coastguard Worker    jg .w4_hpad_loop
4488*c0909341SAndroid Build Coastguard Worker.calc_avg_4:
4489*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 10
4490*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m2
4491*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg
4492*c0909341SAndroid Build Coastguard Worker
4493*c0909341SAndroid Build Coastguard Worker.w8:
4494*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
4495*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
4496*c0909341SAndroid Build Coastguard Worker    jnz .w8_wpad
4497*c0909341SAndroid Build Coastguard Worker.w8_loop:
4498*c0909341SAndroid Build Coastguard Worker    movq                 m1, [yq]
4499*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4500*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4501*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4502*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4503*c0909341SAndroid Build Coastguard Worker    movq                 m0, [yq+strideq]
4504*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
4505*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4506*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4507*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0
4508*c0909341SAndroid Build Coastguard Worker    movq                 m1, [yq+strideq*2]
4509*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4510*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4511*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m1
4512*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
4513*c0909341SAndroid Build Coastguard Worker    movq                 m0, [yq+stride3q]
4514*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
4515*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4516*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m0
4517*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4518*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*4]
4519*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4520*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4521*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
4522*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4523*c0909341SAndroid Build Coastguard Worker    jz .calc_avg_8_16
4524*c0909341SAndroid Build Coastguard Worker    jmp .w8_hpad
4525*c0909341SAndroid Build Coastguard Worker.w8_wpad:
4526*c0909341SAndroid Build Coastguard Worker    movd                 m1, [yq]
4527*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4528*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m1
4529*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4530*c0909341SAndroid Build Coastguard Worker    pshufhw              m1, m1, q3333
4531*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4532*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4533*c0909341SAndroid Build Coastguard Worker    movd                 m0, [yq+strideq]
4534*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
4535*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
4536*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4537*c0909341SAndroid Build Coastguard Worker    pshufhw              m0, m0, q3333
4538*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4539*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4540*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
4541*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4542*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4543*c0909341SAndroid Build Coastguard Worker    jg .w8_wpad
4544*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4545*c0909341SAndroid Build Coastguard Worker    jz .calc_avg_8_16
4546*c0909341SAndroid Build Coastguard Worker.w8_hpad:
4547*c0909341SAndroid Build Coastguard Worker    mova              [acq], m0
4548*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0
4549*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4550*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4551*c0909341SAndroid Build Coastguard Worker    add                 acq, 32
4552*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 2
4553*c0909341SAndroid Build Coastguard Worker    jg .w8_hpad
4554*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg_8_16
4555*c0909341SAndroid Build Coastguard Worker
4556*c0909341SAndroid Build Coastguard Worker.w16:
4557*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
4558*c0909341SAndroid Build Coastguard Worker    jnz .w16_wpad
4559*c0909341SAndroid Build Coastguard Worker.w16_loop:
4560*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq]
4561*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
4562*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4563*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4564*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4565*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4566*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m0
4567*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4568*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4569*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0
4570*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq+strideq]
4571*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
4572*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4573*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4574*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m1
4575*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
4576*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m0
4577*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4578*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m0
4579*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4580*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
4581*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4582*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4583*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
4584*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4585*c0909341SAndroid Build Coastguard Worker    jz .calc_avg_8_16
4586*c0909341SAndroid Build Coastguard Worker    jmp .w16_hpad_loop
4587*c0909341SAndroid Build Coastguard Worker.w16_wpad:
4588*c0909341SAndroid Build Coastguard Worker    cmp               wpadd, 2
4589*c0909341SAndroid Build Coastguard Worker    jl .w16_pad1
4590*c0909341SAndroid Build Coastguard Worker    je .w16_pad2
4591*c0909341SAndroid Build Coastguard Worker.w16_pad3:
4592*c0909341SAndroid Build Coastguard Worker    movd                 m1, [yq]
4593*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4594*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m1
4595*c0909341SAndroid Build Coastguard Worker    pshufhw              m1, m1, q3333
4596*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4597*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4598*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4599*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m1
4600*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m1
4601*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4602*c0909341SAndroid Build Coastguard Worker    movd                 m1, [yq+strideq]
4603*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4604*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m1
4605*c0909341SAndroid Build Coastguard Worker    pshufhw              m1, m1, q3333
4606*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4607*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m1
4608*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
4609*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m1, m1
4610*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m0
4611*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4612*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
4613*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4614*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4615*c0909341SAndroid Build Coastguard Worker    jg .w16_pad3
4616*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_done
4617*c0909341SAndroid Build Coastguard Worker.w16_pad2:
4618*c0909341SAndroid Build Coastguard Worker    movq                 m1, [yq]
4619*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4620*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4621*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4622*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4623*c0909341SAndroid Build Coastguard Worker    pshufhw              m1, m1, q3333
4624*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m1
4625*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m1
4626*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4627*c0909341SAndroid Build Coastguard Worker    movq                 m1, [yq+strideq]
4628*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4629*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4630*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m1
4631*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
4632*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
4633*c0909341SAndroid Build Coastguard Worker    pshufhw              m0, m0, q3333
4634*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
4635*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m0
4636*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4637*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
4638*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4639*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4640*c0909341SAndroid Build Coastguard Worker    jg .w16_pad2
4641*c0909341SAndroid Build Coastguard Worker    jmp .w16_wpad_done
4642*c0909341SAndroid Build Coastguard Worker.w16_pad1:
4643*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq]
4644*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
4645*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4646*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4647*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4648*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4649*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m0
4650*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
4651*c0909341SAndroid Build Coastguard Worker    pshufhw              m0, m0, q3333
4652*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4653*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4654*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0
4655*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq+strideq]
4656*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
4657*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4658*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4659*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m1
4660*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
4661*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m0
4662*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
4663*c0909341SAndroid Build Coastguard Worker    pshufhw              m0, m0, q3333
4664*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4665*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m0
4666*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4667*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq*2]
4668*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4669*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4670*c0909341SAndroid Build Coastguard Worker    jg .w16_pad1
4671*c0909341SAndroid Build Coastguard Worker.w16_wpad_done:
4672*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4673*c0909341SAndroid Build Coastguard Worker    jz .calc_avg_8_16
4674*c0909341SAndroid Build Coastguard Worker.w16_hpad_loop:
4675*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4676*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4677*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
4678*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0
4679*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m1
4680*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m0
4681*c0909341SAndroid Build Coastguard Worker    paddw                m4, m1
4682*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0
4683*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4684*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 2
4685*c0909341SAndroid Build Coastguard Worker    jg .w16_hpad_loop
4686*c0909341SAndroid Build Coastguard Worker.calc_avg_8_16:
4687*c0909341SAndroid Build Coastguard Worker    mova                 m0, m5
4688*c0909341SAndroid Build Coastguard Worker    psrld                m5, 16
4689*c0909341SAndroid Build Coastguard Worker    pslld                m0, 16
4690*c0909341SAndroid Build Coastguard Worker    psrld                m0, 16
4691*c0909341SAndroid Build Coastguard Worker    paddd                m5, m0
4692*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
4693*c0909341SAndroid Build Coastguard Worker    psrld                m0, 16
4694*c0909341SAndroid Build Coastguard Worker    pslld                m4, 16
4695*c0909341SAndroid Build Coastguard Worker    psrld                m4, 16
4696*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
4697*c0909341SAndroid Build Coastguard Worker    paddd                m5, m0
4698*c0909341SAndroid Build Coastguard Worker    jmp .calc_avg
4699*c0909341SAndroid Build Coastguard Worker
4700*c0909341SAndroid Build Coastguard Worker.w32:
4701*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
4702*c0909341SAndroid Build Coastguard Worker    mova           [rsp   ], m0
4703*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], m0
4704*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m0
4705*c0909341SAndroid Build Coastguard Worker    mova           [rsp+48], m0
4706*c0909341SAndroid Build Coastguard Worker    test              wpadd, wpadd
4707*c0909341SAndroid Build Coastguard Worker    jnz .w32_wpad
4708*c0909341SAndroid Build Coastguard Worker.w32_loop:
4709*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq]
4710*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
4711*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4712*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4713*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4714*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1, [rsp]
4715*c0909341SAndroid Build Coastguard Worker    mova           [rsp   ], m5
4716*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m0
4717*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4718*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4719*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0, [rsp+16]
4720*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], m5
4721*c0909341SAndroid Build Coastguard Worker    mova                 m4, [yq+16]
4722*c0909341SAndroid Build Coastguard Worker    mova                 m3, m4
4723*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m3
4724*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m2
4725*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m3
4726*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3, [rsp+32]
4727*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m5
4728*c0909341SAndroid Build Coastguard Worker    punpckhbw            m4, m4
4729*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m2
4730*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m4
4731*c0909341SAndroid Build Coastguard Worker    paddw                m5, m4, [rsp+48]
4732*c0909341SAndroid Build Coastguard Worker    mova           [rsp+48], m5
4733*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq]
4734*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4735*c0909341SAndroid Build Coastguard Worker    sub                  hd, 1
4736*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
4737*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4738*c0909341SAndroid Build Coastguard Worker    jz .calc_avg_32
4739*c0909341SAndroid Build Coastguard Worker    jmp .w32_hpad_loop
4740*c0909341SAndroid Build Coastguard Worker.w32_wpad:
4741*c0909341SAndroid Build Coastguard Worker    cmp               wpadd, 2
4742*c0909341SAndroid Build Coastguard Worker    jl .w32_pad1
4743*c0909341SAndroid Build Coastguard Worker    je .w32_pad2
4744*c0909341SAndroid Build Coastguard Worker    cmp               wpadd, 4
4745*c0909341SAndroid Build Coastguard Worker    jl .w32_pad3
4746*c0909341SAndroid Build Coastguard Worker    je .w32_pad4
4747*c0909341SAndroid Build Coastguard Worker    cmp               wpadd, 6
4748*c0909341SAndroid Build Coastguard Worker    jl .w32_pad5
4749*c0909341SAndroid Build Coastguard Worker    je .w32_pad6
4750*c0909341SAndroid Build Coastguard Worker.w32_pad7:
4751*c0909341SAndroid Build Coastguard Worker    movd                 m1, [yq]
4752*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4753*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m1
4754*c0909341SAndroid Build Coastguard Worker    pshufhw              m1, m1, q3333
4755*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4756*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4757*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1, [rsp]
4758*c0909341SAndroid Build Coastguard Worker    mova           [rsp   ], m5
4759*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
4760*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
4761*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4762*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0, [rsp+16]
4763*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], m5
4764*c0909341SAndroid Build Coastguard Worker    mova                 m3, m0
4765*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m3
4766*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3, [rsp+32]
4767*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m5
4768*c0909341SAndroid Build Coastguard Worker    mova                 m4, m3
4769*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m4
4770*c0909341SAndroid Build Coastguard Worker    paddw                m5, m4, [rsp+48]
4771*c0909341SAndroid Build Coastguard Worker    mova           [rsp+48], m5
4772*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq]
4773*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4774*c0909341SAndroid Build Coastguard Worker    sub                  hd, 1
4775*c0909341SAndroid Build Coastguard Worker    jg .w32_pad7
4776*c0909341SAndroid Build Coastguard Worker    jmp .w32_wpad_done
4777*c0909341SAndroid Build Coastguard Worker.w32_pad6:
4778*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq]
4779*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
4780*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4781*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4782*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4783*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1, [rsp]
4784*c0909341SAndroid Build Coastguard Worker    mova           [rsp   ], m5
4785*c0909341SAndroid Build Coastguard Worker    pshufhw              m0, m1, q3333
4786*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
4787*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4788*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0, [rsp+16]
4789*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], m5
4790*c0909341SAndroid Build Coastguard Worker    mova                 m3, m0
4791*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m3
4792*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3, [rsp+32]
4793*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m5
4794*c0909341SAndroid Build Coastguard Worker    mova                 m4, m3
4795*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m4
4796*c0909341SAndroid Build Coastguard Worker    paddw                m5, m4, [rsp+48]
4797*c0909341SAndroid Build Coastguard Worker    mova           [rsp+48], m5
4798*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq]
4799*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4800*c0909341SAndroid Build Coastguard Worker    sub                  hd, 1
4801*c0909341SAndroid Build Coastguard Worker    jg .w32_pad6
4802*c0909341SAndroid Build Coastguard Worker    jmp .w32_wpad_done
4803*c0909341SAndroid Build Coastguard Worker.w32_pad5:
4804*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq]
4805*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
4806*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4807*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4808*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4809*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp]
4810*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1
4811*c0909341SAndroid Build Coastguard Worker    mova           [rsp   ], m5
4812*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m0
4813*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
4814*c0909341SAndroid Build Coastguard Worker    pshufhw              m0, m0, q3333
4815*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4816*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4817*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0, [rsp+16]
4818*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], m5
4819*c0909341SAndroid Build Coastguard Worker    mova                 m3, m0
4820*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m3
4821*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m3
4822*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3, [rsp+32]
4823*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m5
4824*c0909341SAndroid Build Coastguard Worker    mova                 m4, m3
4825*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m4
4826*c0909341SAndroid Build Coastguard Worker    paddw                m5, m4, [rsp+48]
4827*c0909341SAndroid Build Coastguard Worker    mova           [rsp+48], m5
4828*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq]
4829*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4830*c0909341SAndroid Build Coastguard Worker    sub                  hd, 1
4831*c0909341SAndroid Build Coastguard Worker    jg .w32_pad5
4832*c0909341SAndroid Build Coastguard Worker    jmp .w32_wpad_done
4833*c0909341SAndroid Build Coastguard Worker.w32_pad4:
4834*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq]
4835*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
4836*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4837*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4838*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4839*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1, [rsp]
4840*c0909341SAndroid Build Coastguard Worker    mova           [rsp   ], m5
4841*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m0
4842*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4843*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4844*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0, [rsp+16]
4845*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], m5
4846*c0909341SAndroid Build Coastguard Worker    mova                 m3, m0
4847*c0909341SAndroid Build Coastguard Worker    pshufhw              m3, m3, q3333
4848*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m3
4849*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m3
4850*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3, [rsp+32]
4851*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m5
4852*c0909341SAndroid Build Coastguard Worker    mova                 m4, m3
4853*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m4
4854*c0909341SAndroid Build Coastguard Worker    paddw                m5, m4, [rsp+48]
4855*c0909341SAndroid Build Coastguard Worker    mova           [rsp+48], m5
4856*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq]
4857*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4858*c0909341SAndroid Build Coastguard Worker    sub                  hd, 1
4859*c0909341SAndroid Build Coastguard Worker    jg .w32_pad4
4860*c0909341SAndroid Build Coastguard Worker    jmp .w32_wpad_done
4861*c0909341SAndroid Build Coastguard Worker.w32_pad3:
4862*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq]
4863*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
4864*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4865*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4866*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4867*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1, [rsp]
4868*c0909341SAndroid Build Coastguard Worker    mova           [rsp   ], m5
4869*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m0
4870*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4871*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4872*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0, [rsp+16]
4873*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], m5
4874*c0909341SAndroid Build Coastguard Worker    movd                 m3, [yq+16]
4875*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m3
4876*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m3, m3
4877*c0909341SAndroid Build Coastguard Worker    pshufhw              m3, m3, q3333
4878*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m2
4879*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m3
4880*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3, [rsp+32]
4881*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m5
4882*c0909341SAndroid Build Coastguard Worker    mova                 m4, m3
4883*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m4, m4
4884*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m4
4885*c0909341SAndroid Build Coastguard Worker    paddw                m5, m4, [rsp+48]
4886*c0909341SAndroid Build Coastguard Worker    mova           [rsp+48], m5
4887*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq]
4888*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4889*c0909341SAndroid Build Coastguard Worker    sub                  hd, 1
4890*c0909341SAndroid Build Coastguard Worker    jg .w32_pad3
4891*c0909341SAndroid Build Coastguard Worker    jmp .w32_wpad_done
4892*c0909341SAndroid Build Coastguard Worker.w32_pad2:
4893*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq]
4894*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
4895*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4896*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4897*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4898*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1, [rsp]
4899*c0909341SAndroid Build Coastguard Worker    mova           [rsp   ], m5
4900*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m0
4901*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4902*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4903*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0, [rsp+16]
4904*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], m5
4905*c0909341SAndroid Build Coastguard Worker    mova                 m3, [yq+16]
4906*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m3
4907*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m2
4908*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m3
4909*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3, [rsp+32]
4910*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m5
4911*c0909341SAndroid Build Coastguard Worker    pshufhw              m4, m3, q3333
4912*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m4, m4
4913*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m4
4914*c0909341SAndroid Build Coastguard Worker    paddw                m5, m4, [rsp+48]
4915*c0909341SAndroid Build Coastguard Worker    mova           [rsp+48], m5
4916*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq]
4917*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4918*c0909341SAndroid Build Coastguard Worker    sub                  hd, 1
4919*c0909341SAndroid Build Coastguard Worker    jg .w32_pad2
4920*c0909341SAndroid Build Coastguard Worker    jmp .w32_wpad_done
4921*c0909341SAndroid Build Coastguard Worker.w32_pad1:
4922*c0909341SAndroid Build Coastguard Worker    mova                 m0, [yq]
4923*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
4924*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4925*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
4926*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4927*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1, [rsp]
4928*c0909341SAndroid Build Coastguard Worker    mova           [rsp   ], m5
4929*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m0
4930*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
4931*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4932*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0, [rsp+16]
4933*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], m5
4934*c0909341SAndroid Build Coastguard Worker    mova                 m4, [yq+16]
4935*c0909341SAndroid Build Coastguard Worker    mova                 m3, m4
4936*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m3
4937*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m2
4938*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m3
4939*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3, [rsp+32]
4940*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m5
4941*c0909341SAndroid Build Coastguard Worker    punpckhbw            m4, m4
4942*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m4
4943*c0909341SAndroid Build Coastguard Worker    pshufhw              m4, m4, q3333
4944*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m2
4945*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m4
4946*c0909341SAndroid Build Coastguard Worker    paddw                m5, m4, [rsp+48]
4947*c0909341SAndroid Build Coastguard Worker    mova           [rsp+48], m5
4948*c0909341SAndroid Build Coastguard Worker    lea                  yq, [yq+strideq]
4949*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4950*c0909341SAndroid Build Coastguard Worker    sub                  hd, 1
4951*c0909341SAndroid Build Coastguard Worker    jg .w32_pad1
4952*c0909341SAndroid Build Coastguard Worker.w32_wpad_done:
4953*c0909341SAndroid Build Coastguard Worker    test              hpadd, hpadd
4954*c0909341SAndroid Build Coastguard Worker    jz .calc_avg_32
4955*c0909341SAndroid Build Coastguard Worker.w32_hpad_loop:
4956*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
4957*c0909341SAndroid Build Coastguard Worker    mova           [acq+16], m0
4958*c0909341SAndroid Build Coastguard Worker    paddw                m5, m1, [rsp]
4959*c0909341SAndroid Build Coastguard Worker    mova           [rsp   ], m5
4960*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0, [rsp+16]
4961*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], m5
4962*c0909341SAndroid Build Coastguard Worker    mova           [acq+32], m3
4963*c0909341SAndroid Build Coastguard Worker    mova           [acq+48], m4
4964*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3, [rsp+32]
4965*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m5
4966*c0909341SAndroid Build Coastguard Worker    paddw                m5, m4, [rsp+48]
4967*c0909341SAndroid Build Coastguard Worker    mova           [rsp+48], m5
4968*c0909341SAndroid Build Coastguard Worker    add                 acq, 64
4969*c0909341SAndroid Build Coastguard Worker    sub               hpadd, 1
4970*c0909341SAndroid Build Coastguard Worker    jg .w32_hpad_loop
4971*c0909341SAndroid Build Coastguard Worker
4972*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4973*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
4974*c0909341SAndroid Build Coastguard Worker%else
4975*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
4976*c0909341SAndroid Build Coastguard Worker%endif
4977*c0909341SAndroid Build Coastguard Worker
4978*c0909341SAndroid Build Coastguard Worker.calc_avg_32:
4979*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp]
4980*c0909341SAndroid Build Coastguard Worker    mova                 m0, m5
4981*c0909341SAndroid Build Coastguard Worker    psrld                m5, 16
4982*c0909341SAndroid Build Coastguard Worker    pslld                m0, 16
4983*c0909341SAndroid Build Coastguard Worker    psrld                m0, 16
4984*c0909341SAndroid Build Coastguard Worker    paddd                m5, m0
4985*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+16]
4986*c0909341SAndroid Build Coastguard Worker    mova                 m3, m0
4987*c0909341SAndroid Build Coastguard Worker    psrld                m0, 16
4988*c0909341SAndroid Build Coastguard Worker    pslld                m3, 16
4989*c0909341SAndroid Build Coastguard Worker    psrld                m3, 16
4990*c0909341SAndroid Build Coastguard Worker    paddd                m0, m3
4991*c0909341SAndroid Build Coastguard Worker    paddd                m5, m0
4992*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+32]
4993*c0909341SAndroid Build Coastguard Worker    mova                 m3, m0
4994*c0909341SAndroid Build Coastguard Worker    psrld                m0, 16
4995*c0909341SAndroid Build Coastguard Worker    pslld                m3, 16
4996*c0909341SAndroid Build Coastguard Worker    psrld                m3, 16
4997*c0909341SAndroid Build Coastguard Worker    paddd                m0, m3
4998*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+48]
4999*c0909341SAndroid Build Coastguard Worker    mova                 m3, m1
5000*c0909341SAndroid Build Coastguard Worker    psrld                m1, 16
5001*c0909341SAndroid Build Coastguard Worker    pslld                m3, 16
5002*c0909341SAndroid Build Coastguard Worker    psrld                m3, 16
5003*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
5004*c0909341SAndroid Build Coastguard Worker    paddd                m1, m0
5005*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1
5006*c0909341SAndroid Build Coastguard Worker.calc_avg:
5007*c0909341SAndroid Build Coastguard Worker    movd                szd, m6
5008*c0909341SAndroid Build Coastguard Worker    psrad                m6, 1
5009*c0909341SAndroid Build Coastguard Worker    tzcnt               r1d, szd                       ; const int log2sz = ctz(width) + ctz(height);
5010*c0909341SAndroid Build Coastguard Worker    paddd                m5, m6
5011*c0909341SAndroid Build Coastguard Worker    movd                 m1, r1d
5012*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m5, q2301
5013*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
5014*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m0, q1032
5015*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
5016*c0909341SAndroid Build Coastguard Worker    psrad                m0, m1                        ; sum >>= log2sz;
5017*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m0
5018*c0909341SAndroid Build Coastguard Worker    RELOAD_ACQ_32       acq                            ; ac = ac_orig
5019*c0909341SAndroid Build Coastguard Worker.sub_loop:
5020*c0909341SAndroid Build Coastguard Worker    mova                 m1, [acq]
5021*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
5022*c0909341SAndroid Build Coastguard Worker    mova              [acq], m1
5023*c0909341SAndroid Build Coastguard Worker    add                 acq, 16
5024*c0909341SAndroid Build Coastguard Worker    sub                 szd, 8
5025*c0909341SAndroid Build Coastguard Worker    jg .sub_loop
5026*c0909341SAndroid Build Coastguard Worker    RET
5027*c0909341SAndroid Build Coastguard Worker
5028*c0909341SAndroid Build Coastguard Worker; %1 simd register that hold the mask and will hold the result
5029*c0909341SAndroid Build Coastguard Worker; %2 simd register that holds the "true" values
5030*c0909341SAndroid Build Coastguard Worker; %3 location of the "false" values (simd register/memory)
5031*c0909341SAndroid Build Coastguard Worker%macro BLEND 3 ; mask, true, false
5032*c0909341SAndroid Build Coastguard Worker    pand  %2, %1
5033*c0909341SAndroid Build Coastguard Worker    pandn %1, %3
5034*c0909341SAndroid Build Coastguard Worker    por   %1, %2
5035*c0909341SAndroid Build Coastguard Worker%endmacro
5036*c0909341SAndroid Build Coastguard Worker
5037*c0909341SAndroid Build Coastguard Worker%macro PAETH 2                                 ; top, ldiff
5038*c0909341SAndroid Build Coastguard Worker    pavgb                m1, m%1, m3
5039*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m%1, m3
5040*c0909341SAndroid Build Coastguard Worker    pand                 m0, m4
5041*c0909341SAndroid Build Coastguard Worker    psubusb              m2, m5, m1
5042*c0909341SAndroid Build Coastguard Worker    psubb                m1, m0
5043*c0909341SAndroid Build Coastguard Worker    psubusb              m1, m5
5044*c0909341SAndroid Build Coastguard Worker    por                  m1, m2
5045*c0909341SAndroid Build Coastguard Worker    paddusb              m1, m1
5046*c0909341SAndroid Build Coastguard Worker    por                  m1, m0               ; min(tldiff, 255)
5047*c0909341SAndroid Build Coastguard Worker    psubusb              m2, m5, m3
5048*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m3, m5
5049*c0909341SAndroid Build Coastguard Worker    por                  m2, m0               ; tdiff
5050*c0909341SAndroid Build Coastguard Worker%ifnum %2
5051*c0909341SAndroid Build Coastguard Worker    pminub               m2, m%2
5052*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m0, m%2, m2          ; ldiff <= tdiff
5053*c0909341SAndroid Build Coastguard Worker%else
5054*c0909341SAndroid Build Coastguard Worker    mova                 m0, %2
5055*c0909341SAndroid Build Coastguard Worker    pminub               m2, m0
5056*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m0, m2
5057*c0909341SAndroid Build Coastguard Worker%endif
5058*c0909341SAndroid Build Coastguard Worker    pminub               m1, m2
5059*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m1, m2               ; ldiff <= tldiff && tdiff <= tldiff
5060*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
5061*c0909341SAndroid Build Coastguard Worker    BLEND                m0, m2, m%1
5062*c0909341SAndroid Build Coastguard Worker    BLEND                m1, m0, m5
5063*c0909341SAndroid Build Coastguard Worker%endmacro
5064*c0909341SAndroid Build Coastguard Worker
5065*c0909341SAndroid Build Coastguard Workercglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h
5066*c0909341SAndroid Build Coastguard Worker%define base r5-ipred_paeth_ssse3_table
5067*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
5068*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
5069*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
5070*c0909341SAndroid Build Coastguard Worker    movd                 m5, [tlq]
5071*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m0
5072*c0909341SAndroid Build Coastguard Worker    LEA                  r5, ipred_paeth_ssse3_table
5073*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [r5+wq*4]
5074*c0909341SAndroid Build Coastguard Worker    movddup              m4, [base+ipred_paeth_shuf]
5075*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
5076*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5077*c0909341SAndroid Build Coastguard Worker.w4:
5078*c0909341SAndroid Build Coastguard Worker    movd                 m6, [tlq+1]            ; top
5079*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q0000
5080*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
5081*c0909341SAndroid Build Coastguard Worker    psubusb              m7, m5, m6
5082*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
5083*c0909341SAndroid Build Coastguard Worker    por                  m7, m0                 ; ldiff
5084*c0909341SAndroid Build Coastguard Worker.w4_loop:
5085*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 4
5086*c0909341SAndroid Build Coastguard Worker    movd                 m3, [tlq]
5087*c0909341SAndroid Build Coastguard Worker    mova                 m1, [base+ipred_h_shuf]
5088*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m1                 ; left
5089*c0909341SAndroid Build Coastguard Worker    PAETH                 6, 7
5090*c0909341SAndroid Build Coastguard Worker    movd   [dstq          ], m1
5091*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m1, q1032
5092*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq  ], m0
5093*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m1
5094*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], m1
5095*c0909341SAndroid Build Coastguard Worker    psrlq                m1, 32
5096*c0909341SAndroid Build Coastguard Worker    movd   [dstq+r3       ], m1
5097*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5098*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
5099*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
5100*c0909341SAndroid Build Coastguard Worker    RET
5101*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5102*c0909341SAndroid Build Coastguard Worker.w8:
5103*c0909341SAndroid Build Coastguard Worker    movddup              m6, [tlq+1]
5104*c0909341SAndroid Build Coastguard Worker    psubusb              m7, m5, m6
5105*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
5106*c0909341SAndroid Build Coastguard Worker    por                  m7, m0
5107*c0909341SAndroid Build Coastguard Worker.w8_loop:
5108*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 2
5109*c0909341SAndroid Build Coastguard Worker    movd                 m3, [tlq]
5110*c0909341SAndroid Build Coastguard Worker    pshufb               m3, [base+ipred_paeth_shuf]
5111*c0909341SAndroid Build Coastguard Worker    PAETH                 6, 7
5112*c0909341SAndroid Build Coastguard Worker    movq     [dstq        ], m1
5113*c0909341SAndroid Build Coastguard Worker    movhps   [dstq+strideq], m1
5114*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
5115*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
5116*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
5117*c0909341SAndroid Build Coastguard Worker    RET
5118*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5119*c0909341SAndroid Build Coastguard Worker.w16:
5120*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq+1]
5121*c0909341SAndroid Build Coastguard Worker    psubusb              m7, m5, m6
5122*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
5123*c0909341SAndroid Build Coastguard Worker    por                  m7, m0
5124*c0909341SAndroid Build Coastguard Worker.w16_loop:
5125*c0909341SAndroid Build Coastguard Worker    sub                 tlq, 1
5126*c0909341SAndroid Build Coastguard Worker    movd                 m3, [tlq]
5127*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
5128*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m1
5129*c0909341SAndroid Build Coastguard Worker    PAETH                 6, 7
5130*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m1
5131*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
5132*c0909341SAndroid Build Coastguard Worker    sub                  hd, 1
5133*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
5134*c0909341SAndroid Build Coastguard Worker    RET
5135*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5136*c0909341SAndroid Build Coastguard Worker.w32:
5137*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq+1]
5138*c0909341SAndroid Build Coastguard Worker    psubusb              m7, m5, m6
5139*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
5140*c0909341SAndroid Build Coastguard Worker    por                  m7, m0
5141*c0909341SAndroid Build Coastguard Worker    mova           [rsp   ], m6
5142*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], m7
5143*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq+17]
5144*c0909341SAndroid Build Coastguard Worker    psubusb              m7, m5, m6
5145*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
5146*c0909341SAndroid Build Coastguard Worker    por                  m7, m0
5147*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m6
5148*c0909341SAndroid Build Coastguard Worker.w32_loop:
5149*c0909341SAndroid Build Coastguard Worker    dec                 tlq
5150*c0909341SAndroid Build Coastguard Worker    movd                 m3, [tlq]
5151*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
5152*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m1
5153*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp]
5154*c0909341SAndroid Build Coastguard Worker    PAETH                 6, [rsp+16]
5155*c0909341SAndroid Build Coastguard Worker    mova          [dstq   ], m1
5156*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+32]
5157*c0909341SAndroid Build Coastguard Worker    PAETH                 6, 7
5158*c0909341SAndroid Build Coastguard Worker    mova          [dstq+16], m1
5159*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
5160*c0909341SAndroid Build Coastguard Worker    dec                  hd
5161*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
5162*c0909341SAndroid Build Coastguard Worker    RET
5163*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5164*c0909341SAndroid Build Coastguard Worker.w64:
5165*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq+1]
5166*c0909341SAndroid Build Coastguard Worker    psubusb              m7, m5, m6
5167*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
5168*c0909341SAndroid Build Coastguard Worker    por                  m7, m0
5169*c0909341SAndroid Build Coastguard Worker    mova           [rsp   ], m6
5170*c0909341SAndroid Build Coastguard Worker    mova           [rsp+16], m7
5171*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq+17]
5172*c0909341SAndroid Build Coastguard Worker    psubusb              m7, m5, m6
5173*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
5174*c0909341SAndroid Build Coastguard Worker    por                  m7, m0
5175*c0909341SAndroid Build Coastguard Worker    mova           [rsp+32], m6
5176*c0909341SAndroid Build Coastguard Worker    mova           [rsp+48], m7
5177*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq+33]
5178*c0909341SAndroid Build Coastguard Worker    psubusb              m7, m5, m6
5179*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
5180*c0909341SAndroid Build Coastguard Worker    por                  m7, m0
5181*c0909341SAndroid Build Coastguard Worker    mova           [rsp+64], m6
5182*c0909341SAndroid Build Coastguard Worker    mova           [rsp+80], m7
5183*c0909341SAndroid Build Coastguard Worker    movu                 m6, [tlq+49]
5184*c0909341SAndroid Build Coastguard Worker    psubusb              m7, m5, m6
5185*c0909341SAndroid Build Coastguard Worker    psubusb              m0, m6, m5
5186*c0909341SAndroid Build Coastguard Worker    por                  m7, m0
5187*c0909341SAndroid Build Coastguard Worker    mova           [rsp+96], m6
5188*c0909341SAndroid Build Coastguard Worker.w64_loop:
5189*c0909341SAndroid Build Coastguard Worker    dec                 tlq
5190*c0909341SAndroid Build Coastguard Worker    movd                 m3, [tlq]
5191*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
5192*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m1
5193*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp]
5194*c0909341SAndroid Build Coastguard Worker    PAETH                 6, [rsp+16]
5195*c0909341SAndroid Build Coastguard Worker    mova          [dstq   ], m1
5196*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+32]
5197*c0909341SAndroid Build Coastguard Worker    PAETH                 6, [rsp+48]
5198*c0909341SAndroid Build Coastguard Worker    mova          [dstq+16], m1
5199*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+64]
5200*c0909341SAndroid Build Coastguard Worker    PAETH                 6, [rsp+80]
5201*c0909341SAndroid Build Coastguard Worker    mova          [dstq+32], m1
5202*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+96]
5203*c0909341SAndroid Build Coastguard Worker    PAETH                 6, 7
5204*c0909341SAndroid Build Coastguard Worker    mova          [dstq+48], m1
5205*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
5206*c0909341SAndroid Build Coastguard Worker    dec                  hd
5207*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
5208*c0909341SAndroid Build Coastguard Worker    RET
5209*c0909341SAndroid Build Coastguard Worker
5210*c0909341SAndroid Build Coastguard Worker
5211*c0909341SAndroid Build Coastguard Worker%macro FILTER 4  ;dst, src, tmp, shuf
5212*c0909341SAndroid Build Coastguard Worker%ifnum %4
5213*c0909341SAndroid Build Coastguard Worker    pshufb               m%2, m%4
5214*c0909341SAndroid Build Coastguard Worker%else
5215*c0909341SAndroid Build Coastguard Worker    pshufb               m%2, %4
5216*c0909341SAndroid Build Coastguard Worker%endif
5217*c0909341SAndroid Build Coastguard Worker    pshufd               m%1, m%2, q0000           ;p0 p1
5218*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m%1, m2
5219*c0909341SAndroid Build Coastguard Worker    pshufd               m%3, m%2, q1111           ;p2 p3
5220*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m%3, m3
5221*c0909341SAndroid Build Coastguard Worker    paddw                m%1, [base+pw_8]
5222*c0909341SAndroid Build Coastguard Worker    paddw                m%1, m%3
5223*c0909341SAndroid Build Coastguard Worker    pshufd               m%3, m%2, q2222           ;p4 p5
5224*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m%3, m4
5225*c0909341SAndroid Build Coastguard Worker    paddw                m%1, m%3
5226*c0909341SAndroid Build Coastguard Worker    pshufd               m%3, m%2, q3333           ;p6 __
5227*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m%3, m5
5228*c0909341SAndroid Build Coastguard Worker    paddw                m%1, m%3
5229*c0909341SAndroid Build Coastguard Worker    psraw                m%1, 4
5230*c0909341SAndroid Build Coastguard Worker    packuswb             m%1, m%1
5231*c0909341SAndroid Build Coastguard Worker%endmacro
5232*c0909341SAndroid Build Coastguard Worker
5233*c0909341SAndroid Build Coastguard Workercglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter
5234*c0909341SAndroid Build Coastguard Worker%define base r6-$$
5235*c0909341SAndroid Build Coastguard Worker    LEA                   r6, $$
5236*c0909341SAndroid Build Coastguard Worker    tzcnt                 wd, wm
5237*c0909341SAndroid Build Coastguard Worker%ifidn filterd, filterm
5238*c0909341SAndroid Build Coastguard Worker    movzx            filterd, filterb
5239*c0909341SAndroid Build Coastguard Worker%else
5240*c0909341SAndroid Build Coastguard Worker    movzx            filterd, byte filterm
5241*c0909341SAndroid Build Coastguard Worker%endif
5242*c0909341SAndroid Build Coastguard Worker    shl              filterd, 6
5243*c0909341SAndroid Build Coastguard Worker    lea              filterq, [base+filter_intra_taps+filterq]
5244*c0909341SAndroid Build Coastguard Worker    movq                  m0, [tlq-3]                     ;_ 6 5 0 1 2 3 4
5245*c0909341SAndroid Build Coastguard Worker    movsxd                wq, [base+ipred_filter_ssse3_table+wq*4]
5246*c0909341SAndroid Build Coastguard Worker    mova                  m2, [filterq+16*0]
5247*c0909341SAndroid Build Coastguard Worker    mova                  m3, [filterq+16*1]
5248*c0909341SAndroid Build Coastguard Worker    mova                  m4, [filterq+16*2]
5249*c0909341SAndroid Build Coastguard Worker    mova                  m5, [filterq+16*3]
5250*c0909341SAndroid Build Coastguard Worker    lea                   wq, [base+ipred_filter_ssse3_table+wq]
5251*c0909341SAndroid Build Coastguard Worker    mov                   hd, hm
5252*c0909341SAndroid Build Coastguard Worker    jmp                   wq
5253*c0909341SAndroid Build Coastguard Worker.w4:
5254*c0909341SAndroid Build Coastguard Worker    mova                  m1, [base+filter_shuf1]
5255*c0909341SAndroid Build Coastguard Worker    sub                  tlq, 3
5256*c0909341SAndroid Build Coastguard Worker    sub                  tlq, hq
5257*c0909341SAndroid Build Coastguard Worker    jmp .w4_loop_start
5258*c0909341SAndroid Build Coastguard Worker.w4_loop:
5259*c0909341SAndroid Build Coastguard Worker    movd                  m0, [tlq+hq]
5260*c0909341SAndroid Build Coastguard Worker    punpckldq             m0, m6
5261*c0909341SAndroid Build Coastguard Worker    lea                 dstq, [dstq+strideq*2]
5262*c0909341SAndroid Build Coastguard Worker.w4_loop_start:
5263*c0909341SAndroid Build Coastguard Worker    FILTER                 6, 0, 7, 1
5264*c0909341SAndroid Build Coastguard Worker    movd    [dstq+strideq*0], m6
5265*c0909341SAndroid Build Coastguard Worker    pshuflw               m6, m6, q1032
5266*c0909341SAndroid Build Coastguard Worker    movd    [dstq+strideq*1], m6
5267*c0909341SAndroid Build Coastguard Worker    sub                   hd, 2
5268*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
5269*c0909341SAndroid Build Coastguard Worker    RET
5270*c0909341SAndroid Build Coastguard Worker
5271*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5272*c0909341SAndroid Build Coastguard Worker.w8:
5273*c0909341SAndroid Build Coastguard Worker    movq                  m6, [tlq+1]                   ;_ _ _ 0 1 2 3 4
5274*c0909341SAndroid Build Coastguard Worker    sub                  tlq, 5
5275*c0909341SAndroid Build Coastguard Worker    sub                  tlq, hq
5276*c0909341SAndroid Build Coastguard Worker
5277*c0909341SAndroid Build Coastguard Worker.w8_loop:
5278*c0909341SAndroid Build Coastguard Worker    FILTER                 7, 0, 1, [base+filter_shuf1]
5279*c0909341SAndroid Build Coastguard Worker    punpcklqdq            m6, m7                        ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5280*c0909341SAndroid Build Coastguard Worker    FILTER                 0, 6, 1, [base+filter_shuf2]
5281*c0909341SAndroid Build Coastguard Worker
5282*c0909341SAndroid Build Coastguard Worker    punpckldq             m6, m7, m0
5283*c0909341SAndroid Build Coastguard Worker    movq    [dstq+strideq*0], m6
5284*c0909341SAndroid Build Coastguard Worker    punpckhqdq            m6, m6
5285*c0909341SAndroid Build Coastguard Worker    movq    [dstq+strideq*1], m6
5286*c0909341SAndroid Build Coastguard Worker
5287*c0909341SAndroid Build Coastguard Worker    movd                  m0, [tlq+hq]                  ;_ 6 5 0
5288*c0909341SAndroid Build Coastguard Worker    punpckldq             m0, m6                        ;_ 6 5 0 1 2 3 4
5289*c0909341SAndroid Build Coastguard Worker
5290*c0909341SAndroid Build Coastguard Worker    lea                 dstq, [dstq+strideq*2]
5291*c0909341SAndroid Build Coastguard Worker    sub                   hd, 2
5292*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
5293*c0909341SAndroid Build Coastguard Worker    RET
5294*c0909341SAndroid Build Coastguard Worker
5295*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5296*c0909341SAndroid Build Coastguard Worker.w16:
5297*c0909341SAndroid Build Coastguard Worker    movu                  m6, [tlq+1]                   ;top row
5298*c0909341SAndroid Build Coastguard Worker    sub                  tlq, 5
5299*c0909341SAndroid Build Coastguard Worker    sub                  tlq, hq
5300*c0909341SAndroid Build Coastguard Worker
5301*c0909341SAndroid Build Coastguard Worker.w16_loop:
5302*c0909341SAndroid Build Coastguard Worker    FILTER                 7, 0, 1, [base+filter_shuf1]
5303*c0909341SAndroid Build Coastguard Worker    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5304*c0909341SAndroid Build Coastguard Worker    movd    [dstq+strideq*0], m7
5305*c0909341SAndroid Build Coastguard Worker    psrlq                 m7, 32
5306*c0909341SAndroid Build Coastguard Worker    palignr               m7, m6, 4
5307*c0909341SAndroid Build Coastguard Worker
5308*c0909341SAndroid Build Coastguard Worker    FILTER                 6, 0, 1, [base+filter_shuf2]
5309*c0909341SAndroid Build Coastguard Worker    punpcklqdq            m0, m7, m6                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5310*c0909341SAndroid Build Coastguard Worker    movd  [dstq+4+strideq*0], m6
5311*c0909341SAndroid Build Coastguard Worker    psrlq                 m6, 32
5312*c0909341SAndroid Build Coastguard Worker    palignr               m6, m7, 4
5313*c0909341SAndroid Build Coastguard Worker
5314*c0909341SAndroid Build Coastguard Worker    FILTER                 7, 0, 1, [base+filter_shuf2]
5315*c0909341SAndroid Build Coastguard Worker    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5316*c0909341SAndroid Build Coastguard Worker    movd  [dstq+8+strideq*0], m7
5317*c0909341SAndroid Build Coastguard Worker    psrlq                 m7, 32
5318*c0909341SAndroid Build Coastguard Worker    palignr               m7, m6, 4
5319*c0909341SAndroid Build Coastguard Worker
5320*c0909341SAndroid Build Coastguard Worker    FILTER                 6, 0, 1, [base+filter_shuf2]
5321*c0909341SAndroid Build Coastguard Worker    movd [dstq+12+strideq*0], m6
5322*c0909341SAndroid Build Coastguard Worker    psrlq                 m6, 32
5323*c0909341SAndroid Build Coastguard Worker    palignr               m6, m7, 4
5324*c0909341SAndroid Build Coastguard Worker    mova    [dstq+strideq*1], m6
5325*c0909341SAndroid Build Coastguard Worker
5326*c0909341SAndroid Build Coastguard Worker    movd                  m0, [tlq+hq]                  ;_ 6 5 0
5327*c0909341SAndroid Build Coastguard Worker    punpckldq             m0, m6                        ;_ 6 5 0 1 2 3 4
5328*c0909341SAndroid Build Coastguard Worker
5329*c0909341SAndroid Build Coastguard Worker    lea                 dstq, [dstq+strideq*2]
5330*c0909341SAndroid Build Coastguard Worker    sub                   hd, 2
5331*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
5332*c0909341SAndroid Build Coastguard Worker    RET
5333*c0909341SAndroid Build Coastguard Worker
5334*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5335*c0909341SAndroid Build Coastguard Worker.w32:
5336*c0909341SAndroid Build Coastguard Worker    movu                  m6, [tlq+1]                   ;top row
5337*c0909341SAndroid Build Coastguard Worker    lea              filterq, [tlq+17]
5338*c0909341SAndroid Build Coastguard Worker    sub                  tlq, 5
5339*c0909341SAndroid Build Coastguard Worker    sub                  tlq, hq
5340*c0909341SAndroid Build Coastguard Worker
5341*c0909341SAndroid Build Coastguard Worker.w32_loop:
5342*c0909341SAndroid Build Coastguard Worker    FILTER                 7, 0, 1, [base+filter_shuf1]
5343*c0909341SAndroid Build Coastguard Worker    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5344*c0909341SAndroid Build Coastguard Worker    movd    [dstq+strideq*0], m7
5345*c0909341SAndroid Build Coastguard Worker    psrlq                 m7, 32
5346*c0909341SAndroid Build Coastguard Worker    palignr               m7, m6, 4
5347*c0909341SAndroid Build Coastguard Worker
5348*c0909341SAndroid Build Coastguard Worker    FILTER                 6, 0, 1, [base+filter_shuf2]
5349*c0909341SAndroid Build Coastguard Worker    punpcklqdq            m0, m7, m6                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5350*c0909341SAndroid Build Coastguard Worker    movd  [dstq+4+strideq*0], m6
5351*c0909341SAndroid Build Coastguard Worker    psrlq                 m6, 32
5352*c0909341SAndroid Build Coastguard Worker    palignr               m6, m7, 4
5353*c0909341SAndroid Build Coastguard Worker
5354*c0909341SAndroid Build Coastguard Worker    FILTER                 7, 0, 1, [base+filter_shuf2]
5355*c0909341SAndroid Build Coastguard Worker    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5356*c0909341SAndroid Build Coastguard Worker    movd  [dstq+8+strideq*0], m7
5357*c0909341SAndroid Build Coastguard Worker    psrlq                 m7, 32
5358*c0909341SAndroid Build Coastguard Worker    palignr               m7, m6, 4
5359*c0909341SAndroid Build Coastguard Worker
5360*c0909341SAndroid Build Coastguard Worker    FILTER                 6, 0, 1, [base+filter_shuf2]
5361*c0909341SAndroid Build Coastguard Worker    movu                  m1, [filterq]
5362*c0909341SAndroid Build Coastguard Worker    punpckldq             m0, m7, m1                    ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _
5363*c0909341SAndroid Build Coastguard Worker    punpcklqdq            m0, m6                        ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5364*c0909341SAndroid Build Coastguard Worker    movd [dstq+12+strideq*0], m6
5365*c0909341SAndroid Build Coastguard Worker    psrlq                 m6, 32
5366*c0909341SAndroid Build Coastguard Worker    palignr               m6, m7, 4
5367*c0909341SAndroid Build Coastguard Worker    mova    [dstq+strideq*1], m6
5368*c0909341SAndroid Build Coastguard Worker
5369*c0909341SAndroid Build Coastguard Worker    mova                  m6, m1
5370*c0909341SAndroid Build Coastguard Worker
5371*c0909341SAndroid Build Coastguard Worker    FILTER                 7, 0, 6, [base+filter_shuf2]
5372*c0909341SAndroid Build Coastguard Worker    punpcklqdq            m0, m1, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5373*c0909341SAndroid Build Coastguard Worker    movd [dstq+16+strideq*0], m7
5374*c0909341SAndroid Build Coastguard Worker    psrlq                 m7, 32
5375*c0909341SAndroid Build Coastguard Worker    palignr               m7, m1, 4
5376*c0909341SAndroid Build Coastguard Worker
5377*c0909341SAndroid Build Coastguard Worker    FILTER                 6, 0, 1, [base+filter_shuf2]
5378*c0909341SAndroid Build Coastguard Worker    punpcklqdq            m0, m7, m6                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5379*c0909341SAndroid Build Coastguard Worker    movd [dstq+20+strideq*0], m6
5380*c0909341SAndroid Build Coastguard Worker    psrlq                 m6, 32
5381*c0909341SAndroid Build Coastguard Worker    palignr               m6, m7, 4
5382*c0909341SAndroid Build Coastguard Worker
5383*c0909341SAndroid Build Coastguard Worker    FILTER                 7, 0, 1, [base+filter_shuf2]
5384*c0909341SAndroid Build Coastguard Worker    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
5385*c0909341SAndroid Build Coastguard Worker    movd [dstq+24+strideq*0], m7
5386*c0909341SAndroid Build Coastguard Worker    psrlq                 m7, 32
5387*c0909341SAndroid Build Coastguard Worker    palignr               m7, m6, 4
5388*c0909341SAndroid Build Coastguard Worker
5389*c0909341SAndroid Build Coastguard Worker    FILTER                 6, 0, 1, [base+filter_shuf2]
5390*c0909341SAndroid Build Coastguard Worker    movd [dstq+28+strideq*0], m6
5391*c0909341SAndroid Build Coastguard Worker    psrlq                 m6, 32
5392*c0909341SAndroid Build Coastguard Worker    palignr               m6, m7, 4
5393*c0909341SAndroid Build Coastguard Worker    mova [dstq+16+strideq*1], m6
5394*c0909341SAndroid Build Coastguard Worker
5395*c0909341SAndroid Build Coastguard Worker    mova                  m6, [dstq+strideq*1]
5396*c0909341SAndroid Build Coastguard Worker    movd                  m0, [tlq+hq]                  ;_ 6 5 0
5397*c0909341SAndroid Build Coastguard Worker    punpckldq             m0, m6                        ;_ 6 5 0 1 2 3 4
5398*c0909341SAndroid Build Coastguard Worker    lea              filterq, [dstq+16+strideq*1]
5399*c0909341SAndroid Build Coastguard Worker    lea                 dstq, [dstq+strideq*2]
5400*c0909341SAndroid Build Coastguard Worker    sub                   hd, 2
5401*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
5402*c0909341SAndroid Build Coastguard Worker    RET
5403