xref: /aosp_15_r20/external/libdav1d/src/x86/pal.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2023, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2023, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Workerconst pb_0to63,  db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
32*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
33*c0909341SAndroid Build Coastguard Worker                 db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
34*c0909341SAndroid Build Coastguard Worker                 db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47
35*c0909341SAndroid Build Coastguard Worker                 db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63
36*c0909341SAndroid Build Coastguard Worker%endif
37*c0909341SAndroid Build Coastguard Workerpal_idx_w8_padh: db  0,  1,  2,  3,  3,  3,  3,  3,  8,  9, 10, 11, 11, 11, 11, 11
38*c0909341SAndroid Build Coastguard Worker
39*c0909341SAndroid Build Coastguard Workerpb_1_16: times 4 db  1, 16
40*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
41*c0909341SAndroid Build Coastguard Workerpb_32:   times 4 db 32
42*c0909341SAndroid Build Coastguard Worker%endif
43*c0909341SAndroid Build Coastguard Worker
44*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 2-*
45*c0909341SAndroid Build Coastguard Worker    %xdefine %1_table (%%table - 2*4)
46*c0909341SAndroid Build Coastguard Worker    %xdefine %%base mangle(private_prefix %+ _%1)
47*c0909341SAndroid Build Coastguard Worker    %%table:
48*c0909341SAndroid Build Coastguard Worker    %rep %0 - 1
49*c0909341SAndroid Build Coastguard Worker        dd %%base %+ .w%2 - (%%table - 2*4)
50*c0909341SAndroid Build Coastguard Worker        %rotate 1
51*c0909341SAndroid Build Coastguard Worker    %endrep
52*c0909341SAndroid Build Coastguard Worker%endmacro
53*c0909341SAndroid Build Coastguard Worker
54*c0909341SAndroid Build Coastguard WorkerJMP_TABLE pal_idx_finish_ssse3,     4, 8, 16, 32, 64
55*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
56*c0909341SAndroid Build Coastguard WorkerJMP_TABLE pal_idx_finish_avx2,      4, 8, 16, 32, 64
57*c0909341SAndroid Build Coastguard WorkerJMP_TABLE pal_idx_finish_avx512icl, 4, 8, 16, 32, 64
58*c0909341SAndroid Build Coastguard Worker%endif
59*c0909341SAndroid Build Coastguard Worker
60*c0909341SAndroid Build Coastguard WorkerSECTION .text
61*c0909341SAndroid Build Coastguard Worker
62*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
63*c0909341SAndroid Build Coastguard Workercglobal pal_idx_finish, 2, 7, 6, dst, src, bw, bh, w, h
64*c0909341SAndroid Build Coastguard Worker%define base r6-pal_idx_finish_ssse3_table
65*c0909341SAndroid Build Coastguard Worker    LEA                  r6, pal_idx_finish_ssse3_table
66*c0909341SAndroid Build Coastguard Worker    tzcnt               bwd, bwm
67*c0909341SAndroid Build Coastguard Worker    movifnidn           bhd, bhm
68*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
69*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
70*c0909341SAndroid Build Coastguard Worker    movsxd              bwq, [r6+bwq*4]
71*c0909341SAndroid Build Coastguard Worker    movddup              m3, [base+pb_1_16]
72*c0909341SAndroid Build Coastguard Worker    add                 bwq, r6
73*c0909341SAndroid Build Coastguard Worker    sub                 bhd, hd
74*c0909341SAndroid Build Coastguard Worker    jmp                 bwq
75*c0909341SAndroid Build Coastguard Worker.w4:
76*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq]
77*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
78*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
79*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
80*c0909341SAndroid Build Coastguard Worker    movq             [dstq], m0
81*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
82*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
83*c0909341SAndroid Build Coastguard Worker    jg .w4
84*c0909341SAndroid Build Coastguard Worker    test                bhd, bhd
85*c0909341SAndroid Build Coastguard Worker    jz .w4_end
86*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q3333
87*c0909341SAndroid Build Coastguard Worker.w4_padv:
88*c0909341SAndroid Build Coastguard Worker    movq             [dstq], m0
89*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
90*c0909341SAndroid Build Coastguard Worker    sub                 bhd, 4
91*c0909341SAndroid Build Coastguard Worker    jg .w4_padv
92*c0909341SAndroid Build Coastguard Worker.w4_end:
93*c0909341SAndroid Build Coastguard Worker    RET
94*c0909341SAndroid Build Coastguard Worker.w8_padh:
95*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2
96*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m2
97*c0909341SAndroid Build Coastguard Worker    jmp .w8_main
98*c0909341SAndroid Build Coastguard Worker.w8:
99*c0909341SAndroid Build Coastguard Worker    mova                 m2, [base+pal_idx_w8_padh]
100*c0909341SAndroid Build Coastguard Worker.w8_loop:
101*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq+16*0]
102*c0909341SAndroid Build Coastguard Worker    mova                 m1, [srcq+16*1]
103*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
104*c0909341SAndroid Build Coastguard Worker    jl .w8_padh
105*c0909341SAndroid Build Coastguard Worker.w8_main:
106*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
107*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
108*c0909341SAndroid Build Coastguard Worker    add                srcq, 16*2
109*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
110*c0909341SAndroid Build Coastguard Worker    movu             [dstq], m0
111*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
112*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
113*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
114*c0909341SAndroid Build Coastguard Worker    test                bhd, bhd
115*c0909341SAndroid Build Coastguard Worker    jz .w8_end
116*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m0, q3333
117*c0909341SAndroid Build Coastguard Worker.w8_padv:
118*c0909341SAndroid Build Coastguard Worker    movu             [dstq], m0
119*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
120*c0909341SAndroid Build Coastguard Worker    sub                 bhd, 4
121*c0909341SAndroid Build Coastguard Worker    jg .w8_padv
122*c0909341SAndroid Build Coastguard Worker.w8_end:
123*c0909341SAndroid Build Coastguard Worker    RET
124*c0909341SAndroid Build Coastguard Worker.w16_padh:
125*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
126*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
127*c0909341SAndroid Build Coastguard Worker    jmp .w16_main
128*c0909341SAndroid Build Coastguard Worker.w16:
129*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 16
130*c0909341SAndroid Build Coastguard Worker    je .w16_loop
131*c0909341SAndroid Build Coastguard Worker    call .setup_padh
132*c0909341SAndroid Build Coastguard Worker.w16_loop:
133*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq+16*0]
134*c0909341SAndroid Build Coastguard Worker    mova                 m1, [srcq+16*1]
135*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 16
136*c0909341SAndroid Build Coastguard Worker    jl .w16_padh
137*c0909341SAndroid Build Coastguard Worker.w16_main:
138*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
139*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
140*c0909341SAndroid Build Coastguard Worker    add                srcq, 16*2
141*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
142*c0909341SAndroid Build Coastguard Worker    movu             [dstq], m0
143*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
144*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
145*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
146*c0909341SAndroid Build Coastguard Worker    test                bhd, bhd
147*c0909341SAndroid Build Coastguard Worker    jz .w16_end
148*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
149*c0909341SAndroid Build Coastguard Worker.w16_padv:
150*c0909341SAndroid Build Coastguard Worker    movu        [dstq+16*0], m0
151*c0909341SAndroid Build Coastguard Worker    movu        [dstq+16*1], m0
152*c0909341SAndroid Build Coastguard Worker    add                dstq, 16*2
153*c0909341SAndroid Build Coastguard Worker    sub                 bhd, 4
154*c0909341SAndroid Build Coastguard Worker    jg .w16_padv
155*c0909341SAndroid Build Coastguard Worker.w16_end:
156*c0909341SAndroid Build Coastguard Worker    RET
157*c0909341SAndroid Build Coastguard Worker.w32_padh:
158*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 16
159*c0909341SAndroid Build Coastguard Worker    jg .w32_padh2
160*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m0, m5
161*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
162*c0909341SAndroid Build Coastguard Worker    jmp .w32_main
163*c0909341SAndroid Build Coastguard Worker.w32_padh2:
164*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
165*c0909341SAndroid Build Coastguard Worker    jmp .w32_main
166*c0909341SAndroid Build Coastguard Worker.w32:
167*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 32
168*c0909341SAndroid Build Coastguard Worker    je .w32_loop
169*c0909341SAndroid Build Coastguard Worker    call .setup_padh
170*c0909341SAndroid Build Coastguard Worker.w32_loop:
171*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq+16*0]
172*c0909341SAndroid Build Coastguard Worker    mova                 m1, [srcq+16*1]
173*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 32
174*c0909341SAndroid Build Coastguard Worker    jl .w32_padh
175*c0909341SAndroid Build Coastguard Worker.w32_main:
176*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
177*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
178*c0909341SAndroid Build Coastguard Worker    add                srcq, 16*2
179*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
180*c0909341SAndroid Build Coastguard Worker    movu             [dstq], m0
181*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
182*c0909341SAndroid Build Coastguard Worker    dec                  hd
183*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
184*c0909341SAndroid Build Coastguard Worker    test                bhd, bhd
185*c0909341SAndroid Build Coastguard Worker    jz .w32_end
186*c0909341SAndroid Build Coastguard Worker.w32_padv:
187*c0909341SAndroid Build Coastguard Worker    movu        [dstq+16*0], m0
188*c0909341SAndroid Build Coastguard Worker    movu        [dstq+16*1], m0
189*c0909341SAndroid Build Coastguard Worker    movu        [dstq+16*2], m0
190*c0909341SAndroid Build Coastguard Worker    movu        [dstq+16*3], m0
191*c0909341SAndroid Build Coastguard Worker    add                dstq, 16*4
192*c0909341SAndroid Build Coastguard Worker    sub                 bhd, 4
193*c0909341SAndroid Build Coastguard Worker    jg .w32_padv
194*c0909341SAndroid Build Coastguard Worker.w32_end:
195*c0909341SAndroid Build Coastguard Worker    RET
196*c0909341SAndroid Build Coastguard Worker.w64_padh:
197*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 16
198*c0909341SAndroid Build Coastguard Worker    jg .w64_padh2
199*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m0, m5
200*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
201*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
202*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
203*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
204*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m1
205*c0909341SAndroid Build Coastguard Worker    jmp .w64_main
206*c0909341SAndroid Build Coastguard Worker.w64_padh2:
207*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
208*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
209*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m1, m3
210*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m5
211*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
212*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m2
213*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m1
214*c0909341SAndroid Build Coastguard Worker    jmp .w64_main
215*c0909341SAndroid Build Coastguard Worker.w64_padh3:
216*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 48
217*c0909341SAndroid Build Coastguard Worker    jg .w64_padh4
218*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m1, m5
219*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
220*c0909341SAndroid Build Coastguard Worker    jmp .w64_main2
221*c0909341SAndroid Build Coastguard Worker.w64_padh4:
222*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
223*c0909341SAndroid Build Coastguard Worker    jmp .w64_main2
224*c0909341SAndroid Build Coastguard Worker.w64:
225*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 64
226*c0909341SAndroid Build Coastguard Worker    je .w64_loop
227*c0909341SAndroid Build Coastguard Worker    call .setup_padh
228*c0909341SAndroid Build Coastguard Worker.w64_loop:
229*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq+16*0]
230*c0909341SAndroid Build Coastguard Worker    mova                 m1, [srcq+16*1]
231*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 32
232*c0909341SAndroid Build Coastguard Worker    jle .w64_padh
233*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3
234*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
235*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
236*c0909341SAndroid Build Coastguard Worker    mova                 m1, [srcq+16*2]
237*c0909341SAndroid Build Coastguard Worker    mova                 m2, [srcq+16*3]
238*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 64
239*c0909341SAndroid Build Coastguard Worker    jl .w64_padh3
240*c0909341SAndroid Build Coastguard Worker.w64_main2:
241*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3
242*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m3
243*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m2
244*c0909341SAndroid Build Coastguard Worker.w64_main:
245*c0909341SAndroid Build Coastguard Worker    add                srcq, 16*4
246*c0909341SAndroid Build Coastguard Worker    movu        [dstq+16*0], m0
247*c0909341SAndroid Build Coastguard Worker    movu        [dstq+16*1], m1
248*c0909341SAndroid Build Coastguard Worker    add                dstq, 16*2
249*c0909341SAndroid Build Coastguard Worker    dec                  hd
250*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
251*c0909341SAndroid Build Coastguard Worker    test                bhd, bhd
252*c0909341SAndroid Build Coastguard Worker    jz .w64_end
253*c0909341SAndroid Build Coastguard Worker.w64_padv:
254*c0909341SAndroid Build Coastguard Worker    movu        [dstq+16*0], m0
255*c0909341SAndroid Build Coastguard Worker    movu        [dstq+16*1], m1
256*c0909341SAndroid Build Coastguard Worker    movu        [dstq+16*2], m0
257*c0909341SAndroid Build Coastguard Worker    movu        [dstq+16*3], m1
258*c0909341SAndroid Build Coastguard Worker    add                dstq, 16*4
259*c0909341SAndroid Build Coastguard Worker    sub                 bhd, 2
260*c0909341SAndroid Build Coastguard Worker    jg .w64_padv
261*c0909341SAndroid Build Coastguard Worker.w64_end:
262*c0909341SAndroid Build Coastguard Worker    RET
263*c0909341SAndroid Build Coastguard Worker.setup_padh:
264*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+pb_0to63]
265*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [wq-1]
266*c0909341SAndroid Build Coastguard Worker    and                 r6d, 15
267*c0909341SAndroid Build Coastguard Worker    movd                 m5, r6d
268*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
269*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m0
270*c0909341SAndroid Build Coastguard Worker    pminub               m4, m5
271*c0909341SAndroid Build Coastguard Worker    ret
272*c0909341SAndroid Build Coastguard Worker
273*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
274*c0909341SAndroid Build Coastguard Worker
275*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
276*c0909341SAndroid Build Coastguard Workercglobal pal_idx_finish, 4, 7, 5, dst, src, bw, bh, w, h
277*c0909341SAndroid Build Coastguard Worker%define base r6-pal_idx_finish_avx2_table
278*c0909341SAndroid Build Coastguard Worker    lea                  r6, [pal_idx_finish_avx2_table]
279*c0909341SAndroid Build Coastguard Worker    tzcnt               bwd, bwd
280*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
281*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
282*c0909341SAndroid Build Coastguard Worker    movsxd              bwq, [r6+bwq*4]
283*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [base+pb_1_16]
284*c0909341SAndroid Build Coastguard Worker    dec                  wd
285*c0909341SAndroid Build Coastguard Worker    add                 bwq, r6
286*c0909341SAndroid Build Coastguard Worker    sub                 bhd, hd
287*c0909341SAndroid Build Coastguard Worker    jmp                 bwq
288*c0909341SAndroid Build Coastguard Worker.w4:
289*c0909341SAndroid Build Coastguard Worker    mova                xm0, [srcq]
290*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
291*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm2
292*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm0
293*c0909341SAndroid Build Coastguard Worker    movq             [dstq], xm0
294*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
295*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
296*c0909341SAndroid Build Coastguard Worker    jg .w4
297*c0909341SAndroid Build Coastguard Worker    test                bhd, bhd
298*c0909341SAndroid Build Coastguard Worker    jz .w4_end
299*c0909341SAndroid Build Coastguard Worker    pshuflw             xm0, xm0, q3333
300*c0909341SAndroid Build Coastguard Worker.w4_padv:
301*c0909341SAndroid Build Coastguard Worker    movq             [dstq], xm0
302*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
303*c0909341SAndroid Build Coastguard Worker    sub                 bhd, 4
304*c0909341SAndroid Build Coastguard Worker    jg .w4_padv
305*c0909341SAndroid Build Coastguard Worker.w4_end:
306*c0909341SAndroid Build Coastguard Worker    RET
307*c0909341SAndroid Build Coastguard Worker.w8_padh:
308*c0909341SAndroid Build Coastguard Worker    pshufb              xm0, xm3
309*c0909341SAndroid Build Coastguard Worker    pshufb              xm1, xm3
310*c0909341SAndroid Build Coastguard Worker    jmp .w8_main
311*c0909341SAndroid Build Coastguard Worker.w8:
312*c0909341SAndroid Build Coastguard Worker    mova                xm3, [base+pal_idx_w8_padh]
313*c0909341SAndroid Build Coastguard Worker.w8_loop:
314*c0909341SAndroid Build Coastguard Worker    mova                xm0, [srcq+16*0]
315*c0909341SAndroid Build Coastguard Worker    mova                xm1, [srcq+16*1]
316*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 7
317*c0909341SAndroid Build Coastguard Worker    jl .w8_padh
318*c0909341SAndroid Build Coastguard Worker.w8_main:
319*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm0, xm2
320*c0909341SAndroid Build Coastguard Worker    pmaddubsw           xm1, xm2
321*c0909341SAndroid Build Coastguard Worker    add                srcq, 16*2
322*c0909341SAndroid Build Coastguard Worker    packuswb            xm0, xm1
323*c0909341SAndroid Build Coastguard Worker    movu             [dstq], xm0
324*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
325*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
326*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
327*c0909341SAndroid Build Coastguard Worker    test                bhd, bhd
328*c0909341SAndroid Build Coastguard Worker    jz .w8_end
329*c0909341SAndroid Build Coastguard Worker    pshufd              xm0, xm0, q3333
330*c0909341SAndroid Build Coastguard Worker.w8_padv:
331*c0909341SAndroid Build Coastguard Worker    movu             [dstq], xm0
332*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
333*c0909341SAndroid Build Coastguard Worker    sub                 bhd, 4
334*c0909341SAndroid Build Coastguard Worker    jg .w8_padv
335*c0909341SAndroid Build Coastguard Worker.w8_end:
336*c0909341SAndroid Build Coastguard Worker    RET
337*c0909341SAndroid Build Coastguard Worker.w16_padh:
338*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
339*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
340*c0909341SAndroid Build Coastguard Worker    jmp .w16_main
341*c0909341SAndroid Build Coastguard Worker.w16:
342*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 15
343*c0909341SAndroid Build Coastguard Worker    je .w16_loop
344*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [base+pb_0to63]
345*c0909341SAndroid Build Coastguard Worker    movd                xm3, wd
346*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m3, xm3
347*c0909341SAndroid Build Coastguard Worker    pminub               m3, m0
348*c0909341SAndroid Build Coastguard Worker.w16_loop:
349*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq+32*0]
350*c0909341SAndroid Build Coastguard Worker    mova                 m1, [srcq+32*1]
351*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 15
352*c0909341SAndroid Build Coastguard Worker    jl .w16_padh
353*c0909341SAndroid Build Coastguard Worker.w16_main:
354*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
355*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
356*c0909341SAndroid Build Coastguard Worker    add                srcq, 32*2
357*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
358*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m0, q3120
359*c0909341SAndroid Build Coastguard Worker    movu             [dstq], m1
360*c0909341SAndroid Build Coastguard Worker    add                dstq, 32
361*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
362*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
363*c0909341SAndroid Build Coastguard Worker    test                bhd, bhd
364*c0909341SAndroid Build Coastguard Worker    jz .w16_end
365*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3333
366*c0909341SAndroid Build Coastguard Worker.w16_padv:
367*c0909341SAndroid Build Coastguard Worker    movu             [dstq], m0
368*c0909341SAndroid Build Coastguard Worker    add                dstq, 32
369*c0909341SAndroid Build Coastguard Worker    sub                 bhd, 4
370*c0909341SAndroid Build Coastguard Worker    jg .w16_padv
371*c0909341SAndroid Build Coastguard Worker.w16_end:
372*c0909341SAndroid Build Coastguard Worker    RET
373*c0909341SAndroid Build Coastguard Worker.w32_padh:
374*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 15
375*c0909341SAndroid Build Coastguard Worker    jg .w32_padh2
376*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, xm0, 1
377*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, xm1, 1
378*c0909341SAndroid Build Coastguard Worker.w32_padh2:
379*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
380*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
381*c0909341SAndroid Build Coastguard Worker    jmp .w32_main
382*c0909341SAndroid Build Coastguard Worker.w32:
383*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 31
384*c0909341SAndroid Build Coastguard Worker    je .w32_loop
385*c0909341SAndroid Build Coastguard Worker    movd                xm3, wd
386*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m3, xm3
387*c0909341SAndroid Build Coastguard Worker    pminub               m3, [base+pb_0to63]
388*c0909341SAndroid Build Coastguard Worker.w32_loop:
389*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq+32*0]
390*c0909341SAndroid Build Coastguard Worker    mova                 m1, [srcq+32*1]
391*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 31
392*c0909341SAndroid Build Coastguard Worker    jl .w32_padh
393*c0909341SAndroid Build Coastguard Worker.w32_main:
394*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
395*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
396*c0909341SAndroid Build Coastguard Worker    add                srcq, 32*2
397*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
398*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m0, q3120
399*c0909341SAndroid Build Coastguard Worker    movu             [dstq], m1
400*c0909341SAndroid Build Coastguard Worker    add                dstq, 32
401*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
402*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
403*c0909341SAndroid Build Coastguard Worker    test                bhd, bhd
404*c0909341SAndroid Build Coastguard Worker    jz .w32_end
405*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3131
406*c0909341SAndroid Build Coastguard Worker.w32_padv:
407*c0909341SAndroid Build Coastguard Worker    movu        [dstq+32*0], m0
408*c0909341SAndroid Build Coastguard Worker    movu        [dstq+32*1], m0
409*c0909341SAndroid Build Coastguard Worker    add                dstq, 32*2
410*c0909341SAndroid Build Coastguard Worker    sub                 bhd, 4
411*c0909341SAndroid Build Coastguard Worker    jg .w32_padv
412*c0909341SAndroid Build Coastguard Worker.w32_end:
413*c0909341SAndroid Build Coastguard Worker    RET
414*c0909341SAndroid Build Coastguard Worker.w64_padh:
415*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 15
416*c0909341SAndroid Build Coastguard Worker    jg .w64_padh2
417*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, m0, xm0, 1
418*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1, m3
419*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
420*c0909341SAndroid Build Coastguard Worker    jmp .w64_main
421*c0909341SAndroid Build Coastguard Worker.w64_padh2:
422*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 31
423*c0909341SAndroid Build Coastguard Worker    jg .w64_padh3
424*c0909341SAndroid Build Coastguard Worker    vperm2i128           m1, m0, m0, 0x11
425*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
426*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
427*c0909341SAndroid Build Coastguard Worker    jmp .w64_main
428*c0909341SAndroid Build Coastguard Worker.w64_padh3:
429*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 47
430*c0909341SAndroid Build Coastguard Worker    jg .w64_padh4
431*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, xm1, 1
432*c0909341SAndroid Build Coastguard Worker.w64_padh4:
433*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3
434*c0909341SAndroid Build Coastguard Worker    jmp .w64_main
435*c0909341SAndroid Build Coastguard Worker.w64:
436*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 63
437*c0909341SAndroid Build Coastguard Worker    je .w64_loop
438*c0909341SAndroid Build Coastguard Worker    mov                 r6d, wd
439*c0909341SAndroid Build Coastguard Worker    and                 r6d, 31
440*c0909341SAndroid Build Coastguard Worker    movd                xm4, r6d
441*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m4, xm4
442*c0909341SAndroid Build Coastguard Worker    pminub               m3, m4, [pb_0to63]
443*c0909341SAndroid Build Coastguard Worker.w64_loop:
444*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq+32*0]
445*c0909341SAndroid Build Coastguard Worker    mova                 m1, [srcq+32*1]
446*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 63
447*c0909341SAndroid Build Coastguard Worker    jl .w64_padh
448*c0909341SAndroid Build Coastguard Worker.w64_main:
449*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
450*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2
451*c0909341SAndroid Build Coastguard Worker    add                srcq, 32*2
452*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
453*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
454*c0909341SAndroid Build Coastguard Worker    movu             [dstq], m0
455*c0909341SAndroid Build Coastguard Worker    add                dstq, 32
456*c0909341SAndroid Build Coastguard Worker    dec                  hd
457*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
458*c0909341SAndroid Build Coastguard Worker    test                bhd, bhd
459*c0909341SAndroid Build Coastguard Worker    jz .w64_end
460*c0909341SAndroid Build Coastguard Worker.w64_padv:
461*c0909341SAndroid Build Coastguard Worker    movu        [dstq+32*0], m0
462*c0909341SAndroid Build Coastguard Worker    movu        [dstq+32*1], m0
463*c0909341SAndroid Build Coastguard Worker    movu        [dstq+32*2], m0
464*c0909341SAndroid Build Coastguard Worker    movu        [dstq+32*3], m0
465*c0909341SAndroid Build Coastguard Worker    add                dstq, 32*4
466*c0909341SAndroid Build Coastguard Worker    sub                 bhd, 4
467*c0909341SAndroid Build Coastguard Worker    jg .w64_padv
468*c0909341SAndroid Build Coastguard Worker.w64_end:
469*c0909341SAndroid Build Coastguard Worker    RET
470*c0909341SAndroid Build Coastguard Worker
471*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
472*c0909341SAndroid Build Coastguard Workercglobal pal_idx_finish, 4, 7, 7, dst, src, bw, bh, w, h
473*c0909341SAndroid Build Coastguard Worker%define base r6-pal_idx_finish_avx512icl_table
474*c0909341SAndroid Build Coastguard Worker    lea                  r6, [pal_idx_finish_avx512icl_table]
475*c0909341SAndroid Build Coastguard Worker    tzcnt               bwd, bwd
476*c0909341SAndroid Build Coastguard Worker    movifnidn            wd, wm
477*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
478*c0909341SAndroid Build Coastguard Worker    movsxd              bwq, [r6+bwq*4]
479*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [base+pb_1_16]
480*c0909341SAndroid Build Coastguard Worker    dec                  wd
481*c0909341SAndroid Build Coastguard Worker    add                 bwq, r6
482*c0909341SAndroid Build Coastguard Worker    sub                 bhd, hd
483*c0909341SAndroid Build Coastguard Worker    jmp                 bwq
484*c0909341SAndroid Build Coastguard Worker.w4:
485*c0909341SAndroid Build Coastguard Worker    mova               xmm0, [srcq]
486*c0909341SAndroid Build Coastguard Worker    add                srcq, 16
487*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm0, xm4
488*c0909341SAndroid Build Coastguard Worker    packuswb           xmm0, xmm0
489*c0909341SAndroid Build Coastguard Worker    movq             [dstq], xmm0
490*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
491*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
492*c0909341SAndroid Build Coastguard Worker    jg .w4
493*c0909341SAndroid Build Coastguard Worker    test                bhd, bhd
494*c0909341SAndroid Build Coastguard Worker    jz .w4_end
495*c0909341SAndroid Build Coastguard Worker    pshuflw            xmm0, xmm0, q3333
496*c0909341SAndroid Build Coastguard Worker.w4_padv:
497*c0909341SAndroid Build Coastguard Worker    movq             [dstq], xmm0
498*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
499*c0909341SAndroid Build Coastguard Worker    sub                 bhd, 4
500*c0909341SAndroid Build Coastguard Worker    jg .w4_padv
501*c0909341SAndroid Build Coastguard Worker.w4_end:
502*c0909341SAndroid Build Coastguard Worker    RET
503*c0909341SAndroid Build Coastguard Worker.w8_padh:
504*c0909341SAndroid Build Coastguard Worker    pshufb             xmm0, xmm2
505*c0909341SAndroid Build Coastguard Worker    pshufb             xmm1, xmm2
506*c0909341SAndroid Build Coastguard Worker    jmp .w8_main
507*c0909341SAndroid Build Coastguard Worker.w8:
508*c0909341SAndroid Build Coastguard Worker    mova               xmm2, [base+pal_idx_w8_padh]
509*c0909341SAndroid Build Coastguard Worker.w8_loop:
510*c0909341SAndroid Build Coastguard Worker    mova               xmm0, [srcq+16*0]
511*c0909341SAndroid Build Coastguard Worker    mova               xmm1, [srcq+16*1]
512*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 7
513*c0909341SAndroid Build Coastguard Worker    jl .w8_padh
514*c0909341SAndroid Build Coastguard Worker.w8_main:
515*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm0, xm4
516*c0909341SAndroid Build Coastguard Worker    pmaddubsw          xmm1, xm4
517*c0909341SAndroid Build Coastguard Worker    add                srcq, 16*2
518*c0909341SAndroid Build Coastguard Worker    packuswb           xmm0, xmm1
519*c0909341SAndroid Build Coastguard Worker    movu             [dstq], xmm0
520*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
521*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
522*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
523*c0909341SAndroid Build Coastguard Worker    test                bhd, bhd
524*c0909341SAndroid Build Coastguard Worker    jz .w8_end
525*c0909341SAndroid Build Coastguard Worker    pshufd             xmm0, xmm0, q3333
526*c0909341SAndroid Build Coastguard Worker.w8_padv:
527*c0909341SAndroid Build Coastguard Worker    movu             [dstq], xmm0
528*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
529*c0909341SAndroid Build Coastguard Worker    sub                 bhd, 4
530*c0909341SAndroid Build Coastguard Worker    jg .w8_padv
531*c0909341SAndroid Build Coastguard Worker.w8_end:
532*c0909341SAndroid Build Coastguard Worker    RET
533*c0909341SAndroid Build Coastguard Worker.w16_padh:
534*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2
535*c0909341SAndroid Build Coastguard Worker    jmp .w16_main
536*c0909341SAndroid Build Coastguard Worker.w16:
537*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 15
538*c0909341SAndroid Build Coastguard Worker    je .w16_loop
539*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m2, [base+pb_0to63]
540*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, wd
541*c0909341SAndroid Build Coastguard Worker    pminub               m2, m0
542*c0909341SAndroid Build Coastguard Worker.w16_loop:
543*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq]
544*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 15
545*c0909341SAndroid Build Coastguard Worker    jl .w16_padh
546*c0909341SAndroid Build Coastguard Worker.w16_main:
547*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m4
548*c0909341SAndroid Build Coastguard Worker    add                srcq, 64
549*c0909341SAndroid Build Coastguard Worker    vpmovwb             ym0, m0
550*c0909341SAndroid Build Coastguard Worker    movu             [dstq], ym0
551*c0909341SAndroid Build Coastguard Worker    add                dstq, 32
552*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
553*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
554*c0909341SAndroid Build Coastguard Worker    test                bhd, bhd
555*c0909341SAndroid Build Coastguard Worker    jz .w16_end
556*c0909341SAndroid Build Coastguard Worker    vpermq              ym0, ym0, q3333
557*c0909341SAndroid Build Coastguard Worker.w16_padv:
558*c0909341SAndroid Build Coastguard Worker    movu             [dstq], ym0
559*c0909341SAndroid Build Coastguard Worker    add                dstq, 32
560*c0909341SAndroid Build Coastguard Worker    sub                 bhd, 4
561*c0909341SAndroid Build Coastguard Worker    jg .w16_padv
562*c0909341SAndroid Build Coastguard Worker.w16_end:
563*c0909341SAndroid Build Coastguard Worker    RET
564*c0909341SAndroid Build Coastguard Worker.w32_padh:
565*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m2, m0
566*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m2, m1
567*c0909341SAndroid Build Coastguard Worker    jmp .w32_main
568*c0909341SAndroid Build Coastguard Worker.w32:
569*c0909341SAndroid Build Coastguard Worker    mova                 m2, [base+pb_0to63]
570*c0909341SAndroid Build Coastguard Worker    paddb                m3, m2, m2
571*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 31
572*c0909341SAndroid Build Coastguard Worker    je .w32_loop
573*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, wd
574*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0xff00
575*c0909341SAndroid Build Coastguard Worker    kmovw                k1, r6d
576*c0909341SAndroid Build Coastguard Worker    vpaddd           m0{k1}, [pb_32] {1to16}
577*c0909341SAndroid Build Coastguard Worker    pminub               m2, m0
578*c0909341SAndroid Build Coastguard Worker.w32_loop:
579*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq+64*0]
580*c0909341SAndroid Build Coastguard Worker    mova                 m1, [srcq+64*1]
581*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 31
582*c0909341SAndroid Build Coastguard Worker    jl .w32_padh
583*c0909341SAndroid Build Coastguard Worker.w32_main:
584*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m4
585*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m4
586*c0909341SAndroid Build Coastguard Worker    add                srcq, 64*2
587*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m3, m1
588*c0909341SAndroid Build Coastguard Worker    movu             [dstq], m0
589*c0909341SAndroid Build Coastguard Worker    add                dstq, 64
590*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
591*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
592*c0909341SAndroid Build Coastguard Worker    test                bhd, bhd
593*c0909341SAndroid Build Coastguard Worker    jz .w32_end
594*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m0, q3333
595*c0909341SAndroid Build Coastguard Worker.w32_padv:
596*c0909341SAndroid Build Coastguard Worker    movu             [dstq], m0
597*c0909341SAndroid Build Coastguard Worker    add                dstq, 64
598*c0909341SAndroid Build Coastguard Worker    sub                 bhd, 4
599*c0909341SAndroid Build Coastguard Worker    jg .w32_padv
600*c0909341SAndroid Build Coastguard Worker.w32_end:
601*c0909341SAndroid Build Coastguard Worker    RET
602*c0909341SAndroid Build Coastguard Worker.w64_padh:
603*c0909341SAndroid Build Coastguard Worker    REPX  {vpermb x, m5, x}, m0, m1, m2, m3
604*c0909341SAndroid Build Coastguard Worker    jmp .w64_main
605*c0909341SAndroid Build Coastguard Worker.w64:
606*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+pb_0to63]
607*c0909341SAndroid Build Coastguard Worker    paddb                m6, m5, m5
608*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 63
609*c0909341SAndroid Build Coastguard Worker    je .w64_loop
610*c0909341SAndroid Build Coastguard Worker    vpbroadcastb         m0, wd
611*c0909341SAndroid Build Coastguard Worker    pminub               m5, m0
612*c0909341SAndroid Build Coastguard Worker.w64_loop:
613*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq+64*0]
614*c0909341SAndroid Build Coastguard Worker    mova                 m1, [srcq+64*1]
615*c0909341SAndroid Build Coastguard Worker    mova                 m2, [srcq+64*2]
616*c0909341SAndroid Build Coastguard Worker    mova                 m3, [srcq+64*3]
617*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 63
618*c0909341SAndroid Build Coastguard Worker    jl .w64_padh
619*c0909341SAndroid Build Coastguard Worker.w64_main:
620*c0909341SAndroid Build Coastguard Worker    REPX  {pmaddubsw x, m4}, m0, m1, m2, m3
621*c0909341SAndroid Build Coastguard Worker    add                srcq, 64*4
622*c0909341SAndroid Build Coastguard Worker    vpermt2b             m0, m6, m1
623*c0909341SAndroid Build Coastguard Worker    vpermt2b             m2, m6, m3
624*c0909341SAndroid Build Coastguard Worker    movu        [dstq+64*0], m0
625*c0909341SAndroid Build Coastguard Worker    movu        [dstq+64*1], m2
626*c0909341SAndroid Build Coastguard Worker    add                dstq, 64*2
627*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
628*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
629*c0909341SAndroid Build Coastguard Worker    test                bhd, bhd
630*c0909341SAndroid Build Coastguard Worker    jz .w64_end
631*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m2, q3232
632*c0909341SAndroid Build Coastguard Worker.w64_padv:
633*c0909341SAndroid Build Coastguard Worker    movu        [dstq+64*0], m2
634*c0909341SAndroid Build Coastguard Worker    movu        [dstq+64*1], m2
635*c0909341SAndroid Build Coastguard Worker    add                dstq, 64*2
636*c0909341SAndroid Build Coastguard Worker    sub                 bhd, 4
637*c0909341SAndroid Build Coastguard Worker    jg .w64_padv
638*c0909341SAndroid Build Coastguard Worker.w64_end:
639*c0909341SAndroid Build Coastguard Worker    RET
640*c0909341SAndroid Build Coastguard Worker
641*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
642