xref: /aosp_15_r20/external/libjpeg-turbo/simd/x86_64/jcphuff-sse2.asm (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1*dfc6aa5cSAndroid Build Coastguard Worker;
2*dfc6aa5cSAndroid Build Coastguard Worker; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
3*dfc6aa5cSAndroid Build Coastguard Worker; (64-bit SSE2)
4*dfc6aa5cSAndroid Build Coastguard Worker;
5*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2016, 2018, Matthieu Darbois
6*dfc6aa5cSAndroid Build Coastguard Worker;
7*dfc6aa5cSAndroid Build Coastguard Worker; Based on the x86 SIMD extension for IJG JPEG library
8*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 1999-2006, MIYASAKA Masaru.
9*dfc6aa5cSAndroid Build Coastguard Worker; For conditions of distribution and use, see copyright notice in jsimdext.inc
10*dfc6aa5cSAndroid Build Coastguard Worker;
11*dfc6aa5cSAndroid Build Coastguard Worker; This file should be assembled with NASM (Netwide Assembler),
12*dfc6aa5cSAndroid Build Coastguard Worker; can *not* be assembled with Microsoft's MASM or any compatible
13*dfc6aa5cSAndroid Build Coastguard Worker; assembler (including Borland's Turbo Assembler).
14*dfc6aa5cSAndroid Build Coastguard Worker; NASM is available from http://nasm.sourceforge.net/ or
15*dfc6aa5cSAndroid Build Coastguard Worker; http://sourceforge.net/project/showfiles.php?group_id=6208
16*dfc6aa5cSAndroid Build Coastguard Worker;
17*dfc6aa5cSAndroid Build Coastguard Worker; This file contains an SSE2 implementation of data preparation for progressive
18*dfc6aa5cSAndroid Build Coastguard Worker; Huffman encoding.  See jcphuff.c for more details.
19*dfc6aa5cSAndroid Build Coastguard Worker
20*dfc6aa5cSAndroid Build Coastguard Worker%include "jsimdext.inc"
21*dfc6aa5cSAndroid Build Coastguard Worker
22*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
23*dfc6aa5cSAndroid Build Coastguard Worker    SECTION     SEG_TEXT
24*dfc6aa5cSAndroid Build Coastguard Worker    BITS        64
25*dfc6aa5cSAndroid Build Coastguard Worker
26*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
27*dfc6aa5cSAndroid Build Coastguard Worker; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
28*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_encode_mcu_AC_refine_prepare_sse2()
29*dfc6aa5cSAndroid Build Coastguard Worker
30*dfc6aa5cSAndroid Build Coastguard Worker%macro LOAD16 0
31*dfc6aa5cSAndroid Build Coastguard Worker    pxor        N0, N0
32*dfc6aa5cSAndroid Build Coastguard Worker    pxor        N1, N1
33*dfc6aa5cSAndroid Build Coastguard Worker
34*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  0*SIZEOF_INT]
35*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT +  8*SIZEOF_INT]
36*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 0
37*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X1, word [BLOCK + T1 * 2], 0
38*dfc6aa5cSAndroid Build Coastguard Worker
39*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  1*SIZEOF_INT]
40*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT +  9*SIZEOF_INT]
41*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 1
42*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X1, word [BLOCK + T1 * 2], 1
43*dfc6aa5cSAndroid Build Coastguard Worker
44*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  2*SIZEOF_INT]
45*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT + 10*SIZEOF_INT]
46*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 2
47*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X1, word [BLOCK + T1 * 2], 2
48*dfc6aa5cSAndroid Build Coastguard Worker
49*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  3*SIZEOF_INT]
50*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT + 11*SIZEOF_INT]
51*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 3
52*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X1, word [BLOCK + T1 * 2], 3
53*dfc6aa5cSAndroid Build Coastguard Worker
54*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  4*SIZEOF_INT]
55*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT + 12*SIZEOF_INT]
56*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 4
57*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X1, word [BLOCK + T1 * 2], 4
58*dfc6aa5cSAndroid Build Coastguard Worker
59*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  5*SIZEOF_INT]
60*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT + 13*SIZEOF_INT]
61*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 5
62*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X1, word [BLOCK + T1 * 2], 5
63*dfc6aa5cSAndroid Build Coastguard Worker
64*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  6*SIZEOF_INT]
65*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT + 14*SIZEOF_INT]
66*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 6
67*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X1, word [BLOCK + T1 * 2], 6
68*dfc6aa5cSAndroid Build Coastguard Worker
69*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  7*SIZEOF_INT]
70*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT + 15*SIZEOF_INT]
71*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 7
72*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X1, word [BLOCK + T1 * 2], 7
73*dfc6aa5cSAndroid Build Coastguard Worker%endmacro
74*dfc6aa5cSAndroid Build Coastguard Worker
75*dfc6aa5cSAndroid Build Coastguard Worker%macro LOAD15 0
76*dfc6aa5cSAndroid Build Coastguard Worker    pxor        N0, N0
77*dfc6aa5cSAndroid Build Coastguard Worker    pxor        N1, N1
78*dfc6aa5cSAndroid Build Coastguard Worker    pxor        X1, X1
79*dfc6aa5cSAndroid Build Coastguard Worker
80*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  0*SIZEOF_INT]
81*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT +  8*SIZEOF_INT]
82*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 0
83*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X1, word [BLOCK + T1 * 2], 0
84*dfc6aa5cSAndroid Build Coastguard Worker
85*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  1*SIZEOF_INT]
86*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 1
87*dfc6aa5cSAndroid Build Coastguard Worker
88*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  2*SIZEOF_INT]
89*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 2
90*dfc6aa5cSAndroid Build Coastguard Worker
91*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  3*SIZEOF_INT]
92*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 3
93*dfc6aa5cSAndroid Build Coastguard Worker
94*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  4*SIZEOF_INT]
95*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 4
96*dfc6aa5cSAndroid Build Coastguard Worker
97*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  5*SIZEOF_INT]
98*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 5
99*dfc6aa5cSAndroid Build Coastguard Worker
100*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  6*SIZEOF_INT]
101*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 6
102*dfc6aa5cSAndroid Build Coastguard Worker
103*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  7*SIZEOF_INT]
104*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 7
105*dfc6aa5cSAndroid Build Coastguard Worker
106*dfc6aa5cSAndroid Build Coastguard Worker    cmp         LENEND, 2
107*dfc6aa5cSAndroid Build Coastguard Worker    jl          %%.ELOAD15
108*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT +  9*SIZEOF_INT]
109*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X1, word [BLOCK + T1 * 2], 1
110*dfc6aa5cSAndroid Build Coastguard Worker
111*dfc6aa5cSAndroid Build Coastguard Worker    cmp         LENEND, 3
112*dfc6aa5cSAndroid Build Coastguard Worker    jl          %%.ELOAD15
113*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT + 10*SIZEOF_INT]
114*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X1, word [BLOCK + T1 * 2], 2
115*dfc6aa5cSAndroid Build Coastguard Worker
116*dfc6aa5cSAndroid Build Coastguard Worker    cmp         LENEND, 4
117*dfc6aa5cSAndroid Build Coastguard Worker    jl          %%.ELOAD15
118*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT + 11*SIZEOF_INT]
119*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X1, word [BLOCK + T1 * 2], 3
120*dfc6aa5cSAndroid Build Coastguard Worker
121*dfc6aa5cSAndroid Build Coastguard Worker    cmp         LENEND, 5
122*dfc6aa5cSAndroid Build Coastguard Worker    jl          %%.ELOAD15
123*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT + 12*SIZEOF_INT]
124*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X1, word [BLOCK + T1 * 2], 4
125*dfc6aa5cSAndroid Build Coastguard Worker
126*dfc6aa5cSAndroid Build Coastguard Worker    cmp         LENEND, 6
127*dfc6aa5cSAndroid Build Coastguard Worker    jl          %%.ELOAD15
128*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT + 13*SIZEOF_INT]
129*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X1, word [BLOCK + T1 * 2], 5
130*dfc6aa5cSAndroid Build Coastguard Worker
131*dfc6aa5cSAndroid Build Coastguard Worker    cmp         LENEND, 7
132*dfc6aa5cSAndroid Build Coastguard Worker    jl          %%.ELOAD15
133*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT + 14*SIZEOF_INT]
134*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X1, word [BLOCK + T1 * 2], 6
135*dfc6aa5cSAndroid Build Coastguard Worker%%.ELOAD15:
136*dfc6aa5cSAndroid Build Coastguard Worker%endmacro
137*dfc6aa5cSAndroid Build Coastguard Worker
138*dfc6aa5cSAndroid Build Coastguard Worker%macro LOAD8 0
139*dfc6aa5cSAndroid Build Coastguard Worker    pxor        N0, N0
140*dfc6aa5cSAndroid Build Coastguard Worker
141*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  0*SIZEOF_INT]
142*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 0
143*dfc6aa5cSAndroid Build Coastguard Worker
144*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  1*SIZEOF_INT]
145*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 1
146*dfc6aa5cSAndroid Build Coastguard Worker
147*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  2*SIZEOF_INT]
148*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 2
149*dfc6aa5cSAndroid Build Coastguard Worker
150*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  3*SIZEOF_INT]
151*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 3
152*dfc6aa5cSAndroid Build Coastguard Worker
153*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  4*SIZEOF_INT]
154*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 4
155*dfc6aa5cSAndroid Build Coastguard Worker
156*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  5*SIZEOF_INT]
157*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 5
158*dfc6aa5cSAndroid Build Coastguard Worker
159*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  6*SIZEOF_INT]
160*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 6
161*dfc6aa5cSAndroid Build Coastguard Worker
162*dfc6aa5cSAndroid Build Coastguard Worker    mov         T0d, INT [LUT +  7*SIZEOF_INT]
163*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T0 * 2], 7
164*dfc6aa5cSAndroid Build Coastguard Worker%endmacro
165*dfc6aa5cSAndroid Build Coastguard Worker
166*dfc6aa5cSAndroid Build Coastguard Worker%macro LOAD7 0
167*dfc6aa5cSAndroid Build Coastguard Worker    pxor        N0, N0
168*dfc6aa5cSAndroid Build Coastguard Worker    pxor        X0, X0
169*dfc6aa5cSAndroid Build Coastguard Worker
170*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT +  0*SIZEOF_INT]
171*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T1 * 2], 0
172*dfc6aa5cSAndroid Build Coastguard Worker
173*dfc6aa5cSAndroid Build Coastguard Worker    cmp         LENEND, 2
174*dfc6aa5cSAndroid Build Coastguard Worker    jl          %%.ELOAD7
175*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT +  1*SIZEOF_INT]
176*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T1 * 2], 1
177*dfc6aa5cSAndroid Build Coastguard Worker
178*dfc6aa5cSAndroid Build Coastguard Worker    cmp         LENEND, 3
179*dfc6aa5cSAndroid Build Coastguard Worker    jl          %%.ELOAD7
180*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT +  2*SIZEOF_INT]
181*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T1 * 2], 2
182*dfc6aa5cSAndroid Build Coastguard Worker
183*dfc6aa5cSAndroid Build Coastguard Worker    cmp         LENEND, 4
184*dfc6aa5cSAndroid Build Coastguard Worker    jl          %%.ELOAD7
185*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT +  3*SIZEOF_INT]
186*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T1 * 2], 3
187*dfc6aa5cSAndroid Build Coastguard Worker
188*dfc6aa5cSAndroid Build Coastguard Worker    cmp         LENEND, 5
189*dfc6aa5cSAndroid Build Coastguard Worker    jl          %%.ELOAD7
190*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT +  4*SIZEOF_INT]
191*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T1 * 2], 4
192*dfc6aa5cSAndroid Build Coastguard Worker
193*dfc6aa5cSAndroid Build Coastguard Worker    cmp         LENEND, 6
194*dfc6aa5cSAndroid Build Coastguard Worker    jl          %%.ELOAD7
195*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT +  5*SIZEOF_INT]
196*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T1 * 2], 5
197*dfc6aa5cSAndroid Build Coastguard Worker
198*dfc6aa5cSAndroid Build Coastguard Worker    cmp         LENEND, 7
199*dfc6aa5cSAndroid Build Coastguard Worker    jl          %%.ELOAD7
200*dfc6aa5cSAndroid Build Coastguard Worker    mov         T1d, INT [LUT +  6*SIZEOF_INT]
201*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      X0, word [BLOCK + T1 * 2], 6
202*dfc6aa5cSAndroid Build Coastguard Worker%%.ELOAD7:
203*dfc6aa5cSAndroid Build Coastguard Worker%endmacro
204*dfc6aa5cSAndroid Build Coastguard Worker
205*dfc6aa5cSAndroid Build Coastguard Worker%macro REDUCE0 0
206*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
207*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
208*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
209*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
210*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
211*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
212*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
213*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm7, XMMWORD [VALUES + (56*2)]
214*dfc6aa5cSAndroid Build Coastguard Worker
215*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm0, ZERO
216*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm1, ZERO
217*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm2, ZERO
218*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm3, ZERO
219*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm4, ZERO
220*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm5, ZERO
221*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm6, ZERO
222*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm7, ZERO
223*dfc6aa5cSAndroid Build Coastguard Worker
224*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    xmm0, xmm1
225*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    xmm2, xmm3
226*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    xmm4, xmm5
227*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    xmm6, xmm7
228*dfc6aa5cSAndroid Build Coastguard Worker
229*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    eax, xmm0
230*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    ecx, xmm2
231*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    edx, xmm4
232*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    esi, xmm6
233*dfc6aa5cSAndroid Build Coastguard Worker
234*dfc6aa5cSAndroid Build Coastguard Worker    shl         rcx, 16
235*dfc6aa5cSAndroid Build Coastguard Worker    shl         rdx, 32
236*dfc6aa5cSAndroid Build Coastguard Worker    shl         rsi, 48
237*dfc6aa5cSAndroid Build Coastguard Worker
238*dfc6aa5cSAndroid Build Coastguard Worker    or          rax, rcx
239*dfc6aa5cSAndroid Build Coastguard Worker    or          rdx, rsi
240*dfc6aa5cSAndroid Build Coastguard Worker    or          rax, rdx
241*dfc6aa5cSAndroid Build Coastguard Worker
242*dfc6aa5cSAndroid Build Coastguard Worker    not         rax
243*dfc6aa5cSAndroid Build Coastguard Worker
244*dfc6aa5cSAndroid Build Coastguard Worker    mov         MMWORD [r15], rax
245*dfc6aa5cSAndroid Build Coastguard Worker%endmacro
246*dfc6aa5cSAndroid Build Coastguard Worker
247*dfc6aa5cSAndroid Build Coastguard Worker;
248*dfc6aa5cSAndroid Build Coastguard Worker; Prepare data for jsimd_encode_mcu_AC_first().
249*dfc6aa5cSAndroid Build Coastguard Worker;
250*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void)
251*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
252*dfc6aa5cSAndroid Build Coastguard Worker;                                        const int *jpeg_natural_order_start,
253*dfc6aa5cSAndroid Build Coastguard Worker;                                        int Sl, int Al, JCOEF *values,
254*dfc6aa5cSAndroid Build Coastguard Worker;                                        size_t *zerobits)
255*dfc6aa5cSAndroid Build Coastguard Worker;
256*dfc6aa5cSAndroid Build Coastguard Worker; r10 = const JCOEF *block
257*dfc6aa5cSAndroid Build Coastguard Worker; r11 = const int *jpeg_natural_order_start
258*dfc6aa5cSAndroid Build Coastguard Worker; r12 = int Sl
259*dfc6aa5cSAndroid Build Coastguard Worker; r13 = int Al
260*dfc6aa5cSAndroid Build Coastguard Worker; r14 = JCOEF *values
261*dfc6aa5cSAndroid Build Coastguard Worker; r15 = size_t *zerobits
262*dfc6aa5cSAndroid Build Coastguard Worker
263*dfc6aa5cSAndroid Build Coastguard Worker%define ZERO    xmm9
264*dfc6aa5cSAndroid Build Coastguard Worker%define X0      xmm0
265*dfc6aa5cSAndroid Build Coastguard Worker%define X1      xmm1
266*dfc6aa5cSAndroid Build Coastguard Worker%define N0      xmm2
267*dfc6aa5cSAndroid Build Coastguard Worker%define N1      xmm3
268*dfc6aa5cSAndroid Build Coastguard Worker%define AL      xmm4
269*dfc6aa5cSAndroid Build Coastguard Worker%define K       eax
270*dfc6aa5cSAndroid Build Coastguard Worker%define LUT     r11
271*dfc6aa5cSAndroid Build Coastguard Worker%define T0      rcx
272*dfc6aa5cSAndroid Build Coastguard Worker%define T0d     ecx
273*dfc6aa5cSAndroid Build Coastguard Worker%define T1      rdx
274*dfc6aa5cSAndroid Build Coastguard Worker%define T1d     edx
275*dfc6aa5cSAndroid Build Coastguard Worker%define BLOCK   r10
276*dfc6aa5cSAndroid Build Coastguard Worker%define VALUES  r14
277*dfc6aa5cSAndroid Build Coastguard Worker%define LEN     r12d
278*dfc6aa5cSAndroid Build Coastguard Worker%define LENEND  r13d
279*dfc6aa5cSAndroid Build Coastguard Worker
280*dfc6aa5cSAndroid Build Coastguard Worker    align       32
281*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
282*dfc6aa5cSAndroid Build Coastguard Worker
283*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
284*dfc6aa5cSAndroid Build Coastguard Worker    push        rbp
285*dfc6aa5cSAndroid Build Coastguard Worker    mov         rax, rsp                     ; rax = original rbp
286*dfc6aa5cSAndroid Build Coastguard Worker    sub         rsp, byte 4
287*dfc6aa5cSAndroid Build Coastguard Worker    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
288*dfc6aa5cSAndroid Build Coastguard Worker    mov         [rsp], rax
289*dfc6aa5cSAndroid Build Coastguard Worker    mov         rbp, rsp                     ; rbp = aligned rbp
290*dfc6aa5cSAndroid Build Coastguard Worker    lea         rsp, [rbp - 16]
291*dfc6aa5cSAndroid Build Coastguard Worker    collect_args 6
292*dfc6aa5cSAndroid Build Coastguard Worker
293*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [rbp - 16], ZERO
294*dfc6aa5cSAndroid Build Coastguard Worker
295*dfc6aa5cSAndroid Build Coastguard Worker    movd        AL, r13d
296*dfc6aa5cSAndroid Build Coastguard Worker    pxor        ZERO, ZERO
297*dfc6aa5cSAndroid Build Coastguard Worker    mov         K, LEN
298*dfc6aa5cSAndroid Build Coastguard Worker    mov         LENEND, LEN
299*dfc6aa5cSAndroid Build Coastguard Worker    and         K, -16
300*dfc6aa5cSAndroid Build Coastguard Worker    and         LENEND, 7
301*dfc6aa5cSAndroid Build Coastguard Worker    shr         K, 4
302*dfc6aa5cSAndroid Build Coastguard Worker    jz          .ELOOP16
303*dfc6aa5cSAndroid Build Coastguard Worker.BLOOP16:
304*dfc6aa5cSAndroid Build Coastguard Worker    LOAD16
305*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     N0, X0
306*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     N1, X1
307*dfc6aa5cSAndroid Build Coastguard Worker    paddw       X0, N0
308*dfc6aa5cSAndroid Build Coastguard Worker    paddw       X1, N1
309*dfc6aa5cSAndroid Build Coastguard Worker    pxor        X0, N0
310*dfc6aa5cSAndroid Build Coastguard Worker    pxor        X1, N1
311*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       X0, AL
312*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       X1, AL
313*dfc6aa5cSAndroid Build Coastguard Worker    pxor        N0, X0
314*dfc6aa5cSAndroid Build Coastguard Worker    pxor        N1, X1
315*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + (0) * 2], X0
316*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + (8) * 2], X1
317*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
318*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
319*dfc6aa5cSAndroid Build Coastguard Worker    add         VALUES, 16*2
320*dfc6aa5cSAndroid Build Coastguard Worker    add         LUT, 16*SIZEOF_INT
321*dfc6aa5cSAndroid Build Coastguard Worker    dec         K
322*dfc6aa5cSAndroid Build Coastguard Worker    jnz         .BLOOP16
323*dfc6aa5cSAndroid Build Coastguard Worker    test        LEN, 15
324*dfc6aa5cSAndroid Build Coastguard Worker    je          .PADDING
325*dfc6aa5cSAndroid Build Coastguard Worker.ELOOP16:
326*dfc6aa5cSAndroid Build Coastguard Worker    test        LEN, 8
327*dfc6aa5cSAndroid Build Coastguard Worker    jz          .TRY7
328*dfc6aa5cSAndroid Build Coastguard Worker    test        LEN, 7
329*dfc6aa5cSAndroid Build Coastguard Worker    jz          .TRY8
330*dfc6aa5cSAndroid Build Coastguard Worker
331*dfc6aa5cSAndroid Build Coastguard Worker    LOAD15
332*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     N0, X0
333*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     N1, X1
334*dfc6aa5cSAndroid Build Coastguard Worker    paddw       X0, N0
335*dfc6aa5cSAndroid Build Coastguard Worker    paddw       X1, N1
336*dfc6aa5cSAndroid Build Coastguard Worker    pxor        X0, N0
337*dfc6aa5cSAndroid Build Coastguard Worker    pxor        X1, N1
338*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       X0, AL
339*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       X1, AL
340*dfc6aa5cSAndroid Build Coastguard Worker    pxor        N0, X0
341*dfc6aa5cSAndroid Build Coastguard Worker    pxor        N1, X1
342*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + (0) * 2], X0
343*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + (8) * 2], X1
344*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
345*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
346*dfc6aa5cSAndroid Build Coastguard Worker    add         VALUES, 16*2
347*dfc6aa5cSAndroid Build Coastguard Worker    jmp         .PADDING
348*dfc6aa5cSAndroid Build Coastguard Worker.TRY8:
349*dfc6aa5cSAndroid Build Coastguard Worker    LOAD8
350*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     N0, X0
351*dfc6aa5cSAndroid Build Coastguard Worker    paddw       X0, N0
352*dfc6aa5cSAndroid Build Coastguard Worker    pxor        X0, N0
353*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       X0, AL
354*dfc6aa5cSAndroid Build Coastguard Worker    pxor        N0, X0
355*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + (0) * 2], X0
356*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
357*dfc6aa5cSAndroid Build Coastguard Worker    add         VALUES, 8*2
358*dfc6aa5cSAndroid Build Coastguard Worker    jmp         .PADDING
359*dfc6aa5cSAndroid Build Coastguard Worker.TRY7:
360*dfc6aa5cSAndroid Build Coastguard Worker    LOAD7
361*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     N0, X0
362*dfc6aa5cSAndroid Build Coastguard Worker    paddw       X0, N0
363*dfc6aa5cSAndroid Build Coastguard Worker    pxor        X0, N0
364*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       X0, AL
365*dfc6aa5cSAndroid Build Coastguard Worker    pxor        N0, X0
366*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + (0) * 2], X0
367*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
368*dfc6aa5cSAndroid Build Coastguard Worker    add         VALUES, 8*2
369*dfc6aa5cSAndroid Build Coastguard Worker.PADDING:
370*dfc6aa5cSAndroid Build Coastguard Worker    mov         K, LEN
371*dfc6aa5cSAndroid Build Coastguard Worker    add         K, 7
372*dfc6aa5cSAndroid Build Coastguard Worker    and         K, -8
373*dfc6aa5cSAndroid Build Coastguard Worker    shr         K, 3
374*dfc6aa5cSAndroid Build Coastguard Worker    sub         K, DCTSIZE2/8
375*dfc6aa5cSAndroid Build Coastguard Worker    jz          .EPADDING
376*dfc6aa5cSAndroid Build Coastguard Worker    align       16
377*dfc6aa5cSAndroid Build Coastguard Worker.ZEROLOOP:
378*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + 0], ZERO
379*dfc6aa5cSAndroid Build Coastguard Worker    add         VALUES, 8*2
380*dfc6aa5cSAndroid Build Coastguard Worker    inc         K
381*dfc6aa5cSAndroid Build Coastguard Worker    jnz         .ZEROLOOP
382*dfc6aa5cSAndroid Build Coastguard Worker.EPADDING:
383*dfc6aa5cSAndroid Build Coastguard Worker    sub         VALUES, DCTSIZE2*2
384*dfc6aa5cSAndroid Build Coastguard Worker
385*dfc6aa5cSAndroid Build Coastguard Worker    REDUCE0
386*dfc6aa5cSAndroid Build Coastguard Worker
387*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      ZERO, XMMWORD [rbp - 16]
388*dfc6aa5cSAndroid Build Coastguard Worker    uncollect_args 6
389*dfc6aa5cSAndroid Build Coastguard Worker    mov         rsp, rbp                ; rsp <- aligned rbp
390*dfc6aa5cSAndroid Build Coastguard Worker    pop         rsp                     ; rsp <- original rbp
391*dfc6aa5cSAndroid Build Coastguard Worker    pop         rbp
392*dfc6aa5cSAndroid Build Coastguard Worker    ret
393*dfc6aa5cSAndroid Build Coastguard Worker
394*dfc6aa5cSAndroid Build Coastguard Worker%undef ZERO
395*dfc6aa5cSAndroid Build Coastguard Worker%undef X0
396*dfc6aa5cSAndroid Build Coastguard Worker%undef X1
397*dfc6aa5cSAndroid Build Coastguard Worker%undef N0
398*dfc6aa5cSAndroid Build Coastguard Worker%undef N1
399*dfc6aa5cSAndroid Build Coastguard Worker%undef AL
400*dfc6aa5cSAndroid Build Coastguard Worker%undef K
401*dfc6aa5cSAndroid Build Coastguard Worker%undef LUT
402*dfc6aa5cSAndroid Build Coastguard Worker%undef T0
403*dfc6aa5cSAndroid Build Coastguard Worker%undef T0d
404*dfc6aa5cSAndroid Build Coastguard Worker%undef T1
405*dfc6aa5cSAndroid Build Coastguard Worker%undef T1d
406*dfc6aa5cSAndroid Build Coastguard Worker%undef BLOCK
407*dfc6aa5cSAndroid Build Coastguard Worker%undef VALUES
408*dfc6aa5cSAndroid Build Coastguard Worker%undef LEN
409*dfc6aa5cSAndroid Build Coastguard Worker%undef LENEND
410*dfc6aa5cSAndroid Build Coastguard Worker
411*dfc6aa5cSAndroid Build Coastguard Worker;
412*dfc6aa5cSAndroid Build Coastguard Worker; Prepare data for jsimd_encode_mcu_AC_refine().
413*dfc6aa5cSAndroid Build Coastguard Worker;
414*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(int)
415*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
416*dfc6aa5cSAndroid Build Coastguard Worker;                                         const int *jpeg_natural_order_start,
417*dfc6aa5cSAndroid Build Coastguard Worker;                                         int Sl, int Al, JCOEF *absvalues,
418*dfc6aa5cSAndroid Build Coastguard Worker;                                         size_t *bits)
419*dfc6aa5cSAndroid Build Coastguard Worker;
420*dfc6aa5cSAndroid Build Coastguard Worker; r10 = const JCOEF *block
421*dfc6aa5cSAndroid Build Coastguard Worker; r11 = const int *jpeg_natural_order_start
422*dfc6aa5cSAndroid Build Coastguard Worker; r12 = int Sl
423*dfc6aa5cSAndroid Build Coastguard Worker; r13 = int Al
424*dfc6aa5cSAndroid Build Coastguard Worker; r14 = JCOEF *values
425*dfc6aa5cSAndroid Build Coastguard Worker; r15 = size_t *bits
426*dfc6aa5cSAndroid Build Coastguard Worker
427*dfc6aa5cSAndroid Build Coastguard Worker%define ZERO    xmm9
428*dfc6aa5cSAndroid Build Coastguard Worker%define ONE     xmm5
429*dfc6aa5cSAndroid Build Coastguard Worker%define X0      xmm0
430*dfc6aa5cSAndroid Build Coastguard Worker%define X1      xmm1
431*dfc6aa5cSAndroid Build Coastguard Worker%define N0      xmm2
432*dfc6aa5cSAndroid Build Coastguard Worker%define N1      xmm3
433*dfc6aa5cSAndroid Build Coastguard Worker%define AL      xmm4
434*dfc6aa5cSAndroid Build Coastguard Worker%define K       eax
435*dfc6aa5cSAndroid Build Coastguard Worker%define KK      r9d
436*dfc6aa5cSAndroid Build Coastguard Worker%define EOB     r8d
437*dfc6aa5cSAndroid Build Coastguard Worker%define SIGN    rdi
438*dfc6aa5cSAndroid Build Coastguard Worker%define LUT     r11
439*dfc6aa5cSAndroid Build Coastguard Worker%define T0      rcx
440*dfc6aa5cSAndroid Build Coastguard Worker%define T0d     ecx
441*dfc6aa5cSAndroid Build Coastguard Worker%define T1      rdx
442*dfc6aa5cSAndroid Build Coastguard Worker%define T1d     edx
443*dfc6aa5cSAndroid Build Coastguard Worker%define BLOCK   r10
444*dfc6aa5cSAndroid Build Coastguard Worker%define VALUES  r14
445*dfc6aa5cSAndroid Build Coastguard Worker%define LEN     r12d
446*dfc6aa5cSAndroid Build Coastguard Worker%define LENEND  r13d
447*dfc6aa5cSAndroid Build Coastguard Worker
448*dfc6aa5cSAndroid Build Coastguard Worker    align       32
449*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
450*dfc6aa5cSAndroid Build Coastguard Worker
451*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
452*dfc6aa5cSAndroid Build Coastguard Worker    push        rbp
453*dfc6aa5cSAndroid Build Coastguard Worker    mov         rax, rsp                     ; rax = original rbp
454*dfc6aa5cSAndroid Build Coastguard Worker    sub         rsp, byte 4
455*dfc6aa5cSAndroid Build Coastguard Worker    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
456*dfc6aa5cSAndroid Build Coastguard Worker    mov         [rsp], rax
457*dfc6aa5cSAndroid Build Coastguard Worker    mov         rbp, rsp                     ; rbp = aligned rbp
458*dfc6aa5cSAndroid Build Coastguard Worker    lea         rsp, [rbp - 16]
459*dfc6aa5cSAndroid Build Coastguard Worker    collect_args 6
460*dfc6aa5cSAndroid Build Coastguard Worker
461*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [rbp - 16], ZERO
462*dfc6aa5cSAndroid Build Coastguard Worker
463*dfc6aa5cSAndroid Build Coastguard Worker    xor         SIGN, SIGN
464*dfc6aa5cSAndroid Build Coastguard Worker    xor         EOB, EOB
465*dfc6aa5cSAndroid Build Coastguard Worker    xor         KK, KK
466*dfc6aa5cSAndroid Build Coastguard Worker    movd        AL, r13d
467*dfc6aa5cSAndroid Build Coastguard Worker    pxor        ZERO, ZERO
468*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     ONE, ONE
469*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       ONE, 15
470*dfc6aa5cSAndroid Build Coastguard Worker    mov         K, LEN
471*dfc6aa5cSAndroid Build Coastguard Worker    mov         LENEND, LEN
472*dfc6aa5cSAndroid Build Coastguard Worker    and         K, -16
473*dfc6aa5cSAndroid Build Coastguard Worker    and         LENEND, 7
474*dfc6aa5cSAndroid Build Coastguard Worker    shr         K, 4
475*dfc6aa5cSAndroid Build Coastguard Worker    jz          .ELOOPR16
476*dfc6aa5cSAndroid Build Coastguard Worker.BLOOPR16:
477*dfc6aa5cSAndroid Build Coastguard Worker    LOAD16
478*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     N0, X0
479*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     N1, X1
480*dfc6aa5cSAndroid Build Coastguard Worker    paddw       X0, N0
481*dfc6aa5cSAndroid Build Coastguard Worker    paddw       X1, N1
482*dfc6aa5cSAndroid Build Coastguard Worker    pxor        X0, N0
483*dfc6aa5cSAndroid Build Coastguard Worker    pxor        X1, N1
484*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       X0, AL
485*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       X1, AL
486*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + (0) * 2], X0
487*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + (8) * 2], X1
488*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     X0, ONE
489*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     X1, ONE
490*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    N0, N1
491*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    X0, X1
492*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
493*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
494*dfc6aa5cSAndroid Build Coastguard Worker    shr         SIGN, 16                ; make room for sizebits
495*dfc6aa5cSAndroid Build Coastguard Worker    shl         T0, 48
496*dfc6aa5cSAndroid Build Coastguard Worker    or          SIGN, T0
497*dfc6aa5cSAndroid Build Coastguard Worker    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
498*dfc6aa5cSAndroid Build Coastguard Worker    jz          .CONTINUER16            ; if (idx) {
499*dfc6aa5cSAndroid Build Coastguard Worker    mov         EOB, KK
500*dfc6aa5cSAndroid Build Coastguard Worker    add         EOB, T1d                ; EOB = k + idx;
501*dfc6aa5cSAndroid Build Coastguard Worker.CONTINUER16:
502*dfc6aa5cSAndroid Build Coastguard Worker    add         VALUES, 16*2
503*dfc6aa5cSAndroid Build Coastguard Worker    add         LUT, 16*SIZEOF_INT
504*dfc6aa5cSAndroid Build Coastguard Worker    add         KK, 16
505*dfc6aa5cSAndroid Build Coastguard Worker    dec         K
506*dfc6aa5cSAndroid Build Coastguard Worker    jnz         .BLOOPR16
507*dfc6aa5cSAndroid Build Coastguard Worker    test        LEN, 15
508*dfc6aa5cSAndroid Build Coastguard Worker    je          .PADDINGR
509*dfc6aa5cSAndroid Build Coastguard Worker.ELOOPR16:
510*dfc6aa5cSAndroid Build Coastguard Worker    test        LEN, 8
511*dfc6aa5cSAndroid Build Coastguard Worker    jz          .TRYR7
512*dfc6aa5cSAndroid Build Coastguard Worker    test        LEN, 7
513*dfc6aa5cSAndroid Build Coastguard Worker    jz          .TRYR8
514*dfc6aa5cSAndroid Build Coastguard Worker
515*dfc6aa5cSAndroid Build Coastguard Worker    LOAD15
516*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     N0, X0
517*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     N1, X1
518*dfc6aa5cSAndroid Build Coastguard Worker    paddw       X0, N0
519*dfc6aa5cSAndroid Build Coastguard Worker    paddw       X1, N1
520*dfc6aa5cSAndroid Build Coastguard Worker    pxor        X0, N0
521*dfc6aa5cSAndroid Build Coastguard Worker    pxor        X1, N1
522*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       X0, AL
523*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       X1, AL
524*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + (0) * 2], X0
525*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + (8) * 2], X1
526*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     X0, ONE
527*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     X1, ONE
528*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    N0, N1
529*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    X0, X1
530*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
531*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
532*dfc6aa5cSAndroid Build Coastguard Worker    shr         SIGN, 16                ; make room for sizebits
533*dfc6aa5cSAndroid Build Coastguard Worker    shl         T0, 48
534*dfc6aa5cSAndroid Build Coastguard Worker    or          SIGN, T0
535*dfc6aa5cSAndroid Build Coastguard Worker    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
536*dfc6aa5cSAndroid Build Coastguard Worker    jz          .CONTINUER15            ; if (idx) {
537*dfc6aa5cSAndroid Build Coastguard Worker    mov         EOB, KK
538*dfc6aa5cSAndroid Build Coastguard Worker    add         EOB, T1d                ; EOB = k + idx;
539*dfc6aa5cSAndroid Build Coastguard Worker.CONTINUER15:
540*dfc6aa5cSAndroid Build Coastguard Worker    add         VALUES, 16*2
541*dfc6aa5cSAndroid Build Coastguard Worker    jmp         .PADDINGR
542*dfc6aa5cSAndroid Build Coastguard Worker.TRYR8:
543*dfc6aa5cSAndroid Build Coastguard Worker    LOAD8
544*dfc6aa5cSAndroid Build Coastguard Worker
545*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     N0, X0
546*dfc6aa5cSAndroid Build Coastguard Worker    paddw       X0, N0
547*dfc6aa5cSAndroid Build Coastguard Worker    pxor        X0, N0
548*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       X0, AL
549*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + (0) * 2], X0
550*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     X0, ONE
551*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    N0, ZERO
552*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    X0, ZERO
553*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
554*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
555*dfc6aa5cSAndroid Build Coastguard Worker    shr         SIGN, 8                 ; make room for sizebits
556*dfc6aa5cSAndroid Build Coastguard Worker    shl         T0, 56
557*dfc6aa5cSAndroid Build Coastguard Worker    or          SIGN, T0
558*dfc6aa5cSAndroid Build Coastguard Worker    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
559*dfc6aa5cSAndroid Build Coastguard Worker    jz          .CONTINUER8             ; if (idx) {
560*dfc6aa5cSAndroid Build Coastguard Worker    mov         EOB, KK
561*dfc6aa5cSAndroid Build Coastguard Worker    add         EOB, T1d                ; EOB = k + idx;
562*dfc6aa5cSAndroid Build Coastguard Worker.CONTINUER8:
563*dfc6aa5cSAndroid Build Coastguard Worker    add         VALUES, 8*2
564*dfc6aa5cSAndroid Build Coastguard Worker    jmp         .PADDINGR
565*dfc6aa5cSAndroid Build Coastguard Worker.TRYR7:
566*dfc6aa5cSAndroid Build Coastguard Worker    LOAD7
567*dfc6aa5cSAndroid Build Coastguard Worker
568*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     N0, X0
569*dfc6aa5cSAndroid Build Coastguard Worker    paddw       X0, N0
570*dfc6aa5cSAndroid Build Coastguard Worker    pxor        X0, N0
571*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       X0, AL
572*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + (0) * 2], X0
573*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     X0, ONE
574*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    N0, ZERO
575*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    X0, ZERO
576*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
577*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
578*dfc6aa5cSAndroid Build Coastguard Worker    shr         SIGN, 8                 ; make room for sizebits
579*dfc6aa5cSAndroid Build Coastguard Worker    shl         T0, 56
580*dfc6aa5cSAndroid Build Coastguard Worker    or          SIGN, T0
581*dfc6aa5cSAndroid Build Coastguard Worker    bsr         T1d, T1d                ; idx = 16 - (__builtin_clz(idx)>>1);
582*dfc6aa5cSAndroid Build Coastguard Worker    jz          .CONTINUER7             ; if (idx) {
583*dfc6aa5cSAndroid Build Coastguard Worker    mov         EOB, KK
584*dfc6aa5cSAndroid Build Coastguard Worker    add         EOB, T1d                ; EOB = k + idx;
585*dfc6aa5cSAndroid Build Coastguard Worker.CONTINUER7:
586*dfc6aa5cSAndroid Build Coastguard Worker    add         VALUES, 8*2
587*dfc6aa5cSAndroid Build Coastguard Worker.PADDINGR:
588*dfc6aa5cSAndroid Build Coastguard Worker    mov         K, LEN
589*dfc6aa5cSAndroid Build Coastguard Worker    add         K, 7
590*dfc6aa5cSAndroid Build Coastguard Worker    and         K, -8
591*dfc6aa5cSAndroid Build Coastguard Worker    shr         K, 3
592*dfc6aa5cSAndroid Build Coastguard Worker    sub         K, DCTSIZE2/8
593*dfc6aa5cSAndroid Build Coastguard Worker    jz          .EPADDINGR
594*dfc6aa5cSAndroid Build Coastguard Worker    align       16
595*dfc6aa5cSAndroid Build Coastguard Worker.ZEROLOOPR:
596*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [VALUES + 0], ZERO
597*dfc6aa5cSAndroid Build Coastguard Worker    shr         SIGN, 8
598*dfc6aa5cSAndroid Build Coastguard Worker    add         VALUES, 8*2
599*dfc6aa5cSAndroid Build Coastguard Worker    inc         K
600*dfc6aa5cSAndroid Build Coastguard Worker    jnz         .ZEROLOOPR
601*dfc6aa5cSAndroid Build Coastguard Worker.EPADDINGR:
602*dfc6aa5cSAndroid Build Coastguard Worker    not         SIGN
603*dfc6aa5cSAndroid Build Coastguard Worker    sub         VALUES, DCTSIZE2*2
604*dfc6aa5cSAndroid Build Coastguard Worker    mov         MMWORD [r15+SIZEOF_MMWORD], SIGN
605*dfc6aa5cSAndroid Build Coastguard Worker
606*dfc6aa5cSAndroid Build Coastguard Worker    REDUCE0
607*dfc6aa5cSAndroid Build Coastguard Worker
608*dfc6aa5cSAndroid Build Coastguard Worker    mov         eax, EOB
609*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      ZERO, XMMWORD [rbp - 16]
610*dfc6aa5cSAndroid Build Coastguard Worker    uncollect_args 6
611*dfc6aa5cSAndroid Build Coastguard Worker    mov         rsp, rbp                ; rsp <- aligned rbp
612*dfc6aa5cSAndroid Build Coastguard Worker    pop         rsp                     ; rsp <- original rbp
613*dfc6aa5cSAndroid Build Coastguard Worker    pop         rbp
614*dfc6aa5cSAndroid Build Coastguard Worker    ret
615*dfc6aa5cSAndroid Build Coastguard Worker
616*dfc6aa5cSAndroid Build Coastguard Worker%undef ZERO
617*dfc6aa5cSAndroid Build Coastguard Worker%undef ONE
618*dfc6aa5cSAndroid Build Coastguard Worker%undef X0
619*dfc6aa5cSAndroid Build Coastguard Worker%undef X1
620*dfc6aa5cSAndroid Build Coastguard Worker%undef N0
621*dfc6aa5cSAndroid Build Coastguard Worker%undef N1
622*dfc6aa5cSAndroid Build Coastguard Worker%undef AL
623*dfc6aa5cSAndroid Build Coastguard Worker%undef K
624*dfc6aa5cSAndroid Build Coastguard Worker%undef KK
625*dfc6aa5cSAndroid Build Coastguard Worker%undef EOB
626*dfc6aa5cSAndroid Build Coastguard Worker%undef SIGN
627*dfc6aa5cSAndroid Build Coastguard Worker%undef LUT
628*dfc6aa5cSAndroid Build Coastguard Worker%undef T0
629*dfc6aa5cSAndroid Build Coastguard Worker%undef T0d
630*dfc6aa5cSAndroid Build Coastguard Worker%undef T1
631*dfc6aa5cSAndroid Build Coastguard Worker%undef T1d
632*dfc6aa5cSAndroid Build Coastguard Worker%undef BLOCK
633*dfc6aa5cSAndroid Build Coastguard Worker%undef VALUES
634*dfc6aa5cSAndroid Build Coastguard Worker%undef LEN
635*dfc6aa5cSAndroid Build Coastguard Worker%undef LENEND
636*dfc6aa5cSAndroid Build Coastguard Worker
637*dfc6aa5cSAndroid Build Coastguard Worker; For some reason, the OS X linker does not honor the request to align the
638*dfc6aa5cSAndroid Build Coastguard Worker; segment unless we do this.
639*dfc6aa5cSAndroid Build Coastguard Worker    align       32
640