xref: /aosp_15_r20/external/libjpeg-turbo/simd/i386/jchuff-sse2.asm (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1*dfc6aa5cSAndroid Build Coastguard Worker;
2*dfc6aa5cSAndroid Build Coastguard Worker; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
3*dfc6aa5cSAndroid Build Coastguard Worker;
4*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander.
5*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2015, Matthieu Darbois.
6*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2018, Matthias Räncker.
7*dfc6aa5cSAndroid Build Coastguard Worker;
8*dfc6aa5cSAndroid Build Coastguard Worker; Based on the x86 SIMD extension for IJG JPEG library
9*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 1999-2006, MIYASAKA Masaru.
10*dfc6aa5cSAndroid Build Coastguard Worker; For conditions of distribution and use, see copyright notice in jsimdext.inc
11*dfc6aa5cSAndroid Build Coastguard Worker;
12*dfc6aa5cSAndroid Build Coastguard Worker; This file should be assembled with NASM (Netwide Assembler),
13*dfc6aa5cSAndroid Build Coastguard Worker; can *not* be assembled with Microsoft's MASM or any compatible
14*dfc6aa5cSAndroid Build Coastguard Worker; assembler (including Borland's Turbo Assembler).
15*dfc6aa5cSAndroid Build Coastguard Worker; NASM is available from http://nasm.sourceforge.net/ or
16*dfc6aa5cSAndroid Build Coastguard Worker; http://sourceforge.net/project/showfiles.php?group_id=6208
17*dfc6aa5cSAndroid Build Coastguard Worker;
18*dfc6aa5cSAndroid Build Coastguard Worker; This file contains an SSE2 implementation for Huffman coding of one block.
19*dfc6aa5cSAndroid Build Coastguard Worker; The following code is based on jchuff.c; see jchuff.c for more details.
20*dfc6aa5cSAndroid Build Coastguard Worker
21*dfc6aa5cSAndroid Build Coastguard Worker%include "jsimdext.inc"
22*dfc6aa5cSAndroid Build Coastguard Worker
23*dfc6aa5cSAndroid Build Coastguard Workerstruc working_state
24*dfc6aa5cSAndroid Build Coastguard Worker.next_output_byte:   resp 1     ; => next byte to write in buffer
25*dfc6aa5cSAndroid Build Coastguard Worker.free_in_buffer:     resp 1     ; # of byte spaces remaining in buffer
26*dfc6aa5cSAndroid Build Coastguard Worker.cur.put_buffer.simd resq 1     ; current bit accumulation buffer
27*dfc6aa5cSAndroid Build Coastguard Worker.cur.free_bits       resd 1     ; # of bits available in it
28*dfc6aa5cSAndroid Build Coastguard Worker.cur.last_dc_val     resd 4     ; last DC coef for each component
29*dfc6aa5cSAndroid Build Coastguard Worker.cinfo:              resp 1     ; dump_buffer needs access to this
30*dfc6aa5cSAndroid Build Coastguard Workerendstruc
31*dfc6aa5cSAndroid Build Coastguard Worker
32*dfc6aa5cSAndroid Build Coastguard Workerstruc c_derived_tbl
33*dfc6aa5cSAndroid Build Coastguard Worker.ehufco:             resd 256   ; code for each symbol
34*dfc6aa5cSAndroid Build Coastguard Worker.ehufsi:             resb 256   ; length of code for each symbol
35*dfc6aa5cSAndroid Build Coastguard Worker; If no code has been allocated for a symbol S, ehufsi[S] contains 0
36*dfc6aa5cSAndroid Build Coastguard Workerendstruc
37*dfc6aa5cSAndroid Build Coastguard Worker
38*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
39*dfc6aa5cSAndroid Build Coastguard Worker    SECTION     SEG_CONST
40*dfc6aa5cSAndroid Build Coastguard Worker
41*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_DATA(jconst_huff_encode_one_block)
42*dfc6aa5cSAndroid Build Coastguard Worker
43*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jconst_huff_encode_one_block):
44*dfc6aa5cSAndroid Build Coastguard Worker
45*dfc6aa5cSAndroid Build Coastguard Worker    alignz      32
46*dfc6aa5cSAndroid Build Coastguard Worker
47*dfc6aa5cSAndroid Build Coastguard Workerjpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007
48*dfc6aa5cSAndroid Build Coastguard Worker               dq 0x000f, 0x001f, 0x003f, 0x007f
49*dfc6aa5cSAndroid Build Coastguard Worker               dq 0x00ff, 0x01ff, 0x03ff, 0x07ff
50*dfc6aa5cSAndroid Build Coastguard Worker               dq 0x0fff, 0x1fff, 0x3fff, 0x7fff
51*dfc6aa5cSAndroid Build Coastguard Worker
52*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 14 db 15
53*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 13 db 14
54*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 12 db 13
55*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 11 db 12
56*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 10 db 11
57*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  9 db 10
58*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  8 db  9
59*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  7 db  8
60*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  6 db  7
61*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  5 db  6
62*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  4 db  5
63*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  3 db  4
64*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  2 db  3
65*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  1 db  2
66*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  0 db  1
67*dfc6aa5cSAndroid Build Coastguard Workertimes 1       db  0
68*dfc6aa5cSAndroid Build Coastguard Workerjpeg_nbits_table:
69*dfc6aa5cSAndroid Build Coastguard Workertimes 1       db  0
70*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  0 db  1
71*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  1 db  2
72*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  2 db  3
73*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  3 db  4
74*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  4 db  5
75*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  5 db  6
76*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  6 db  7
77*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  7 db  8
78*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  8 db  9
79*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  9 db 10
80*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 10 db 11
81*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 11 db 12
82*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 12 db 13
83*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 13 db 14
84*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 14 db 15
85*dfc6aa5cSAndroid Build Coastguard Worker
86*dfc6aa5cSAndroid Build Coastguard Worker    alignz      32
87*dfc6aa5cSAndroid Build Coastguard Worker
88*dfc6aa5cSAndroid Build Coastguard Worker%ifdef PIC
89*dfc6aa5cSAndroid Build Coastguard Worker%define NBITS(x)      nbits_base + x
90*dfc6aa5cSAndroid Build Coastguard Worker%else
91*dfc6aa5cSAndroid Build Coastguard Worker%define NBITS(x)      jpeg_nbits_table + x
92*dfc6aa5cSAndroid Build Coastguard Worker%endif
93*dfc6aa5cSAndroid Build Coastguard Worker%define MASK_BITS(x)  NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table)
94*dfc6aa5cSAndroid Build Coastguard Worker
95*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
96*dfc6aa5cSAndroid Build Coastguard Worker    SECTION     SEG_TEXT
97*dfc6aa5cSAndroid Build Coastguard Worker    BITS        32
98*dfc6aa5cSAndroid Build Coastguard Worker
99*dfc6aa5cSAndroid Build Coastguard Worker%define mm_put_buffer     mm0
100*dfc6aa5cSAndroid Build Coastguard Worker%define mm_all_0xff       mm1
101*dfc6aa5cSAndroid Build Coastguard Worker%define mm_temp           mm2
102*dfc6aa5cSAndroid Build Coastguard Worker%define mm_nbits          mm3
103*dfc6aa5cSAndroid Build Coastguard Worker%define mm_code_bits      mm3
104*dfc6aa5cSAndroid Build Coastguard Worker%define mm_code           mm4
105*dfc6aa5cSAndroid Build Coastguard Worker%define mm_overflow_bits  mm5
106*dfc6aa5cSAndroid Build Coastguard Worker%define mm_save_nbits     mm6
107*dfc6aa5cSAndroid Build Coastguard Worker
108*dfc6aa5cSAndroid Build Coastguard Worker; Shorthand used to describe SIMD operations:
109*dfc6aa5cSAndroid Build Coastguard Worker; wN:  xmmN treated as eight signed 16-bit values
110*dfc6aa5cSAndroid Build Coastguard Worker; wN[i]:  perform the same operation on all eight signed 16-bit values, i=0..7
111*dfc6aa5cSAndroid Build Coastguard Worker; bN:  xmmN treated as 16 unsigned 8-bit values, or
112*dfc6aa5cSAndroid Build Coastguard Worker;      mmN treated as eight unsigned 8-bit values
113*dfc6aa5cSAndroid Build Coastguard Worker; bN[i]:  perform the same operation on all unsigned 8-bit values,
114*dfc6aa5cSAndroid Build Coastguard Worker;         i=0..15 (SSE register) or i=0..7 (MMX register)
115*dfc6aa5cSAndroid Build Coastguard Worker; Contents of SIMD registers are shown in memory order.
116*dfc6aa5cSAndroid Build Coastguard Worker
117*dfc6aa5cSAndroid Build Coastguard Worker; Fill the bit buffer to capacity with the leading bits from code, then output
118*dfc6aa5cSAndroid Build Coastguard Worker; the bit buffer and put the remaining bits from code into the bit buffer.
119*dfc6aa5cSAndroid Build Coastguard Worker;
120*dfc6aa5cSAndroid Build Coastguard Worker; Usage:
121*dfc6aa5cSAndroid Build Coastguard Worker; code - contains the bits to shift into the bit buffer (LSB-aligned)
122*dfc6aa5cSAndroid Build Coastguard Worker; %1 - temp register
123*dfc6aa5cSAndroid Build Coastguard Worker; %2 - low byte of temp register
124*dfc6aa5cSAndroid Build Coastguard Worker; %3 - second byte of temp register
125*dfc6aa5cSAndroid Build Coastguard Worker; %4-%8 (optional) - extra instructions to execute before the macro completes
126*dfc6aa5cSAndroid Build Coastguard Worker; %9 - the label to which to jump when the macro completes
127*dfc6aa5cSAndroid Build Coastguard Worker;
128*dfc6aa5cSAndroid Build Coastguard Worker; Upon completion, free_bits will be set to the number of remaining bits from
129*dfc6aa5cSAndroid Build Coastguard Worker; code, and put_buffer will contain those remaining bits.  temp and code will
130*dfc6aa5cSAndroid Build Coastguard Worker; be clobbered.
131*dfc6aa5cSAndroid Build Coastguard Worker;
132*dfc6aa5cSAndroid Build Coastguard Worker; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
133*dfc6aa5cSAndroid Build Coastguard Worker; macro in jchuff.c.
134*dfc6aa5cSAndroid Build Coastguard Worker
135*dfc6aa5cSAndroid Build Coastguard Worker%macro EMIT_QWORD 9
136*dfc6aa5cSAndroid Build Coastguard Worker%define %%temp   %1
137*dfc6aa5cSAndroid Build Coastguard Worker%define %%tempb  %2
138*dfc6aa5cSAndroid Build Coastguard Worker%define %%temph  %3
139*dfc6aa5cSAndroid Build Coastguard Worker    add         nbits, free_bits             ; nbits += free_bits;
140*dfc6aa5cSAndroid Build Coastguard Worker    neg         free_bits                    ; free_bits = -free_bits;
141*dfc6aa5cSAndroid Build Coastguard Worker    movq        mm_temp, mm_code             ; temp = code;
142*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_nbits, nbits              ; nbits --> MMX register
143*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_overflow_bits, free_bits  ; overflow_bits (temp register) = free_bits;
144*dfc6aa5cSAndroid Build Coastguard Worker    neg         free_bits                    ; free_bits = -free_bits;
145*dfc6aa5cSAndroid Build Coastguard Worker    psllq       mm_put_buffer, mm_nbits      ; put_buffer <<= nbits;
146*dfc6aa5cSAndroid Build Coastguard Worker    psrlq       mm_temp, mm_overflow_bits    ; temp >>= overflow_bits;
147*dfc6aa5cSAndroid Build Coastguard Worker    add         free_bits, 64                ; free_bits += 64;
148*dfc6aa5cSAndroid Build Coastguard Worker    por         mm_temp, mm_put_buffer       ; temp |= put_buffer;
149*dfc6aa5cSAndroid Build Coastguard Worker%ifidn %%temp, nbits_base
150*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_save_nbits, nbits_base    ; save nbits_base
151*dfc6aa5cSAndroid Build Coastguard Worker%endif
152*dfc6aa5cSAndroid Build Coastguard Worker    movq        mm_code_bits, mm_temp        ; code_bits (temp register) = temp;
153*dfc6aa5cSAndroid Build Coastguard Worker    movq        mm_put_buffer, mm_code       ; put_buffer = code;
154*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqb     mm_temp, mm_all_0xff         ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0);
155*dfc6aa5cSAndroid Build Coastguard Worker    movq        mm_code, mm_code_bits        ; code = code_bits;
156*dfc6aa5cSAndroid Build Coastguard Worker    psrlq       mm_code_bits, 32             ; code_bits >>= 32;
157*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    nbits, mm_temp               ; nbits = 0;  nbits |= ((b_temp[i] >> 7) << i);
158*dfc6aa5cSAndroid Build Coastguard Worker    movd        %%temp, mm_code_bits         ; temp = code_bits;
159*dfc6aa5cSAndroid Build Coastguard Worker    bswap       %%temp                       ; temp = htonl(temp);
160*dfc6aa5cSAndroid Build Coastguard Worker    test        nbits, nbits                 ; if (nbits != 0)  /* Some 0xFF bytes */
161*dfc6aa5cSAndroid Build Coastguard Worker    jnz         %%.SLOW                      ;   goto %%.SLOW
162*dfc6aa5cSAndroid Build Coastguard Worker    mov         dword [buffer], %%temp       ; *(uint32_t)buffer = temp;
163*dfc6aa5cSAndroid Build Coastguard Worker%ifidn %%temp, nbits_base
164*dfc6aa5cSAndroid Build Coastguard Worker    movd        nbits_base, mm_save_nbits    ; restore nbits_base
165*dfc6aa5cSAndroid Build Coastguard Worker%endif
166*dfc6aa5cSAndroid Build Coastguard Worker    %4
167*dfc6aa5cSAndroid Build Coastguard Worker    movd        nbits, mm_code               ; nbits = (uint32_t)(code);
168*dfc6aa5cSAndroid Build Coastguard Worker    %5
169*dfc6aa5cSAndroid Build Coastguard Worker    bswap       nbits                        ; nbits = htonl(nbits);
170*dfc6aa5cSAndroid Build Coastguard Worker    mov         dword [buffer + 4], nbits    ; *(uint32_t)(buffer + 4) = nbits;
171*dfc6aa5cSAndroid Build Coastguard Worker    lea         buffer, [buffer + 8]         ; buffer += 8;
172*dfc6aa5cSAndroid Build Coastguard Worker    %6
173*dfc6aa5cSAndroid Build Coastguard Worker    %7
174*dfc6aa5cSAndroid Build Coastguard Worker    %8
175*dfc6aa5cSAndroid Build Coastguard Worker    jmp %9                                   ; return
176*dfc6aa5cSAndroid Build Coastguard Worker%%.SLOW:
177*dfc6aa5cSAndroid Build Coastguard Worker    ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
178*dfc6aa5cSAndroid Build Coastguard Worker    ; bytes in the qword.
179*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer], %%tempb     ; buffer[0] = temp[0];
180*dfc6aa5cSAndroid Build Coastguard Worker    cmp         %%tempb, 0xFF              ; Set CF if temp[0] < 0xFF
181*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer+1], 0         ; buffer[1] = 0;
182*dfc6aa5cSAndroid Build Coastguard Worker    sbb         buffer, -2                 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
183*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer], %%temph     ; buffer[0] = temp[1];
184*dfc6aa5cSAndroid Build Coastguard Worker    cmp         %%temph, 0xFF              ; Set CF if temp[1] < 0xFF
185*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer+1], 0         ; buffer[1] = 0;
186*dfc6aa5cSAndroid Build Coastguard Worker    sbb         buffer, -2                 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
187*dfc6aa5cSAndroid Build Coastguard Worker    shr         %%temp, 16                 ; temp >>= 16;
188*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer], %%tempb     ; buffer[0] = temp[0];
189*dfc6aa5cSAndroid Build Coastguard Worker    cmp         %%tempb, 0xFF              ; Set CF if temp[0] < 0xFF
190*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer+1], 0         ; buffer[1] = 0;
191*dfc6aa5cSAndroid Build Coastguard Worker    sbb         buffer, -2                 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
192*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer], %%temph     ; buffer[0] = temp[1];
193*dfc6aa5cSAndroid Build Coastguard Worker    cmp         %%temph, 0xFF              ; Set CF if temp[1] < 0xFF
194*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer+1], 0         ; buffer[1] = 0;
195*dfc6aa5cSAndroid Build Coastguard Worker    sbb         buffer, -2                 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
196*dfc6aa5cSAndroid Build Coastguard Worker    movd        nbits, mm_code             ; nbits (temp register) = (uint32_t)(code)
197*dfc6aa5cSAndroid Build Coastguard Worker%ifidn %%temp, nbits_base
198*dfc6aa5cSAndroid Build Coastguard Worker    movd        nbits_base, mm_save_nbits  ; restore nbits_base
199*dfc6aa5cSAndroid Build Coastguard Worker%endif
200*dfc6aa5cSAndroid Build Coastguard Worker    bswap       nbits                      ; nbits = htonl(nbits)
201*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer], nbitsb      ; buffer[0] = nbits[0];
202*dfc6aa5cSAndroid Build Coastguard Worker    cmp         nbitsb, 0xFF               ; Set CF if nbits[0] < 0xFF
203*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer+1], 0         ; buffer[1] = 0;
204*dfc6aa5cSAndroid Build Coastguard Worker    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
205*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer], nbitsh      ; buffer[0] = nbits[1];
206*dfc6aa5cSAndroid Build Coastguard Worker    cmp         nbitsh, 0xFF               ; Set CF if nbits[1] < 0xFF
207*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer+1], 0         ; buffer[1] = 0;
208*dfc6aa5cSAndroid Build Coastguard Worker    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
209*dfc6aa5cSAndroid Build Coastguard Worker    shr         nbits, 16                  ; nbits >>= 16;
210*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer], nbitsb      ; buffer[0] = nbits[0];
211*dfc6aa5cSAndroid Build Coastguard Worker    cmp         nbitsb, 0xFF               ; Set CF if nbits[0] < 0xFF
212*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer+1], 0         ; buffer[1] = 0;
213*dfc6aa5cSAndroid Build Coastguard Worker    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
214*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer], nbitsh      ; buffer[0] = nbits[1];
215*dfc6aa5cSAndroid Build Coastguard Worker    %4
216*dfc6aa5cSAndroid Build Coastguard Worker    cmp         nbitsh, 0xFF               ; Set CF if nbits[1] < 0xFF
217*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer+1], 0         ; buffer[1] = 0;
218*dfc6aa5cSAndroid Build Coastguard Worker    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
219*dfc6aa5cSAndroid Build Coastguard Worker    %5
220*dfc6aa5cSAndroid Build Coastguard Worker    %6
221*dfc6aa5cSAndroid Build Coastguard Worker    %7
222*dfc6aa5cSAndroid Build Coastguard Worker    %8
223*dfc6aa5cSAndroid Build Coastguard Worker    jmp %9                                 ; return;
224*dfc6aa5cSAndroid Build Coastguard Worker%endmacro
225*dfc6aa5cSAndroid Build Coastguard Worker
226*dfc6aa5cSAndroid Build Coastguard Worker%macro PUSH 1
227*dfc6aa5cSAndroid Build Coastguard Worker    push        %1
228*dfc6aa5cSAndroid Build Coastguard Worker%assign stack_offset  stack_offset + 4
229*dfc6aa5cSAndroid Build Coastguard Worker%endmacro
230*dfc6aa5cSAndroid Build Coastguard Worker
231*dfc6aa5cSAndroid Build Coastguard Worker%macro POP 1
232*dfc6aa5cSAndroid Build Coastguard Worker    pop         %1
233*dfc6aa5cSAndroid Build Coastguard Worker%assign stack_offset  stack_offset - 4
234*dfc6aa5cSAndroid Build Coastguard Worker%endmacro
235*dfc6aa5cSAndroid Build Coastguard Worker
236*dfc6aa5cSAndroid Build Coastguard Worker; If PIC is defined, load the address of a symbol defined in this file into a
237*dfc6aa5cSAndroid Build Coastguard Worker; register.  Equivalent to
238*dfc6aa5cSAndroid Build Coastguard Worker;   get_GOT     %1
239*dfc6aa5cSAndroid Build Coastguard Worker;   lea         %1, [GOTOFF(%1, %2)]
240*dfc6aa5cSAndroid Build Coastguard Worker; without using the GOT.
241*dfc6aa5cSAndroid Build Coastguard Worker;
242*dfc6aa5cSAndroid Build Coastguard Worker; Usage:
243*dfc6aa5cSAndroid Build Coastguard Worker; %1 - register into which to load the address of the symbol
244*dfc6aa5cSAndroid Build Coastguard Worker; %2 - symbol whose address should be loaded
245*dfc6aa5cSAndroid Build Coastguard Worker; %3 - optional multi-line macro to execute before the symbol address is loaded
246*dfc6aa5cSAndroid Build Coastguard Worker; %4 - optional multi-line macro to execute after the symbol address is loaded
247*dfc6aa5cSAndroid Build Coastguard Worker;
248*dfc6aa5cSAndroid Build Coastguard Worker; If PIC is not defined, then %3 and %4 are executed in order.
249*dfc6aa5cSAndroid Build Coastguard Worker
250*dfc6aa5cSAndroid Build Coastguard Worker%macro GET_SYM 2-4
251*dfc6aa5cSAndroid Build Coastguard Worker%ifdef PIC
252*dfc6aa5cSAndroid Build Coastguard Worker    call        %%.geteip
253*dfc6aa5cSAndroid Build Coastguard Worker%%.ref:
254*dfc6aa5cSAndroid Build Coastguard Worker    %4
255*dfc6aa5cSAndroid Build Coastguard Worker    add         %1, %2 - %%.ref
256*dfc6aa5cSAndroid Build Coastguard Worker    jmp         short %%.done
257*dfc6aa5cSAndroid Build Coastguard Worker    align       32
258*dfc6aa5cSAndroid Build Coastguard Worker%%.geteip:
259*dfc6aa5cSAndroid Build Coastguard Worker    %3          4               ; must adjust stack pointer because of call
260*dfc6aa5cSAndroid Build Coastguard Worker    mov         %1, POINTER [esp]
261*dfc6aa5cSAndroid Build Coastguard Worker    ret
262*dfc6aa5cSAndroid Build Coastguard Worker    align       32
263*dfc6aa5cSAndroid Build Coastguard Worker%%.done:
264*dfc6aa5cSAndroid Build Coastguard Worker%else
265*dfc6aa5cSAndroid Build Coastguard Worker    %3          0
266*dfc6aa5cSAndroid Build Coastguard Worker    %4
267*dfc6aa5cSAndroid Build Coastguard Worker%endif
268*dfc6aa5cSAndroid Build Coastguard Worker%endmacro
269*dfc6aa5cSAndroid Build Coastguard Worker
270*dfc6aa5cSAndroid Build Coastguard Worker;
271*dfc6aa5cSAndroid Build Coastguard Worker; Encode a single block's worth of coefficients.
272*dfc6aa5cSAndroid Build Coastguard Worker;
273*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(JOCTET *)
274*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
275*dfc6aa5cSAndroid Build Coastguard Worker;                                  JCOEFPTR block, int last_dc_val,
276*dfc6aa5cSAndroid Build Coastguard Worker;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
277*dfc6aa5cSAndroid Build Coastguard Worker;
278*dfc6aa5cSAndroid Build Coastguard Worker; Stack layout:
279*dfc6aa5cSAndroid Build Coastguard Worker; Function args
280*dfc6aa5cSAndroid Build Coastguard Worker; Return address
281*dfc6aa5cSAndroid Build Coastguard Worker; Saved ebx
282*dfc6aa5cSAndroid Build Coastguard Worker; Saved ebp
283*dfc6aa5cSAndroid Build Coastguard Worker; Saved esi
284*dfc6aa5cSAndroid Build Coastguard Worker; Saved edi <-- esp_save
285*dfc6aa5cSAndroid Build Coastguard Worker; ...
286*dfc6aa5cSAndroid Build Coastguard Worker; esp_save
287*dfc6aa5cSAndroid Build Coastguard Worker; t_ 64*2 bytes (aligned to 128 bytes)
288*dfc6aa5cSAndroid Build Coastguard Worker;
289*dfc6aa5cSAndroid Build Coastguard Worker; esp is used (as t) to point into t_ (data in lower indices is not used once
290*dfc6aa5cSAndroid Build Coastguard Worker; esp passes over them, so this is signal-safe.)  Aligning to 128 bytes allows
291*dfc6aa5cSAndroid Build Coastguard Worker; us to find the rest of the data again.
292*dfc6aa5cSAndroid Build Coastguard Worker;
293*dfc6aa5cSAndroid Build Coastguard Worker; NOTES:
294*dfc6aa5cSAndroid Build Coastguard Worker; When shuffling data, we try to avoid pinsrw as much as possible, since it is
295*dfc6aa5cSAndroid Build Coastguard Worker; slow on many CPUs.  Its reciprocal throughput (issue latency) is 1 even on
296*dfc6aa5cSAndroid Build Coastguard Worker; modern CPUs, so chains of pinsrw instructions (even with different outputs)
297*dfc6aa5cSAndroid Build Coastguard Worker; can limit performance.  pinsrw is a VectorPath instruction on AMD K8 and
298*dfc6aa5cSAndroid Build Coastguard Worker; requires 2 µops (with memory operand) on Intel.  In either case, only one
299*dfc6aa5cSAndroid Build Coastguard Worker; pinsrw instruction can be decoded per cycle (and nothing else if they are
300*dfc6aa5cSAndroid Build Coastguard Worker; back-to-back), so out-of-order execution cannot be used to work around long
301*dfc6aa5cSAndroid Build Coastguard Worker; pinsrw chains (though for Sandy Bridge and later, this may be less of a
302*dfc6aa5cSAndroid Build Coastguard Worker; problem if the code runs from the µop cache.)
303*dfc6aa5cSAndroid Build Coastguard Worker;
304*dfc6aa5cSAndroid Build Coastguard Worker; We use tzcnt instead of bsf without checking for support.  The instruction is
305*dfc6aa5cSAndroid Build Coastguard Worker; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
306*dfc6aa5cSAndroid Build Coastguard Worker; rep bsf.)  The destination (first) operand of bsf (and tzcnt on some CPUs) is
307*dfc6aa5cSAndroid Build Coastguard Worker; an input dependency (although the behavior is not formally defined, Intel
308*dfc6aa5cSAndroid Build Coastguard Worker; CPUs usually leave the destination unmodified if the source is zero.)  This
309*dfc6aa5cSAndroid Build Coastguard Worker; can prevent out-of-order execution, so we clear the destination before
310*dfc6aa5cSAndroid Build Coastguard Worker; invoking tzcnt.
311*dfc6aa5cSAndroid Build Coastguard Worker;
312*dfc6aa5cSAndroid Build Coastguard Worker; Initial register allocation
313*dfc6aa5cSAndroid Build Coastguard Worker; eax - frame --> buffer
314*dfc6aa5cSAndroid Build Coastguard Worker; ebx - nbits_base (PIC) / emit_temp
315*dfc6aa5cSAndroid Build Coastguard Worker; ecx - dctbl --> size --> state
316*dfc6aa5cSAndroid Build Coastguard Worker; edx - block --> nbits
317*dfc6aa5cSAndroid Build Coastguard Worker; esi - code_temp --> state --> actbl
318*dfc6aa5cSAndroid Build Coastguard Worker; edi - index_temp --> free_bits
319*dfc6aa5cSAndroid Build Coastguard Worker; esp - t
320*dfc6aa5cSAndroid Build Coastguard Worker; ebp - index
321*dfc6aa5cSAndroid Build Coastguard Worker
322*dfc6aa5cSAndroid Build Coastguard Worker%define frame       eax
323*dfc6aa5cSAndroid Build Coastguard Worker%ifdef PIC
324*dfc6aa5cSAndroid Build Coastguard Worker%define nbits_base  ebx
325*dfc6aa5cSAndroid Build Coastguard Worker%endif
326*dfc6aa5cSAndroid Build Coastguard Worker%define emit_temp   ebx
327*dfc6aa5cSAndroid Build Coastguard Worker%define emit_tempb  bl
328*dfc6aa5cSAndroid Build Coastguard Worker%define emit_temph  bh
329*dfc6aa5cSAndroid Build Coastguard Worker%define dctbl       ecx
330*dfc6aa5cSAndroid Build Coastguard Worker%define block       edx
331*dfc6aa5cSAndroid Build Coastguard Worker%define code_temp   esi
332*dfc6aa5cSAndroid Build Coastguard Worker%define index_temp  edi
333*dfc6aa5cSAndroid Build Coastguard Worker%define t           esp
334*dfc6aa5cSAndroid Build Coastguard Worker%define index       ebp
335*dfc6aa5cSAndroid Build Coastguard Worker
336*dfc6aa5cSAndroid Build Coastguard Worker%assign save_frame  DCTSIZE2 * SIZEOF_WORD
337*dfc6aa5cSAndroid Build Coastguard Worker
338*dfc6aa5cSAndroid Build Coastguard Worker; Step 1: Re-arrange input data according to jpeg_natural_order
339*dfc6aa5cSAndroid Build Coastguard Worker; xx 01 02 03 04 05 06 07      xx 01 08 16 09 02 03 10
340*dfc6aa5cSAndroid Build Coastguard Worker; 08 09 10 11 12 13 14 15      17 24 32 25 18 11 04 05
341*dfc6aa5cSAndroid Build Coastguard Worker; 16 17 18 19 20 21 22 23      12 19 26 33 40 48 41 34
342*dfc6aa5cSAndroid Build Coastguard Worker; 24 25 26 27 28 29 30 31 ==>  27 20 13 06 07 14 21 28
343*dfc6aa5cSAndroid Build Coastguard Worker; 32 33 34 35 36 37 38 39      35 42 49 56 57 50 43 36
344*dfc6aa5cSAndroid Build Coastguard Worker; 40 41 42 43 44 45 46 47      29 22 15 23 30 37 44 51
345*dfc6aa5cSAndroid Build Coastguard Worker; 48 49 50 51 52 53 54 55      58 59 52 45 38 31 39 46
346*dfc6aa5cSAndroid Build Coastguard Worker; 56 57 58 59 60 61 62 63      53 60 61 54 47 55 62 63
347*dfc6aa5cSAndroid Build Coastguard Worker
348*dfc6aa5cSAndroid Build Coastguard Worker    align       32
349*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
350*dfc6aa5cSAndroid Build Coastguard Worker
351*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_huff_encode_one_block_sse2):
352*dfc6aa5cSAndroid Build Coastguard Worker
353*dfc6aa5cSAndroid Build Coastguard Worker%assign stack_offset      0
354*dfc6aa5cSAndroid Build Coastguard Worker%define arg_state         4 + stack_offset
355*dfc6aa5cSAndroid Build Coastguard Worker%define arg_buffer        8 + stack_offset
356*dfc6aa5cSAndroid Build Coastguard Worker%define arg_block        12 + stack_offset
357*dfc6aa5cSAndroid Build Coastguard Worker%define arg_last_dc_val  16 + stack_offset
358*dfc6aa5cSAndroid Build Coastguard Worker%define arg_dctbl        20 + stack_offset
359*dfc6aa5cSAndroid Build Coastguard Worker%define arg_actbl        24 + stack_offset
360*dfc6aa5cSAndroid Build Coastguard Worker
361*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;X: X = code stream
362*dfc6aa5cSAndroid Build Coastguard Worker    mov         block, [esp + arg_block]
363*dfc6aa5cSAndroid Build Coastguard Worker    PUSH        ebx
364*dfc6aa5cSAndroid Build Coastguard Worker    PUSH        ebp
365*dfc6aa5cSAndroid Build Coastguard Worker    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
366*dfc6aa5cSAndroid Build Coastguard Worker    PUSH        esi
367*dfc6aa5cSAndroid Build Coastguard Worker    PUSH        edi
368*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
369*dfc6aa5cSAndroid Build Coastguard Worker    mov         frame, esp
370*dfc6aa5cSAndroid Build Coastguard Worker    lea         t, [frame - (save_frame + 4)]
371*dfc6aa5cSAndroid Build Coastguard Worker    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
372*dfc6aa5cSAndroid Build Coastguard Worker    and         t, -DCTSIZE2 * SIZEOF_WORD                                             ; t = &t_[0]
373*dfc6aa5cSAndroid Build Coastguard Worker    mov         [t + save_frame], frame
374*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
375*dfc6aa5cSAndroid Build Coastguard Worker    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
376*dfc6aa5cSAndroid Build Coastguard Worker    pshuflw     xmm0, xmm0, 11001001b                     ;A: w0 = 01 08 xx 09 02 03 10 11
377*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm0, word [block + 16 * SIZEOF_WORD], 2  ;A: w0 = 01 08 16 09 02 03 10 11
378*dfc6aa5cSAndroid Build Coastguard Worker    punpckhdq   xmm3, xmm1                                ;D: w3 = 04 05 12 13 06 07 14 15
379*dfc6aa5cSAndroid Build Coastguard Worker    punpcklqdq  xmm1, xmm3                                ;B: w1 = 08 09 10 11 04 05 12 13
380*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm0, word [block + 17 * SIZEOF_WORD], 7  ;A: w0 = 01 08 16 09 02 03 10 17
381*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;A:      (Row 0, offset 1)
382*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     xmm4, xmm0                                ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
383*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm0, xmm4                                ;A: w0[i] += w4[i];
384*dfc6aa5cSAndroid Build Coastguard Worker    movaps      XMMWORD [t + 0 * SIZEOF_WORD], xmm0       ;A: t[i] = w0[i];
385*dfc6aa5cSAndroid Build Coastguard Worker
386*dfc6aa5cSAndroid Build Coastguard Worker    movq        xmm2, qword [block + 24 * SIZEOF_WORD]    ;B: w2 = 24 25 26 27 -- -- -- --
387*dfc6aa5cSAndroid Build Coastguard Worker    pshuflw     xmm2, xmm2, 11011000b                     ;B: w2 = 24 26 25 27 -- -- -- --
388*dfc6aa5cSAndroid Build Coastguard Worker    pslldq      xmm1, 1 * SIZEOF_WORD                     ;B: w1 = -- 08 09 10 11 04 05 12
389*dfc6aa5cSAndroid Build Coastguard Worker    movups      xmm5, XMMWORD [block + 48 * SIZEOF_WORD]  ;H: w5 = 48 49 50 51 52 53 54 55
390*dfc6aa5cSAndroid Build Coastguard Worker    movsd       xmm1, xmm2                                ;B: w1 = 24 26 25 27 11 04 05 12
391*dfc6aa5cSAndroid Build Coastguard Worker    punpcklqdq  xmm2, xmm5                                ;C: w2 = 24 26 25 27 48 49 50 51
392*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm1, word [block + 32 * SIZEOF_WORD], 1  ;B: w1 = 24 32 25 27 11 04 05 12
393*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
394*dfc6aa5cSAndroid Build Coastguard Worker    psrldq      xmm3, 2 * SIZEOF_WORD                     ;D: w3 = 12 13 06 07 14 15 -- --
395*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm0, xmm4                                ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
396*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm1, word [block + 18 * SIZEOF_WORD], 3  ;B: w1 = 24 32 25 18 11 04 05 12
397*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;        (Row 1, offset 1)
398*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     xmm4, xmm1                                ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
399*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm1, xmm4                                ;B: w1[i] += w4[i];
400*dfc6aa5cSAndroid Build Coastguard Worker    movaps      XMMWORD [t + 8 * SIZEOF_WORD], xmm1       ;B: t[i+8] = w1[i];
401*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm4, xmm4                                ;B: w4[i] = 0;
402*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm1, xmm4                                ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
403*dfc6aa5cSAndroid Build Coastguard Worker
404*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    xmm0, xmm1                                ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
405*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;    w/ signed saturation
406*dfc6aa5cSAndroid Build Coastguard Worker
407*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm3, word [block + 20 * SIZEOF_WORD], 0  ;D: w3 = 20 13 06 07 14 15 -- --
408*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm3, word [block + 21 * SIZEOF_WORD], 5  ;D: w3 = 20 13 06 07 14 21 -- --
409*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm3, word [block + 28 * SIZEOF_WORD], 6  ;D: w3 = 20 13 06 07 14 21 28 --
410*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm3, word [block + 35 * SIZEOF_WORD], 7  ;D: w3 = 20 13 06 07 14 21 28 35
411*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;        (Row 3, offset 1)
412*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     xmm4, xmm3                                ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
413*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm3, xmm4                                ;D: w3[i] += w4[i];
414*dfc6aa5cSAndroid Build Coastguard Worker    movaps      XMMWORD [t + 24 * SIZEOF_WORD], xmm3      ;D: t[i+24] = w3[i];
415*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm4, xmm4                                ;D: w4[i] = 0;
416*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm3, xmm4                                ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
417*dfc6aa5cSAndroid Build Coastguard Worker
418*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm2, word [block + 19 * SIZEOF_WORD], 0  ;C: w2 = 19 26 25 27 48 49 50 51
419*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm2, word [block + 33 * SIZEOF_WORD], 2  ;C: w2 = 19 26 33 27 48 49 50 51
420*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm2, word [block + 40 * SIZEOF_WORD], 3  ;C: w2 = 19 26 33 40 48 49 50 51
421*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm2, word [block + 41 * SIZEOF_WORD], 5  ;C: w2 = 19 26 33 40 48 41 50 51
422*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm2, word [block + 34 * SIZEOF_WORD], 6  ;C: w2 = 19 26 33 40 48 41 34 51
423*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm2, word [block + 27 * SIZEOF_WORD], 7  ;C: w2 = 19 26 33 40 48 41 34 27
424*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;        (Row 2, offset 1)
425*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     xmm4, xmm2                                ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
426*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm2, xmm4                                ;C: w2[i] += w4[i];
427*dfc6aa5cSAndroid Build Coastguard Worker    movsx       code_temp, word [block]                   ;Z:     code_temp = block[0];
428*dfc6aa5cSAndroid Build Coastguard Worker
429*dfc6aa5cSAndroid Build Coastguard Worker; %1 - stack pointer adjustment
430*dfc6aa5cSAndroid Build Coastguard Worker%macro GET_SYM_BEFORE 1
431*dfc6aa5cSAndroid Build Coastguard Worker    movaps      XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2
432*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;C: t[i+16] = w2[i];
433*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm4, xmm4                                ;C: w4[i] = 0;
434*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm2, xmm4                                ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
435*dfc6aa5cSAndroid Build Coastguard Worker    sub         code_temp, [frame + arg_last_dc_val]      ;Z:     code_temp -= last_dc_val;
436*dfc6aa5cSAndroid Build Coastguard Worker
437*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    xmm2, xmm3                                ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
438*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;    w/ signed saturation
439*dfc6aa5cSAndroid Build Coastguard Worker
440*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm3, xmm5                                ;H: w3 = 48 49 50 51 52 53 54 55
441*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    index_temp, xmm2                          ;Z:     index_temp = 0;  index_temp |= ((b2[i] >> 7) << i);
442*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    index, xmm0                               ;Z:     index = 0;  index |= ((b0[i] >> 7) << i);
443*dfc6aa5cSAndroid Build Coastguard Worker    movups      xmm0, XMMWORD [block + 56 * SIZEOF_WORD]  ;H: w0 = 56 57 58 59 60 61 62 63
444*dfc6aa5cSAndroid Build Coastguard Worker    punpckhdq   xmm3, xmm0                                ;H: w3 = 52 53 60 61 54 55 62 63
445*dfc6aa5cSAndroid Build Coastguard Worker    shl         index_temp, 16                            ;Z:     index_temp <<= 16;
446*dfc6aa5cSAndroid Build Coastguard Worker    psrldq      xmm3, 1 * SIZEOF_WORD                     ;H: w3 = 53 60 61 54 55 62 63 --
447*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm2, xmm2                                ;H: w2[i] = 0;
448*dfc6aa5cSAndroid Build Coastguard Worker    pshuflw     xmm3, xmm3, 00111001b                     ;H: w3 = 60 61 54 53 55 62 63 --
449*dfc6aa5cSAndroid Build Coastguard Worker    or          index, index_temp                         ;Z:     index |= index_temp;
450*dfc6aa5cSAndroid Build Coastguard Worker%undef index_temp
451*dfc6aa5cSAndroid Build Coastguard Worker%define free_bits  edi
452*dfc6aa5cSAndroid Build Coastguard Worker%endmacro
453*dfc6aa5cSAndroid Build Coastguard Worker
454*dfc6aa5cSAndroid Build Coastguard Worker%macro GET_SYM_AFTER 0
455*dfc6aa5cSAndroid Build Coastguard Worker    movq        xmm1, qword [block + 44 * SIZEOF_WORD]    ;G: w1 = 44 45 46 47 -- -- -- --
456*dfc6aa5cSAndroid Build Coastguard Worker    unpcklps    xmm5, xmm0                                ;E: w5 = 48 49 56 57 50 51 58 59
457*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm0, xmm0                                ;H: w0[i] = 0;
458*dfc6aa5cSAndroid Build Coastguard Worker    not         index                                     ;Z:     index = ~index;
459*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm3, word [block + 47 * SIZEOF_WORD], 3  ;H: w3 = 60 61 54 47 55 62 63 --
460*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;        (Row 7, offset 1)
461*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     xmm2, xmm3                                ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
462*dfc6aa5cSAndroid Build Coastguard Worker    mov         dctbl, [frame + arg_dctbl]
463*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm3, xmm2                                ;H: w3[i] += w2[i];
464*dfc6aa5cSAndroid Build Coastguard Worker    movaps      XMMWORD [t + 56 * SIZEOF_WORD], xmm3      ;H: t[i+56] = w3[i];
465*dfc6aa5cSAndroid Build Coastguard Worker    movq        xmm4, qword [block + 36 * SIZEOF_WORD]    ;G: w4 = 36 37 38 39 -- -- -- --
466*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm3, xmm0                                ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
467*dfc6aa5cSAndroid Build Coastguard Worker    punpckldq   xmm4, xmm1                                ;G: w4 = 36 37 44 45 38 39 46 47
468*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm1, xmm4                                ;F: w1 = 36 37 44 45 38 39 46 47
469*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     mm_all_0xff, mm_all_0xff                  ;Z:     all_0xff[i] = 0xFF;
470*dfc6aa5cSAndroid Build Coastguard Worker%endmacro
471*dfc6aa5cSAndroid Build Coastguard Worker
472*dfc6aa5cSAndroid Build Coastguard Worker    GET_SYM     nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER
473*dfc6aa5cSAndroid Build Coastguard Worker
474*dfc6aa5cSAndroid Build Coastguard Worker    psrldq      xmm4, 1 * SIZEOF_WORD                     ;G: w4 = 37 44 45 38 39 46 47 --
475*dfc6aa5cSAndroid Build Coastguard Worker    shufpd      xmm1, xmm5, 10b                           ;F: w1 = 36 37 44 45 50 51 58 59
476*dfc6aa5cSAndroid Build Coastguard Worker    pshufhw     xmm4, xmm4, 11010011b                     ;G: w4 = 37 44 45 38 -- 39 46 --
477*dfc6aa5cSAndroid Build Coastguard Worker    pslldq      xmm1, 1 * SIZEOF_WORD                     ;F: w1 = -- 36 37 44 45 50 51 58
478*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm4, word [block + 59 * SIZEOF_WORD], 0  ;G: w4 = 59 44 45 38 -- 39 46 --
479*dfc6aa5cSAndroid Build Coastguard Worker    pshufd      xmm1, xmm1, 11011000b                     ;F: w1 = -- 36 45 50 37 44 51 58
480*dfc6aa5cSAndroid Build Coastguard Worker    cmp         code_temp, 1 << 31                        ;Z:     Set CF if code_temp < 0x80000000,
481*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;Z:     i.e. if code_temp is positive
482*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm4, word [block + 52 * SIZEOF_WORD], 1  ;G: w4 = 59 52 45 38 -- 39 46 --
483*dfc6aa5cSAndroid Build Coastguard Worker    movlps      xmm1, qword [block + 20 * SIZEOF_WORD]    ;F: w1 = 20 21 22 23 37 44 51 58
484*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm4, word [block + 31 * SIZEOF_WORD], 4  ;G: w4 = 59 52 45 38 31 39 46 --
485*dfc6aa5cSAndroid Build Coastguard Worker    pshuflw     xmm1, xmm1, 01110010b                     ;F: w1 = 22 20 23 21 37 44 51 58
486*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm4, word [block + 53 * SIZEOF_WORD], 7  ;G: w4 = 59 52 45 38 31 39 46 53
487*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;        (Row 6, offset 1)
488*dfc6aa5cSAndroid Build Coastguard Worker    adc         code_temp, -1                             ;Z:     code_temp += -1 + (code_temp >= 0 ? 1 : 0);
489*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm2, xmm2                                ;G: w2[i] = 0;
490*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     xmm0, xmm4                                ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
491*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm1, word [block + 15 * SIZEOF_WORD], 1  ;F: w1 = 22 15 23 21 37 44 51 58
492*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm4, xmm0                                ;G: w4[i] += w0[i];
493*dfc6aa5cSAndroid Build Coastguard Worker    movaps      XMMWORD [t + 48 * SIZEOF_WORD], xmm4      ;G: t[48+i] = w4[i];
494*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_temp, code_temp                        ;Z:     temp = code_temp
495*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm1, word [block + 30 * SIZEOF_WORD], 3  ;F: w1 = 22 15 23 30 37 44 51 58
496*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;        (Row 5, offset 1)
497*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm4, xmm2                                ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
498*dfc6aa5cSAndroid Build Coastguard Worker
499*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    xmm4, xmm3                                ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
500*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;    w/ signed saturation
501*dfc6aa5cSAndroid Build Coastguard Worker
502*dfc6aa5cSAndroid Build Coastguard Worker    lea         t, [t - SIZEOF_WORD]                      ;Z:     t = &t[-1]
503*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm0, xmm0                                ;F: w0[i] = 0;
504*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     xmm2, xmm1                                ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
505*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm1, xmm2                                ;F: w1[i] += w2[i];
506*dfc6aa5cSAndroid Build Coastguard Worker    movaps      XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1  ;F: t[40+i] = w1[i];
507*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm1, xmm0                                ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
508*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm5, word [block + 42 * SIZEOF_WORD], 0  ;E: w5 = 42 49 56 57 50 51 58 59
509*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm5, word [block + 43 * SIZEOF_WORD], 5  ;E: w5 = 42 49 56 57 50 43 58 59
510*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm5, word [block + 36 * SIZEOF_WORD], 6  ;E: w5 = 42 49 56 57 50 43 36 59
511*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm5, word [block + 29 * SIZEOF_WORD], 7  ;E: w5 = 42 49 56 57 50 43 36 29
512*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;        (Row 4, offset 1)
513*dfc6aa5cSAndroid Build Coastguard Worker%undef block
514*dfc6aa5cSAndroid Build Coastguard Worker%define nbits  edx
515*dfc6aa5cSAndroid Build Coastguard Worker%define nbitsb  dl
516*dfc6aa5cSAndroid Build Coastguard Worker%define nbitsh  dh
517*dfc6aa5cSAndroid Build Coastguard Worker    movzx       nbits, byte [NBITS(code_temp)]            ;Z:     nbits = JPEG_NBITS(code_temp);
518*dfc6aa5cSAndroid Build Coastguard Worker%undef code_temp
519*dfc6aa5cSAndroid Build Coastguard Worker%define state  esi
520*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm2, xmm2                                ;E: w2[i] = 0;
521*dfc6aa5cSAndroid Build Coastguard Worker    mov         state, [frame + arg_state]
522*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_nbits, nbits                           ;Z:     nbits --> MMX register
523*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     xmm0, xmm5                                ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
524*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4]
525*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;Z:     code = dctbl->ehufco[nbits];
526*dfc6aa5cSAndroid Build Coastguard Worker%define size  ecx
527*dfc6aa5cSAndroid Build Coastguard Worker%define sizeb  cl
528*dfc6aa5cSAndroid Build Coastguard Worker%define sizeh  ch
529*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm5, xmm0                                ;E: w5[i] += w0[i];
530*dfc6aa5cSAndroid Build Coastguard Worker    movaps      XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5  ;E: t[32+i] = w5[i];
531*dfc6aa5cSAndroid Build Coastguard Worker    movzx       size, byte [dctbl + c_derived_tbl.ehufsi + nbits]
532*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;Z:     size = dctbl->ehufsi[nbits];
533*dfc6aa5cSAndroid Build Coastguard Worker%undef dctbl
534*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm5, xmm2                                ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
535*dfc6aa5cSAndroid Build Coastguard Worker
536*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    xmm5, xmm1                                ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
537*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;    w/ signed saturation
538*dfc6aa5cSAndroid Build Coastguard Worker
539*dfc6aa5cSAndroid Build Coastguard Worker    movq        mm_put_buffer, [state + working_state.cur.put_buffer.simd]
540*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;Z:     put_buffer = state->cur.put_buffer.simd;
541*dfc6aa5cSAndroid Build Coastguard Worker    mov         free_bits, [state + working_state.cur.free_bits]
542*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;Z:     free_bits = state->cur.free_bits;
543*dfc6aa5cSAndroid Build Coastguard Worker%undef state
544*dfc6aa5cSAndroid Build Coastguard Worker%define actbl  esi
545*dfc6aa5cSAndroid Build Coastguard Worker    mov         actbl, [frame + arg_actbl]
546*dfc6aa5cSAndroid Build Coastguard Worker%define buffer  eax
547*dfc6aa5cSAndroid Build Coastguard Worker    mov         buffer, [frame + arg_buffer]
548*dfc6aa5cSAndroid Build Coastguard Worker%undef frame
549*dfc6aa5cSAndroid Build Coastguard Worker    jmp        .BEGIN
550*dfc6aa5cSAndroid Build Coastguard Worker
551*dfc6aa5cSAndroid Build Coastguard Worker; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
552*dfc6aa5cSAndroid Build Coastguard Worker
553*dfc6aa5cSAndroid Build Coastguard Worker    align       16
554*dfc6aa5cSAndroid Build Coastguard Worker; size <= 32, so this is not really a loop
555*dfc6aa5cSAndroid Build Coastguard Worker.BRLOOP1:                                                 ; .BRLOOP1:
556*dfc6aa5cSAndroid Build Coastguard Worker    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
557*dfc6aa5cSAndroid Build Coastguard Worker                                                          ; nbits = actbl->ehufsi[0xf0];
558*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
559*dfc6aa5cSAndroid Build Coastguard Worker                                                          ; code = actbl->ehufco[0xf0];
560*dfc6aa5cSAndroid Build Coastguard Worker    and         index, 0x7ffffff                          ; clear index if size == 32
561*dfc6aa5cSAndroid Build Coastguard Worker    sub         size, 16                                  ; size -= 16;
562*dfc6aa5cSAndroid Build Coastguard Worker    sub         free_bits, nbits                          ; if ((free_bits -= nbits) <= 0)
563*dfc6aa5cSAndroid Build Coastguard Worker    jle         .EMIT_BRLOOP1                             ;   goto .EMIT_BRLOOP1;
564*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_nbits, nbits                           ; nbits --> MMX register
565*dfc6aa5cSAndroid Build Coastguard Worker    psllq       mm_put_buffer, mm_nbits                   ; put_buffer <<= nbits;
566*dfc6aa5cSAndroid Build Coastguard Worker    por         mm_put_buffer, mm_code                    ; put_buffer |= code;
567*dfc6aa5cSAndroid Build Coastguard Worker    jmp         .ERLOOP1                                  ; goto .ERLOOP1;
568*dfc6aa5cSAndroid Build Coastguard Worker
569*dfc6aa5cSAndroid Build Coastguard Worker; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
570*dfc6aa5cSAndroid Build Coastguard Worker
571*dfc6aa5cSAndroid Build Coastguard Worker    align       16
572*dfc6aa5cSAndroid Build Coastguard Worker%ifdef PIC
573*dfc6aa5cSAndroid Build Coastguard Worker    times 6     nop
574*dfc6aa5cSAndroid Build Coastguard Worker%else
575*dfc6aa5cSAndroid Build Coastguard Worker    times 2     nop
576*dfc6aa5cSAndroid Build Coastguard Worker%endif
577*dfc6aa5cSAndroid Build Coastguard Worker.BLOOP1:                                                  ; do {  /* size = # of zero bits/elements to skip */
578*dfc6aa5cSAndroid Build Coastguard Worker; if size == 32, index remains unchanged.  Correct in .BRLOOP.
579*dfc6aa5cSAndroid Build Coastguard Worker    shr         index, sizeb                              ;   index >>= size;
580*dfc6aa5cSAndroid Build Coastguard Worker    lea         t, [t + size * SIZEOF_WORD]               ;   t += size;
581*dfc6aa5cSAndroid Build Coastguard Worker    cmp         size, 16                                  ;   if (size > 16)
582*dfc6aa5cSAndroid Build Coastguard Worker    jg          .BRLOOP1                                  ;     goto .BRLOOP1;
583*dfc6aa5cSAndroid Build Coastguard Worker.ERLOOP1:                                                 ; .ERLOOP1:
584*dfc6aa5cSAndroid Build Coastguard Worker    movsx       nbits, word [t]                           ;   nbits = *t;
585*dfc6aa5cSAndroid Build Coastguard Worker%ifdef PIC
586*dfc6aa5cSAndroid Build Coastguard Worker    add         size, size                                ;   size += size;
587*dfc6aa5cSAndroid Build Coastguard Worker%else
588*dfc6aa5cSAndroid Build Coastguard Worker    lea         size, [size * 2]                          ;   size += size;
589*dfc6aa5cSAndroid Build Coastguard Worker%endif
590*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_temp, nbits                            ;   temp = nbits;
591*dfc6aa5cSAndroid Build Coastguard Worker    movzx       nbits, byte [NBITS(nbits)]                ;   nbits = JPEG_NBITS(nbits);
592*dfc6aa5cSAndroid Build Coastguard Worker    lea         size, [size * 8 + nbits]                  ;   size = size * 8 + nbits;
593*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_nbits, nbits                           ;   nbits --> MMX register
594*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
595*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;   code = actbl->ehufco[size-16];
596*dfc6aa5cSAndroid Build Coastguard Worker    movzx       size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
597*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;   size = actbl->ehufsi[size-16];
598*dfc6aa5cSAndroid Build Coastguard Worker.BEGIN:                                                   ; .BEGIN:
599*dfc6aa5cSAndroid Build Coastguard Worker    pand        mm_temp, [MASK_BITS(nbits)]               ;   temp &= (1 << nbits) - 1;
600*dfc6aa5cSAndroid Build Coastguard Worker    psllq       mm_code, mm_nbits                         ;   code <<= nbits;
601*dfc6aa5cSAndroid Build Coastguard Worker    add         nbits, size                               ;   nbits += size;
602*dfc6aa5cSAndroid Build Coastguard Worker    por         mm_code, mm_temp                          ;   code |= temp;
603*dfc6aa5cSAndroid Build Coastguard Worker    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
604*dfc6aa5cSAndroid Build Coastguard Worker    jle         .EMIT_ERLOOP1                             ;     insert code, flush buffer, init size, goto .BLOOP1
605*dfc6aa5cSAndroid Build Coastguard Worker    xor         size, size                                ;   size = 0;  /* kill tzcnt input dependency */
606*dfc6aa5cSAndroid Build Coastguard Worker    tzcnt       size, index                               ;   size = # of trailing 0 bits in index
607*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_nbits, nbits                           ;   nbits --> MMX register
608*dfc6aa5cSAndroid Build Coastguard Worker    psllq       mm_put_buffer, mm_nbits                   ;   put_buffer <<= nbits;
609*dfc6aa5cSAndroid Build Coastguard Worker    inc         size                                      ;   ++size;
610*dfc6aa5cSAndroid Build Coastguard Worker    por         mm_put_buffer, mm_code                    ;   put_buffer |= code;
611*dfc6aa5cSAndroid Build Coastguard Worker    test        index, index
612*dfc6aa5cSAndroid Build Coastguard Worker    jnz         .BLOOP1                                   ; } while (index != 0);
613*dfc6aa5cSAndroid Build Coastguard Worker; Round 2
614*dfc6aa5cSAndroid Build Coastguard Worker; t points to the last used word, possibly below t_ if the previous index had 32 zero bits.
615*dfc6aa5cSAndroid Build Coastguard Worker.ELOOP1:                                                  ; .ELOOP1:
616*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    size, xmm4                                ; size = 0;  size |= ((b4[i] >> 7) << i);
617*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    index, xmm5                               ; index = 0;  index |= ((b5[i] >> 7) << i);
618*dfc6aa5cSAndroid Build Coastguard Worker    shl         size, 16                                  ; size <<= 16;
619*dfc6aa5cSAndroid Build Coastguard Worker    or          index, size                               ; index |= size;
620*dfc6aa5cSAndroid Build Coastguard Worker    not         index                                     ; index = ~index;
621*dfc6aa5cSAndroid Build Coastguard Worker    lea         nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD]
622*dfc6aa5cSAndroid Build Coastguard Worker                                                          ; nbits = t + 1 + 64;
623*dfc6aa5cSAndroid Build Coastguard Worker    and         nbits, -DCTSIZE2 * SIZEOF_WORD            ; nbits &= -128;  /* now points to &t_[64] */
624*dfc6aa5cSAndroid Build Coastguard Worker    sub         nbits, t                                  ; nbits -= t;
625*dfc6aa5cSAndroid Build Coastguard Worker    shr         nbits, 1                                  ; nbits >>= 1;  /* # of leading 0 bits in old index + 33 */
626*dfc6aa5cSAndroid Build Coastguard Worker    tzcnt       size, index                               ; size = # of trailing 0 bits in index
627*dfc6aa5cSAndroid Build Coastguard Worker    inc         size                                      ; ++size;
628*dfc6aa5cSAndroid Build Coastguard Worker    test        index, index                              ; if (index == 0)
629*dfc6aa5cSAndroid Build Coastguard Worker    jz          .ELOOP2                                   ;   goto .ELOOP2;
630*dfc6aa5cSAndroid Build Coastguard Worker; NOTE: size == 32 cannot happen, since the last element is always 0.
631*dfc6aa5cSAndroid Build Coastguard Worker    shr         index, sizeb                              ; index >>= size;
632*dfc6aa5cSAndroid Build Coastguard Worker    lea         size, [size + nbits - 33]                 ; size = size + nbits - 33;
633*dfc6aa5cSAndroid Build Coastguard Worker    lea         t, [t + size * SIZEOF_WORD]               ; t += size;
634*dfc6aa5cSAndroid Build Coastguard Worker    cmp         size, 16                                  ; if (size <= 16)
635*dfc6aa5cSAndroid Build Coastguard Worker    jle         .ERLOOP2                                  ;   goto .ERLOOP2;
636*dfc6aa5cSAndroid Build Coastguard Worker.BRLOOP2:                                                 ; do {
637*dfc6aa5cSAndroid Build Coastguard Worker    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
638*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;   nbits = actbl->ehufsi[0xf0];
639*dfc6aa5cSAndroid Build Coastguard Worker    sub         size, 16                                  ;   size -= 16;
640*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
641*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;   code = actbl->ehufco[0xf0];
642*dfc6aa5cSAndroid Build Coastguard Worker    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
643*dfc6aa5cSAndroid Build Coastguard Worker    jle         .EMIT_BRLOOP2                             ;     insert code and flush put_buffer
644*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_nbits, nbits                           ;   else { nbits --> MMX register
645*dfc6aa5cSAndroid Build Coastguard Worker    psllq       mm_put_buffer, mm_nbits                   ;     put_buffer <<= nbits;
646*dfc6aa5cSAndroid Build Coastguard Worker    por         mm_put_buffer, mm_code                    ;     put_buffer |= code;
647*dfc6aa5cSAndroid Build Coastguard Worker    cmp         size, 16                                  ;     if (size <= 16)
648*dfc6aa5cSAndroid Build Coastguard Worker    jle        .ERLOOP2                                   ;       goto .ERLOOP2;
649*dfc6aa5cSAndroid Build Coastguard Worker    jmp        .BRLOOP2                                   ; } while (1);
650*dfc6aa5cSAndroid Build Coastguard Worker
651*dfc6aa5cSAndroid Build Coastguard Worker; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
652*dfc6aa5cSAndroid Build Coastguard Worker
653*dfc6aa5cSAndroid Build Coastguard Worker    align      16
654*dfc6aa5cSAndroid Build Coastguard Worker.BLOOP2:                                                  ; do {  /* size = # of zero bits/elements to skip */
655*dfc6aa5cSAndroid Build Coastguard Worker    shr         index, sizeb                              ;   index >>= size;
656*dfc6aa5cSAndroid Build Coastguard Worker    lea         t, [t + size * SIZEOF_WORD]               ;   t += size;
657*dfc6aa5cSAndroid Build Coastguard Worker    cmp         size, 16                                  ;   if (size > 16)
658*dfc6aa5cSAndroid Build Coastguard Worker    jg          .BRLOOP2                                  ;     goto .BRLOOP2;
659*dfc6aa5cSAndroid Build Coastguard Worker.ERLOOP2:                                                 ; .ERLOOP2:
660*dfc6aa5cSAndroid Build Coastguard Worker    movsx       nbits, word [t]                           ;   nbits = *t;
661*dfc6aa5cSAndroid Build Coastguard Worker    add         size, size                                ;   size += size;
662*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_temp, nbits                            ;   temp = nbits;
663*dfc6aa5cSAndroid Build Coastguard Worker    movzx       nbits, byte [NBITS(nbits)]                ;   nbits = JPEG_NBITS(nbits);
664*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_nbits, nbits                           ;   nbits --> MMX register
665*dfc6aa5cSAndroid Build Coastguard Worker    lea         size, [size * 8 + nbits]                  ;   size = size * 8 + nbits;
666*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
667*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;   code = actbl->ehufco[size-16];
668*dfc6aa5cSAndroid Build Coastguard Worker    movzx       size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
669*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;   size = actbl->ehufsi[size-16];
670*dfc6aa5cSAndroid Build Coastguard Worker    psllq       mm_code, mm_nbits                         ;   code <<= nbits;
671*dfc6aa5cSAndroid Build Coastguard Worker    pand        mm_temp, [MASK_BITS(nbits)]               ;   temp &= (1 << nbits) - 1;
672*dfc6aa5cSAndroid Build Coastguard Worker    lea         nbits, [nbits + size]                     ;   nbits += size;
673*dfc6aa5cSAndroid Build Coastguard Worker    por         mm_code, mm_temp                          ;   code |= temp;
674*dfc6aa5cSAndroid Build Coastguard Worker    xor         size, size                                ;   size = 0;  /* kill tzcnt input dependency */
675*dfc6aa5cSAndroid Build Coastguard Worker    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
676*dfc6aa5cSAndroid Build Coastguard Worker    jle         .EMIT_ERLOOP2                             ;     insert code, flush buffer, init size, goto .BLOOP2
677*dfc6aa5cSAndroid Build Coastguard Worker    tzcnt       size, index                               ;   size = # of trailing 0 bits in index
678*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_nbits, nbits                           ;   nbits --> MMX register
679*dfc6aa5cSAndroid Build Coastguard Worker    psllq       mm_put_buffer, mm_nbits                   ;   put_buffer <<= nbits;
680*dfc6aa5cSAndroid Build Coastguard Worker    inc         size                                      ;   ++size;
681*dfc6aa5cSAndroid Build Coastguard Worker    por         mm_put_buffer, mm_code                    ;   put_buffer |= code;
682*dfc6aa5cSAndroid Build Coastguard Worker    test        index, index
683*dfc6aa5cSAndroid Build Coastguard Worker    jnz         .BLOOP2                                   ; } while (index != 0);
684*dfc6aa5cSAndroid Build Coastguard Worker.ELOOP2:                                                  ; .ELOOP2:
685*dfc6aa5cSAndroid Build Coastguard Worker    mov         nbits, t                                  ; nbits = t;
686*dfc6aa5cSAndroid Build Coastguard Worker    lea         t, [t + SIZEOF_WORD]                      ; t = &t[1];
687*dfc6aa5cSAndroid Build Coastguard Worker    and         nbits, DCTSIZE2 * SIZEOF_WORD - 1         ; nbits &= 127;
688*dfc6aa5cSAndroid Build Coastguard Worker    and         t, -DCTSIZE2 * SIZEOF_WORD                ; t &= -128;  /* t = &t_[0]; */
689*dfc6aa5cSAndroid Build Coastguard Worker    cmp         nbits, (DCTSIZE2 - 2) * SIZEOF_WORD       ; if (nbits != 62 * 2)
690*dfc6aa5cSAndroid Build Coastguard Worker    je          .EFN                                      ; {
691*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0]
692*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;   code = actbl->ehufco[0];
693*dfc6aa5cSAndroid Build Coastguard Worker    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
694*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;   nbits = actbl->ehufsi[0];
695*dfc6aa5cSAndroid Build Coastguard Worker    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
696*dfc6aa5cSAndroid Build Coastguard Worker    jg          .EFN_SKIP_EMIT_CODE                       ;   {
697*dfc6aa5cSAndroid Build Coastguard Worker    EMIT_QWORD  size, sizeb, sizeh, , , , , , .EFN        ;     insert code, flush put_buffer
698*dfc6aa5cSAndroid Build Coastguard Worker    align       16
699*dfc6aa5cSAndroid Build Coastguard Worker.EFN_SKIP_EMIT_CODE:                                      ;   } else {
700*dfc6aa5cSAndroid Build Coastguard Worker    movd        mm_nbits, nbits                           ;     nbits --> MMX register
701*dfc6aa5cSAndroid Build Coastguard Worker    psllq       mm_put_buffer, mm_nbits                   ;     put_buffer <<= nbits;
702*dfc6aa5cSAndroid Build Coastguard Worker    por         mm_put_buffer, mm_code                    ;     put_buffer |= code;
703*dfc6aa5cSAndroid Build Coastguard Worker.EFN:                                                     ; } }
704*dfc6aa5cSAndroid Build Coastguard Worker%define frame  esp
705*dfc6aa5cSAndroid Build Coastguard Worker    mov         frame, [t + save_frame]
706*dfc6aa5cSAndroid Build Coastguard Worker%define state  ecx
707*dfc6aa5cSAndroid Build Coastguard Worker    mov         state, [frame + arg_state]
708*dfc6aa5cSAndroid Build Coastguard Worker    movq        [state + working_state.cur.put_buffer.simd], mm_put_buffer
709*dfc6aa5cSAndroid Build Coastguard Worker                                                          ; state->cur.put_buffer.simd = put_buffer;
710*dfc6aa5cSAndroid Build Coastguard Worker    emms
711*dfc6aa5cSAndroid Build Coastguard Worker    mov         [state + working_state.cur.free_bits], free_bits
712*dfc6aa5cSAndroid Build Coastguard Worker                                                          ; state->cur.free_bits = free_bits;
713*dfc6aa5cSAndroid Build Coastguard Worker    POP         edi
714*dfc6aa5cSAndroid Build Coastguard Worker    POP         esi
715*dfc6aa5cSAndroid Build Coastguard Worker    POP         ebp
716*dfc6aa5cSAndroid Build Coastguard Worker    POP         ebx
717*dfc6aa5cSAndroid Build Coastguard Worker    ret
718*dfc6aa5cSAndroid Build Coastguard Worker
719*dfc6aa5cSAndroid Build Coastguard Worker; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
720*dfc6aa5cSAndroid Build Coastguard Worker
721*dfc6aa5cSAndroid Build Coastguard Worker    align       16
722*dfc6aa5cSAndroid Build Coastguard Worker.EMIT_BRLOOP1:
723*dfc6aa5cSAndroid Build Coastguard Worker    EMIT_QWORD  emit_temp, emit_tempb, emit_temph, , , , , , \
724*dfc6aa5cSAndroid Build Coastguard Worker      .ERLOOP1
725*dfc6aa5cSAndroid Build Coastguard Worker
726*dfc6aa5cSAndroid Build Coastguard Worker; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
727*dfc6aa5cSAndroid Build Coastguard Worker
728*dfc6aa5cSAndroid Build Coastguard Worker    align       16
729*dfc6aa5cSAndroid Build Coastguard Worker.EMIT_ERLOOP1:
730*dfc6aa5cSAndroid Build Coastguard Worker    EMIT_QWORD  size, sizeb, sizeh, \
731*dfc6aa5cSAndroid Build Coastguard Worker      { xor     size, size }, \
732*dfc6aa5cSAndroid Build Coastguard Worker      { tzcnt   size, index }, \
733*dfc6aa5cSAndroid Build Coastguard Worker      { inc     size }, \
734*dfc6aa5cSAndroid Build Coastguard Worker      { test    index, index }, \
735*dfc6aa5cSAndroid Build Coastguard Worker      { jnz     .BLOOP1 }, \
736*dfc6aa5cSAndroid Build Coastguard Worker      .ELOOP1
737*dfc6aa5cSAndroid Build Coastguard Worker
738*dfc6aa5cSAndroid Build Coastguard Worker; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
739*dfc6aa5cSAndroid Build Coastguard Worker
740*dfc6aa5cSAndroid Build Coastguard Worker    align       16
741*dfc6aa5cSAndroid Build Coastguard Worker.EMIT_BRLOOP2:
742*dfc6aa5cSAndroid Build Coastguard Worker    EMIT_QWORD  emit_temp, emit_tempb, emit_temph, , , , \
743*dfc6aa5cSAndroid Build Coastguard Worker      { cmp     size, 16 }, \
744*dfc6aa5cSAndroid Build Coastguard Worker      { jle     .ERLOOP2 }, \
745*dfc6aa5cSAndroid Build Coastguard Worker      .BRLOOP2
746*dfc6aa5cSAndroid Build Coastguard Worker
747*dfc6aa5cSAndroid Build Coastguard Worker; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
748*dfc6aa5cSAndroid Build Coastguard Worker
749*dfc6aa5cSAndroid Build Coastguard Worker    align       16
750*dfc6aa5cSAndroid Build Coastguard Worker.EMIT_ERLOOP2:
751*dfc6aa5cSAndroid Build Coastguard Worker    EMIT_QWORD  size, sizeb, sizeh, \
752*dfc6aa5cSAndroid Build Coastguard Worker      { xor     size, size }, \
753*dfc6aa5cSAndroid Build Coastguard Worker      { tzcnt   size, index }, \
754*dfc6aa5cSAndroid Build Coastguard Worker      { inc     size }, \
755*dfc6aa5cSAndroid Build Coastguard Worker      { test    index, index }, \
756*dfc6aa5cSAndroid Build Coastguard Worker      { jnz     .BLOOP2 }, \
757*dfc6aa5cSAndroid Build Coastguard Worker      .ELOOP2
758*dfc6aa5cSAndroid Build Coastguard Worker
759*dfc6aa5cSAndroid Build Coastguard Worker; For some reason, the OS X linker does not honor the request to align the
760*dfc6aa5cSAndroid Build Coastguard Worker; segment unless we do this.
761*dfc6aa5cSAndroid Build Coastguard Worker    align       32
762