xref: /aosp_15_r20/external/libjpeg-turbo/simd/x86_64/jchuff-sse2.asm (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1*dfc6aa5cSAndroid Build Coastguard Worker;
2*dfc6aa5cSAndroid Build Coastguard Worker; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
3*dfc6aa5cSAndroid Build Coastguard Worker;
4*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander.
5*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2015, Matthieu Darbois.
6*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2018, Matthias Räncker.
7*dfc6aa5cSAndroid Build Coastguard Worker;
8*dfc6aa5cSAndroid Build Coastguard Worker; Based on the x86 SIMD extension for IJG JPEG library
9*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 1999-2006, MIYASAKA Masaru.
10*dfc6aa5cSAndroid Build Coastguard Worker; For conditions of distribution and use, see copyright notice in jsimdext.inc
11*dfc6aa5cSAndroid Build Coastguard Worker;
12*dfc6aa5cSAndroid Build Coastguard Worker; This file should be assembled with NASM (Netwide Assembler),
13*dfc6aa5cSAndroid Build Coastguard Worker; can *not* be assembled with Microsoft's MASM or any compatible
14*dfc6aa5cSAndroid Build Coastguard Worker; assembler (including Borland's Turbo Assembler).
15*dfc6aa5cSAndroid Build Coastguard Worker; NASM is available from http://nasm.sourceforge.net/ or
16*dfc6aa5cSAndroid Build Coastguard Worker; http://sourceforge.net/project/showfiles.php?group_id=6208
17*dfc6aa5cSAndroid Build Coastguard Worker;
18*dfc6aa5cSAndroid Build Coastguard Worker; This file contains an SSE2 implementation for Huffman coding of one block.
19*dfc6aa5cSAndroid Build Coastguard Worker; The following code is based on jchuff.c; see jchuff.c for more details.
20*dfc6aa5cSAndroid Build Coastguard Worker
21*dfc6aa5cSAndroid Build Coastguard Worker%include "jsimdext.inc"
22*dfc6aa5cSAndroid Build Coastguard Worker
23*dfc6aa5cSAndroid Build Coastguard Workerstruc working_state
24*dfc6aa5cSAndroid Build Coastguard Worker.next_output_byte:   resp 1     ; => next byte to write in buffer
25*dfc6aa5cSAndroid Build Coastguard Worker.free_in_buffer:     resp 1     ; # of byte spaces remaining in buffer
26*dfc6aa5cSAndroid Build Coastguard Worker.cur.put_buffer.simd resq 1     ; current bit accumulation buffer
27*dfc6aa5cSAndroid Build Coastguard Worker.cur.free_bits       resd 1     ; # of bits available in it
28*dfc6aa5cSAndroid Build Coastguard Worker.cur.last_dc_val     resd 4     ; last DC coef for each component
29*dfc6aa5cSAndroid Build Coastguard Worker.cinfo:              resp 1     ; dump_buffer needs access to this
30*dfc6aa5cSAndroid Build Coastguard Workerendstruc
31*dfc6aa5cSAndroid Build Coastguard Worker
32*dfc6aa5cSAndroid Build Coastguard Workerstruc c_derived_tbl
33*dfc6aa5cSAndroid Build Coastguard Worker.ehufco:             resd 256   ; code for each symbol
34*dfc6aa5cSAndroid Build Coastguard Worker.ehufsi:             resb 256   ; length of code for each symbol
35*dfc6aa5cSAndroid Build Coastguard Worker; If no code has been allocated for a symbol S, ehufsi[S] contains 0
36*dfc6aa5cSAndroid Build Coastguard Workerendstruc
37*dfc6aa5cSAndroid Build Coastguard Worker
38*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
39*dfc6aa5cSAndroid Build Coastguard Worker    SECTION     SEG_CONST
40*dfc6aa5cSAndroid Build Coastguard Worker
41*dfc6aa5cSAndroid Build Coastguard Worker    alignz      32
42*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_DATA(jconst_huff_encode_one_block)
43*dfc6aa5cSAndroid Build Coastguard Worker
44*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jconst_huff_encode_one_block):
45*dfc6aa5cSAndroid Build Coastguard Worker
46*dfc6aa5cSAndroid Build Coastguard Workerjpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007
47*dfc6aa5cSAndroid Build Coastguard Worker               dd 0x000f, 0x001f, 0x003f, 0x007f
48*dfc6aa5cSAndroid Build Coastguard Worker               dd 0x00ff, 0x01ff, 0x03ff, 0x07ff
49*dfc6aa5cSAndroid Build Coastguard Worker               dd 0x0fff, 0x1fff, 0x3fff, 0x7fff
50*dfc6aa5cSAndroid Build Coastguard Worker
51*dfc6aa5cSAndroid Build Coastguard Worker    alignz      32
52*dfc6aa5cSAndroid Build Coastguard Worker
53*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 14 db 15
54*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 13 db 14
55*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 12 db 13
56*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 11 db 12
57*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 10 db 11
58*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  9 db 10
59*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  8 db  9
60*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  7 db  8
61*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  6 db  7
62*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  5 db  6
63*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  4 db  5
64*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  3 db  4
65*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  2 db  3
66*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  1 db  2
67*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  0 db  1
68*dfc6aa5cSAndroid Build Coastguard Workertimes 1       db  0
69*dfc6aa5cSAndroid Build Coastguard Workerjpeg_nbits_table:
70*dfc6aa5cSAndroid Build Coastguard Workertimes 1       db  0
71*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  0 db  1
72*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  1 db  2
73*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  2 db  3
74*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  3 db  4
75*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  4 db  5
76*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  5 db  6
77*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  6 db  7
78*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  7 db  8
79*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  8 db  9
80*dfc6aa5cSAndroid Build Coastguard Workertimes 1 <<  9 db 10
81*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 10 db 11
82*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 11 db 12
83*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 12 db 13
84*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 13 db 14
85*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 14 db 15
86*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 15 db 16
87*dfc6aa5cSAndroid Build Coastguard Worker
88*dfc6aa5cSAndroid Build Coastguard Worker    alignz      32
89*dfc6aa5cSAndroid Build Coastguard Worker
90*dfc6aa5cSAndroid Build Coastguard Worker%define NBITS(x)      nbits_base + x
91*dfc6aa5cSAndroid Build Coastguard Worker%define MASK_BITS(x)  NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table)
92*dfc6aa5cSAndroid Build Coastguard Worker
93*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
94*dfc6aa5cSAndroid Build Coastguard Worker    SECTION     SEG_TEXT
95*dfc6aa5cSAndroid Build Coastguard Worker    BITS        64
96*dfc6aa5cSAndroid Build Coastguard Worker
97*dfc6aa5cSAndroid Build Coastguard Worker; Shorthand used to describe SIMD operations:
98*dfc6aa5cSAndroid Build Coastguard Worker; wN:  xmmN treated as eight signed 16-bit values
99*dfc6aa5cSAndroid Build Coastguard Worker; wN[i]:  perform the same operation on all eight signed 16-bit values, i=0..7
100*dfc6aa5cSAndroid Build Coastguard Worker; bN:  xmmN treated as 16 unsigned 8-bit values
101*dfc6aa5cSAndroid Build Coastguard Worker; bN[i]:  perform the same operation on all 16 unsigned 8-bit values, i=0..15
102*dfc6aa5cSAndroid Build Coastguard Worker; Contents of SIMD registers are shown in memory order.
103*dfc6aa5cSAndroid Build Coastguard Worker
104*dfc6aa5cSAndroid Build Coastguard Worker; Fill the bit buffer to capacity with the leading bits from code, then output
105*dfc6aa5cSAndroid Build Coastguard Worker; the bit buffer and put the remaining bits from code into the bit buffer.
106*dfc6aa5cSAndroid Build Coastguard Worker;
107*dfc6aa5cSAndroid Build Coastguard Worker; Usage:
108*dfc6aa5cSAndroid Build Coastguard Worker; code - contains the bits to shift into the bit buffer (LSB-aligned)
109*dfc6aa5cSAndroid Build Coastguard Worker; %1 - the label to which to jump when the macro completes
110*dfc6aa5cSAndroid Build Coastguard Worker; %2 (optional) - extra instructions to execute after nbits has been set
111*dfc6aa5cSAndroid Build Coastguard Worker;
112*dfc6aa5cSAndroid Build Coastguard Worker; Upon completion, free_bits will be set to the number of remaining bits from
113*dfc6aa5cSAndroid Build Coastguard Worker; code, and put_buffer will contain those remaining bits.  temp and code will
114*dfc6aa5cSAndroid Build Coastguard Worker; be clobbered.
115*dfc6aa5cSAndroid Build Coastguard Worker;
116*dfc6aa5cSAndroid Build Coastguard Worker; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
117*dfc6aa5cSAndroid Build Coastguard Worker; macro in jchuff.c.
118*dfc6aa5cSAndroid Build Coastguard Worker
119*dfc6aa5cSAndroid Build Coastguard Worker%macro EMIT_QWORD 1-2
120*dfc6aa5cSAndroid Build Coastguard Worker    add         nbitsb, free_bitsb      ; nbits += free_bits;
121*dfc6aa5cSAndroid Build Coastguard Worker    neg         free_bitsb              ; free_bits = -free_bits;
122*dfc6aa5cSAndroid Build Coastguard Worker    mov         tempd, code             ; temp = code;
123*dfc6aa5cSAndroid Build Coastguard Worker    shl         put_buffer, nbitsb      ; put_buffer <<= nbits;
124*dfc6aa5cSAndroid Build Coastguard Worker    mov         nbitsb, free_bitsb      ; nbits = free_bits;
125*dfc6aa5cSAndroid Build Coastguard Worker    neg         free_bitsb              ; free_bits = -free_bits;
126*dfc6aa5cSAndroid Build Coastguard Worker    shr         tempd, nbitsb           ; temp >>= nbits;
127*dfc6aa5cSAndroid Build Coastguard Worker    or          tempq, put_buffer       ; temp |= put_buffer;
128*dfc6aa5cSAndroid Build Coastguard Worker    movq        xmm0, tempq             ; xmm0.u64 = { temp, 0 };
129*dfc6aa5cSAndroid Build Coastguard Worker    bswap       tempq                   ; temp = htonl(temp);
130*dfc6aa5cSAndroid Build Coastguard Worker    mov         put_buffer, codeq       ; put_buffer = code;
131*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqb     xmm0, xmm1              ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0);
132*dfc6aa5cSAndroid Build Coastguard Worker    %2
133*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    code, xmm0              ; code = 0;  code |= ((b0[i] >> 7) << i);
134*dfc6aa5cSAndroid Build Coastguard Worker    mov         qword [buffer], tempq   ; memcpy(buffer, &temp, 8);
135*dfc6aa5cSAndroid Build Coastguard Worker                                        ; (speculative; will be overwritten if
136*dfc6aa5cSAndroid Build Coastguard Worker                                        ; code contains any 0xFF bytes)
137*dfc6aa5cSAndroid Build Coastguard Worker    add         free_bitsb, 64          ; free_bits += 64;
138*dfc6aa5cSAndroid Build Coastguard Worker    add         bufferp, 8              ; buffer += 8;
139*dfc6aa5cSAndroid Build Coastguard Worker    test        code, code              ; if (code == 0)  /* No 0xFF bytes */
140*dfc6aa5cSAndroid Build Coastguard Worker    jz          %1                      ;   return;
141*dfc6aa5cSAndroid Build Coastguard Worker    ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
142*dfc6aa5cSAndroid Build Coastguard Worker    ; bytes in the qword.
143*dfc6aa5cSAndroid Build Coastguard Worker    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
144*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer-7], 0      ; buffer[-7] = 0;
145*dfc6aa5cSAndroid Build Coastguard Worker    sbb         bufferp, 6              ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0));
146*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer], temph    ; buffer[0] = temp[1];
147*dfc6aa5cSAndroid Build Coastguard Worker    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
148*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer+1], 0      ; buffer[1] = 0;
149*dfc6aa5cSAndroid Build Coastguard Worker    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
150*dfc6aa5cSAndroid Build Coastguard Worker    shr         tempq, 16               ; temp >>= 16;
151*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
152*dfc6aa5cSAndroid Build Coastguard Worker    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
153*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer+1], 0      ; buffer[1] = 0;
154*dfc6aa5cSAndroid Build Coastguard Worker    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
155*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer], temph    ; buffer[0] = temp[1];
156*dfc6aa5cSAndroid Build Coastguard Worker    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
157*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer+1], 0      ; buffer[1] = 0;
158*dfc6aa5cSAndroid Build Coastguard Worker    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
159*dfc6aa5cSAndroid Build Coastguard Worker    shr         tempq, 16               ; temp >>= 16;
160*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
161*dfc6aa5cSAndroid Build Coastguard Worker    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
162*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer+1], 0      ; buffer[1] = 0;
163*dfc6aa5cSAndroid Build Coastguard Worker    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
164*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer], temph    ; buffer[0] = temp[1];
165*dfc6aa5cSAndroid Build Coastguard Worker    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
166*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer+1], 0      ; buffer[1] = 0;
167*dfc6aa5cSAndroid Build Coastguard Worker    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
168*dfc6aa5cSAndroid Build Coastguard Worker    shr         tempd, 16               ; temp >>= 16;
169*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
170*dfc6aa5cSAndroid Build Coastguard Worker    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
171*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer+1], 0      ; buffer[1] = 0;
172*dfc6aa5cSAndroid Build Coastguard Worker    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
173*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer], temph    ; buffer[0] = temp[1];
174*dfc6aa5cSAndroid Build Coastguard Worker    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
175*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [buffer+1], 0      ; buffer[1] = 0;
176*dfc6aa5cSAndroid Build Coastguard Worker    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
177*dfc6aa5cSAndroid Build Coastguard Worker    jmp         %1                      ; return;
178*dfc6aa5cSAndroid Build Coastguard Worker%endmacro
179*dfc6aa5cSAndroid Build Coastguard Worker
180*dfc6aa5cSAndroid Build Coastguard Worker;
181*dfc6aa5cSAndroid Build Coastguard Worker; Encode a single block's worth of coefficients.
182*dfc6aa5cSAndroid Build Coastguard Worker;
183*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(JOCTET *)
184*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
185*dfc6aa5cSAndroid Build Coastguard Worker;                                  JCOEFPTR block, int last_dc_val,
186*dfc6aa5cSAndroid Build Coastguard Worker;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
187*dfc6aa5cSAndroid Build Coastguard Worker;
188*dfc6aa5cSAndroid Build Coastguard Worker; NOTES:
189*dfc6aa5cSAndroid Build Coastguard Worker; When shuffling data, we try to avoid pinsrw as much as possible, since it is
190*dfc6aa5cSAndroid Build Coastguard Worker; slow on many CPUs.  Its reciprocal throughput (issue latency) is 1 even on
191*dfc6aa5cSAndroid Build Coastguard Worker; modern CPUs, so chains of pinsrw instructions (even with different outputs)
192*dfc6aa5cSAndroid Build Coastguard Worker; can limit performance.  pinsrw is a VectorPath instruction on AMD K8 and
193*dfc6aa5cSAndroid Build Coastguard Worker; requires 2 µops (with memory operand) on Intel.  In either case, only one
194*dfc6aa5cSAndroid Build Coastguard Worker; pinsrw instruction can be decoded per cycle (and nothing else if they are
195*dfc6aa5cSAndroid Build Coastguard Worker; back-to-back), so out-of-order execution cannot be used to work around long
196*dfc6aa5cSAndroid Build Coastguard Worker; pinsrw chains (though for Sandy Bridge and later, this may be less of a
197*dfc6aa5cSAndroid Build Coastguard Worker; problem if the code runs from the µop cache.)
198*dfc6aa5cSAndroid Build Coastguard Worker;
199*dfc6aa5cSAndroid Build Coastguard Worker; We use tzcnt instead of bsf without checking for support.  The instruction is
200*dfc6aa5cSAndroid Build Coastguard Worker; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
201*dfc6aa5cSAndroid Build Coastguard Worker; rep bsf.)  The destination (first) operand of bsf (and tzcnt on some CPUs) is
202*dfc6aa5cSAndroid Build Coastguard Worker; an input dependency (although the behavior is not formally defined, Intel
203*dfc6aa5cSAndroid Build Coastguard Worker; CPUs usually leave the destination unmodified if the source is zero.)  This
204*dfc6aa5cSAndroid Build Coastguard Worker; can prevent out-of-order execution, so we clear the destination before
205*dfc6aa5cSAndroid Build Coastguard Worker; invoking tzcnt.
206*dfc6aa5cSAndroid Build Coastguard Worker;
207*dfc6aa5cSAndroid Build Coastguard Worker; Initial register allocation
208*dfc6aa5cSAndroid Build Coastguard Worker; rax - buffer
209*dfc6aa5cSAndroid Build Coastguard Worker; rbx - temp
210*dfc6aa5cSAndroid Build Coastguard Worker; rcx - nbits
211*dfc6aa5cSAndroid Build Coastguard Worker; rdx - block --> free_bits
212*dfc6aa5cSAndroid Build Coastguard Worker; rsi - nbits_base
213*dfc6aa5cSAndroid Build Coastguard Worker; rdi - t
214*dfc6aa5cSAndroid Build Coastguard Worker; rbp - code
215*dfc6aa5cSAndroid Build Coastguard Worker; r8  - dctbl --> code_temp
216*dfc6aa5cSAndroid Build Coastguard Worker; r9  - actbl
217*dfc6aa5cSAndroid Build Coastguard Worker; r10 - state
218*dfc6aa5cSAndroid Build Coastguard Worker; r11 - index
219*dfc6aa5cSAndroid Build Coastguard Worker; r12 - put_buffer
220*dfc6aa5cSAndroid Build Coastguard Worker
221*dfc6aa5cSAndroid Build Coastguard Worker%define buffer       rax
222*dfc6aa5cSAndroid Build Coastguard Worker%ifdef WIN64
223*dfc6aa5cSAndroid Build Coastguard Worker%define bufferp      rax
224*dfc6aa5cSAndroid Build Coastguard Worker%else
225*dfc6aa5cSAndroid Build Coastguard Worker%define bufferp      raxp
226*dfc6aa5cSAndroid Build Coastguard Worker%endif
227*dfc6aa5cSAndroid Build Coastguard Worker%define tempq        rbx
228*dfc6aa5cSAndroid Build Coastguard Worker%define tempd        ebx
229*dfc6aa5cSAndroid Build Coastguard Worker%define tempb        bl
230*dfc6aa5cSAndroid Build Coastguard Worker%define temph        bh
231*dfc6aa5cSAndroid Build Coastguard Worker%define nbitsq       rcx
232*dfc6aa5cSAndroid Build Coastguard Worker%define nbits        ecx
233*dfc6aa5cSAndroid Build Coastguard Worker%define nbitsb       cl
234*dfc6aa5cSAndroid Build Coastguard Worker%define block        rdx
235*dfc6aa5cSAndroid Build Coastguard Worker%define nbits_base   rsi
236*dfc6aa5cSAndroid Build Coastguard Worker%define t            rdi
237*dfc6aa5cSAndroid Build Coastguard Worker%define td           edi
238*dfc6aa5cSAndroid Build Coastguard Worker%define codeq        rbp
239*dfc6aa5cSAndroid Build Coastguard Worker%define code         ebp
240*dfc6aa5cSAndroid Build Coastguard Worker%define dctbl        r8
241*dfc6aa5cSAndroid Build Coastguard Worker%define actbl        r9
242*dfc6aa5cSAndroid Build Coastguard Worker%define state        r10
243*dfc6aa5cSAndroid Build Coastguard Worker%define index        r11
244*dfc6aa5cSAndroid Build Coastguard Worker%define indexd       r11d
245*dfc6aa5cSAndroid Build Coastguard Worker%define put_buffer   r12
246*dfc6aa5cSAndroid Build Coastguard Worker%define put_bufferd  r12d
247*dfc6aa5cSAndroid Build Coastguard Worker
248*dfc6aa5cSAndroid Build Coastguard Worker; Step 1: Re-arrange input data according to jpeg_natural_order
249*dfc6aa5cSAndroid Build Coastguard Worker; xx 01 02 03 04 05 06 07      xx 01 08 16 09 02 03 10
250*dfc6aa5cSAndroid Build Coastguard Worker; 08 09 10 11 12 13 14 15      17 24 32 25 18 11 04 05
251*dfc6aa5cSAndroid Build Coastguard Worker; 16 17 18 19 20 21 22 23      12 19 26 33 40 48 41 34
252*dfc6aa5cSAndroid Build Coastguard Worker; 24 25 26 27 28 29 30 31 ==>  27 20 13 06 07 14 21 28
253*dfc6aa5cSAndroid Build Coastguard Worker; 32 33 34 35 36 37 38 39      35 42 49 56 57 50 43 36
254*dfc6aa5cSAndroid Build Coastguard Worker; 40 41 42 43 44 45 46 47      29 22 15 23 30 37 44 51
255*dfc6aa5cSAndroid Build Coastguard Worker; 48 49 50 51 52 53 54 55      58 59 52 45 38 31 39 46
256*dfc6aa5cSAndroid Build Coastguard Worker; 56 57 58 59 60 61 62 63      53 60 61 54 47 55 62 63
257*dfc6aa5cSAndroid Build Coastguard Worker
258*dfc6aa5cSAndroid Build Coastguard Worker    align       32
259*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
260*dfc6aa5cSAndroid Build Coastguard Worker
261*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_huff_encode_one_block_sse2):
262*dfc6aa5cSAndroid Build Coastguard Worker
263*dfc6aa5cSAndroid Build Coastguard Worker%ifdef WIN64
264*dfc6aa5cSAndroid Build Coastguard Worker
265*dfc6aa5cSAndroid Build Coastguard Worker; rcx = working_state *state
266*dfc6aa5cSAndroid Build Coastguard Worker; rdx = JOCTET *buffer
267*dfc6aa5cSAndroid Build Coastguard Worker; r8 = JCOEFPTR block
268*dfc6aa5cSAndroid Build Coastguard Worker; r9 = int last_dc_val
269*dfc6aa5cSAndroid Build Coastguard Worker; [rax+48] = c_derived_tbl *dctbl
270*dfc6aa5cSAndroid Build Coastguard Worker; [rax+56] = c_derived_tbl *actbl
271*dfc6aa5cSAndroid Build Coastguard Worker
272*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;X: X = code stream
273*dfc6aa5cSAndroid Build Coastguard Worker    mov         buffer, rdx
274*dfc6aa5cSAndroid Build Coastguard Worker    mov         block, r8
275*dfc6aa5cSAndroid Build Coastguard Worker    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
276*dfc6aa5cSAndroid Build Coastguard Worker    push        rbx
277*dfc6aa5cSAndroid Build Coastguard Worker    push        rbp
278*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
279*dfc6aa5cSAndroid Build Coastguard Worker    push        rsi
280*dfc6aa5cSAndroid Build Coastguard Worker    push        rdi
281*dfc6aa5cSAndroid Build Coastguard Worker    push        r12
282*dfc6aa5cSAndroid Build Coastguard Worker    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
283*dfc6aa5cSAndroid Build Coastguard Worker    mov         state, rcx
284*dfc6aa5cSAndroid Build Coastguard Worker    movsx       code, word [block]                        ;Z:     code = block[0];
285*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
286*dfc6aa5cSAndroid Build Coastguard Worker    sub         code, r9d                                 ;Z:     code -= last_dc_val;
287*dfc6aa5cSAndroid Build Coastguard Worker    mov         dctbl, POINTER [rsp+6*8+4*8]
288*dfc6aa5cSAndroid Build Coastguard Worker    mov         actbl, POINTER [rsp+6*8+5*8]
289*dfc6aa5cSAndroid Build Coastguard Worker    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
290*dfc6aa5cSAndroid Build Coastguard Worker    lea         nbits_base, [rel jpeg_nbits_table]
291*dfc6aa5cSAndroid Build Coastguard Worker    add         rsp, -DCTSIZE2 * SIZEOF_WORD
292*dfc6aa5cSAndroid Build Coastguard Worker    mov         t, rsp
293*dfc6aa5cSAndroid Build Coastguard Worker
294*dfc6aa5cSAndroid Build Coastguard Worker%else
295*dfc6aa5cSAndroid Build Coastguard Worker
296*dfc6aa5cSAndroid Build Coastguard Worker; rdi = working_state *state
297*dfc6aa5cSAndroid Build Coastguard Worker; rsi = JOCTET *buffer
298*dfc6aa5cSAndroid Build Coastguard Worker; rdx = JCOEFPTR block
299*dfc6aa5cSAndroid Build Coastguard Worker; rcx = int last_dc_val
300*dfc6aa5cSAndroid Build Coastguard Worker; r8 = c_derived_tbl *dctbl
301*dfc6aa5cSAndroid Build Coastguard Worker; r9 = c_derived_tbl *actbl
302*dfc6aa5cSAndroid Build Coastguard Worker
303*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;X: X = code stream
304*dfc6aa5cSAndroid Build Coastguard Worker    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
305*dfc6aa5cSAndroid Build Coastguard Worker    push        rbx
306*dfc6aa5cSAndroid Build Coastguard Worker    push        rbp
307*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
308*dfc6aa5cSAndroid Build Coastguard Worker    push        r12
309*dfc6aa5cSAndroid Build Coastguard Worker    mov         state, rdi
310*dfc6aa5cSAndroid Build Coastguard Worker    mov         buffer, rsi
311*dfc6aa5cSAndroid Build Coastguard Worker    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
312*dfc6aa5cSAndroid Build Coastguard Worker    movsx       codeq, word [block]                       ;Z:     code = block[0];
313*dfc6aa5cSAndroid Build Coastguard Worker    lea         nbits_base, [rel jpeg_nbits_table]
314*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
315*dfc6aa5cSAndroid Build Coastguard Worker    sub         codeq, rcx                                ;Z:     code -= last_dc_val;
316*dfc6aa5cSAndroid Build Coastguard Worker    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
317*dfc6aa5cSAndroid Build Coastguard Worker    lea         t, [rsp - DCTSIZE2 * SIZEOF_WORD]         ;   use red zone for t_
318*dfc6aa5cSAndroid Build Coastguard Worker
319*dfc6aa5cSAndroid Build Coastguard Worker%endif
320*dfc6aa5cSAndroid Build Coastguard Worker
321*dfc6aa5cSAndroid Build Coastguard Worker    pshuflw     xmm0, xmm0, 11001001b                     ;A: w0 = 01 08 xx 09 02 03 10 11
322*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm0, word [block + 16 * SIZEOF_WORD], 2  ;A: w0 = 01 08 16 09 02 03 10 11
323*dfc6aa5cSAndroid Build Coastguard Worker    punpckhdq   xmm3, xmm1                                ;D: w3 = 04 05 12 13 06 07 14 15
324*dfc6aa5cSAndroid Build Coastguard Worker    punpcklqdq  xmm1, xmm3                                ;B: w1 = 08 09 10 11 04 05 12 13
325*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm0, word [block + 17 * SIZEOF_WORD], 7  ;A: w0 = 01 08 16 09 02 03 10 17
326*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;A:      (Row 0, offset 1)
327*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     xmm4, xmm0                                ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
328*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm0, xmm4                                ;A: w0[i] += w4[i];
329*dfc6aa5cSAndroid Build Coastguard Worker    movaps      XMMWORD [t + 0 * SIZEOF_WORD], xmm0       ;A: t[i] = w0[i];
330*dfc6aa5cSAndroid Build Coastguard Worker
331*dfc6aa5cSAndroid Build Coastguard Worker    movq        xmm2, qword [block + 24 * SIZEOF_WORD]    ;B: w2 = 24 25 26 27 -- -- -- --
332*dfc6aa5cSAndroid Build Coastguard Worker    pshuflw     xmm2, xmm2, 11011000b                     ;B: w2 = 24 26 25 27 -- -- -- --
333*dfc6aa5cSAndroid Build Coastguard Worker    pslldq      xmm1, 1 * SIZEOF_WORD                     ;B: w1 = -- 08 09 10 11 04 05 12
334*dfc6aa5cSAndroid Build Coastguard Worker    movups      xmm5, XMMWORD [block + 48 * SIZEOF_WORD]  ;H: w5 = 48 49 50 51 52 53 54 55
335*dfc6aa5cSAndroid Build Coastguard Worker    movsd       xmm1, xmm2                                ;B: w1 = 24 26 25 27 11 04 05 12
336*dfc6aa5cSAndroid Build Coastguard Worker    punpcklqdq  xmm2, xmm5                                ;C: w2 = 24 26 25 27 48 49 50 51
337*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm1, word [block + 32 * SIZEOF_WORD], 1  ;B: w1 = 24 32 25 27 11 04 05 12
338*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
339*dfc6aa5cSAndroid Build Coastguard Worker    psrldq      xmm3, 2 * SIZEOF_WORD                     ;D: w3 = 12 13 06 07 14 15 -- --
340*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm0, xmm4                                ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
341*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm1, word [block + 18 * SIZEOF_WORD], 3  ;B: w1 = 24 32 25 18 11 04 05 12
342*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;        (Row 1, offset 1)
343*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     xmm4, xmm1                                ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
344*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm1, xmm4                                ;B: w1[i] += w4[i];
345*dfc6aa5cSAndroid Build Coastguard Worker    movaps      XMMWORD [t + 8 * SIZEOF_WORD], xmm1       ;B: t[i+8] = w1[i];
346*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm4, xmm4                                ;B: w4[i] = 0;
347*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm1, xmm4                                ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
348*dfc6aa5cSAndroid Build Coastguard Worker
349*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    xmm0, xmm1                                ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
350*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;    w/ signed saturation
351*dfc6aa5cSAndroid Build Coastguard Worker
352*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm3, word [block + 20 * SIZEOF_WORD], 0  ;D: w3 = 20 13 06 07 14 15 -- --
353*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm3, word [block + 21 * SIZEOF_WORD], 5  ;D: w3 = 20 13 06 07 14 21 -- --
354*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm3, word [block + 28 * SIZEOF_WORD], 6  ;D: w3 = 20 13 06 07 14 21 28 --
355*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm3, word [block + 35 * SIZEOF_WORD], 7  ;D: w3 = 20 13 06 07 14 21 28 35
356*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;        (Row 3, offset 1)
357*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     xmm4, xmm3                                ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
358*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm3, xmm4                                ;D: w3[i] += w4[i];
359*dfc6aa5cSAndroid Build Coastguard Worker    movaps      XMMWORD [t + 24 * SIZEOF_WORD], xmm3      ;D: t[i+24] = w3[i];
360*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm4, xmm4                                ;D: w4[i] = 0;
361*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm3, xmm4                                ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
362*dfc6aa5cSAndroid Build Coastguard Worker
363*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm2, word [block + 19 * SIZEOF_WORD], 0  ;C: w2 = 19 26 25 27 48 49 50 51
364*dfc6aa5cSAndroid Build Coastguard Worker    cmp         code, 1 << 31                             ;Z:     Set CF if code < 0x80000000,
365*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;Z:     i.e. if code is positive
366*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm2, word [block + 33 * SIZEOF_WORD], 2  ;C: w2 = 19 26 33 27 48 49 50 51
367*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm2, word [block + 40 * SIZEOF_WORD], 3  ;C: w2 = 19 26 33 40 48 49 50 51
368*dfc6aa5cSAndroid Build Coastguard Worker    adc         code, -1                                  ;Z:     code += -1 + (code >= 0 ? 1 : 0);
369*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm2, word [block + 41 * SIZEOF_WORD], 5  ;C: w2 = 19 26 33 40 48 41 50 51
370*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm2, word [block + 34 * SIZEOF_WORD], 6  ;C: w2 = 19 26 33 40 48 41 34 51
371*dfc6aa5cSAndroid Build Coastguard Worker    movsxd      codeq, code                               ;Z:     sign extend code
372*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm2, word [block + 27 * SIZEOF_WORD], 7  ;C: w2 = 19 26 33 40 48 41 34 27
373*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;        (Row 2, offset 1)
374*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     xmm4, xmm2                                ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
375*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm2, xmm4                                ;C: w2[i] += w4[i];
376*dfc6aa5cSAndroid Build Coastguard Worker    movaps      XMMWORD [t + 16 * SIZEOF_WORD], xmm2      ;C: t[i+16] = w2[i];
377*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm4, xmm4                                ;C: w4[i] = 0;
378*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm2, xmm4                                ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
379*dfc6aa5cSAndroid Build Coastguard Worker
380*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    xmm2, xmm3                                ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
381*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;    w/ signed saturation
382*dfc6aa5cSAndroid Build Coastguard Worker
383*dfc6aa5cSAndroid Build Coastguard Worker    movzx       nbitsq, byte [NBITS(codeq)]               ;Z:     nbits = JPEG_NBITS(code);
384*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm3, xmm5                                ;H: w3 = 48 49 50 51 52 53 54 55
385*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    tempd, xmm2                               ;Z:     temp = 0;  temp |= ((b2[i] >> 7) << i);
386*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    put_bufferd, xmm0                         ;Z:     put_buffer = 0;  put_buffer |= ((b0[i] >> 7) << i);
387*dfc6aa5cSAndroid Build Coastguard Worker    movups      xmm0, XMMWORD [block + 56 * SIZEOF_WORD]  ;H: w0 = 56 57 58 59 60 61 62 63
388*dfc6aa5cSAndroid Build Coastguard Worker    punpckhdq   xmm3, xmm0                                ;H: w3 = 52 53 60 61 54 55 62 63
389*dfc6aa5cSAndroid Build Coastguard Worker    shl         tempd, 16                                 ;Z:     temp <<= 16;
390*dfc6aa5cSAndroid Build Coastguard Worker    psrldq      xmm3, 1 * SIZEOF_WORD                     ;H: w3 = 53 60 61 54 55 62 63 --
391*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm2, xmm2                                ;H: w2[i] = 0;
392*dfc6aa5cSAndroid Build Coastguard Worker    or          put_bufferd, tempd                        ;Z:     put_buffer |= temp;
393*dfc6aa5cSAndroid Build Coastguard Worker    pshuflw     xmm3, xmm3, 00111001b                     ;H: w3 = 60 61 54 53 55 62 63 --
394*dfc6aa5cSAndroid Build Coastguard Worker    movq        xmm1, qword [block + 44 * SIZEOF_WORD]    ;G: w1 = 44 45 46 47 -- -- -- --
395*dfc6aa5cSAndroid Build Coastguard Worker    unpcklps    xmm5, xmm0                                ;E: w5 = 48 49 56 57 50 51 58 59
396*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm0, xmm0                                ;H: w0[i] = 0;
397*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm3, word [block + 47 * SIZEOF_WORD], 3  ;H: w3 = 60 61 54 47 55 62 63 --
398*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;        (Row 7, offset 1)
399*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     xmm2, xmm3                                ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
400*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm3, xmm2                                ;H: w3[i] += w2[i];
401*dfc6aa5cSAndroid Build Coastguard Worker    movaps      XMMWORD [t + 56 * SIZEOF_WORD], xmm3      ;H: t[i+56] = w3[i];
402*dfc6aa5cSAndroid Build Coastguard Worker    movq        xmm4, qword [block + 36 * SIZEOF_WORD]    ;G: w4 = 36 37 38 39 -- -- -- --
403*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm3, xmm0                                ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
404*dfc6aa5cSAndroid Build Coastguard Worker    punpckldq   xmm4, xmm1                                ;G: w4 = 36 37 44 45 38 39 46 47
405*dfc6aa5cSAndroid Build Coastguard Worker    mov         tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4]
406*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;Z:     temp = dctbl->ehufco[nbits];
407*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm1, xmm4                                ;F: w1 = 36 37 44 45 38 39 46 47
408*dfc6aa5cSAndroid Build Coastguard Worker    psrldq      xmm4, 1 * SIZEOF_WORD                     ;G: w4 = 37 44 45 38 39 46 47 --
409*dfc6aa5cSAndroid Build Coastguard Worker    shufpd      xmm1, xmm5, 10b                           ;F: w1 = 36 37 44 45 50 51 58 59
410*dfc6aa5cSAndroid Build Coastguard Worker    and         code, dword [MASK_BITS(nbitsq)]           ;Z:     code &= (1 << nbits) - 1;
411*dfc6aa5cSAndroid Build Coastguard Worker    pshufhw     xmm4, xmm4, 11010011b                     ;G: w4 = 37 44 45 38 -- 39 46 --
412*dfc6aa5cSAndroid Build Coastguard Worker    pslldq      xmm1, 1 * SIZEOF_WORD                     ;F: w1 = -- 36 37 44 45 50 51 58
413*dfc6aa5cSAndroid Build Coastguard Worker    shl         tempq, nbitsb                             ;Z:     temp <<= nbits;
414*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm4, word [block + 59 * SIZEOF_WORD], 0  ;G: w4 = 59 44 45 38 -- 39 46 --
415*dfc6aa5cSAndroid Build Coastguard Worker    pshufd      xmm1, xmm1, 11011000b                     ;F: w1 = -- 36 45 50 37 44 51 58
416*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm4, word [block + 52 * SIZEOF_WORD], 1  ;G: w4 = 59 52 45 38 -- 39 46 --
417*dfc6aa5cSAndroid Build Coastguard Worker    or          code, tempd                               ;Z:     code |= temp;
418*dfc6aa5cSAndroid Build Coastguard Worker    movlps      xmm1, qword [block + 20 * SIZEOF_WORD]    ;F: w1 = 20 21 22 23 37 44 51 58
419*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm4, word [block + 31 * SIZEOF_WORD], 4  ;G: w4 = 59 52 45 38 31 39 46 --
420*dfc6aa5cSAndroid Build Coastguard Worker    pshuflw     xmm1, xmm1, 01110010b                     ;F: w1 = 22 20 23 21 37 44 51 58
421*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm4, word [block + 53 * SIZEOF_WORD], 7  ;G: w4 = 59 52 45 38 31 39 46 53
422*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;        (Row 6, offset 1)
423*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm2, xmm2                                ;G: w2[i] = 0;
424*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     xmm0, xmm4                                ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
425*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm1, word [block + 15 * SIZEOF_WORD], 1  ;F: w1 = 22 15 23 21 37 44 51 58
426*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm4, xmm0                                ;G: w4[i] += w0[i];
427*dfc6aa5cSAndroid Build Coastguard Worker    movaps      XMMWORD [t + 48 * SIZEOF_WORD], xmm4      ;G: t[48+i] = w4[i];
428*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm1, word [block + 30 * SIZEOF_WORD], 3  ;F: w1 = 22 15 23 30 37 44 51 58
429*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;        (Row 5, offset 1)
430*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm4, xmm2                                ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
431*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm5, word [block + 42 * SIZEOF_WORD], 0  ;E: w5 = 42 49 56 57 50 51 58 59
432*dfc6aa5cSAndroid Build Coastguard Worker
433*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    xmm4, xmm3                                ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
434*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;    w/ signed saturation
435*dfc6aa5cSAndroid Build Coastguard Worker
436*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm0, xmm0                                ;F: w0[i] = 0;
437*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm5, word [block + 43 * SIZEOF_WORD], 5  ;E: w5 = 42 49 56 57 50 43 58 59
438*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     xmm2, xmm1                                ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
439*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    tempd, xmm4                               ;Z:     temp = 0;  temp |= ((b4[i] >> 7) << i);
440*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm5, word [block + 36 * SIZEOF_WORD], 6  ;E: w5 = 42 49 56 57 50 43 36 59
441*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm1, xmm2                                ;F: w1[i] += w2[i];
442*dfc6aa5cSAndroid Build Coastguard Worker    movaps      XMMWORD [t + 40 * SIZEOF_WORD], xmm1      ;F: t[40+i] = w1[i];
443*dfc6aa5cSAndroid Build Coastguard Worker    pinsrw      xmm5, word [block + 29 * SIZEOF_WORD], 7  ;E: w5 = 42 49 56 57 50 43 36 29
444*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;        (Row 4, offset 1)
445*dfc6aa5cSAndroid Build Coastguard Worker%undef block
446*dfc6aa5cSAndroid Build Coastguard Worker%define free_bitsq  rdx
447*dfc6aa5cSAndroid Build Coastguard Worker%define free_bitsd  edx
448*dfc6aa5cSAndroid Build Coastguard Worker%define free_bitsb  dl
449*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm1, xmm0                                ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
450*dfc6aa5cSAndroid Build Coastguard Worker    shl         tempq, 48                                 ;Z:     temp <<= 48;
451*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm2, xmm2                                ;E: w2[i] = 0;
452*dfc6aa5cSAndroid Build Coastguard Worker    pcmpgtw     xmm0, xmm5                                ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
453*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm5, xmm0                                ;E: w5[i] += w0[i];
454*dfc6aa5cSAndroid Build Coastguard Worker    or          tempq, put_buffer                         ;Z:     temp |= put_buffer;
455*dfc6aa5cSAndroid Build Coastguard Worker    movaps      XMMWORD [t + 32 * SIZEOF_WORD], xmm5      ;E: t[32+i] = w5[i];
456*dfc6aa5cSAndroid Build Coastguard Worker    lea         t, [dword t - 2]                          ;Z:     t = &t[-1];
457*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm5, xmm2                                ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
458*dfc6aa5cSAndroid Build Coastguard Worker
459*dfc6aa5cSAndroid Build Coastguard Worker    packsswb    xmm5, xmm1                                ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
460*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;    w/ signed saturation
461*dfc6aa5cSAndroid Build Coastguard Worker
462*dfc6aa5cSAndroid Build Coastguard Worker    add         nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq]
463*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;Z:     nbits += dctbl->ehufsi[nbits];
464*dfc6aa5cSAndroid Build Coastguard Worker%undef dctbl
465*dfc6aa5cSAndroid Build Coastguard Worker%define code_temp  r8d
466*dfc6aa5cSAndroid Build Coastguard Worker    pmovmskb    indexd, xmm5                              ;Z:     index = 0;  index |= ((b5[i] >> 7) << i);
467*dfc6aa5cSAndroid Build Coastguard Worker    mov         free_bitsd, [state+working_state.cur.free_bits]
468*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;Z:     free_bits = state->cur.free_bits;
469*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm1, xmm1                                ;Z:     b1[i] = 0xFF;
470*dfc6aa5cSAndroid Build Coastguard Worker    shl         index, 32                                 ;Z:     index <<= 32;
471*dfc6aa5cSAndroid Build Coastguard Worker    mov         put_buffer, [state+working_state.cur.put_buffer.simd]
472*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;Z:     put_buffer = state->cur.put_buffer.simd;
473*dfc6aa5cSAndroid Build Coastguard Worker    or          index, tempq                              ;Z:     index |= temp;
474*dfc6aa5cSAndroid Build Coastguard Worker    not         index                                     ;Z:     index = ~index;
475*dfc6aa5cSAndroid Build Coastguard Worker    sub         free_bitsb, nbitsb                        ;Z:     if ((free_bits -= nbits) >= 0)
476*dfc6aa5cSAndroid Build Coastguard Worker    jnl         .ENTRY_SKIP_EMIT_CODE                     ;Z:       goto .ENTRY_SKIP_EMIT_CODE;
477*dfc6aa5cSAndroid Build Coastguard Worker    align       16
478*dfc6aa5cSAndroid Build Coastguard Worker.EMIT_CODE:                                               ;Z:     .EMIT_CODE:
479*dfc6aa5cSAndroid Build Coastguard Worker    EMIT_QWORD  .BLOOP_COND                               ;Z:     insert code, flush buffer, goto .BLOOP_COND
480*dfc6aa5cSAndroid Build Coastguard Worker
481*dfc6aa5cSAndroid Build Coastguard Worker; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
482*dfc6aa5cSAndroid Build Coastguard Worker
483*dfc6aa5cSAndroid Build Coastguard Worker    align       16
484*dfc6aa5cSAndroid Build Coastguard Worker.BRLOOP:                                                  ; do {
485*dfc6aa5cSAndroid Build Coastguard Worker    lea         code_temp, [nbitsq - 16]                  ;   code_temp = nbits - 16;
486*dfc6aa5cSAndroid Build Coastguard Worker    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
487*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;   nbits = actbl->ehufsi[0xf0];
488*dfc6aa5cSAndroid Build Coastguard Worker    mov         code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
489*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;   code = actbl->ehufco[0xf0];
490*dfc6aa5cSAndroid Build Coastguard Worker    sub         free_bitsb, nbitsb                        ;   if ((free_bits -= nbits) <= 0)
491*dfc6aa5cSAndroid Build Coastguard Worker    jle         .EMIT_BRLOOP_CODE                         ;     goto .EMIT_BRLOOP_CODE;
492*dfc6aa5cSAndroid Build Coastguard Worker    shl         put_buffer, nbitsb                        ;   put_buffer <<= nbits;
493*dfc6aa5cSAndroid Build Coastguard Worker    mov         nbits, code_temp                          ;   nbits = code_temp;
494*dfc6aa5cSAndroid Build Coastguard Worker    or          put_buffer, codeq                         ;   put_buffer |= code;
495*dfc6aa5cSAndroid Build Coastguard Worker    cmp         nbits, 16                                 ;   if (nbits <= 16)
496*dfc6aa5cSAndroid Build Coastguard Worker    jle         .ERLOOP                                   ;     break;
497*dfc6aa5cSAndroid Build Coastguard Worker    jmp         .BRLOOP                                   ; } while (1);
498*dfc6aa5cSAndroid Build Coastguard Worker
499*dfc6aa5cSAndroid Build Coastguard Worker; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
500*dfc6aa5cSAndroid Build Coastguard Worker
501*dfc6aa5cSAndroid Build Coastguard Worker    align       16
502*dfc6aa5cSAndroid Build Coastguard Worker    times 5     nop
503*dfc6aa5cSAndroid Build Coastguard Worker.ENTRY_SKIP_EMIT_CODE:                                    ; .ENTRY_SKIP_EMIT_CODE:
504*dfc6aa5cSAndroid Build Coastguard Worker    shl         put_buffer, nbitsb                        ; put_buffer <<= nbits;
505*dfc6aa5cSAndroid Build Coastguard Worker    or          put_buffer, codeq                         ; put_buffer |= code;
506*dfc6aa5cSAndroid Build Coastguard Worker.BLOOP_COND:                                              ; .BLOOP_COND:
507*dfc6aa5cSAndroid Build Coastguard Worker    test        index, index                              ; if (index != 0)
508*dfc6aa5cSAndroid Build Coastguard Worker    jz          .ELOOP                                    ; {
509*dfc6aa5cSAndroid Build Coastguard Worker.BLOOP:                                                   ;   do {
510*dfc6aa5cSAndroid Build Coastguard Worker    xor         nbits, nbits                              ;     nbits = 0;  /* kill tzcnt input dependency */
511*dfc6aa5cSAndroid Build Coastguard Worker    tzcnt       nbitsq, index                             ;     nbits = # of trailing 0 bits in index
512*dfc6aa5cSAndroid Build Coastguard Worker    inc         nbits                                     ;     ++nbits;
513*dfc6aa5cSAndroid Build Coastguard Worker    lea         t, [t + nbitsq * 2]                       ;     t = &t[nbits];
514*dfc6aa5cSAndroid Build Coastguard Worker    shr         index, nbitsb                             ;     index >>= nbits;
515*dfc6aa5cSAndroid Build Coastguard Worker.EMIT_BRLOOP_CODE_END:                                    ; .EMIT_BRLOOP_CODE_END:
516*dfc6aa5cSAndroid Build Coastguard Worker    cmp         nbits, 16                                 ;     if (nbits > 16)
517*dfc6aa5cSAndroid Build Coastguard Worker    jg          .BRLOOP                                   ;       goto .BRLOOP;
518*dfc6aa5cSAndroid Build Coastguard Worker.ERLOOP:                                                  ; .ERLOOP:
519*dfc6aa5cSAndroid Build Coastguard Worker    movsx       codeq, word [t]                           ;     code = *t;
520*dfc6aa5cSAndroid Build Coastguard Worker    lea         tempd, [nbitsq * 2]                       ;     temp = nbits * 2;
521*dfc6aa5cSAndroid Build Coastguard Worker    movzx       nbits, byte [NBITS(codeq)]                ;     nbits = JPEG_NBITS(code);
522*dfc6aa5cSAndroid Build Coastguard Worker    lea         tempd, [nbitsq + tempq * 8]               ;     temp = temp * 8 + nbits;
523*dfc6aa5cSAndroid Build Coastguard Worker    mov         code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4]
524*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;     code_temp = actbl->ehufco[temp-16];
525*dfc6aa5cSAndroid Build Coastguard Worker    shl         code_temp, nbitsb                         ;     code_temp <<= nbits;
526*dfc6aa5cSAndroid Build Coastguard Worker    and         code, dword [MASK_BITS(nbitsq)]           ;     code &= (1 << nbits) - 1;
527*dfc6aa5cSAndroid Build Coastguard Worker    add         nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)]
528*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;     free_bits -= actbl->ehufsi[temp-16];
529*dfc6aa5cSAndroid Build Coastguard Worker    or          code, code_temp                           ;     code |= code_temp;
530*dfc6aa5cSAndroid Build Coastguard Worker    sub         free_bitsb, nbitsb                        ;     if ((free_bits -= nbits) <= 0)
531*dfc6aa5cSAndroid Build Coastguard Worker    jle         .EMIT_CODE                                ;       goto .EMIT_CODE;
532*dfc6aa5cSAndroid Build Coastguard Worker    shl         put_buffer, nbitsb                        ;     put_buffer <<= nbits;
533*dfc6aa5cSAndroid Build Coastguard Worker    or          put_buffer, codeq                         ;     put_buffer |= code;
534*dfc6aa5cSAndroid Build Coastguard Worker    test        index, index
535*dfc6aa5cSAndroid Build Coastguard Worker    jnz         .BLOOP                                    ;   } while (index != 0);
536*dfc6aa5cSAndroid Build Coastguard Worker.ELOOP:                                                   ; }  /* index != 0 */
537*dfc6aa5cSAndroid Build Coastguard Worker    sub         td, esp                                   ; t -= (WIN64: &t_[0], UNIX: &t_[64]);
538*dfc6aa5cSAndroid Build Coastguard Worker%ifdef WIN64
539*dfc6aa5cSAndroid Build Coastguard Worker    cmp         td, (DCTSIZE2 - 2) * SIZEOF_WORD          ; if (t != 62)
540*dfc6aa5cSAndroid Build Coastguard Worker%else
541*dfc6aa5cSAndroid Build Coastguard Worker    cmp         td, -2 * SIZEOF_WORD                      ; if (t != -2)
542*dfc6aa5cSAndroid Build Coastguard Worker%endif
543*dfc6aa5cSAndroid Build Coastguard Worker    je          .EFN                                      ; {
544*dfc6aa5cSAndroid Build Coastguard Worker    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
545*dfc6aa5cSAndroid Build Coastguard Worker                                                          ;   nbits = actbl->ehufsi[0];
546*dfc6aa5cSAndroid Build Coastguard Worker    mov         code, [actbl + c_derived_tbl.ehufco + 0]  ;   code = actbl->ehufco[0];
547*dfc6aa5cSAndroid Build Coastguard Worker    sub         free_bitsb, nbitsb                        ;   if ((free_bits -= nbits) <= 0)
548*dfc6aa5cSAndroid Build Coastguard Worker    jg          .EFN_SKIP_EMIT_CODE                       ;   {
549*dfc6aa5cSAndroid Build Coastguard Worker    EMIT_QWORD  .EFN                                      ;     insert code, flush buffer
550*dfc6aa5cSAndroid Build Coastguard Worker    align       16
551*dfc6aa5cSAndroid Build Coastguard Worker.EFN_SKIP_EMIT_CODE:                                      ;   } else {
552*dfc6aa5cSAndroid Build Coastguard Worker    shl         put_buffer, nbitsb                        ;     put_buffer <<= nbits;
553*dfc6aa5cSAndroid Build Coastguard Worker    or          put_buffer, codeq                         ;     put_buffer |= code;
554*dfc6aa5cSAndroid Build Coastguard Worker.EFN:                                                     ; } }
555*dfc6aa5cSAndroid Build Coastguard Worker    mov         [state + working_state.cur.put_buffer.simd], put_buffer
556*dfc6aa5cSAndroid Build Coastguard Worker                                                          ; state->cur.put_buffer.simd = put_buffer;
557*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [state + working_state.cur.free_bits], free_bitsb
558*dfc6aa5cSAndroid Build Coastguard Worker                                                          ; state->cur.free_bits = free_bits;
559*dfc6aa5cSAndroid Build Coastguard Worker%ifdef WIN64
560*dfc6aa5cSAndroid Build Coastguard Worker    sub         rsp, -DCTSIZE2 * SIZEOF_WORD
561*dfc6aa5cSAndroid Build Coastguard Worker    pop         r12
562*dfc6aa5cSAndroid Build Coastguard Worker    pop         rdi
563*dfc6aa5cSAndroid Build Coastguard Worker    pop         rsi
564*dfc6aa5cSAndroid Build Coastguard Worker    pop         rbp
565*dfc6aa5cSAndroid Build Coastguard Worker    pop         rbx
566*dfc6aa5cSAndroid Build Coastguard Worker%else
567*dfc6aa5cSAndroid Build Coastguard Worker    pop         r12
568*dfc6aa5cSAndroid Build Coastguard Worker    pop         rbp
569*dfc6aa5cSAndroid Build Coastguard Worker    pop         rbx
570*dfc6aa5cSAndroid Build Coastguard Worker%endif
571*dfc6aa5cSAndroid Build Coastguard Worker    ret
572*dfc6aa5cSAndroid Build Coastguard Worker
573*dfc6aa5cSAndroid Build Coastguard Worker; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
574*dfc6aa5cSAndroid Build Coastguard Worker
575*dfc6aa5cSAndroid Build Coastguard Worker    align       16
576*dfc6aa5cSAndroid Build Coastguard Worker.EMIT_BRLOOP_CODE:
577*dfc6aa5cSAndroid Build Coastguard Worker    EMIT_QWORD  .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp }
578*dfc6aa5cSAndroid Build Coastguard Worker                                                          ; insert code, flush buffer,
579*dfc6aa5cSAndroid Build Coastguard Worker                                                          ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END
580*dfc6aa5cSAndroid Build Coastguard Worker
581*dfc6aa5cSAndroid Build Coastguard Worker; For some reason, the OS X linker does not honor the request to align the
582*dfc6aa5cSAndroid Build Coastguard Worker; segment unless we do this.
583*dfc6aa5cSAndroid Build Coastguard Worker    align       32
584