1*dfc6aa5cSAndroid Build Coastguard Worker; 2*dfc6aa5cSAndroid Build Coastguard Worker; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2) 3*dfc6aa5cSAndroid Build Coastguard Worker; 4*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander. 5*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2015, Matthieu Darbois. 6*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2018, Matthias Räncker. 7*dfc6aa5cSAndroid Build Coastguard Worker; 8*dfc6aa5cSAndroid Build Coastguard Worker; Based on the x86 SIMD extension for IJG JPEG library 9*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 1999-2006, MIYASAKA Masaru. 10*dfc6aa5cSAndroid Build Coastguard Worker; For conditions of distribution and use, see copyright notice in jsimdext.inc 11*dfc6aa5cSAndroid Build Coastguard Worker; 12*dfc6aa5cSAndroid Build Coastguard Worker; This file should be assembled with NASM (Netwide Assembler), 13*dfc6aa5cSAndroid Build Coastguard Worker; can *not* be assembled with Microsoft's MASM or any compatible 14*dfc6aa5cSAndroid Build Coastguard Worker; assembler (including Borland's Turbo Assembler). 15*dfc6aa5cSAndroid Build Coastguard Worker; NASM is available from http://nasm.sourceforge.net/ or 16*dfc6aa5cSAndroid Build Coastguard Worker; http://sourceforge.net/project/showfiles.php?group_id=6208 17*dfc6aa5cSAndroid Build Coastguard Worker; 18*dfc6aa5cSAndroid Build Coastguard Worker; This file contains an SSE2 implementation for Huffman coding of one block. 19*dfc6aa5cSAndroid Build Coastguard Worker; The following code is based on jchuff.c; see jchuff.c for more details. 20*dfc6aa5cSAndroid Build Coastguard Worker 21*dfc6aa5cSAndroid Build Coastguard Worker%include "jsimdext.inc" 22*dfc6aa5cSAndroid Build Coastguard Worker 23*dfc6aa5cSAndroid Build Coastguard Workerstruc working_state 24*dfc6aa5cSAndroid Build Coastguard Worker.next_output_byte: resp 1 ; => next byte to write in buffer 25*dfc6aa5cSAndroid Build Coastguard Worker.free_in_buffer: resp 1 ; # of byte spaces remaining in buffer 26*dfc6aa5cSAndroid Build Coastguard Worker.cur.put_buffer.simd resq 1 ; current bit accumulation buffer 27*dfc6aa5cSAndroid Build Coastguard Worker.cur.free_bits resd 1 ; # of bits available in it 28*dfc6aa5cSAndroid Build Coastguard Worker.cur.last_dc_val resd 4 ; last DC coef for each component 29*dfc6aa5cSAndroid Build Coastguard Worker.cinfo: resp 1 ; dump_buffer needs access to this 30*dfc6aa5cSAndroid Build Coastguard Workerendstruc 31*dfc6aa5cSAndroid Build Coastguard Worker 32*dfc6aa5cSAndroid Build Coastguard Workerstruc c_derived_tbl 33*dfc6aa5cSAndroid Build Coastguard Worker.ehufco: resd 256 ; code for each symbol 34*dfc6aa5cSAndroid Build Coastguard Worker.ehufsi: resb 256 ; length of code for each symbol 35*dfc6aa5cSAndroid Build Coastguard Worker; If no code has been allocated for a symbol S, ehufsi[S] contains 0 36*dfc6aa5cSAndroid Build Coastguard Workerendstruc 37*dfc6aa5cSAndroid Build Coastguard Worker 38*dfc6aa5cSAndroid Build Coastguard Worker; -------------------------------------------------------------------------- 39*dfc6aa5cSAndroid Build Coastguard Worker SECTION SEG_CONST 40*dfc6aa5cSAndroid Build Coastguard Worker 41*dfc6aa5cSAndroid Build Coastguard Worker alignz 32 42*dfc6aa5cSAndroid Build Coastguard Worker GLOBAL_DATA(jconst_huff_encode_one_block) 43*dfc6aa5cSAndroid Build Coastguard Worker 44*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jconst_huff_encode_one_block): 45*dfc6aa5cSAndroid Build Coastguard Worker 46*dfc6aa5cSAndroid Build Coastguard Workerjpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007 47*dfc6aa5cSAndroid Build Coastguard Worker dd 0x000f, 0x001f, 0x003f, 0x007f 48*dfc6aa5cSAndroid Build Coastguard Worker dd 0x00ff, 0x01ff, 0x03ff, 0x07ff 49*dfc6aa5cSAndroid Build Coastguard Worker dd 0x0fff, 0x1fff, 0x3fff, 0x7fff 50*dfc6aa5cSAndroid Build Coastguard Worker 51*dfc6aa5cSAndroid Build Coastguard Worker alignz 32 52*dfc6aa5cSAndroid Build Coastguard Worker 53*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 14 db 15 54*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 13 db 14 55*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 12 db 13 56*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 11 db 12 57*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 10 db 11 58*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 9 db 10 59*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 8 db 9 60*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 7 db 8 61*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 6 db 7 62*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 5 db 6 63*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 4 db 5 64*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 3 db 4 65*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 2 db 3 66*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 1 db 2 67*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 0 db 1 68*dfc6aa5cSAndroid Build Coastguard Workertimes 1 db 0 69*dfc6aa5cSAndroid Build Coastguard Workerjpeg_nbits_table: 70*dfc6aa5cSAndroid Build Coastguard Workertimes 1 db 0 71*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 0 db 1 72*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 1 db 2 73*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 2 db 3 74*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 3 db 4 75*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 4 db 5 76*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 5 db 6 77*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 6 db 7 78*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 7 db 8 79*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 8 db 9 80*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 9 db 10 81*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 10 db 11 82*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 11 db 12 83*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 12 db 13 84*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 13 db 14 85*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 14 db 15 86*dfc6aa5cSAndroid Build Coastguard Workertimes 1 << 15 db 16 87*dfc6aa5cSAndroid Build Coastguard Worker 88*dfc6aa5cSAndroid Build Coastguard Worker alignz 32 89*dfc6aa5cSAndroid Build Coastguard Worker 90*dfc6aa5cSAndroid Build Coastguard Worker%define NBITS(x) nbits_base + x 91*dfc6aa5cSAndroid Build Coastguard Worker%define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table) 92*dfc6aa5cSAndroid Build Coastguard Worker 93*dfc6aa5cSAndroid Build Coastguard Worker; -------------------------------------------------------------------------- 94*dfc6aa5cSAndroid Build Coastguard Worker SECTION SEG_TEXT 95*dfc6aa5cSAndroid Build Coastguard Worker BITS 64 96*dfc6aa5cSAndroid Build Coastguard Worker 97*dfc6aa5cSAndroid Build Coastguard Worker; Shorthand used to describe SIMD operations: 98*dfc6aa5cSAndroid Build Coastguard Worker; wN: xmmN treated as eight signed 16-bit values 99*dfc6aa5cSAndroid Build Coastguard Worker; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7 100*dfc6aa5cSAndroid Build Coastguard Worker; bN: xmmN treated as 16 unsigned 8-bit values 101*dfc6aa5cSAndroid Build Coastguard Worker; bN[i]: perform the same operation on all 16 unsigned 8-bit values, i=0..15 102*dfc6aa5cSAndroid Build Coastguard Worker; Contents of SIMD registers are shown in memory order. 103*dfc6aa5cSAndroid Build Coastguard Worker 104*dfc6aa5cSAndroid Build Coastguard Worker; Fill the bit buffer to capacity with the leading bits from code, then output 105*dfc6aa5cSAndroid Build Coastguard Worker; the bit buffer and put the remaining bits from code into the bit buffer. 106*dfc6aa5cSAndroid Build Coastguard Worker; 107*dfc6aa5cSAndroid Build Coastguard Worker; Usage: 108*dfc6aa5cSAndroid Build Coastguard Worker; code - contains the bits to shift into the bit buffer (LSB-aligned) 109*dfc6aa5cSAndroid Build Coastguard Worker; %1 - the label to which to jump when the macro completes 110*dfc6aa5cSAndroid Build Coastguard Worker; %2 (optional) - extra instructions to execute after nbits has been set 111*dfc6aa5cSAndroid Build Coastguard Worker; 112*dfc6aa5cSAndroid Build Coastguard Worker; Upon completion, free_bits will be set to the number of remaining bits from 113*dfc6aa5cSAndroid Build Coastguard Worker; code, and put_buffer will contain those remaining bits. temp and code will 114*dfc6aa5cSAndroid Build Coastguard Worker; be clobbered. 115*dfc6aa5cSAndroid Build Coastguard Worker; 116*dfc6aa5cSAndroid Build Coastguard Worker; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE() 117*dfc6aa5cSAndroid Build Coastguard Worker; macro in jchuff.c. 118*dfc6aa5cSAndroid Build Coastguard Worker 119*dfc6aa5cSAndroid Build Coastguard Worker%macro EMIT_QWORD 1-2 120*dfc6aa5cSAndroid Build Coastguard Worker add nbitsb, free_bitsb ; nbits += free_bits; 121*dfc6aa5cSAndroid Build Coastguard Worker neg free_bitsb ; free_bits = -free_bits; 122*dfc6aa5cSAndroid Build Coastguard Worker mov tempd, code ; temp = code; 123*dfc6aa5cSAndroid Build Coastguard Worker shl put_buffer, nbitsb ; put_buffer <<= nbits; 124*dfc6aa5cSAndroid Build Coastguard Worker mov nbitsb, free_bitsb ; nbits = free_bits; 125*dfc6aa5cSAndroid Build Coastguard Worker neg free_bitsb ; free_bits = -free_bits; 126*dfc6aa5cSAndroid Build Coastguard Worker shr tempd, nbitsb ; temp >>= nbits; 127*dfc6aa5cSAndroid Build Coastguard Worker or tempq, put_buffer ; temp |= put_buffer; 128*dfc6aa5cSAndroid Build Coastguard Worker movq xmm0, tempq ; xmm0.u64 = { temp, 0 }; 129*dfc6aa5cSAndroid Build Coastguard Worker bswap tempq ; temp = htonl(temp); 130*dfc6aa5cSAndroid Build Coastguard Worker mov put_buffer, codeq ; put_buffer = code; 131*dfc6aa5cSAndroid Build Coastguard Worker pcmpeqb xmm0, xmm1 ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0); 132*dfc6aa5cSAndroid Build Coastguard Worker %2 133*dfc6aa5cSAndroid Build Coastguard Worker pmovmskb code, xmm0 ; code = 0; code |= ((b0[i] >> 7) << i); 134*dfc6aa5cSAndroid Build Coastguard Worker mov qword [buffer], tempq ; memcpy(buffer, &temp, 8); 135*dfc6aa5cSAndroid Build Coastguard Worker ; (speculative; will be overwritten if 136*dfc6aa5cSAndroid Build Coastguard Worker ; code contains any 0xFF bytes) 137*dfc6aa5cSAndroid Build Coastguard Worker add free_bitsb, 64 ; free_bits += 64; 138*dfc6aa5cSAndroid Build Coastguard Worker add bufferp, 8 ; buffer += 8; 139*dfc6aa5cSAndroid Build Coastguard Worker test code, code ; if (code == 0) /* No 0xFF bytes */ 140*dfc6aa5cSAndroid Build Coastguard Worker jz %1 ; return; 141*dfc6aa5cSAndroid Build Coastguard Worker ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8 142*dfc6aa5cSAndroid Build Coastguard Worker ; bytes in the qword. 143*dfc6aa5cSAndroid Build Coastguard Worker cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF 144*dfc6aa5cSAndroid Build Coastguard Worker mov byte [buffer-7], 0 ; buffer[-7] = 0; 145*dfc6aa5cSAndroid Build Coastguard Worker sbb bufferp, 6 ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0)); 146*dfc6aa5cSAndroid Build Coastguard Worker mov byte [buffer], temph ; buffer[0] = temp[1]; 147*dfc6aa5cSAndroid Build Coastguard Worker cmp temph, 0xFF ; Set CF if temp[1] < 0xFF 148*dfc6aa5cSAndroid Build Coastguard Worker mov byte [buffer+1], 0 ; buffer[1] = 0; 149*dfc6aa5cSAndroid Build Coastguard Worker sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); 150*dfc6aa5cSAndroid Build Coastguard Worker shr tempq, 16 ; temp >>= 16; 151*dfc6aa5cSAndroid Build Coastguard Worker mov byte [buffer], tempb ; buffer[0] = temp[0]; 152*dfc6aa5cSAndroid Build Coastguard Worker cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF 153*dfc6aa5cSAndroid Build Coastguard Worker mov byte [buffer+1], 0 ; buffer[1] = 0; 154*dfc6aa5cSAndroid Build Coastguard Worker sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); 155*dfc6aa5cSAndroid Build Coastguard Worker mov byte [buffer], temph ; buffer[0] = temp[1]; 156*dfc6aa5cSAndroid Build Coastguard Worker cmp temph, 0xFF ; Set CF if temp[1] < 0xFF 157*dfc6aa5cSAndroid Build Coastguard Worker mov byte [buffer+1], 0 ; buffer[1] = 0; 158*dfc6aa5cSAndroid Build Coastguard Worker sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); 159*dfc6aa5cSAndroid Build Coastguard Worker shr tempq, 16 ; temp >>= 16; 160*dfc6aa5cSAndroid Build Coastguard Worker mov byte [buffer], tempb ; buffer[0] = temp[0]; 161*dfc6aa5cSAndroid Build Coastguard Worker cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF 162*dfc6aa5cSAndroid Build Coastguard Worker mov byte [buffer+1], 0 ; buffer[1] = 0; 163*dfc6aa5cSAndroid Build Coastguard Worker sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); 164*dfc6aa5cSAndroid Build Coastguard Worker mov byte [buffer], temph ; buffer[0] = temp[1]; 165*dfc6aa5cSAndroid Build Coastguard Worker cmp temph, 0xFF ; Set CF if temp[1] < 0xFF 166*dfc6aa5cSAndroid Build Coastguard Worker mov byte [buffer+1], 0 ; buffer[1] = 0; 167*dfc6aa5cSAndroid Build Coastguard Worker sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); 168*dfc6aa5cSAndroid Build Coastguard Worker shr tempd, 16 ; temp >>= 16; 169*dfc6aa5cSAndroid Build Coastguard Worker mov byte [buffer], tempb ; buffer[0] = temp[0]; 170*dfc6aa5cSAndroid Build Coastguard Worker cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF 171*dfc6aa5cSAndroid Build Coastguard Worker mov byte [buffer+1], 0 ; buffer[1] = 0; 172*dfc6aa5cSAndroid Build Coastguard Worker sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); 173*dfc6aa5cSAndroid Build Coastguard Worker mov byte [buffer], temph ; buffer[0] = temp[1]; 174*dfc6aa5cSAndroid Build Coastguard Worker cmp temph, 0xFF ; Set CF if temp[1] < 0xFF 175*dfc6aa5cSAndroid Build Coastguard Worker mov byte [buffer+1], 0 ; buffer[1] = 0; 176*dfc6aa5cSAndroid Build Coastguard Worker sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); 177*dfc6aa5cSAndroid Build Coastguard Worker jmp %1 ; return; 178*dfc6aa5cSAndroid Build Coastguard Worker%endmacro 179*dfc6aa5cSAndroid Build Coastguard Worker 180*dfc6aa5cSAndroid Build Coastguard Worker; 181*dfc6aa5cSAndroid Build Coastguard Worker; Encode a single block's worth of coefficients. 182*dfc6aa5cSAndroid Build Coastguard Worker; 183*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(JOCTET *) 184*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer, 185*dfc6aa5cSAndroid Build Coastguard Worker; JCOEFPTR block, int last_dc_val, 186*dfc6aa5cSAndroid Build Coastguard Worker; c_derived_tbl *dctbl, c_derived_tbl *actbl) 187*dfc6aa5cSAndroid Build Coastguard Worker; 188*dfc6aa5cSAndroid Build Coastguard Worker; NOTES: 189*dfc6aa5cSAndroid Build Coastguard Worker; When shuffling data, we try to avoid pinsrw as much as possible, since it is 190*dfc6aa5cSAndroid Build Coastguard Worker; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on 191*dfc6aa5cSAndroid Build Coastguard Worker; modern CPUs, so chains of pinsrw instructions (even with different outputs) 192*dfc6aa5cSAndroid Build Coastguard Worker; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and 193*dfc6aa5cSAndroid Build Coastguard Worker; requires 2 µops (with memory operand) on Intel. In either case, only one 194*dfc6aa5cSAndroid Build Coastguard Worker; pinsrw instruction can be decoded per cycle (and nothing else if they are 195*dfc6aa5cSAndroid Build Coastguard Worker; back-to-back), so out-of-order execution cannot be used to work around long 196*dfc6aa5cSAndroid Build Coastguard Worker; pinsrw chains (though for Sandy Bridge and later, this may be less of a 197*dfc6aa5cSAndroid Build Coastguard Worker; problem if the code runs from the µop cache.) 198*dfc6aa5cSAndroid Build Coastguard Worker; 199*dfc6aa5cSAndroid Build Coastguard Worker; We use tzcnt instead of bsf without checking for support. The instruction is 200*dfc6aa5cSAndroid Build Coastguard Worker; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to 201*dfc6aa5cSAndroid Build Coastguard Worker; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is 202*dfc6aa5cSAndroid Build Coastguard Worker; an input dependency (although the behavior is not formally defined, Intel 203*dfc6aa5cSAndroid Build Coastguard Worker; CPUs usually leave the destination unmodified if the source is zero.) This 204*dfc6aa5cSAndroid Build Coastguard Worker; can prevent out-of-order execution, so we clear the destination before 205*dfc6aa5cSAndroid Build Coastguard Worker; invoking tzcnt. 206*dfc6aa5cSAndroid Build Coastguard Worker; 207*dfc6aa5cSAndroid Build Coastguard Worker; Initial register allocation 208*dfc6aa5cSAndroid Build Coastguard Worker; rax - buffer 209*dfc6aa5cSAndroid Build Coastguard Worker; rbx - temp 210*dfc6aa5cSAndroid Build Coastguard Worker; rcx - nbits 211*dfc6aa5cSAndroid Build Coastguard Worker; rdx - block --> free_bits 212*dfc6aa5cSAndroid Build Coastguard Worker; rsi - nbits_base 213*dfc6aa5cSAndroid Build Coastguard Worker; rdi - t 214*dfc6aa5cSAndroid Build Coastguard Worker; rbp - code 215*dfc6aa5cSAndroid Build Coastguard Worker; r8 - dctbl --> code_temp 216*dfc6aa5cSAndroid Build Coastguard Worker; r9 - actbl 217*dfc6aa5cSAndroid Build Coastguard Worker; r10 - state 218*dfc6aa5cSAndroid Build Coastguard Worker; r11 - index 219*dfc6aa5cSAndroid Build Coastguard Worker; r12 - put_buffer 220*dfc6aa5cSAndroid Build Coastguard Worker 221*dfc6aa5cSAndroid Build Coastguard Worker%define buffer rax 222*dfc6aa5cSAndroid Build Coastguard Worker%ifdef WIN64 223*dfc6aa5cSAndroid Build Coastguard Worker%define bufferp rax 224*dfc6aa5cSAndroid Build Coastguard Worker%else 225*dfc6aa5cSAndroid Build Coastguard Worker%define bufferp raxp 226*dfc6aa5cSAndroid Build Coastguard Worker%endif 227*dfc6aa5cSAndroid Build Coastguard Worker%define tempq rbx 228*dfc6aa5cSAndroid Build Coastguard Worker%define tempd ebx 229*dfc6aa5cSAndroid Build Coastguard Worker%define tempb bl 230*dfc6aa5cSAndroid Build Coastguard Worker%define temph bh 231*dfc6aa5cSAndroid Build Coastguard Worker%define nbitsq rcx 232*dfc6aa5cSAndroid Build Coastguard Worker%define nbits ecx 233*dfc6aa5cSAndroid Build Coastguard Worker%define nbitsb cl 234*dfc6aa5cSAndroid Build Coastguard Worker%define block rdx 235*dfc6aa5cSAndroid Build Coastguard Worker%define nbits_base rsi 236*dfc6aa5cSAndroid Build Coastguard Worker%define t rdi 237*dfc6aa5cSAndroid Build Coastguard Worker%define td edi 238*dfc6aa5cSAndroid Build Coastguard Worker%define codeq rbp 239*dfc6aa5cSAndroid Build Coastguard Worker%define code ebp 240*dfc6aa5cSAndroid Build Coastguard Worker%define dctbl r8 241*dfc6aa5cSAndroid Build Coastguard Worker%define actbl r9 242*dfc6aa5cSAndroid Build Coastguard Worker%define state r10 243*dfc6aa5cSAndroid Build Coastguard Worker%define index r11 244*dfc6aa5cSAndroid Build Coastguard Worker%define indexd r11d 245*dfc6aa5cSAndroid Build Coastguard Worker%define put_buffer r12 246*dfc6aa5cSAndroid Build Coastguard Worker%define put_bufferd r12d 247*dfc6aa5cSAndroid Build Coastguard Worker 248*dfc6aa5cSAndroid Build Coastguard Worker; Step 1: Re-arrange input data according to jpeg_natural_order 249*dfc6aa5cSAndroid Build Coastguard Worker; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10 250*dfc6aa5cSAndroid Build Coastguard Worker; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05 251*dfc6aa5cSAndroid Build Coastguard Worker; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34 252*dfc6aa5cSAndroid Build Coastguard Worker; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28 253*dfc6aa5cSAndroid Build Coastguard Worker; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36 254*dfc6aa5cSAndroid Build Coastguard Worker; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51 255*dfc6aa5cSAndroid Build Coastguard Worker; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46 256*dfc6aa5cSAndroid Build Coastguard Worker; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63 257*dfc6aa5cSAndroid Build Coastguard Worker 258*dfc6aa5cSAndroid Build Coastguard Worker align 32 259*dfc6aa5cSAndroid Build Coastguard Worker GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2) 260*dfc6aa5cSAndroid Build Coastguard Worker 261*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_huff_encode_one_block_sse2): 262*dfc6aa5cSAndroid Build Coastguard Worker 263*dfc6aa5cSAndroid Build Coastguard Worker%ifdef WIN64 264*dfc6aa5cSAndroid Build Coastguard Worker 265*dfc6aa5cSAndroid Build Coastguard Worker; rcx = working_state *state 266*dfc6aa5cSAndroid Build Coastguard Worker; rdx = JOCTET *buffer 267*dfc6aa5cSAndroid Build Coastguard Worker; r8 = JCOEFPTR block 268*dfc6aa5cSAndroid Build Coastguard Worker; r9 = int last_dc_val 269*dfc6aa5cSAndroid Build Coastguard Worker; [rax+48] = c_derived_tbl *dctbl 270*dfc6aa5cSAndroid Build Coastguard Worker; [rax+56] = c_derived_tbl *actbl 271*dfc6aa5cSAndroid Build Coastguard Worker 272*dfc6aa5cSAndroid Build Coastguard Worker ;X: X = code stream 273*dfc6aa5cSAndroid Build Coastguard Worker mov buffer, rdx 274*dfc6aa5cSAndroid Build Coastguard Worker mov block, r8 275*dfc6aa5cSAndroid Build Coastguard Worker movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 276*dfc6aa5cSAndroid Build Coastguard Worker push rbx 277*dfc6aa5cSAndroid Build Coastguard Worker push rbp 278*dfc6aa5cSAndroid Build Coastguard Worker movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 279*dfc6aa5cSAndroid Build Coastguard Worker push rsi 280*dfc6aa5cSAndroid Build Coastguard Worker push rdi 281*dfc6aa5cSAndroid Build Coastguard Worker push r12 282*dfc6aa5cSAndroid Build Coastguard Worker movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15 283*dfc6aa5cSAndroid Build Coastguard Worker mov state, rcx 284*dfc6aa5cSAndroid Build Coastguard Worker movsx code, word [block] ;Z: code = block[0]; 285*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm4, xmm4 ;A: w4[i] = 0; 286*dfc6aa5cSAndroid Build Coastguard Worker sub code, r9d ;Z: code -= last_dc_val; 287*dfc6aa5cSAndroid Build Coastguard Worker mov dctbl, POINTER [rsp+6*8+4*8] 288*dfc6aa5cSAndroid Build Coastguard Worker mov actbl, POINTER [rsp+6*8+5*8] 289*dfc6aa5cSAndroid Build Coastguard Worker punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 290*dfc6aa5cSAndroid Build Coastguard Worker lea nbits_base, [rel jpeg_nbits_table] 291*dfc6aa5cSAndroid Build Coastguard Worker add rsp, -DCTSIZE2 * SIZEOF_WORD 292*dfc6aa5cSAndroid Build Coastguard Worker mov t, rsp 293*dfc6aa5cSAndroid Build Coastguard Worker 294*dfc6aa5cSAndroid Build Coastguard Worker%else 295*dfc6aa5cSAndroid Build Coastguard Worker 296*dfc6aa5cSAndroid Build Coastguard Worker; rdi = working_state *state 297*dfc6aa5cSAndroid Build Coastguard Worker; rsi = JOCTET *buffer 298*dfc6aa5cSAndroid Build Coastguard Worker; rdx = JCOEFPTR block 299*dfc6aa5cSAndroid Build Coastguard Worker; rcx = int last_dc_val 300*dfc6aa5cSAndroid Build Coastguard Worker; r8 = c_derived_tbl *dctbl 301*dfc6aa5cSAndroid Build Coastguard Worker; r9 = c_derived_tbl *actbl 302*dfc6aa5cSAndroid Build Coastguard Worker 303*dfc6aa5cSAndroid Build Coastguard Worker ;X: X = code stream 304*dfc6aa5cSAndroid Build Coastguard Worker movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 305*dfc6aa5cSAndroid Build Coastguard Worker push rbx 306*dfc6aa5cSAndroid Build Coastguard Worker push rbp 307*dfc6aa5cSAndroid Build Coastguard Worker movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 308*dfc6aa5cSAndroid Build Coastguard Worker push r12 309*dfc6aa5cSAndroid Build Coastguard Worker mov state, rdi 310*dfc6aa5cSAndroid Build Coastguard Worker mov buffer, rsi 311*dfc6aa5cSAndroid Build Coastguard Worker movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15 312*dfc6aa5cSAndroid Build Coastguard Worker movsx codeq, word [block] ;Z: code = block[0]; 313*dfc6aa5cSAndroid Build Coastguard Worker lea nbits_base, [rel jpeg_nbits_table] 314*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm4, xmm4 ;A: w4[i] = 0; 315*dfc6aa5cSAndroid Build Coastguard Worker sub codeq, rcx ;Z: code -= last_dc_val; 316*dfc6aa5cSAndroid Build Coastguard Worker punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 317*dfc6aa5cSAndroid Build Coastguard Worker lea t, [rsp - DCTSIZE2 * SIZEOF_WORD] ; use red zone for t_ 318*dfc6aa5cSAndroid Build Coastguard Worker 319*dfc6aa5cSAndroid Build Coastguard Worker%endif 320*dfc6aa5cSAndroid Build Coastguard Worker 321*dfc6aa5cSAndroid Build Coastguard Worker pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11 322*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11 323*dfc6aa5cSAndroid Build Coastguard Worker punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15 324*dfc6aa5cSAndroid Build Coastguard Worker punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13 325*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17 326*dfc6aa5cSAndroid Build Coastguard Worker ;A: (Row 0, offset 1) 327*dfc6aa5cSAndroid Build Coastguard Worker pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0); 328*dfc6aa5cSAndroid Build Coastguard Worker paddw xmm0, xmm4 ;A: w0[i] += w4[i]; 329*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i]; 330*dfc6aa5cSAndroid Build Coastguard Worker 331*dfc6aa5cSAndroid Build Coastguard Worker movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- -- 332*dfc6aa5cSAndroid Build Coastguard Worker pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- -- 333*dfc6aa5cSAndroid Build Coastguard Worker pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12 334*dfc6aa5cSAndroid Build Coastguard Worker movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55 335*dfc6aa5cSAndroid Build Coastguard Worker movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12 336*dfc6aa5cSAndroid Build Coastguard Worker punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51 337*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12 338*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm4, xmm4 ;A: w4[i] = 0; 339*dfc6aa5cSAndroid Build Coastguard Worker psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- -- 340*dfc6aa5cSAndroid Build Coastguard Worker pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0); 341*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12 342*dfc6aa5cSAndroid Build Coastguard Worker ; (Row 1, offset 1) 343*dfc6aa5cSAndroid Build Coastguard Worker pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0); 344*dfc6aa5cSAndroid Build Coastguard Worker paddw xmm1, xmm4 ;B: w1[i] += w4[i]; 345*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i]; 346*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm4, xmm4 ;B: w4[i] = 0; 347*dfc6aa5cSAndroid Build Coastguard Worker pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0); 348*dfc6aa5cSAndroid Build Coastguard Worker 349*dfc6aa5cSAndroid Build Coastguard Worker packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i] 350*dfc6aa5cSAndroid Build Coastguard Worker ; w/ signed saturation 351*dfc6aa5cSAndroid Build Coastguard Worker 352*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- -- 353*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- -- 354*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 -- 355*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35 356*dfc6aa5cSAndroid Build Coastguard Worker ; (Row 3, offset 1) 357*dfc6aa5cSAndroid Build Coastguard Worker pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0); 358*dfc6aa5cSAndroid Build Coastguard Worker paddw xmm3, xmm4 ;D: w3[i] += w4[i]; 359*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i]; 360*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm4, xmm4 ;D: w4[i] = 0; 361*dfc6aa5cSAndroid Build Coastguard Worker pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0); 362*dfc6aa5cSAndroid Build Coastguard Worker 363*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51 364*dfc6aa5cSAndroid Build Coastguard Worker cmp code, 1 << 31 ;Z: Set CF if code < 0x80000000, 365*dfc6aa5cSAndroid Build Coastguard Worker ;Z: i.e. if code is positive 366*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51 367*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51 368*dfc6aa5cSAndroid Build Coastguard Worker adc code, -1 ;Z: code += -1 + (code >= 0 ? 1 : 0); 369*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51 370*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51 371*dfc6aa5cSAndroid Build Coastguard Worker movsxd codeq, code ;Z: sign extend code 372*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27 373*dfc6aa5cSAndroid Build Coastguard Worker ; (Row 2, offset 1) 374*dfc6aa5cSAndroid Build Coastguard Worker pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0); 375*dfc6aa5cSAndroid Build Coastguard Worker paddw xmm2, xmm4 ;C: w2[i] += w4[i]; 376*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [t + 16 * SIZEOF_WORD], xmm2 ;C: t[i+16] = w2[i]; 377*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm4, xmm4 ;C: w4[i] = 0; 378*dfc6aa5cSAndroid Build Coastguard Worker pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0); 379*dfc6aa5cSAndroid Build Coastguard Worker 380*dfc6aa5cSAndroid Build Coastguard Worker packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i] 381*dfc6aa5cSAndroid Build Coastguard Worker ; w/ signed saturation 382*dfc6aa5cSAndroid Build Coastguard Worker 383*dfc6aa5cSAndroid Build Coastguard Worker movzx nbitsq, byte [NBITS(codeq)] ;Z: nbits = JPEG_NBITS(code); 384*dfc6aa5cSAndroid Build Coastguard Worker movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55 385*dfc6aa5cSAndroid Build Coastguard Worker pmovmskb tempd, xmm2 ;Z: temp = 0; temp |= ((b2[i] >> 7) << i); 386*dfc6aa5cSAndroid Build Coastguard Worker pmovmskb put_bufferd, xmm0 ;Z: put_buffer = 0; put_buffer |= ((b0[i] >> 7) << i); 387*dfc6aa5cSAndroid Build Coastguard Worker movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63 388*dfc6aa5cSAndroid Build Coastguard Worker punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63 389*dfc6aa5cSAndroid Build Coastguard Worker shl tempd, 16 ;Z: temp <<= 16; 390*dfc6aa5cSAndroid Build Coastguard Worker psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 -- 391*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm2, xmm2 ;H: w2[i] = 0; 392*dfc6aa5cSAndroid Build Coastguard Worker or put_bufferd, tempd ;Z: put_buffer |= temp; 393*dfc6aa5cSAndroid Build Coastguard Worker pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 -- 394*dfc6aa5cSAndroid Build Coastguard Worker movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- -- 395*dfc6aa5cSAndroid Build Coastguard Worker unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59 396*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm0, xmm0 ;H: w0[i] = 0; 397*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 -- 398*dfc6aa5cSAndroid Build Coastguard Worker ; (Row 7, offset 1) 399*dfc6aa5cSAndroid Build Coastguard Worker pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0); 400*dfc6aa5cSAndroid Build Coastguard Worker paddw xmm3, xmm2 ;H: w3[i] += w2[i]; 401*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i]; 402*dfc6aa5cSAndroid Build Coastguard Worker movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- -- 403*dfc6aa5cSAndroid Build Coastguard Worker pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0); 404*dfc6aa5cSAndroid Build Coastguard Worker punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47 405*dfc6aa5cSAndroid Build Coastguard Worker mov tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4] 406*dfc6aa5cSAndroid Build Coastguard Worker ;Z: temp = dctbl->ehufco[nbits]; 407*dfc6aa5cSAndroid Build Coastguard Worker movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47 408*dfc6aa5cSAndroid Build Coastguard Worker psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 -- 409*dfc6aa5cSAndroid Build Coastguard Worker shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59 410*dfc6aa5cSAndroid Build Coastguard Worker and code, dword [MASK_BITS(nbitsq)] ;Z: code &= (1 << nbits) - 1; 411*dfc6aa5cSAndroid Build Coastguard Worker pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 -- 412*dfc6aa5cSAndroid Build Coastguard Worker pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58 413*dfc6aa5cSAndroid Build Coastguard Worker shl tempq, nbitsb ;Z: temp <<= nbits; 414*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 -- 415*dfc6aa5cSAndroid Build Coastguard Worker pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58 416*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 -- 417*dfc6aa5cSAndroid Build Coastguard Worker or code, tempd ;Z: code |= temp; 418*dfc6aa5cSAndroid Build Coastguard Worker movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58 419*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 -- 420*dfc6aa5cSAndroid Build Coastguard Worker pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58 421*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53 422*dfc6aa5cSAndroid Build Coastguard Worker ; (Row 6, offset 1) 423*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm2, xmm2 ;G: w2[i] = 0; 424*dfc6aa5cSAndroid Build Coastguard Worker pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0); 425*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58 426*dfc6aa5cSAndroid Build Coastguard Worker paddw xmm4, xmm0 ;G: w4[i] += w0[i]; 427*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i]; 428*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58 429*dfc6aa5cSAndroid Build Coastguard Worker ; (Row 5, offset 1) 430*dfc6aa5cSAndroid Build Coastguard Worker pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0); 431*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59 432*dfc6aa5cSAndroid Build Coastguard Worker 433*dfc6aa5cSAndroid Build Coastguard Worker packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i] 434*dfc6aa5cSAndroid Build Coastguard Worker ; w/ signed saturation 435*dfc6aa5cSAndroid Build Coastguard Worker 436*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm0, xmm0 ;F: w0[i] = 0; 437*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59 438*dfc6aa5cSAndroid Build Coastguard Worker pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0); 439*dfc6aa5cSAndroid Build Coastguard Worker pmovmskb tempd, xmm4 ;Z: temp = 0; temp |= ((b4[i] >> 7) << i); 440*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59 441*dfc6aa5cSAndroid Build Coastguard Worker paddw xmm1, xmm2 ;F: w1[i] += w2[i]; 442*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [t + 40 * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i]; 443*dfc6aa5cSAndroid Build Coastguard Worker pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29 444*dfc6aa5cSAndroid Build Coastguard Worker ; (Row 4, offset 1) 445*dfc6aa5cSAndroid Build Coastguard Worker%undef block 446*dfc6aa5cSAndroid Build Coastguard Worker%define free_bitsq rdx 447*dfc6aa5cSAndroid Build Coastguard Worker%define free_bitsd edx 448*dfc6aa5cSAndroid Build Coastguard Worker%define free_bitsb dl 449*dfc6aa5cSAndroid Build Coastguard Worker pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0); 450*dfc6aa5cSAndroid Build Coastguard Worker shl tempq, 48 ;Z: temp <<= 48; 451*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm2, xmm2 ;E: w2[i] = 0; 452*dfc6aa5cSAndroid Build Coastguard Worker pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0); 453*dfc6aa5cSAndroid Build Coastguard Worker paddw xmm5, xmm0 ;E: w5[i] += w0[i]; 454*dfc6aa5cSAndroid Build Coastguard Worker or tempq, put_buffer ;Z: temp |= put_buffer; 455*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [t + 32 * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i]; 456*dfc6aa5cSAndroid Build Coastguard Worker lea t, [dword t - 2] ;Z: t = &t[-1]; 457*dfc6aa5cSAndroid Build Coastguard Worker pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0); 458*dfc6aa5cSAndroid Build Coastguard Worker 459*dfc6aa5cSAndroid Build Coastguard Worker packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i] 460*dfc6aa5cSAndroid Build Coastguard Worker ; w/ signed saturation 461*dfc6aa5cSAndroid Build Coastguard Worker 462*dfc6aa5cSAndroid Build Coastguard Worker add nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq] 463*dfc6aa5cSAndroid Build Coastguard Worker ;Z: nbits += dctbl->ehufsi[nbits]; 464*dfc6aa5cSAndroid Build Coastguard Worker%undef dctbl 465*dfc6aa5cSAndroid Build Coastguard Worker%define code_temp r8d 466*dfc6aa5cSAndroid Build Coastguard Worker pmovmskb indexd, xmm5 ;Z: index = 0; index |= ((b5[i] >> 7) << i); 467*dfc6aa5cSAndroid Build Coastguard Worker mov free_bitsd, [state+working_state.cur.free_bits] 468*dfc6aa5cSAndroid Build Coastguard Worker ;Z: free_bits = state->cur.free_bits; 469*dfc6aa5cSAndroid Build Coastguard Worker pcmpeqw xmm1, xmm1 ;Z: b1[i] = 0xFF; 470*dfc6aa5cSAndroid Build Coastguard Worker shl index, 32 ;Z: index <<= 32; 471*dfc6aa5cSAndroid Build Coastguard Worker mov put_buffer, [state+working_state.cur.put_buffer.simd] 472*dfc6aa5cSAndroid Build Coastguard Worker ;Z: put_buffer = state->cur.put_buffer.simd; 473*dfc6aa5cSAndroid Build Coastguard Worker or index, tempq ;Z: index |= temp; 474*dfc6aa5cSAndroid Build Coastguard Worker not index ;Z: index = ~index; 475*dfc6aa5cSAndroid Build Coastguard Worker sub free_bitsb, nbitsb ;Z: if ((free_bits -= nbits) >= 0) 476*dfc6aa5cSAndroid Build Coastguard Worker jnl .ENTRY_SKIP_EMIT_CODE ;Z: goto .ENTRY_SKIP_EMIT_CODE; 477*dfc6aa5cSAndroid Build Coastguard Worker align 16 478*dfc6aa5cSAndroid Build Coastguard Worker.EMIT_CODE: ;Z: .EMIT_CODE: 479*dfc6aa5cSAndroid Build Coastguard Worker EMIT_QWORD .BLOOP_COND ;Z: insert code, flush buffer, goto .BLOOP_COND 480*dfc6aa5cSAndroid Build Coastguard Worker 481*dfc6aa5cSAndroid Build Coastguard Worker; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 482*dfc6aa5cSAndroid Build Coastguard Worker 483*dfc6aa5cSAndroid Build Coastguard Worker align 16 484*dfc6aa5cSAndroid Build Coastguard Worker.BRLOOP: ; do { 485*dfc6aa5cSAndroid Build Coastguard Worker lea code_temp, [nbitsq - 16] ; code_temp = nbits - 16; 486*dfc6aa5cSAndroid Build Coastguard Worker movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0] 487*dfc6aa5cSAndroid Build Coastguard Worker ; nbits = actbl->ehufsi[0xf0]; 488*dfc6aa5cSAndroid Build Coastguard Worker mov code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4] 489*dfc6aa5cSAndroid Build Coastguard Worker ; code = actbl->ehufco[0xf0]; 490*dfc6aa5cSAndroid Build Coastguard Worker sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0) 491*dfc6aa5cSAndroid Build Coastguard Worker jle .EMIT_BRLOOP_CODE ; goto .EMIT_BRLOOP_CODE; 492*dfc6aa5cSAndroid Build Coastguard Worker shl put_buffer, nbitsb ; put_buffer <<= nbits; 493*dfc6aa5cSAndroid Build Coastguard Worker mov nbits, code_temp ; nbits = code_temp; 494*dfc6aa5cSAndroid Build Coastguard Worker or put_buffer, codeq ; put_buffer |= code; 495*dfc6aa5cSAndroid Build Coastguard Worker cmp nbits, 16 ; if (nbits <= 16) 496*dfc6aa5cSAndroid Build Coastguard Worker jle .ERLOOP ; break; 497*dfc6aa5cSAndroid Build Coastguard Worker jmp .BRLOOP ; } while (1); 498*dfc6aa5cSAndroid Build Coastguard Worker 499*dfc6aa5cSAndroid Build Coastguard Worker; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 500*dfc6aa5cSAndroid Build Coastguard Worker 501*dfc6aa5cSAndroid Build Coastguard Worker align 16 502*dfc6aa5cSAndroid Build Coastguard Worker times 5 nop 503*dfc6aa5cSAndroid Build Coastguard Worker.ENTRY_SKIP_EMIT_CODE: ; .ENTRY_SKIP_EMIT_CODE: 504*dfc6aa5cSAndroid Build Coastguard Worker shl put_buffer, nbitsb ; put_buffer <<= nbits; 505*dfc6aa5cSAndroid Build Coastguard Worker or put_buffer, codeq ; put_buffer |= code; 506*dfc6aa5cSAndroid Build Coastguard Worker.BLOOP_COND: ; .BLOOP_COND: 507*dfc6aa5cSAndroid Build Coastguard Worker test index, index ; if (index != 0) 508*dfc6aa5cSAndroid Build Coastguard Worker jz .ELOOP ; { 509*dfc6aa5cSAndroid Build Coastguard Worker.BLOOP: ; do { 510*dfc6aa5cSAndroid Build Coastguard Worker xor nbits, nbits ; nbits = 0; /* kill tzcnt input dependency */ 511*dfc6aa5cSAndroid Build Coastguard Worker tzcnt nbitsq, index ; nbits = # of trailing 0 bits in index 512*dfc6aa5cSAndroid Build Coastguard Worker inc nbits ; ++nbits; 513*dfc6aa5cSAndroid Build Coastguard Worker lea t, [t + nbitsq * 2] ; t = &t[nbits]; 514*dfc6aa5cSAndroid Build Coastguard Worker shr index, nbitsb ; index >>= nbits; 515*dfc6aa5cSAndroid Build Coastguard Worker.EMIT_BRLOOP_CODE_END: ; .EMIT_BRLOOP_CODE_END: 516*dfc6aa5cSAndroid Build Coastguard Worker cmp nbits, 16 ; if (nbits > 16) 517*dfc6aa5cSAndroid Build Coastguard Worker jg .BRLOOP ; goto .BRLOOP; 518*dfc6aa5cSAndroid Build Coastguard Worker.ERLOOP: ; .ERLOOP: 519*dfc6aa5cSAndroid Build Coastguard Worker movsx codeq, word [t] ; code = *t; 520*dfc6aa5cSAndroid Build Coastguard Worker lea tempd, [nbitsq * 2] ; temp = nbits * 2; 521*dfc6aa5cSAndroid Build Coastguard Worker movzx nbits, byte [NBITS(codeq)] ; nbits = JPEG_NBITS(code); 522*dfc6aa5cSAndroid Build Coastguard Worker lea tempd, [nbitsq + tempq * 8] ; temp = temp * 8 + nbits; 523*dfc6aa5cSAndroid Build Coastguard Worker mov code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4] 524*dfc6aa5cSAndroid Build Coastguard Worker ; code_temp = actbl->ehufco[temp-16]; 525*dfc6aa5cSAndroid Build Coastguard Worker shl code_temp, nbitsb ; code_temp <<= nbits; 526*dfc6aa5cSAndroid Build Coastguard Worker and code, dword [MASK_BITS(nbitsq)] ; code &= (1 << nbits) - 1; 527*dfc6aa5cSAndroid Build Coastguard Worker add nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)] 528*dfc6aa5cSAndroid Build Coastguard Worker ; free_bits -= actbl->ehufsi[temp-16]; 529*dfc6aa5cSAndroid Build Coastguard Worker or code, code_temp ; code |= code_temp; 530*dfc6aa5cSAndroid Build Coastguard Worker sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0) 531*dfc6aa5cSAndroid Build Coastguard Worker jle .EMIT_CODE ; goto .EMIT_CODE; 532*dfc6aa5cSAndroid Build Coastguard Worker shl put_buffer, nbitsb ; put_buffer <<= nbits; 533*dfc6aa5cSAndroid Build Coastguard Worker or put_buffer, codeq ; put_buffer |= code; 534*dfc6aa5cSAndroid Build Coastguard Worker test index, index 535*dfc6aa5cSAndroid Build Coastguard Worker jnz .BLOOP ; } while (index != 0); 536*dfc6aa5cSAndroid Build Coastguard Worker.ELOOP: ; } /* index != 0 */ 537*dfc6aa5cSAndroid Build Coastguard Worker sub td, esp ; t -= (WIN64: &t_[0], UNIX: &t_[64]); 538*dfc6aa5cSAndroid Build Coastguard Worker%ifdef WIN64 539*dfc6aa5cSAndroid Build Coastguard Worker cmp td, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (t != 62) 540*dfc6aa5cSAndroid Build Coastguard Worker%else 541*dfc6aa5cSAndroid Build Coastguard Worker cmp td, -2 * SIZEOF_WORD ; if (t != -2) 542*dfc6aa5cSAndroid Build Coastguard Worker%endif 543*dfc6aa5cSAndroid Build Coastguard Worker je .EFN ; { 544*dfc6aa5cSAndroid Build Coastguard Worker movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0] 545*dfc6aa5cSAndroid Build Coastguard Worker ; nbits = actbl->ehufsi[0]; 546*dfc6aa5cSAndroid Build Coastguard Worker mov code, [actbl + c_derived_tbl.ehufco + 0] ; code = actbl->ehufco[0]; 547*dfc6aa5cSAndroid Build Coastguard Worker sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0) 548*dfc6aa5cSAndroid Build Coastguard Worker jg .EFN_SKIP_EMIT_CODE ; { 549*dfc6aa5cSAndroid Build Coastguard Worker EMIT_QWORD .EFN ; insert code, flush buffer 550*dfc6aa5cSAndroid Build Coastguard Worker align 16 551*dfc6aa5cSAndroid Build Coastguard Worker.EFN_SKIP_EMIT_CODE: ; } else { 552*dfc6aa5cSAndroid Build Coastguard Worker shl put_buffer, nbitsb ; put_buffer <<= nbits; 553*dfc6aa5cSAndroid Build Coastguard Worker or put_buffer, codeq ; put_buffer |= code; 554*dfc6aa5cSAndroid Build Coastguard Worker.EFN: ; } } 555*dfc6aa5cSAndroid Build Coastguard Worker mov [state + working_state.cur.put_buffer.simd], put_buffer 556*dfc6aa5cSAndroid Build Coastguard Worker ; state->cur.put_buffer.simd = put_buffer; 557*dfc6aa5cSAndroid Build Coastguard Worker mov byte [state + working_state.cur.free_bits], free_bitsb 558*dfc6aa5cSAndroid Build Coastguard Worker ; state->cur.free_bits = free_bits; 559*dfc6aa5cSAndroid Build Coastguard Worker%ifdef WIN64 560*dfc6aa5cSAndroid Build Coastguard Worker sub rsp, -DCTSIZE2 * SIZEOF_WORD 561*dfc6aa5cSAndroid Build Coastguard Worker pop r12 562*dfc6aa5cSAndroid Build Coastguard Worker pop rdi 563*dfc6aa5cSAndroid Build Coastguard Worker pop rsi 564*dfc6aa5cSAndroid Build Coastguard Worker pop rbp 565*dfc6aa5cSAndroid Build Coastguard Worker pop rbx 566*dfc6aa5cSAndroid Build Coastguard Worker%else 567*dfc6aa5cSAndroid Build Coastguard Worker pop r12 568*dfc6aa5cSAndroid Build Coastguard Worker pop rbp 569*dfc6aa5cSAndroid Build Coastguard Worker pop rbx 570*dfc6aa5cSAndroid Build Coastguard Worker%endif 571*dfc6aa5cSAndroid Build Coastguard Worker ret 572*dfc6aa5cSAndroid Build Coastguard Worker 573*dfc6aa5cSAndroid Build Coastguard Worker; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 574*dfc6aa5cSAndroid Build Coastguard Worker 575*dfc6aa5cSAndroid Build Coastguard Worker align 16 576*dfc6aa5cSAndroid Build Coastguard Worker.EMIT_BRLOOP_CODE: 577*dfc6aa5cSAndroid Build Coastguard Worker EMIT_QWORD .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp } 578*dfc6aa5cSAndroid Build Coastguard Worker ; insert code, flush buffer, 579*dfc6aa5cSAndroid Build Coastguard Worker ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END 580*dfc6aa5cSAndroid Build Coastguard Worker 581*dfc6aa5cSAndroid Build Coastguard Worker; For some reason, the OS X linker does not honor the request to align the 582*dfc6aa5cSAndroid Build Coastguard Worker; segment unless we do this. 583*dfc6aa5cSAndroid Build Coastguard Worker align 32 584