1*dfc6aa5cSAndroid Build Coastguard Worker; 2*dfc6aa5cSAndroid Build Coastguard Worker; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2) 3*dfc6aa5cSAndroid Build Coastguard Worker; 4*dfc6aa5cSAndroid Build Coastguard Worker; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2009, 2016, D. R. Commander. 6*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2018, Matthias Räncker. 7*dfc6aa5cSAndroid Build Coastguard Worker; 8*dfc6aa5cSAndroid Build Coastguard Worker; Based on the x86 SIMD extension for IJG JPEG library 9*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 1999-2006, MIYASAKA Masaru. 10*dfc6aa5cSAndroid Build Coastguard Worker; For conditions of distribution and use, see copyright notice in jsimdext.inc 11*dfc6aa5cSAndroid Build Coastguard Worker; 12*dfc6aa5cSAndroid Build Coastguard Worker; This file should be assembled with NASM (Netwide Assembler), 13*dfc6aa5cSAndroid Build Coastguard Worker; can *not* be assembled with Microsoft's MASM or any compatible 14*dfc6aa5cSAndroid Build Coastguard Worker; assembler (including Borland's Turbo Assembler). 15*dfc6aa5cSAndroid Build Coastguard Worker; NASM is available from http://nasm.sourceforge.net/ or 16*dfc6aa5cSAndroid Build Coastguard Worker; http://sourceforge.net/project/showfiles.php?group_id=6208 17*dfc6aa5cSAndroid Build Coastguard Worker 18*dfc6aa5cSAndroid Build Coastguard Worker%include "jsimdext.inc" 19*dfc6aa5cSAndroid Build Coastguard Worker%include "jdct.inc" 20*dfc6aa5cSAndroid Build Coastguard Worker 21*dfc6aa5cSAndroid Build Coastguard Worker; -------------------------------------------------------------------------- 22*dfc6aa5cSAndroid Build Coastguard Worker SECTION SEG_TEXT 23*dfc6aa5cSAndroid Build Coastguard Worker BITS 64 24*dfc6aa5cSAndroid Build Coastguard Worker; 25*dfc6aa5cSAndroid Build Coastguard Worker; Load data into workspace, applying unsigned->signed conversion 26*dfc6aa5cSAndroid Build Coastguard Worker; 27*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void) 28*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, 29*dfc6aa5cSAndroid Build Coastguard Worker; FAST_FLOAT *workspace); 30*dfc6aa5cSAndroid Build Coastguard Worker; 31*dfc6aa5cSAndroid Build Coastguard Worker 32*dfc6aa5cSAndroid Build Coastguard Worker; r10 = JSAMPARRAY sample_data 33*dfc6aa5cSAndroid Build Coastguard Worker; r11d = JDIMENSION start_col 34*dfc6aa5cSAndroid Build Coastguard Worker; r12 = FAST_FLOAT *workspace 35*dfc6aa5cSAndroid Build Coastguard Worker 36*dfc6aa5cSAndroid Build Coastguard Worker align 32 37*dfc6aa5cSAndroid Build Coastguard Worker GLOBAL_FUNCTION(jsimd_convsamp_float_sse2) 38*dfc6aa5cSAndroid Build Coastguard Worker 39*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_convsamp_float_sse2): 40*dfc6aa5cSAndroid Build Coastguard Worker push rbp 41*dfc6aa5cSAndroid Build Coastguard Worker mov rax, rsp 42*dfc6aa5cSAndroid Build Coastguard Worker mov rbp, rsp 43*dfc6aa5cSAndroid Build Coastguard Worker collect_args 3 44*dfc6aa5cSAndroid Build Coastguard Worker push rbx 45*dfc6aa5cSAndroid Build Coastguard Worker 46*dfc6aa5cSAndroid Build Coastguard Worker pcmpeqw xmm7, xmm7 47*dfc6aa5cSAndroid Build Coastguard Worker psllw xmm7, 7 48*dfc6aa5cSAndroid Build Coastguard Worker packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) 49*dfc6aa5cSAndroid Build Coastguard Worker 50*dfc6aa5cSAndroid Build Coastguard Worker mov rsi, r10 51*dfc6aa5cSAndroid Build Coastguard Worker mov eax, r11d 52*dfc6aa5cSAndroid Build Coastguard Worker mov rdi, r12 53*dfc6aa5cSAndroid Build Coastguard Worker mov rcx, DCTSIZE/2 54*dfc6aa5cSAndroid Build Coastguard Worker.convloop: 55*dfc6aa5cSAndroid Build Coastguard Worker mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 56*dfc6aa5cSAndroid Build Coastguard Worker mov rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 57*dfc6aa5cSAndroid Build Coastguard Worker 58*dfc6aa5cSAndroid Build Coastguard Worker movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] 59*dfc6aa5cSAndroid Build Coastguard Worker movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] 60*dfc6aa5cSAndroid Build Coastguard Worker 61*dfc6aa5cSAndroid Build Coastguard Worker psubb xmm0, xmm7 ; xmm0=(01234567) 62*dfc6aa5cSAndroid Build Coastguard Worker psubb xmm1, xmm7 ; xmm1=(89ABCDEF) 63*dfc6aa5cSAndroid Build Coastguard Worker 64*dfc6aa5cSAndroid Build Coastguard Worker punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) 65*dfc6aa5cSAndroid Build Coastguard Worker punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) 66*dfc6aa5cSAndroid Build Coastguard Worker 67*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3) 68*dfc6aa5cSAndroid Build Coastguard Worker punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7) 69*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B) 70*dfc6aa5cSAndroid Build Coastguard Worker punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F) 71*dfc6aa5cSAndroid Build Coastguard Worker 72*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123) 73*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567) 74*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm2, xmm2 ; xmm2=(0123) 75*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm0, xmm0 ; xmm0=(4567) 76*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) 77*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) 78*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm3, xmm3 ; xmm3=(89AB) 79*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF) 80*dfc6aa5cSAndroid Build Coastguard Worker 81*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 82*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 83*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 84*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 85*dfc6aa5cSAndroid Build Coastguard Worker 86*dfc6aa5cSAndroid Build Coastguard Worker add rsi, byte 2*SIZEOF_JSAMPROW 87*dfc6aa5cSAndroid Build Coastguard Worker add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 88*dfc6aa5cSAndroid Build Coastguard Worker dec rcx 89*dfc6aa5cSAndroid Build Coastguard Worker jnz short .convloop 90*dfc6aa5cSAndroid Build Coastguard Worker 91*dfc6aa5cSAndroid Build Coastguard Worker pop rbx 92*dfc6aa5cSAndroid Build Coastguard Worker uncollect_args 3 93*dfc6aa5cSAndroid Build Coastguard Worker pop rbp 94*dfc6aa5cSAndroid Build Coastguard Worker ret 95*dfc6aa5cSAndroid Build Coastguard Worker 96*dfc6aa5cSAndroid Build Coastguard Worker; -------------------------------------------------------------------------- 97*dfc6aa5cSAndroid Build Coastguard Worker; 98*dfc6aa5cSAndroid Build Coastguard Worker; Quantize/descale the coefficients, and store into coef_block 99*dfc6aa5cSAndroid Build Coastguard Worker; 100*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void) 101*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors, 102*dfc6aa5cSAndroid Build Coastguard Worker; FAST_FLOAT *workspace); 103*dfc6aa5cSAndroid Build Coastguard Worker; 104*dfc6aa5cSAndroid Build Coastguard Worker 105*dfc6aa5cSAndroid Build Coastguard Worker; r10 = JCOEFPTR coef_block 106*dfc6aa5cSAndroid Build Coastguard Worker; r11 = FAST_FLOAT *divisors 107*dfc6aa5cSAndroid Build Coastguard Worker; r12 = FAST_FLOAT *workspace 108*dfc6aa5cSAndroid Build Coastguard Worker 109*dfc6aa5cSAndroid Build Coastguard Worker align 32 110*dfc6aa5cSAndroid Build Coastguard Worker GLOBAL_FUNCTION(jsimd_quantize_float_sse2) 111*dfc6aa5cSAndroid Build Coastguard Worker 112*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_quantize_float_sse2): 113*dfc6aa5cSAndroid Build Coastguard Worker push rbp 114*dfc6aa5cSAndroid Build Coastguard Worker mov rax, rsp 115*dfc6aa5cSAndroid Build Coastguard Worker mov rbp, rsp 116*dfc6aa5cSAndroid Build Coastguard Worker collect_args 3 117*dfc6aa5cSAndroid Build Coastguard Worker 118*dfc6aa5cSAndroid Build Coastguard Worker mov rsi, r12 119*dfc6aa5cSAndroid Build Coastguard Worker mov rdx, r11 120*dfc6aa5cSAndroid Build Coastguard Worker mov rdi, r10 121*dfc6aa5cSAndroid Build Coastguard Worker mov rax, DCTSIZE2/16 122*dfc6aa5cSAndroid Build Coastguard Worker.quantloop: 123*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] 124*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)] 125*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] 126*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] 127*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] 128*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)] 129*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] 130*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] 131*dfc6aa5cSAndroid Build Coastguard Worker 132*dfc6aa5cSAndroid Build Coastguard Worker cvtps2dq xmm0, xmm0 133*dfc6aa5cSAndroid Build Coastguard Worker cvtps2dq xmm1, xmm1 134*dfc6aa5cSAndroid Build Coastguard Worker cvtps2dq xmm2, xmm2 135*dfc6aa5cSAndroid Build Coastguard Worker cvtps2dq xmm3, xmm3 136*dfc6aa5cSAndroid Build Coastguard Worker 137*dfc6aa5cSAndroid Build Coastguard Worker packssdw xmm0, xmm1 138*dfc6aa5cSAndroid Build Coastguard Worker packssdw xmm2, xmm3 139*dfc6aa5cSAndroid Build Coastguard Worker 140*dfc6aa5cSAndroid Build Coastguard Worker movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0 141*dfc6aa5cSAndroid Build Coastguard Worker movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2 142*dfc6aa5cSAndroid Build Coastguard Worker 143*dfc6aa5cSAndroid Build Coastguard Worker add rsi, byte 16*SIZEOF_FAST_FLOAT 144*dfc6aa5cSAndroid Build Coastguard Worker add rdx, byte 16*SIZEOF_FAST_FLOAT 145*dfc6aa5cSAndroid Build Coastguard Worker add rdi, byte 16*SIZEOF_JCOEF 146*dfc6aa5cSAndroid Build Coastguard Worker dec rax 147*dfc6aa5cSAndroid Build Coastguard Worker jnz short .quantloop 148*dfc6aa5cSAndroid Build Coastguard Worker 149*dfc6aa5cSAndroid Build Coastguard Worker uncollect_args 3 150*dfc6aa5cSAndroid Build Coastguard Worker pop rbp 151*dfc6aa5cSAndroid Build Coastguard Worker ret 152*dfc6aa5cSAndroid Build Coastguard Worker 153*dfc6aa5cSAndroid Build Coastguard Worker; For some reason, the OS X linker does not honor the request to align the 154*dfc6aa5cSAndroid Build Coastguard Worker; segment unless we do this. 155*dfc6aa5cSAndroid Build Coastguard Worker align 32 156