1*dfc6aa5cSAndroid Build Coastguard Worker; 2*dfc6aa5cSAndroid Build Coastguard Worker; jquanti.asm - sample data conversion and quantization (SSE2) 3*dfc6aa5cSAndroid Build Coastguard Worker; 4*dfc6aa5cSAndroid Build Coastguard Worker; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2016, D. R. Commander. 6*dfc6aa5cSAndroid Build Coastguard Worker; 7*dfc6aa5cSAndroid Build Coastguard Worker; Based on the x86 SIMD extension for IJG JPEG library 8*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 1999-2006, MIYASAKA Masaru. 9*dfc6aa5cSAndroid Build Coastguard Worker; For conditions of distribution and use, see copyright notice in jsimdext.inc 10*dfc6aa5cSAndroid Build Coastguard Worker; 11*dfc6aa5cSAndroid Build Coastguard Worker; This file should be assembled with NASM (Netwide Assembler), 12*dfc6aa5cSAndroid Build Coastguard Worker; can *not* be assembled with Microsoft's MASM or any compatible 13*dfc6aa5cSAndroid Build Coastguard Worker; assembler (including Borland's Turbo Assembler). 14*dfc6aa5cSAndroid Build Coastguard Worker; NASM is available from http://nasm.sourceforge.net/ or 15*dfc6aa5cSAndroid Build Coastguard Worker; http://sourceforge.net/project/showfiles.php?group_id=6208 16*dfc6aa5cSAndroid Build Coastguard Worker 17*dfc6aa5cSAndroid Build Coastguard Worker%include "jsimdext.inc" 18*dfc6aa5cSAndroid Build Coastguard Worker%include "jdct.inc" 19*dfc6aa5cSAndroid Build Coastguard Worker 20*dfc6aa5cSAndroid Build Coastguard Worker; -------------------------------------------------------------------------- 21*dfc6aa5cSAndroid Build Coastguard Worker SECTION SEG_TEXT 22*dfc6aa5cSAndroid Build Coastguard Worker BITS 32 23*dfc6aa5cSAndroid Build Coastguard Worker; 24*dfc6aa5cSAndroid Build Coastguard Worker; Load data into workspace, applying unsigned->signed conversion 25*dfc6aa5cSAndroid Build Coastguard Worker; 26*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void) 27*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, 28*dfc6aa5cSAndroid Build Coastguard Worker; DCTELEM *workspace); 29*dfc6aa5cSAndroid Build Coastguard Worker; 30*dfc6aa5cSAndroid Build Coastguard Worker 31*dfc6aa5cSAndroid Build Coastguard Worker%define sample_data ebp + 8 ; JSAMPARRAY sample_data 32*dfc6aa5cSAndroid Build Coastguard Worker%define start_col ebp + 12 ; JDIMENSION start_col 33*dfc6aa5cSAndroid Build Coastguard Worker%define workspace ebp + 16 ; DCTELEM *workspace 34*dfc6aa5cSAndroid Build Coastguard Worker 35*dfc6aa5cSAndroid Build Coastguard Worker align 32 36*dfc6aa5cSAndroid Build Coastguard Worker GLOBAL_FUNCTION(jsimd_convsamp_sse2) 37*dfc6aa5cSAndroid Build Coastguard Worker 38*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_convsamp_sse2): 39*dfc6aa5cSAndroid Build Coastguard Worker push ebp 40*dfc6aa5cSAndroid Build Coastguard Worker mov ebp, esp 41*dfc6aa5cSAndroid Build Coastguard Worker push ebx 42*dfc6aa5cSAndroid Build Coastguard Worker; push ecx ; need not be preserved 43*dfc6aa5cSAndroid Build Coastguard Worker; push edx ; need not be preserved 44*dfc6aa5cSAndroid Build Coastguard Worker push esi 45*dfc6aa5cSAndroid Build Coastguard Worker push edi 46*dfc6aa5cSAndroid Build Coastguard Worker 47*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm6, xmm6 ; xmm6=(all 0's) 48*dfc6aa5cSAndroid Build Coastguard Worker pcmpeqw xmm7, xmm7 49*dfc6aa5cSAndroid Build Coastguard Worker psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} 50*dfc6aa5cSAndroid Build Coastguard Worker 51*dfc6aa5cSAndroid Build Coastguard Worker mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 52*dfc6aa5cSAndroid Build Coastguard Worker mov eax, JDIMENSION [start_col] 53*dfc6aa5cSAndroid Build Coastguard Worker mov edi, POINTER [workspace] ; (DCTELEM *) 54*dfc6aa5cSAndroid Build Coastguard Worker mov ecx, DCTSIZE/4 55*dfc6aa5cSAndroid Build Coastguard Worker alignx 16, 7 56*dfc6aa5cSAndroid Build Coastguard Worker.convloop: 57*dfc6aa5cSAndroid Build Coastguard Worker mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 58*dfc6aa5cSAndroid Build Coastguard Worker mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 59*dfc6aa5cSAndroid Build Coastguard Worker 60*dfc6aa5cSAndroid Build Coastguard Worker movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567) 61*dfc6aa5cSAndroid Build Coastguard Worker movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) 62*dfc6aa5cSAndroid Build Coastguard Worker 63*dfc6aa5cSAndroid Build Coastguard Worker mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) 64*dfc6aa5cSAndroid Build Coastguard Worker mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) 65*dfc6aa5cSAndroid Build Coastguard Worker 66*dfc6aa5cSAndroid Build Coastguard Worker movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) 67*dfc6aa5cSAndroid Build Coastguard Worker movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) 68*dfc6aa5cSAndroid Build Coastguard Worker 69*dfc6aa5cSAndroid Build Coastguard Worker punpcklbw xmm0, xmm6 ; xmm0=(01234567) 70*dfc6aa5cSAndroid Build Coastguard Worker punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF) 71*dfc6aa5cSAndroid Build Coastguard Worker paddw xmm0, xmm7 72*dfc6aa5cSAndroid Build Coastguard Worker paddw xmm1, xmm7 73*dfc6aa5cSAndroid Build Coastguard Worker punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN) 74*dfc6aa5cSAndroid Build Coastguard Worker punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV) 75*dfc6aa5cSAndroid Build Coastguard Worker paddw xmm2, xmm7 76*dfc6aa5cSAndroid Build Coastguard Worker paddw xmm3, xmm7 77*dfc6aa5cSAndroid Build Coastguard Worker 78*dfc6aa5cSAndroid Build Coastguard Worker movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 79*dfc6aa5cSAndroid Build Coastguard Worker movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 80*dfc6aa5cSAndroid Build Coastguard Worker movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 81*dfc6aa5cSAndroid Build Coastguard Worker movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 82*dfc6aa5cSAndroid Build Coastguard Worker 83*dfc6aa5cSAndroid Build Coastguard Worker add esi, byte 4*SIZEOF_JSAMPROW 84*dfc6aa5cSAndroid Build Coastguard Worker add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM 85*dfc6aa5cSAndroid Build Coastguard Worker dec ecx 86*dfc6aa5cSAndroid Build Coastguard Worker jnz short .convloop 87*dfc6aa5cSAndroid Build Coastguard Worker 88*dfc6aa5cSAndroid Build Coastguard Worker pop edi 89*dfc6aa5cSAndroid Build Coastguard Worker pop esi 90*dfc6aa5cSAndroid Build Coastguard Worker; pop edx ; need not be preserved 91*dfc6aa5cSAndroid Build Coastguard Worker; pop ecx ; need not be preserved 92*dfc6aa5cSAndroid Build Coastguard Worker pop ebx 93*dfc6aa5cSAndroid Build Coastguard Worker pop ebp 94*dfc6aa5cSAndroid Build Coastguard Worker ret 95*dfc6aa5cSAndroid Build Coastguard Worker 96*dfc6aa5cSAndroid Build Coastguard Worker; -------------------------------------------------------------------------- 97*dfc6aa5cSAndroid Build Coastguard Worker; 98*dfc6aa5cSAndroid Build Coastguard Worker; Quantize/descale the coefficients, and store into coef_block 99*dfc6aa5cSAndroid Build Coastguard Worker; 100*dfc6aa5cSAndroid Build Coastguard Worker; This implementation is based on an algorithm described in 101*dfc6aa5cSAndroid Build Coastguard Worker; "How to optimize for the Pentium family of microprocessors" 102*dfc6aa5cSAndroid Build Coastguard Worker; (http://www.agner.org/assem/). 103*dfc6aa5cSAndroid Build Coastguard Worker; 104*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void) 105*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors, 106*dfc6aa5cSAndroid Build Coastguard Worker; DCTELEM *workspace); 107*dfc6aa5cSAndroid Build Coastguard Worker; 108*dfc6aa5cSAndroid Build Coastguard Worker 109*dfc6aa5cSAndroid Build Coastguard Worker%define RECIPROCAL(m, n, b) \ 110*dfc6aa5cSAndroid Build Coastguard Worker XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) 111*dfc6aa5cSAndroid Build Coastguard Worker%define CORRECTION(m, n, b) \ 112*dfc6aa5cSAndroid Build Coastguard Worker XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) 113*dfc6aa5cSAndroid Build Coastguard Worker%define SCALE(m, n, b) \ 114*dfc6aa5cSAndroid Build Coastguard Worker XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) 115*dfc6aa5cSAndroid Build Coastguard Worker 116*dfc6aa5cSAndroid Build Coastguard Worker%define coef_block ebp + 8 ; JCOEFPTR coef_block 117*dfc6aa5cSAndroid Build Coastguard Worker%define divisors ebp + 12 ; DCTELEM *divisors 118*dfc6aa5cSAndroid Build Coastguard Worker%define workspace ebp + 16 ; DCTELEM *workspace 119*dfc6aa5cSAndroid Build Coastguard Worker 120*dfc6aa5cSAndroid Build Coastguard Worker align 32 121*dfc6aa5cSAndroid Build Coastguard Worker GLOBAL_FUNCTION(jsimd_quantize_sse2) 122*dfc6aa5cSAndroid Build Coastguard Worker 123*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_quantize_sse2): 124*dfc6aa5cSAndroid Build Coastguard Worker push ebp 125*dfc6aa5cSAndroid Build Coastguard Worker mov ebp, esp 126*dfc6aa5cSAndroid Build Coastguard Worker; push ebx ; unused 127*dfc6aa5cSAndroid Build Coastguard Worker; push ecx ; unused 128*dfc6aa5cSAndroid Build Coastguard Worker; push edx ; need not be preserved 129*dfc6aa5cSAndroid Build Coastguard Worker push esi 130*dfc6aa5cSAndroid Build Coastguard Worker push edi 131*dfc6aa5cSAndroid Build Coastguard Worker 132*dfc6aa5cSAndroid Build Coastguard Worker mov esi, POINTER [workspace] 133*dfc6aa5cSAndroid Build Coastguard Worker mov edx, POINTER [divisors] 134*dfc6aa5cSAndroid Build Coastguard Worker mov edi, JCOEFPTR [coef_block] 135*dfc6aa5cSAndroid Build Coastguard Worker mov eax, DCTSIZE2/32 136*dfc6aa5cSAndroid Build Coastguard Worker alignx 16, 7 137*dfc6aa5cSAndroid Build Coastguard Worker.quantloop: 138*dfc6aa5cSAndroid Build Coastguard Worker movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] 139*dfc6aa5cSAndroid Build Coastguard Worker movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] 140*dfc6aa5cSAndroid Build Coastguard Worker movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] 141*dfc6aa5cSAndroid Build Coastguard Worker movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)] 142*dfc6aa5cSAndroid Build Coastguard Worker movdqa xmm0, xmm4 143*dfc6aa5cSAndroid Build Coastguard Worker movdqa xmm1, xmm5 144*dfc6aa5cSAndroid Build Coastguard Worker movdqa xmm2, xmm6 145*dfc6aa5cSAndroid Build Coastguard Worker movdqa xmm3, xmm7 146*dfc6aa5cSAndroid Build Coastguard Worker psraw xmm4, (WORD_BIT-1) 147*dfc6aa5cSAndroid Build Coastguard Worker psraw xmm5, (WORD_BIT-1) 148*dfc6aa5cSAndroid Build Coastguard Worker psraw xmm6, (WORD_BIT-1) 149*dfc6aa5cSAndroid Build Coastguard Worker psraw xmm7, (WORD_BIT-1) 150*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm0, xmm4 151*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm1, xmm5 152*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm2, xmm6 153*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm3, xmm7 154*dfc6aa5cSAndroid Build Coastguard Worker psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; 155*dfc6aa5cSAndroid Build Coastguard Worker psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; 156*dfc6aa5cSAndroid Build Coastguard Worker psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; 157*dfc6aa5cSAndroid Build Coastguard Worker psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; 158*dfc6aa5cSAndroid Build Coastguard Worker 159*dfc6aa5cSAndroid Build Coastguard Worker paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor 160*dfc6aa5cSAndroid Build Coastguard Worker paddw xmm1, XMMWORD [CORRECTION(1,0,edx)] 161*dfc6aa5cSAndroid Build Coastguard Worker paddw xmm2, XMMWORD [CORRECTION(2,0,edx)] 162*dfc6aa5cSAndroid Build Coastguard Worker paddw xmm3, XMMWORD [CORRECTION(3,0,edx)] 163*dfc6aa5cSAndroid Build Coastguard Worker pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal 164*dfc6aa5cSAndroid Build Coastguard Worker pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)] 165*dfc6aa5cSAndroid Build Coastguard Worker pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)] 166*dfc6aa5cSAndroid Build Coastguard Worker pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)] 167*dfc6aa5cSAndroid Build Coastguard Worker pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale 168*dfc6aa5cSAndroid Build Coastguard Worker pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)] 169*dfc6aa5cSAndroid Build Coastguard Worker pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)] 170*dfc6aa5cSAndroid Build Coastguard Worker pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)] 171*dfc6aa5cSAndroid Build Coastguard Worker 172*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm0, xmm4 173*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm1, xmm5 174*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm2, xmm6 175*dfc6aa5cSAndroid Build Coastguard Worker pxor xmm3, xmm7 176*dfc6aa5cSAndroid Build Coastguard Worker psubw xmm0, xmm4 177*dfc6aa5cSAndroid Build Coastguard Worker psubw xmm1, xmm5 178*dfc6aa5cSAndroid Build Coastguard Worker psubw xmm2, xmm6 179*dfc6aa5cSAndroid Build Coastguard Worker psubw xmm3, xmm7 180*dfc6aa5cSAndroid Build Coastguard Worker movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 181*dfc6aa5cSAndroid Build Coastguard Worker movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 182*dfc6aa5cSAndroid Build Coastguard Worker movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 183*dfc6aa5cSAndroid Build Coastguard Worker movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 184*dfc6aa5cSAndroid Build Coastguard Worker 185*dfc6aa5cSAndroid Build Coastguard Worker add esi, byte 32*SIZEOF_DCTELEM 186*dfc6aa5cSAndroid Build Coastguard Worker add edx, byte 32*SIZEOF_DCTELEM 187*dfc6aa5cSAndroid Build Coastguard Worker add edi, byte 32*SIZEOF_JCOEF 188*dfc6aa5cSAndroid Build Coastguard Worker dec eax 189*dfc6aa5cSAndroid Build Coastguard Worker jnz near .quantloop 190*dfc6aa5cSAndroid Build Coastguard Worker 191*dfc6aa5cSAndroid Build Coastguard Worker pop edi 192*dfc6aa5cSAndroid Build Coastguard Worker pop esi 193*dfc6aa5cSAndroid Build Coastguard Worker; pop edx ; need not be preserved 194*dfc6aa5cSAndroid Build Coastguard Worker; pop ecx ; unused 195*dfc6aa5cSAndroid Build Coastguard Worker; pop ebx ; unused 196*dfc6aa5cSAndroid Build Coastguard Worker pop ebp 197*dfc6aa5cSAndroid Build Coastguard Worker ret 198*dfc6aa5cSAndroid Build Coastguard Worker 199*dfc6aa5cSAndroid Build Coastguard Worker; For some reason, the OS X linker does not honor the request to align the 200*dfc6aa5cSAndroid Build Coastguard Worker; segment unless we do this. 201*dfc6aa5cSAndroid Build Coastguard Worker align 32 202