1*dfc6aa5cSAndroid Build Coastguard Worker; 2*dfc6aa5cSAndroid Build Coastguard Worker; jquanti.asm - sample data conversion and quantization (AVX2) 3*dfc6aa5cSAndroid Build Coastguard Worker; 4*dfc6aa5cSAndroid Build Coastguard Worker; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2016, 2018, D. R. Commander. 6*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2016, Matthieu Darbois. 7*dfc6aa5cSAndroid Build Coastguard Worker; 8*dfc6aa5cSAndroid Build Coastguard Worker; Based on the x86 SIMD extension for IJG JPEG library 9*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 1999-2006, MIYASAKA Masaru. 10*dfc6aa5cSAndroid Build Coastguard Worker; For conditions of distribution and use, see copyright notice in jsimdext.inc 11*dfc6aa5cSAndroid Build Coastguard Worker; 12*dfc6aa5cSAndroid Build Coastguard Worker; This file should be assembled with NASM (Netwide Assembler), 13*dfc6aa5cSAndroid Build Coastguard Worker; can *not* be assembled with Microsoft's MASM or any compatible 14*dfc6aa5cSAndroid Build Coastguard Worker; assembler (including Borland's Turbo Assembler). 15*dfc6aa5cSAndroid Build Coastguard Worker; NASM is available from http://nasm.sourceforge.net/ or 16*dfc6aa5cSAndroid Build Coastguard Worker; http://sourceforge.net/project/showfiles.php?group_id=6208 17*dfc6aa5cSAndroid Build Coastguard Worker 18*dfc6aa5cSAndroid Build Coastguard Worker%include "jsimdext.inc" 19*dfc6aa5cSAndroid Build Coastguard Worker%include "jdct.inc" 20*dfc6aa5cSAndroid Build Coastguard Worker 21*dfc6aa5cSAndroid Build Coastguard Worker; -------------------------------------------------------------------------- 22*dfc6aa5cSAndroid Build Coastguard Worker SECTION SEG_TEXT 23*dfc6aa5cSAndroid Build Coastguard Worker BITS 32 24*dfc6aa5cSAndroid Build Coastguard Worker; 25*dfc6aa5cSAndroid Build Coastguard Worker; Load data into workspace, applying unsigned->signed conversion 26*dfc6aa5cSAndroid Build Coastguard Worker; 27*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void) 28*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col, 29*dfc6aa5cSAndroid Build Coastguard Worker; DCTELEM *workspace); 30*dfc6aa5cSAndroid Build Coastguard Worker; 31*dfc6aa5cSAndroid Build Coastguard Worker 32*dfc6aa5cSAndroid Build Coastguard Worker%define sample_data ebp + 8 ; JSAMPARRAY sample_data 33*dfc6aa5cSAndroid Build Coastguard Worker%define start_col ebp + 12 ; JDIMENSION start_col 34*dfc6aa5cSAndroid Build Coastguard Worker%define workspace ebp + 16 ; DCTELEM *workspace 35*dfc6aa5cSAndroid Build Coastguard Worker 36*dfc6aa5cSAndroid Build Coastguard Worker align 32 37*dfc6aa5cSAndroid Build Coastguard Worker GLOBAL_FUNCTION(jsimd_convsamp_avx2) 38*dfc6aa5cSAndroid Build Coastguard Worker 39*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_convsamp_avx2): 40*dfc6aa5cSAndroid Build Coastguard Worker push ebp 41*dfc6aa5cSAndroid Build Coastguard Worker mov ebp, esp 42*dfc6aa5cSAndroid Build Coastguard Worker push ebx 43*dfc6aa5cSAndroid Build Coastguard Worker; push ecx ; need not be preserved 44*dfc6aa5cSAndroid Build Coastguard Worker; push edx ; need not be preserved 45*dfc6aa5cSAndroid Build Coastguard Worker push esi 46*dfc6aa5cSAndroid Build Coastguard Worker push edi 47*dfc6aa5cSAndroid Build Coastguard Worker 48*dfc6aa5cSAndroid Build Coastguard Worker mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 49*dfc6aa5cSAndroid Build Coastguard Worker mov eax, JDIMENSION [start_col] 50*dfc6aa5cSAndroid Build Coastguard Worker mov edi, POINTER [workspace] ; (DCTELEM *) 51*dfc6aa5cSAndroid Build Coastguard Worker 52*dfc6aa5cSAndroid Build Coastguard Worker mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 53*dfc6aa5cSAndroid Build Coastguard Worker mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 54*dfc6aa5cSAndroid Build Coastguard Worker movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 55*dfc6aa5cSAndroid Build Coastguard Worker movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 56*dfc6aa5cSAndroid Build Coastguard Worker 57*dfc6aa5cSAndroid Build Coastguard Worker mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) 58*dfc6aa5cSAndroid Build Coastguard Worker mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) 59*dfc6aa5cSAndroid Build Coastguard Worker movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 60*dfc6aa5cSAndroid Build Coastguard Worker movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 61*dfc6aa5cSAndroid Build Coastguard Worker 62*dfc6aa5cSAndroid Build Coastguard Worker mov ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW] ; (JSAMPLE *) 63*dfc6aa5cSAndroid Build Coastguard Worker mov edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW] ; (JSAMPLE *) 64*dfc6aa5cSAndroid Build Coastguard Worker movq xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 65*dfc6aa5cSAndroid Build Coastguard Worker movq xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 66*dfc6aa5cSAndroid Build Coastguard Worker 67*dfc6aa5cSAndroid Build Coastguard Worker mov ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW] ; (JSAMPLE *) 68*dfc6aa5cSAndroid Build Coastguard Worker mov edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW] ; (JSAMPLE *) 69*dfc6aa5cSAndroid Build Coastguard Worker movq xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 70*dfc6aa5cSAndroid Build Coastguard Worker movq xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 71*dfc6aa5cSAndroid Build Coastguard Worker 72*dfc6aa5cSAndroid Build Coastguard Worker vinserti128 ymm0, ymm0, xmm1, 1 73*dfc6aa5cSAndroid Build Coastguard Worker vinserti128 ymm2, ymm2, xmm3, 1 74*dfc6aa5cSAndroid Build Coastguard Worker vinserti128 ymm4, ymm4, xmm5, 1 75*dfc6aa5cSAndroid Build Coastguard Worker vinserti128 ymm6, ymm6, xmm7, 1 76*dfc6aa5cSAndroid Build Coastguard Worker 77*dfc6aa5cSAndroid Build Coastguard Worker vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) 78*dfc6aa5cSAndroid Build Coastguard Worker vpunpcklbw ymm0, ymm0, ymm1 79*dfc6aa5cSAndroid Build Coastguard Worker vpunpcklbw ymm2, ymm2, ymm1 80*dfc6aa5cSAndroid Build Coastguard Worker vpunpcklbw ymm4, ymm4, ymm1 81*dfc6aa5cSAndroid Build Coastguard Worker vpunpcklbw ymm6, ymm6, ymm1 82*dfc6aa5cSAndroid Build Coastguard Worker 83*dfc6aa5cSAndroid Build Coastguard Worker vpcmpeqw ymm7, ymm7, ymm7 84*dfc6aa5cSAndroid Build Coastguard Worker vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} 85*dfc6aa5cSAndroid Build Coastguard Worker 86*dfc6aa5cSAndroid Build Coastguard Worker vpaddw ymm0, ymm0, ymm7 87*dfc6aa5cSAndroid Build Coastguard Worker vpaddw ymm2, ymm2, ymm7 88*dfc6aa5cSAndroid Build Coastguard Worker vpaddw ymm4, ymm4, ymm7 89*dfc6aa5cSAndroid Build Coastguard Worker vpaddw ymm6, ymm6, ymm7 90*dfc6aa5cSAndroid Build Coastguard Worker 91*dfc6aa5cSAndroid Build Coastguard Worker vmovdqu YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0 92*dfc6aa5cSAndroid Build Coastguard Worker vmovdqu YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2 93*dfc6aa5cSAndroid Build Coastguard Worker vmovdqu YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4 94*dfc6aa5cSAndroid Build Coastguard Worker vmovdqu YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6 95*dfc6aa5cSAndroid Build Coastguard Worker 96*dfc6aa5cSAndroid Build Coastguard Worker vzeroupper 97*dfc6aa5cSAndroid Build Coastguard Worker pop edi 98*dfc6aa5cSAndroid Build Coastguard Worker pop esi 99*dfc6aa5cSAndroid Build Coastguard Worker; pop edx ; need not be preserved 100*dfc6aa5cSAndroid Build Coastguard Worker; pop ecx ; need not be preserved 101*dfc6aa5cSAndroid Build Coastguard Worker pop ebx 102*dfc6aa5cSAndroid Build Coastguard Worker pop ebp 103*dfc6aa5cSAndroid Build Coastguard Worker ret 104*dfc6aa5cSAndroid Build Coastguard Worker 105*dfc6aa5cSAndroid Build Coastguard Worker; -------------------------------------------------------------------------- 106*dfc6aa5cSAndroid Build Coastguard Worker; 107*dfc6aa5cSAndroid Build Coastguard Worker; Quantize/descale the coefficients, and store into coef_block 108*dfc6aa5cSAndroid Build Coastguard Worker; 109*dfc6aa5cSAndroid Build Coastguard Worker; This implementation is based on an algorithm described in 110*dfc6aa5cSAndroid Build Coastguard Worker; "How to optimize for the Pentium family of microprocessors" 111*dfc6aa5cSAndroid Build Coastguard Worker; (http://www.agner.org/assem/). 112*dfc6aa5cSAndroid Build Coastguard Worker; 113*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void) 114*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors, 115*dfc6aa5cSAndroid Build Coastguard Worker; DCTELEM *workspace); 116*dfc6aa5cSAndroid Build Coastguard Worker; 117*dfc6aa5cSAndroid Build Coastguard Worker 118*dfc6aa5cSAndroid Build Coastguard Worker%define RECIPROCAL(m, n, b) \ 119*dfc6aa5cSAndroid Build Coastguard Worker YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) 120*dfc6aa5cSAndroid Build Coastguard Worker%define CORRECTION(m, n, b) \ 121*dfc6aa5cSAndroid Build Coastguard Worker YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) 122*dfc6aa5cSAndroid Build Coastguard Worker%define SCALE(m, n, b) \ 123*dfc6aa5cSAndroid Build Coastguard Worker YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) 124*dfc6aa5cSAndroid Build Coastguard Worker 125*dfc6aa5cSAndroid Build Coastguard Worker%define coef_block ebp + 8 ; JCOEFPTR coef_block 126*dfc6aa5cSAndroid Build Coastguard Worker%define divisors ebp + 12 ; DCTELEM *divisors 127*dfc6aa5cSAndroid Build Coastguard Worker%define workspace ebp + 16 ; DCTELEM *workspace 128*dfc6aa5cSAndroid Build Coastguard Worker 129*dfc6aa5cSAndroid Build Coastguard Worker align 32 130*dfc6aa5cSAndroid Build Coastguard Worker GLOBAL_FUNCTION(jsimd_quantize_avx2) 131*dfc6aa5cSAndroid Build Coastguard Worker 132*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_quantize_avx2): 133*dfc6aa5cSAndroid Build Coastguard Worker push ebp 134*dfc6aa5cSAndroid Build Coastguard Worker mov ebp, esp 135*dfc6aa5cSAndroid Build Coastguard Worker; push ebx ; unused 136*dfc6aa5cSAndroid Build Coastguard Worker; push ecx ; unused 137*dfc6aa5cSAndroid Build Coastguard Worker; push edx ; need not be preserved 138*dfc6aa5cSAndroid Build Coastguard Worker push esi 139*dfc6aa5cSAndroid Build Coastguard Worker push edi 140*dfc6aa5cSAndroid Build Coastguard Worker 141*dfc6aa5cSAndroid Build Coastguard Worker mov esi, POINTER [workspace] 142*dfc6aa5cSAndroid Build Coastguard Worker mov edx, POINTER [divisors] 143*dfc6aa5cSAndroid Build Coastguard Worker mov edi, JCOEFPTR [coef_block] 144*dfc6aa5cSAndroid Build Coastguard Worker 145*dfc6aa5cSAndroid Build Coastguard Worker vmovdqu ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] 146*dfc6aa5cSAndroid Build Coastguard Worker vmovdqu ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] 147*dfc6aa5cSAndroid Build Coastguard Worker vmovdqu ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)] 148*dfc6aa5cSAndroid Build Coastguard Worker vmovdqu ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)] 149*dfc6aa5cSAndroid Build Coastguard Worker vpabsw ymm0, ymm4 150*dfc6aa5cSAndroid Build Coastguard Worker vpabsw ymm1, ymm5 151*dfc6aa5cSAndroid Build Coastguard Worker vpabsw ymm2, ymm6 152*dfc6aa5cSAndroid Build Coastguard Worker vpabsw ymm3, ymm7 153*dfc6aa5cSAndroid Build Coastguard Worker 154*dfc6aa5cSAndroid Build Coastguard Worker vpaddw ymm0, YMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor 155*dfc6aa5cSAndroid Build Coastguard Worker vpaddw ymm1, YMMWORD [CORRECTION(2,0,edx)] 156*dfc6aa5cSAndroid Build Coastguard Worker vpaddw ymm2, YMMWORD [CORRECTION(4,0,edx)] 157*dfc6aa5cSAndroid Build Coastguard Worker vpaddw ymm3, YMMWORD [CORRECTION(6,0,edx)] 158*dfc6aa5cSAndroid Build Coastguard Worker vpmulhuw ymm0, YMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal 159*dfc6aa5cSAndroid Build Coastguard Worker vpmulhuw ymm1, YMMWORD [RECIPROCAL(2,0,edx)] 160*dfc6aa5cSAndroid Build Coastguard Worker vpmulhuw ymm2, YMMWORD [RECIPROCAL(4,0,edx)] 161*dfc6aa5cSAndroid Build Coastguard Worker vpmulhuw ymm3, YMMWORD [RECIPROCAL(6,0,edx)] 162*dfc6aa5cSAndroid Build Coastguard Worker vpmulhuw ymm0, YMMWORD [SCALE(0,0,edx)] ; scale 163*dfc6aa5cSAndroid Build Coastguard Worker vpmulhuw ymm1, YMMWORD [SCALE(2,0,edx)] 164*dfc6aa5cSAndroid Build Coastguard Worker vpmulhuw ymm2, YMMWORD [SCALE(4,0,edx)] 165*dfc6aa5cSAndroid Build Coastguard Worker vpmulhuw ymm3, YMMWORD [SCALE(6,0,edx)] 166*dfc6aa5cSAndroid Build Coastguard Worker 167*dfc6aa5cSAndroid Build Coastguard Worker vpsignw ymm0, ymm0, ymm4 168*dfc6aa5cSAndroid Build Coastguard Worker vpsignw ymm1, ymm1, ymm5 169*dfc6aa5cSAndroid Build Coastguard Worker vpsignw ymm2, ymm2, ymm6 170*dfc6aa5cSAndroid Build Coastguard Worker vpsignw ymm3, ymm3, ymm7 171*dfc6aa5cSAndroid Build Coastguard Worker 172*dfc6aa5cSAndroid Build Coastguard Worker vmovdqu [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0 173*dfc6aa5cSAndroid Build Coastguard Worker vmovdqu [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1 174*dfc6aa5cSAndroid Build Coastguard Worker vmovdqu [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2 175*dfc6aa5cSAndroid Build Coastguard Worker vmovdqu [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3 176*dfc6aa5cSAndroid Build Coastguard Worker 177*dfc6aa5cSAndroid Build Coastguard Worker vzeroupper 178*dfc6aa5cSAndroid Build Coastguard Worker pop edi 179*dfc6aa5cSAndroid Build Coastguard Worker pop esi 180*dfc6aa5cSAndroid Build Coastguard Worker; pop edx ; need not be preserved 181*dfc6aa5cSAndroid Build Coastguard Worker; pop ecx ; unused 182*dfc6aa5cSAndroid Build Coastguard Worker; pop ebx ; unused 183*dfc6aa5cSAndroid Build Coastguard Worker pop ebp 184*dfc6aa5cSAndroid Build Coastguard Worker ret 185*dfc6aa5cSAndroid Build Coastguard Worker 186*dfc6aa5cSAndroid Build Coastguard Worker; For some reason, the OS X linker does not honor the request to align the 187*dfc6aa5cSAndroid Build Coastguard Worker; segment unless we do this. 188*dfc6aa5cSAndroid Build Coastguard Worker align 32 189