1*dfc6aa5cSAndroid Build Coastguard Worker; 2*dfc6aa5cSAndroid Build Coastguard Worker; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2) 3*dfc6aa5cSAndroid Build Coastguard Worker; 4*dfc6aa5cSAndroid Build Coastguard Worker; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2009, 2016, D. R. Commander. 6*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2018, Matthias Räncker. 7*dfc6aa5cSAndroid Build Coastguard Worker; 8*dfc6aa5cSAndroid Build Coastguard Worker; Based on the x86 SIMD extension for IJG JPEG library 9*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 1999-2006, MIYASAKA Masaru. 10*dfc6aa5cSAndroid Build Coastguard Worker; For conditions of distribution and use, see copyright notice in jsimdext.inc 11*dfc6aa5cSAndroid Build Coastguard Worker; 12*dfc6aa5cSAndroid Build Coastguard Worker; This file should be assembled with NASM (Netwide Assembler), 13*dfc6aa5cSAndroid Build Coastguard Worker; can *not* be assembled with Microsoft's MASM or any compatible 14*dfc6aa5cSAndroid Build Coastguard Worker; assembler (including Borland's Turbo Assembler). 15*dfc6aa5cSAndroid Build Coastguard Worker; NASM is available from http://nasm.sourceforge.net/ or 16*dfc6aa5cSAndroid Build Coastguard Worker; http://sourceforge.net/project/showfiles.php?group_id=6208 17*dfc6aa5cSAndroid Build Coastguard Worker; 18*dfc6aa5cSAndroid Build Coastguard Worker; This file contains a floating-point implementation of the inverse DCT 19*dfc6aa5cSAndroid Build Coastguard Worker; (Discrete Cosine Transform). The following code is based directly on 20*dfc6aa5cSAndroid Build Coastguard Worker; the IJG's original jidctflt.c; see the jidctflt.c for more details. 21*dfc6aa5cSAndroid Build Coastguard Worker 22*dfc6aa5cSAndroid Build Coastguard Worker%include "jsimdext.inc" 23*dfc6aa5cSAndroid Build Coastguard Worker%include "jdct.inc" 24*dfc6aa5cSAndroid Build Coastguard Worker 25*dfc6aa5cSAndroid Build Coastguard Worker; -------------------------------------------------------------------------- 26*dfc6aa5cSAndroid Build Coastguard Worker 27*dfc6aa5cSAndroid Build Coastguard Worker%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) 28*dfc6aa5cSAndroid Build Coastguard Worker shufps %1, %2, 0x44 29*dfc6aa5cSAndroid Build Coastguard Worker%endmacro 30*dfc6aa5cSAndroid Build Coastguard Worker 31*dfc6aa5cSAndroid Build Coastguard Worker%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) 32*dfc6aa5cSAndroid Build Coastguard Worker shufps %1, %2, 0xEE 33*dfc6aa5cSAndroid Build Coastguard Worker%endmacro 34*dfc6aa5cSAndroid Build Coastguard Worker 35*dfc6aa5cSAndroid Build Coastguard Worker; -------------------------------------------------------------------------- 36*dfc6aa5cSAndroid Build Coastguard Worker SECTION SEG_CONST 37*dfc6aa5cSAndroid Build Coastguard Worker 38*dfc6aa5cSAndroid Build Coastguard Worker alignz 32 39*dfc6aa5cSAndroid Build Coastguard Worker GLOBAL_DATA(jconst_idct_float_sse2) 40*dfc6aa5cSAndroid Build Coastguard Worker 41*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jconst_idct_float_sse2): 42*dfc6aa5cSAndroid Build Coastguard Worker 43*dfc6aa5cSAndroid Build Coastguard WorkerPD_1_414 times 4 dd 1.414213562373095048801689 44*dfc6aa5cSAndroid Build Coastguard WorkerPD_1_847 times 4 dd 1.847759065022573512256366 45*dfc6aa5cSAndroid Build Coastguard WorkerPD_1_082 times 4 dd 1.082392200292393968799446 46*dfc6aa5cSAndroid Build Coastguard WorkerPD_M2_613 times 4 dd -2.613125929752753055713286 47*dfc6aa5cSAndroid Build Coastguard WorkerPD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) 48*dfc6aa5cSAndroid Build Coastguard WorkerPB_CENTERJSAMP times 16 db CENTERJSAMPLE 49*dfc6aa5cSAndroid Build Coastguard Worker 50*dfc6aa5cSAndroid Build Coastguard Worker alignz 32 51*dfc6aa5cSAndroid Build Coastguard Worker 52*dfc6aa5cSAndroid Build Coastguard Worker; -------------------------------------------------------------------------- 53*dfc6aa5cSAndroid Build Coastguard Worker SECTION SEG_TEXT 54*dfc6aa5cSAndroid Build Coastguard Worker BITS 64 55*dfc6aa5cSAndroid Build Coastguard Worker; 56*dfc6aa5cSAndroid Build Coastguard Worker; Perform dequantization and inverse DCT on one block of coefficients. 57*dfc6aa5cSAndroid Build Coastguard Worker; 58*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void) 59*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block, 60*dfc6aa5cSAndroid Build Coastguard Worker; JSAMPARRAY output_buf, JDIMENSION output_col) 61*dfc6aa5cSAndroid Build Coastguard Worker; 62*dfc6aa5cSAndroid Build Coastguard Worker 63*dfc6aa5cSAndroid Build Coastguard Worker; r10 = void *dct_table 64*dfc6aa5cSAndroid Build Coastguard Worker; r11 = JCOEFPTR coef_block 65*dfc6aa5cSAndroid Build Coastguard Worker; r12 = JSAMPARRAY output_buf 66*dfc6aa5cSAndroid Build Coastguard Worker; r13d = JDIMENSION output_col 67*dfc6aa5cSAndroid Build Coastguard Worker 68*dfc6aa5cSAndroid Build Coastguard Worker%define original_rbp rbp + 0 69*dfc6aa5cSAndroid Build Coastguard Worker%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD 70*dfc6aa5cSAndroid Build Coastguard Worker ; xmmword wk[WK_NUM] 71*dfc6aa5cSAndroid Build Coastguard Worker%define WK_NUM 2 72*dfc6aa5cSAndroid Build Coastguard Worker%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT 73*dfc6aa5cSAndroid Build Coastguard Worker ; FAST_FLOAT workspace[DCTSIZE2] 74*dfc6aa5cSAndroid Build Coastguard Worker 75*dfc6aa5cSAndroid Build Coastguard Worker align 32 76*dfc6aa5cSAndroid Build Coastguard Worker GLOBAL_FUNCTION(jsimd_idct_float_sse2) 77*dfc6aa5cSAndroid Build Coastguard Worker 78*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_idct_float_sse2): 79*dfc6aa5cSAndroid Build Coastguard Worker push rbp 80*dfc6aa5cSAndroid Build Coastguard Worker mov rax, rsp ; rax = original rbp 81*dfc6aa5cSAndroid Build Coastguard Worker sub rsp, byte 4 82*dfc6aa5cSAndroid Build Coastguard Worker and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 83*dfc6aa5cSAndroid Build Coastguard Worker mov [rsp], rax 84*dfc6aa5cSAndroid Build Coastguard Worker mov rbp, rsp ; rbp = aligned rbp 85*dfc6aa5cSAndroid Build Coastguard Worker lea rsp, [workspace] 86*dfc6aa5cSAndroid Build Coastguard Worker collect_args 4 87*dfc6aa5cSAndroid Build Coastguard Worker push rbx 88*dfc6aa5cSAndroid Build Coastguard Worker 89*dfc6aa5cSAndroid Build Coastguard Worker ; ---- Pass 1: process columns from input, store into work array. 90*dfc6aa5cSAndroid Build Coastguard Worker 91*dfc6aa5cSAndroid Build Coastguard Worker mov rdx, r10 ; quantptr 92*dfc6aa5cSAndroid Build Coastguard Worker mov rsi, r11 ; inptr 93*dfc6aa5cSAndroid Build Coastguard Worker lea rdi, [workspace] ; FAST_FLOAT *wsptr 94*dfc6aa5cSAndroid Build Coastguard Worker mov rcx, DCTSIZE/4 ; ctr 95*dfc6aa5cSAndroid Build Coastguard Worker.columnloop: 96*dfc6aa5cSAndroid Build Coastguard Worker%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE 97*dfc6aa5cSAndroid Build Coastguard Worker mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] 98*dfc6aa5cSAndroid Build Coastguard Worker or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] 99*dfc6aa5cSAndroid Build Coastguard Worker jnz near .columnDCT 100*dfc6aa5cSAndroid Build Coastguard Worker 101*dfc6aa5cSAndroid Build Coastguard Worker movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 102*dfc6aa5cSAndroid Build Coastguard Worker movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 103*dfc6aa5cSAndroid Build Coastguard Worker movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 104*dfc6aa5cSAndroid Build Coastguard Worker movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 105*dfc6aa5cSAndroid Build Coastguard Worker movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 106*dfc6aa5cSAndroid Build Coastguard Worker movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 107*dfc6aa5cSAndroid Build Coastguard Worker movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 108*dfc6aa5cSAndroid Build Coastguard Worker por xmm1, xmm2 109*dfc6aa5cSAndroid Build Coastguard Worker por xmm3, xmm4 110*dfc6aa5cSAndroid Build Coastguard Worker por xmm5, xmm6 111*dfc6aa5cSAndroid Build Coastguard Worker por xmm1, xmm3 112*dfc6aa5cSAndroid Build Coastguard Worker por xmm5, xmm7 113*dfc6aa5cSAndroid Build Coastguard Worker por xmm1, xmm5 114*dfc6aa5cSAndroid Build Coastguard Worker packsswb xmm1, xmm1 115*dfc6aa5cSAndroid Build Coastguard Worker movd eax, xmm1 116*dfc6aa5cSAndroid Build Coastguard Worker test rax, rax 117*dfc6aa5cSAndroid Build Coastguard Worker jnz short .columnDCT 118*dfc6aa5cSAndroid Build Coastguard Worker 119*dfc6aa5cSAndroid Build Coastguard Worker ; -- AC terms all zero 120*dfc6aa5cSAndroid Build Coastguard Worker 121*dfc6aa5cSAndroid Build Coastguard Worker movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 122*dfc6aa5cSAndroid Build Coastguard Worker 123*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 124*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 125*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) 126*dfc6aa5cSAndroid Build Coastguard Worker 127*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 128*dfc6aa5cSAndroid Build Coastguard Worker 129*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, xmm0 130*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm2, xmm0 131*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm0 132*dfc6aa5cSAndroid Build Coastguard Worker 133*dfc6aa5cSAndroid Build Coastguard Worker shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00) 134*dfc6aa5cSAndroid Build Coastguard Worker shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01) 135*dfc6aa5cSAndroid Build Coastguard Worker shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02) 136*dfc6aa5cSAndroid Build Coastguard Worker shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03) 137*dfc6aa5cSAndroid Build Coastguard Worker 138*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 139*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 140*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 141*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 142*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 143*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2 144*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 145*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 146*dfc6aa5cSAndroid Build Coastguard Worker jmp near .nextcolumn 147*dfc6aa5cSAndroid Build Coastguard Worker%endif 148*dfc6aa5cSAndroid Build Coastguard Worker.columnDCT: 149*dfc6aa5cSAndroid Build Coastguard Worker 150*dfc6aa5cSAndroid Build Coastguard Worker ; -- Even part 151*dfc6aa5cSAndroid Build Coastguard Worker 152*dfc6aa5cSAndroid Build Coastguard Worker movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 153*dfc6aa5cSAndroid Build Coastguard Worker movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 154*dfc6aa5cSAndroid Build Coastguard Worker movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 155*dfc6aa5cSAndroid Build Coastguard Worker movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 156*dfc6aa5cSAndroid Build Coastguard Worker 157*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 158*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23) 159*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 160*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) 161*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) 162*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23) 163*dfc6aa5cSAndroid Build Coastguard Worker 164*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43) 165*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63) 166*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) 167*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) 168*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43) 169*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63) 170*dfc6aa5cSAndroid Build Coastguard Worker 171*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 172*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 173*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 174*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 175*dfc6aa5cSAndroid Build Coastguard Worker 176*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm4, xmm0 177*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, xmm1 178*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm2 ; xmm0=tmp11 179*dfc6aa5cSAndroid Build Coastguard Worker subps xmm1, xmm3 180*dfc6aa5cSAndroid Build Coastguard Worker addps xmm4, xmm2 ; xmm4=tmp10 181*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm3 ; xmm5=tmp13 182*dfc6aa5cSAndroid Build Coastguard Worker 183*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm1, [rel PD_1_414] 184*dfc6aa5cSAndroid Build Coastguard Worker subps xmm1, xmm5 ; xmm1=tmp12 185*dfc6aa5cSAndroid Build Coastguard Worker 186*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm6, xmm4 187*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm7, xmm0 188*dfc6aa5cSAndroid Build Coastguard Worker subps xmm4, xmm5 ; xmm4=tmp3 189*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm1 ; xmm0=tmp2 190*dfc6aa5cSAndroid Build Coastguard Worker addps xmm6, xmm5 ; xmm6=tmp0 191*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm1 ; xmm7=tmp1 192*dfc6aa5cSAndroid Build Coastguard Worker 193*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [wk(1)], xmm4 ; tmp3 194*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [wk(0)], xmm0 ; tmp2 195*dfc6aa5cSAndroid Build Coastguard Worker 196*dfc6aa5cSAndroid Build Coastguard Worker ; -- Odd part 197*dfc6aa5cSAndroid Build Coastguard Worker 198*dfc6aa5cSAndroid Build Coastguard Worker movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 199*dfc6aa5cSAndroid Build Coastguard Worker movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 200*dfc6aa5cSAndroid Build Coastguard Worker movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 201*dfc6aa5cSAndroid Build Coastguard Worker movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 202*dfc6aa5cSAndroid Build Coastguard Worker 203*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13) 204*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33) 205*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) 206*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) 207*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13) 208*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33) 209*dfc6aa5cSAndroid Build Coastguard Worker 210*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53) 211*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73) 212*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) 213*dfc6aa5cSAndroid Build Coastguard Worker psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) 214*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53) 215*dfc6aa5cSAndroid Build Coastguard Worker cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73) 216*dfc6aa5cSAndroid Build Coastguard Worker 217*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 218*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 219*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 220*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 221*dfc6aa5cSAndroid Build Coastguard Worker 222*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm4, xmm2 223*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm5 224*dfc6aa5cSAndroid Build Coastguard Worker addps xmm2, xmm1 ; xmm2=z11 225*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm3 ; xmm5=z13 226*dfc6aa5cSAndroid Build Coastguard Worker subps xmm4, xmm1 ; xmm4=z12 227*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm3 ; xmm0=z10 228*dfc6aa5cSAndroid Build Coastguard Worker 229*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, xmm2 230*dfc6aa5cSAndroid Build Coastguard Worker subps xmm2, xmm5 231*dfc6aa5cSAndroid Build Coastguard Worker addps xmm1, xmm5 ; xmm1=tmp7 232*dfc6aa5cSAndroid Build Coastguard Worker 233*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm2, [rel PD_1_414] ; xmm2=tmp11 234*dfc6aa5cSAndroid Build Coastguard Worker 235*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm0 236*dfc6aa5cSAndroid Build Coastguard Worker addps xmm0, xmm4 237*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm0, [rel PD_1_847] ; xmm0=z5 238*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930) 239*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200) 240*dfc6aa5cSAndroid Build Coastguard Worker addps xmm3, xmm0 ; xmm3=tmp12 241*dfc6aa5cSAndroid Build Coastguard Worker subps xmm4, xmm0 ; xmm4=tmp10 242*dfc6aa5cSAndroid Build Coastguard Worker 243*dfc6aa5cSAndroid Build Coastguard Worker ; -- Final output stage 244*dfc6aa5cSAndroid Build Coastguard Worker 245*dfc6aa5cSAndroid Build Coastguard Worker subps xmm3, xmm1 ; xmm3=tmp6 246*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, xmm6 247*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm7 248*dfc6aa5cSAndroid Build Coastguard Worker addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03) 249*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13) 250*dfc6aa5cSAndroid Build Coastguard Worker subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73) 251*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63) 252*dfc6aa5cSAndroid Build Coastguard Worker subps xmm2, xmm3 ; xmm2=tmp5 253*dfc6aa5cSAndroid Build Coastguard Worker 254*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, xmm6 ; transpose coefficients(phase 1) 255*dfc6aa5cSAndroid Build Coastguard Worker unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) 256*dfc6aa5cSAndroid Build Coastguard Worker unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13) 257*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm0 ; transpose coefficients(phase 1) 258*dfc6aa5cSAndroid Build Coastguard Worker unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71) 259*dfc6aa5cSAndroid Build Coastguard Worker unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73) 260*dfc6aa5cSAndroid Build Coastguard Worker 261*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 262*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 263*dfc6aa5cSAndroid Build Coastguard Worker 264*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) 265*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) 266*dfc6aa5cSAndroid Build Coastguard Worker 267*dfc6aa5cSAndroid Build Coastguard Worker addps xmm4, xmm2 ; xmm4=tmp4 268*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm7 269*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm5 270*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23) 271*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43) 272*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53) 273*dfc6aa5cSAndroid Build Coastguard Worker subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33) 274*dfc6aa5cSAndroid Build Coastguard Worker 275*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm2, xmm7 ; transpose coefficients(phase 1) 276*dfc6aa5cSAndroid Build Coastguard Worker unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31) 277*dfc6aa5cSAndroid Build Coastguard Worker unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33) 278*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm4, xmm5 ; transpose coefficients(phase 1) 279*dfc6aa5cSAndroid Build Coastguard Worker unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51) 280*dfc6aa5cSAndroid Build Coastguard Worker unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) 281*dfc6aa5cSAndroid Build Coastguard Worker 282*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm6 ; transpose coefficients(phase 2) 283*dfc6aa5cSAndroid Build Coastguard Worker unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) 284*dfc6aa5cSAndroid Build Coastguard Worker unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) 285*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm1 ; transpose coefficients(phase 2) 286*dfc6aa5cSAndroid Build Coastguard Worker unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) 287*dfc6aa5cSAndroid Build Coastguard Worker unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) 288*dfc6aa5cSAndroid Build Coastguard Worker 289*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) 290*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) 291*dfc6aa5cSAndroid Build Coastguard Worker 292*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6 293*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 294*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 295*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 296*dfc6aa5cSAndroid Build Coastguard Worker 297*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm6, xmm5 ; transpose coefficients(phase 2) 298*dfc6aa5cSAndroid Build Coastguard Worker unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) 299*dfc6aa5cSAndroid Build Coastguard Worker unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) 300*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm4 ; transpose coefficients(phase 2) 301*dfc6aa5cSAndroid Build Coastguard Worker unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) 302*dfc6aa5cSAndroid Build Coastguard Worker unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) 303*dfc6aa5cSAndroid Build Coastguard Worker 304*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5 305*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6 306*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4 307*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 308*dfc6aa5cSAndroid Build Coastguard Worker 309*dfc6aa5cSAndroid Build Coastguard Worker.nextcolumn: 310*dfc6aa5cSAndroid Build Coastguard Worker add rsi, byte 4*SIZEOF_JCOEF ; coef_block 311*dfc6aa5cSAndroid Build Coastguard Worker add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr 312*dfc6aa5cSAndroid Build Coastguard Worker add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr 313*dfc6aa5cSAndroid Build Coastguard Worker dec rcx ; ctr 314*dfc6aa5cSAndroid Build Coastguard Worker jnz near .columnloop 315*dfc6aa5cSAndroid Build Coastguard Worker 316*dfc6aa5cSAndroid Build Coastguard Worker ; -- Prefetch the next coefficient block 317*dfc6aa5cSAndroid Build Coastguard Worker 318*dfc6aa5cSAndroid Build Coastguard Worker prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] 319*dfc6aa5cSAndroid Build Coastguard Worker prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] 320*dfc6aa5cSAndroid Build Coastguard Worker prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] 321*dfc6aa5cSAndroid Build Coastguard Worker prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] 322*dfc6aa5cSAndroid Build Coastguard Worker 323*dfc6aa5cSAndroid Build Coastguard Worker ; ---- Pass 2: process rows from work array, store into output array. 324*dfc6aa5cSAndroid Build Coastguard Worker 325*dfc6aa5cSAndroid Build Coastguard Worker mov rax, [original_rbp] 326*dfc6aa5cSAndroid Build Coastguard Worker lea rsi, [workspace] ; FAST_FLOAT *wsptr 327*dfc6aa5cSAndroid Build Coastguard Worker mov rdi, r12 ; (JSAMPROW *) 328*dfc6aa5cSAndroid Build Coastguard Worker mov eax, r13d 329*dfc6aa5cSAndroid Build Coastguard Worker mov rcx, DCTSIZE/4 ; ctr 330*dfc6aa5cSAndroid Build Coastguard Worker.rowloop: 331*dfc6aa5cSAndroid Build Coastguard Worker 332*dfc6aa5cSAndroid Build Coastguard Worker ; -- Even part 333*dfc6aa5cSAndroid Build Coastguard Worker 334*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] 335*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)] 336*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)] 337*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)] 338*dfc6aa5cSAndroid Build Coastguard Worker 339*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm4, xmm0 340*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, xmm1 341*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm2 ; xmm0=tmp11 342*dfc6aa5cSAndroid Build Coastguard Worker subps xmm1, xmm3 343*dfc6aa5cSAndroid Build Coastguard Worker addps xmm4, xmm2 ; xmm4=tmp10 344*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm3 ; xmm5=tmp13 345*dfc6aa5cSAndroid Build Coastguard Worker 346*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm1, [rel PD_1_414] 347*dfc6aa5cSAndroid Build Coastguard Worker subps xmm1, xmm5 ; xmm1=tmp12 348*dfc6aa5cSAndroid Build Coastguard Worker 349*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm6, xmm4 350*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm7, xmm0 351*dfc6aa5cSAndroid Build Coastguard Worker subps xmm4, xmm5 ; xmm4=tmp3 352*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm1 ; xmm0=tmp2 353*dfc6aa5cSAndroid Build Coastguard Worker addps xmm6, xmm5 ; xmm6=tmp0 354*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm1 ; xmm7=tmp1 355*dfc6aa5cSAndroid Build Coastguard Worker 356*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [wk(1)], xmm4 ; tmp3 357*dfc6aa5cSAndroid Build Coastguard Worker movaps XMMWORD [wk(0)], xmm0 ; tmp2 358*dfc6aa5cSAndroid Build Coastguard Worker 359*dfc6aa5cSAndroid Build Coastguard Worker ; -- Odd part 360*dfc6aa5cSAndroid Build Coastguard Worker 361*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] 362*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)] 363*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)] 364*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)] 365*dfc6aa5cSAndroid Build Coastguard Worker 366*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm4, xmm2 367*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm5 368*dfc6aa5cSAndroid Build Coastguard Worker addps xmm2, xmm1 ; xmm2=z11 369*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm3 ; xmm5=z13 370*dfc6aa5cSAndroid Build Coastguard Worker subps xmm4, xmm1 ; xmm4=z12 371*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm3 ; xmm0=z10 372*dfc6aa5cSAndroid Build Coastguard Worker 373*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, xmm2 374*dfc6aa5cSAndroid Build Coastguard Worker subps xmm2, xmm5 375*dfc6aa5cSAndroid Build Coastguard Worker addps xmm1, xmm5 ; xmm1=tmp7 376*dfc6aa5cSAndroid Build Coastguard Worker 377*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm2, [rel PD_1_414] ; xmm2=tmp11 378*dfc6aa5cSAndroid Build Coastguard Worker 379*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, xmm0 380*dfc6aa5cSAndroid Build Coastguard Worker addps xmm0, xmm4 381*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm0, [rel PD_1_847] ; xmm0=z5 382*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930) 383*dfc6aa5cSAndroid Build Coastguard Worker mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200) 384*dfc6aa5cSAndroid Build Coastguard Worker addps xmm3, xmm0 ; xmm3=tmp12 385*dfc6aa5cSAndroid Build Coastguard Worker subps xmm4, xmm0 ; xmm4=tmp10 386*dfc6aa5cSAndroid Build Coastguard Worker 387*dfc6aa5cSAndroid Build Coastguard Worker ; -- Final output stage 388*dfc6aa5cSAndroid Build Coastguard Worker 389*dfc6aa5cSAndroid Build Coastguard Worker subps xmm3, xmm1 ; xmm3=tmp6 390*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, xmm6 391*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm0, xmm7 392*dfc6aa5cSAndroid Build Coastguard Worker addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30) 393*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31) 394*dfc6aa5cSAndroid Build Coastguard Worker subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37) 395*dfc6aa5cSAndroid Build Coastguard Worker subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36) 396*dfc6aa5cSAndroid Build Coastguard Worker subps xmm2, xmm3 ; xmm2=tmp5 397*dfc6aa5cSAndroid Build Coastguard Worker 398*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, [rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC] 399*dfc6aa5cSAndroid Build Coastguard Worker pcmpeqd xmm3, xmm3 400*dfc6aa5cSAndroid Build Coastguard Worker psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} 401*dfc6aa5cSAndroid Build Coastguard Worker 402*dfc6aa5cSAndroid Build Coastguard Worker addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) 403*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) 404*dfc6aa5cSAndroid Build Coastguard Worker addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) 405*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) 406*dfc6aa5cSAndroid Build Coastguard Worker 407*dfc6aa5cSAndroid Build Coastguard Worker pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) 408*dfc6aa5cSAndroid Build Coastguard Worker pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) 409*dfc6aa5cSAndroid Build Coastguard Worker pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) 410*dfc6aa5cSAndroid Build Coastguard Worker pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) 411*dfc6aa5cSAndroid Build Coastguard Worker por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31) 412*dfc6aa5cSAndroid Build Coastguard Worker por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37) 413*dfc6aa5cSAndroid Build Coastguard Worker 414*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 415*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 416*dfc6aa5cSAndroid Build Coastguard Worker 417*dfc6aa5cSAndroid Build Coastguard Worker addps xmm4, xmm2 ; xmm4=tmp4 418*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm7, xmm1 419*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm5, xmm3 420*dfc6aa5cSAndroid Build Coastguard Worker addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32) 421*dfc6aa5cSAndroid Build Coastguard Worker addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34) 422*dfc6aa5cSAndroid Build Coastguard Worker subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35) 423*dfc6aa5cSAndroid Build Coastguard Worker subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33) 424*dfc6aa5cSAndroid Build Coastguard Worker 425*dfc6aa5cSAndroid Build Coastguard Worker movaps xmm2, [rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC] 426*dfc6aa5cSAndroid Build Coastguard Worker pcmpeqd xmm4, xmm4 427*dfc6aa5cSAndroid Build Coastguard Worker psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} 428*dfc6aa5cSAndroid Build Coastguard Worker 429*dfc6aa5cSAndroid Build Coastguard Worker addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) 430*dfc6aa5cSAndroid Build Coastguard Worker addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) 431*dfc6aa5cSAndroid Build Coastguard Worker addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) 432*dfc6aa5cSAndroid Build Coastguard Worker addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) 433*dfc6aa5cSAndroid Build Coastguard Worker 434*dfc6aa5cSAndroid Build Coastguard Worker pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) 435*dfc6aa5cSAndroid Build Coastguard Worker pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) 436*dfc6aa5cSAndroid Build Coastguard Worker pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) 437*dfc6aa5cSAndroid Build Coastguard Worker pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) 438*dfc6aa5cSAndroid Build Coastguard Worker por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35) 439*dfc6aa5cSAndroid Build Coastguard Worker por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33) 440*dfc6aa5cSAndroid Build Coastguard Worker 441*dfc6aa5cSAndroid Build Coastguard Worker movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] 442*dfc6aa5cSAndroid Build Coastguard Worker 443*dfc6aa5cSAndroid Build Coastguard Worker packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) 444*dfc6aa5cSAndroid Build Coastguard Worker packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) 445*dfc6aa5cSAndroid Build Coastguard Worker paddb xmm6, xmm2 446*dfc6aa5cSAndroid Build Coastguard Worker paddb xmm1, xmm2 447*dfc6aa5cSAndroid Build Coastguard Worker 448*dfc6aa5cSAndroid Build Coastguard Worker movdqa xmm4, xmm6 ; transpose coefficients(phase 2) 449*dfc6aa5cSAndroid Build Coastguard Worker punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 450*dfc6aa5cSAndroid Build Coastguard Worker punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 451*dfc6aa5cSAndroid Build Coastguard Worker 452*dfc6aa5cSAndroid Build Coastguard Worker movdqa xmm7, xmm6 ; transpose coefficients(phase 3) 453*dfc6aa5cSAndroid Build Coastguard Worker punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 454*dfc6aa5cSAndroid Build Coastguard Worker punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 455*dfc6aa5cSAndroid Build Coastguard Worker 456*dfc6aa5cSAndroid Build Coastguard Worker pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 457*dfc6aa5cSAndroid Build Coastguard Worker pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 458*dfc6aa5cSAndroid Build Coastguard Worker 459*dfc6aa5cSAndroid Build Coastguard Worker mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] 460*dfc6aa5cSAndroid Build Coastguard Worker mov rbxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] 461*dfc6aa5cSAndroid Build Coastguard Worker movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 462*dfc6aa5cSAndroid Build Coastguard Worker movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7 463*dfc6aa5cSAndroid Build Coastguard Worker mov rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] 464*dfc6aa5cSAndroid Build Coastguard Worker mov rbxp, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] 465*dfc6aa5cSAndroid Build Coastguard Worker movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 466*dfc6aa5cSAndroid Build Coastguard Worker movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3 467*dfc6aa5cSAndroid Build Coastguard Worker 468*dfc6aa5cSAndroid Build Coastguard Worker add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr 469*dfc6aa5cSAndroid Build Coastguard Worker add rdi, byte 4*SIZEOF_JSAMPROW 470*dfc6aa5cSAndroid Build Coastguard Worker dec rcx ; ctr 471*dfc6aa5cSAndroid Build Coastguard Worker jnz near .rowloop 472*dfc6aa5cSAndroid Build Coastguard Worker 473*dfc6aa5cSAndroid Build Coastguard Worker pop rbx 474*dfc6aa5cSAndroid Build Coastguard Worker uncollect_args 4 475*dfc6aa5cSAndroid Build Coastguard Worker mov rsp, rbp ; rsp <- aligned rbp 476*dfc6aa5cSAndroid Build Coastguard Worker pop rsp ; rsp <- original rbp 477*dfc6aa5cSAndroid Build Coastguard Worker pop rbp 478*dfc6aa5cSAndroid Build Coastguard Worker ret 479*dfc6aa5cSAndroid Build Coastguard Worker 480*dfc6aa5cSAndroid Build Coastguard Worker; For some reason, the OS X linker does not honor the request to align the 481*dfc6aa5cSAndroid Build Coastguard Worker; segment unless we do this. 482*dfc6aa5cSAndroid Build Coastguard Worker align 32 483