1*638691a0SAndroid Build Coastguard Worker# MMX assist routines for sumsq 2*638691a0SAndroid Build Coastguard Worker# Copyright 2001 Phil Karn, KA9Q 3*638691a0SAndroid Build Coastguard Worker# May be used under the terms of the GNU Public License (GPL) 4*638691a0SAndroid Build Coastguard Worker 5*638691a0SAndroid Build Coastguard Worker .text 6*638691a0SAndroid Build Coastguard Worker 7*638691a0SAndroid Build Coastguard Worker# Evaluate sum of squares of signed 16-bit input samples 8*638691a0SAndroid Build Coastguard Worker# long long sumsq_mmx_assist(signed short *in,int cnt); 9*638691a0SAndroid Build Coastguard Worker .global sumsq_mmx_assist 10*638691a0SAndroid Build Coastguard Worker .type sumsq_mmx_assist,@function 11*638691a0SAndroid Build Coastguard Worker .align 16 12*638691a0SAndroid Build Coastguard Workersumsq_mmx_assist: 13*638691a0SAndroid Build Coastguard Worker pushl %ebp 14*638691a0SAndroid Build Coastguard Worker movl %esp,%ebp 15*638691a0SAndroid Build Coastguard Worker pushl %esi 16*638691a0SAndroid Build Coastguard Worker pushl %ecx 17*638691a0SAndroid Build Coastguard Worker pushl %ebx 18*638691a0SAndroid Build Coastguard Worker 19*638691a0SAndroid Build Coastguard Worker movl 8(%ebp),%esi 20*638691a0SAndroid Build Coastguard Worker movl 12(%ebp),%ecx 21*638691a0SAndroid Build Coastguard Worker xor %eax,%eax 22*638691a0SAndroid Build Coastguard Worker xor %edx,%edx 23*638691a0SAndroid Build Coastguard Worker 24*638691a0SAndroid Build Coastguard Worker # Since 4 * 32767**2 < 2**32, we can accumulate two at a time 25*638691a0SAndroid Build Coastguard Worker1: subl $8,%ecx 26*638691a0SAndroid Build Coastguard Worker jl 2f 27*638691a0SAndroid Build Coastguard Worker movq (%esi),%mm0 # S0 S1 S2 S3 28*638691a0SAndroid Build Coastguard Worker pmaddwd %mm0,%mm0 # (S0^2+S1^2) (S2^2+S3^2) 29*638691a0SAndroid Build Coastguard Worker movq 8(%esi),%mm6 # S4 S5 S6 S7 30*638691a0SAndroid Build Coastguard Worker pmaddwd %mm6,%mm6 # (S4^2+S5^2) (S6^2+S7^2) 31*638691a0SAndroid Build Coastguard Worker paddd %mm6,%mm0 # (S0^2+S1^2+S4^2+S5^2)(S2^2+S3^2+S6^2+S7^2) 32*638691a0SAndroid Build Coastguard Worker movd %mm0,%ebx 33*638691a0SAndroid Build Coastguard Worker addl %ebx,%eax 34*638691a0SAndroid Build Coastguard Worker adcl $0,%edx 35*638691a0SAndroid Build Coastguard Worker psrlq $32,%mm0 36*638691a0SAndroid Build Coastguard Worker movd %mm0,%ebx 37*638691a0SAndroid Build Coastguard Worker addl %ebx,%eax 38*638691a0SAndroid Build Coastguard Worker adcl $0,%edx 39*638691a0SAndroid Build Coastguard Worker addl $16,%esi 40*638691a0SAndroid Build Coastguard Worker jmp 1b 41*638691a0SAndroid Build Coastguard Worker 42*638691a0SAndroid Build Coastguard Worker2: emms 43*638691a0SAndroid Build Coastguard Worker popl %ebx 44*638691a0SAndroid Build Coastguard Worker popl %ecx 45*638691a0SAndroid Build Coastguard Worker popl %esi 46*638691a0SAndroid Build Coastguard Worker popl %ebp 47*638691a0SAndroid Build Coastguard Worker ret 48*638691a0SAndroid Build Coastguard Worker 49*638691a0SAndroid Build Coastguard Worker# Evaluate sum of squares of signed 16-bit input samples 50*638691a0SAndroid Build Coastguard Worker# long sumsq_wd_mmx_assist(signed short *in,int cnt); 51*638691a0SAndroid Build Coastguard Worker# Quick version, only safe for small numbers of small input values... 52*638691a0SAndroid Build Coastguard Worker .global sumsq_wd_mmx_assist 53*638691a0SAndroid Build Coastguard Worker .type sumsq_wd_mmx_assist,@function 54*638691a0SAndroid Build Coastguard Worker .align 16 55*638691a0SAndroid Build Coastguard Workersumsq_wd_mmx_assist: 56*638691a0SAndroid Build Coastguard Worker pushl %ebp 57*638691a0SAndroid Build Coastguard Worker movl %esp,%ebp 58*638691a0SAndroid Build Coastguard Worker pushl %esi 59*638691a0SAndroid Build Coastguard Worker 60*638691a0SAndroid Build Coastguard Worker movl 8(%ebp),%esi 61*638691a0SAndroid Build Coastguard Worker movl 12(%ebp),%ecx 62*638691a0SAndroid Build Coastguard Worker pxor %mm2,%mm2 # zero sum 63*638691a0SAndroid Build Coastguard Worker 64*638691a0SAndroid Build Coastguard Worker1: subl $8,%ecx 65*638691a0SAndroid Build Coastguard Worker jl 2f 66*638691a0SAndroid Build Coastguard Worker movq (%esi),%mm0 # S0 S1 S2 S3 67*638691a0SAndroid Build Coastguard Worker pmaddwd %mm0,%mm0 # (S0*S0+S1*S1) (S2*S2+S3*S3) 68*638691a0SAndroid Build Coastguard Worker movq 8(%esi),%mm1 69*638691a0SAndroid Build Coastguard Worker pmaddwd %mm1,%mm1 70*638691a0SAndroid Build Coastguard Worker paddd %mm1,%mm2 71*638691a0SAndroid Build Coastguard Worker paddd %mm0,%mm2 # accumulate 72*638691a0SAndroid Build Coastguard Worker 73*638691a0SAndroid Build Coastguard Worker addl $16,%esi 74*638691a0SAndroid Build Coastguard Worker jmp 1b 75*638691a0SAndroid Build Coastguard Worker 76*638691a0SAndroid Build Coastguard Worker2: movd %mm2,%eax # even sum 77*638691a0SAndroid Build Coastguard Worker psrlq $32,%mm2 78*638691a0SAndroid Build Coastguard Worker movd %mm2,%edx # odd sum 79*638691a0SAndroid Build Coastguard Worker addl %edx,%eax 80*638691a0SAndroid Build Coastguard Worker emms 81*638691a0SAndroid Build Coastguard Worker popl %esi 82*638691a0SAndroid Build Coastguard Worker popl %ebp 83*638691a0SAndroid Build Coastguard Worker ret 84