xref: /aosp_15_r20/external/fec/sumsq_mmx_assist.s (revision 638691a093b4f9473cd6ee8f3e0139deef159a86)
1*638691a0SAndroid Build Coastguard Worker# MMX assist routines for sumsq
2*638691a0SAndroid Build Coastguard Worker# Copyright 2001 Phil Karn, KA9Q
3*638691a0SAndroid Build Coastguard Worker# May be used under the terms of the GNU Public License (GPL)
4*638691a0SAndroid Build Coastguard Worker
5*638691a0SAndroid Build Coastguard Worker	.text
6*638691a0SAndroid Build Coastguard Worker
7*638691a0SAndroid Build Coastguard Worker# Evaluate sum of squares of signed 16-bit input samples
8*638691a0SAndroid Build Coastguard Worker#  long long sumsq_mmx_assist(signed short *in,int cnt);
9*638691a0SAndroid Build Coastguard Worker	.global sumsq_mmx_assist
10*638691a0SAndroid Build Coastguard Worker	.type sumsq_mmx_assist,@function
11*638691a0SAndroid Build Coastguard Worker	.align 16
12*638691a0SAndroid Build Coastguard Workersumsq_mmx_assist:
13*638691a0SAndroid Build Coastguard Worker	pushl %ebp
14*638691a0SAndroid Build Coastguard Worker	movl %esp,%ebp
15*638691a0SAndroid Build Coastguard Worker	pushl %esi
16*638691a0SAndroid Build Coastguard Worker	pushl %ecx
17*638691a0SAndroid Build Coastguard Worker	pushl %ebx
18*638691a0SAndroid Build Coastguard Worker
19*638691a0SAndroid Build Coastguard Worker	movl 8(%ebp),%esi
20*638691a0SAndroid Build Coastguard Worker	movl 12(%ebp),%ecx
21*638691a0SAndroid Build Coastguard Worker	xor %eax,%eax
22*638691a0SAndroid Build Coastguard Worker	xor %edx,%edx
23*638691a0SAndroid Build Coastguard Worker
24*638691a0SAndroid Build Coastguard Worker	# Since 4 * 32767**2 < 2**32, we can accumulate two at a time
25*638691a0SAndroid Build Coastguard Worker1:	subl $8,%ecx
26*638691a0SAndroid Build Coastguard Worker	jl 2f
27*638691a0SAndroid Build Coastguard Worker	movq (%esi),%mm0	# S0 S1 S2 S3
28*638691a0SAndroid Build Coastguard Worker	pmaddwd %mm0,%mm0	# (S0^2+S1^2) (S2^2+S3^2)
29*638691a0SAndroid Build Coastguard Worker	movq 8(%esi),%mm6	# S4 S5 S6 S7
30*638691a0SAndroid Build Coastguard Worker	pmaddwd %mm6,%mm6	# (S4^2+S5^2) (S6^2+S7^2)
31*638691a0SAndroid Build Coastguard Worker	paddd %mm6,%mm0		# (S0^2+S1^2+S4^2+S5^2)(S2^2+S3^2+S6^2+S7^2)
32*638691a0SAndroid Build Coastguard Worker	movd %mm0,%ebx
33*638691a0SAndroid Build Coastguard Worker	addl %ebx,%eax
34*638691a0SAndroid Build Coastguard Worker	adcl $0,%edx
35*638691a0SAndroid Build Coastguard Worker	psrlq $32,%mm0
36*638691a0SAndroid Build Coastguard Worker	movd %mm0,%ebx
37*638691a0SAndroid Build Coastguard Worker	addl %ebx,%eax
38*638691a0SAndroid Build Coastguard Worker	adcl $0,%edx
39*638691a0SAndroid Build Coastguard Worker	addl $16,%esi
40*638691a0SAndroid Build Coastguard Worker	jmp 1b
41*638691a0SAndroid Build Coastguard Worker
42*638691a0SAndroid Build Coastguard Worker2:	emms
43*638691a0SAndroid Build Coastguard Worker	popl %ebx
44*638691a0SAndroid Build Coastguard Worker	popl %ecx
45*638691a0SAndroid Build Coastguard Worker	popl %esi
46*638691a0SAndroid Build Coastguard Worker	popl %ebp
47*638691a0SAndroid Build Coastguard Worker	ret
48*638691a0SAndroid Build Coastguard Worker
49*638691a0SAndroid Build Coastguard Worker# Evaluate sum of squares of signed 16-bit input samples
50*638691a0SAndroid Build Coastguard Worker#  long sumsq_wd_mmx_assist(signed short *in,int cnt);
51*638691a0SAndroid Build Coastguard Worker#  Quick version, only safe for small numbers of small input values...
52*638691a0SAndroid Build Coastguard Worker	.global sumsq_wd_mmx_assist
53*638691a0SAndroid Build Coastguard Worker	.type sumsq_wd_mmx_assist,@function
54*638691a0SAndroid Build Coastguard Worker	.align 16
55*638691a0SAndroid Build Coastguard Workersumsq_wd_mmx_assist:
56*638691a0SAndroid Build Coastguard Worker	pushl %ebp
57*638691a0SAndroid Build Coastguard Worker	movl %esp,%ebp
58*638691a0SAndroid Build Coastguard Worker	pushl %esi
59*638691a0SAndroid Build Coastguard Worker
60*638691a0SAndroid Build Coastguard Worker	movl 8(%ebp),%esi
61*638691a0SAndroid Build Coastguard Worker	movl 12(%ebp),%ecx
62*638691a0SAndroid Build Coastguard Worker	pxor %mm2,%mm2		# zero sum
63*638691a0SAndroid Build Coastguard Worker
64*638691a0SAndroid Build Coastguard Worker1:	subl $8,%ecx
65*638691a0SAndroid Build Coastguard Worker	jl 2f
66*638691a0SAndroid Build Coastguard Worker	movq (%esi),%mm0	# S0 S1 S2 S3
67*638691a0SAndroid Build Coastguard Worker	pmaddwd %mm0,%mm0	# (S0*S0+S1*S1) (S2*S2+S3*S3)
68*638691a0SAndroid Build Coastguard Worker	movq 8(%esi),%mm1
69*638691a0SAndroid Build Coastguard Worker	pmaddwd %mm1,%mm1
70*638691a0SAndroid Build Coastguard Worker	paddd %mm1,%mm2
71*638691a0SAndroid Build Coastguard Worker	paddd %mm0,%mm2		# accumulate
72*638691a0SAndroid Build Coastguard Worker
73*638691a0SAndroid Build Coastguard Worker	addl $16,%esi
74*638691a0SAndroid Build Coastguard Worker	jmp 1b
75*638691a0SAndroid Build Coastguard Worker
76*638691a0SAndroid Build Coastguard Worker2:	movd %mm2,%eax		# even sum
77*638691a0SAndroid Build Coastguard Worker	psrlq $32,%mm2
78*638691a0SAndroid Build Coastguard Worker	movd %mm2,%edx		# odd sum
79*638691a0SAndroid Build Coastguard Worker	addl %edx,%eax
80*638691a0SAndroid Build Coastguard Worker	emms
81*638691a0SAndroid Build Coastguard Worker	popl %esi
82*638691a0SAndroid Build Coastguard Worker	popl %ebp
83*638691a0SAndroid Build Coastguard Worker	ret
84