xref: /aosp_15_r20/external/compiler-rt/lib/builtins/i386/floatundisf.S (revision 7c3d14c8b49c529e04be81a3ce6f5cc23712e4c6)
1*7c3d14c8STreehugger Robot// This file is dual licensed under the MIT and the University of Illinois Open
2*7c3d14c8STreehugger Robot// Source Licenses. See LICENSE.TXT for details.
3*7c3d14c8STreehugger Robot
4*7c3d14c8STreehugger Robot#include "../assembly.h"
5*7c3d14c8STreehugger Robot
6*7c3d14c8STreehugger Robot// float __floatundisf(du_int a);
7*7c3d14c8STreehugger Robot
8*7c3d14c8STreehugger Robot// Note that there is a hardware instruction, fildll, that does most of what
9*7c3d14c8STreehugger Robot// this function needs to do.  However, because of our ia32 ABI, it will take
10*7c3d14c8STreehugger Robot// a write-small read-large stall, so the software implementation here is
11*7c3d14c8STreehugger Robot// actually several cycles faster.
12*7c3d14c8STreehugger Robot
13*7c3d14c8STreehugger Robot// This is a branch-free implementation.  A branchy implementation might be
14*7c3d14c8STreehugger Robot// faster for the common case if you know something a priori about the input
15*7c3d14c8STreehugger Robot// distribution.
16*7c3d14c8STreehugger Robot
17*7c3d14c8STreehugger Robot/* branch-free x87 implementation - one cycle slower than without x87.
18*7c3d14c8STreehugger Robot
19*7c3d14c8STreehugger Robot#ifdef __i386__
20*7c3d14c8STreehugger Robot
21*7c3d14c8STreehugger RobotCONST_SECTION
22*7c3d14c8STreehugger Robot.balign 3
23*7c3d14c8STreehugger Robot
24*7c3d14c8STreehugger Robot		.quad	0x43f0000000000000
25*7c3d14c8STreehugger Robottwop64:	.quad	0x0000000000000000
26*7c3d14c8STreehugger Robot
27*7c3d14c8STreehugger Robot#define			TWOp64			twop64-0b(%ecx,%eax,8)
28*7c3d14c8STreehugger Robot
29*7c3d14c8STreehugger Robot.text
30*7c3d14c8STreehugger Robot.balign 4
31*7c3d14c8STreehugger RobotDEFINE_COMPILERRT_FUNCTION(__floatundisf)
32*7c3d14c8STreehugger Robot	movl		8(%esp),		%eax
33*7c3d14c8STreehugger Robot	movd		8(%esp),		%xmm1
34*7c3d14c8STreehugger Robot	movd		4(%esp),		%xmm0
35*7c3d14c8STreehugger Robot	punpckldq	%xmm1,			%xmm0
36*7c3d14c8STreehugger Robot	calll		0f
37*7c3d14c8STreehugger Robot0:	popl		%ecx
38*7c3d14c8STreehugger Robot	sarl		$31,			%eax
39*7c3d14c8STreehugger Robot	movq		%xmm0,			4(%esp)
40*7c3d14c8STreehugger Robot	fildll		4(%esp)
41*7c3d14c8STreehugger Robot	faddl		TWOp64
42*7c3d14c8STreehugger Robot	fstps		4(%esp)
43*7c3d14c8STreehugger Robot	flds		4(%esp)
44*7c3d14c8STreehugger Robot	ret
45*7c3d14c8STreehugger RobotEND_COMPILERRT_FUNCTION(__floatundisf)
46*7c3d14c8STreehugger Robot
47*7c3d14c8STreehugger Robot#endif // __i386__
48*7c3d14c8STreehugger Robot
49*7c3d14c8STreehugger Robot*/
50*7c3d14c8STreehugger Robot
51*7c3d14c8STreehugger Robot/* branch-free, x87-free implementation - faster at the expense of code size */
52*7c3d14c8STreehugger Robot
53*7c3d14c8STreehugger Robot#ifdef __i386__
54*7c3d14c8STreehugger Robot
55*7c3d14c8STreehugger RobotCONST_SECTION
56*7c3d14c8STreehugger Robot
57*7c3d14c8STreehugger Robot	.balign 16
58*7c3d14c8STreehugger Robottwop52:
59*7c3d14c8STreehugger Robot	.quad 0x4330000000000000
60*7c3d14c8STreehugger Robot	.quad 0x0000000000000fff
61*7c3d14c8STreehugger Robot
62*7c3d14c8STreehugger Robot	.balign 16
63*7c3d14c8STreehugger Robotsticky:
64*7c3d14c8STreehugger Robot	.quad 0x0000000000000000
65*7c3d14c8STreehugger Robot	.long 0x00000012
66*7c3d14c8STreehugger Robot
67*7c3d14c8STreehugger Robot	.balign 16
68*7c3d14c8STreehugger Robottwelve:
69*7c3d14c8STreehugger Robot	.long 0x00000000
70*7c3d14c8STreehugger Robot
71*7c3d14c8STreehugger Robot#define			TWOp52			twop52-0b(%ecx)
72*7c3d14c8STreehugger Robot#define			STICKY			sticky-0b(%ecx,%eax,8)
73*7c3d14c8STreehugger Robot
74*7c3d14c8STreehugger Robot.text
75*7c3d14c8STreehugger Robot.balign 4
76*7c3d14c8STreehugger RobotDEFINE_COMPILERRT_FUNCTION(__floatundisf)
77*7c3d14c8STreehugger Robot	movl		8(%esp),		%eax
78*7c3d14c8STreehugger Robot	movd		8(%esp),		%xmm1
79*7c3d14c8STreehugger Robot	movd		4(%esp),		%xmm0
80*7c3d14c8STreehugger Robot	punpckldq	%xmm1,			%xmm0
81*7c3d14c8STreehugger Robot
82*7c3d14c8STreehugger Robot	calll		0f
83*7c3d14c8STreehugger Robot0:	popl		%ecx
84*7c3d14c8STreehugger Robot	shrl		%eax					// high 31 bits of input as sint32
85*7c3d14c8STreehugger Robot	addl		$0x7ff80000,	%eax
86*7c3d14c8STreehugger Robot	sarl		$31,			%eax	// (big input) ? -1 : 0
87*7c3d14c8STreehugger Robot	movsd		STICKY,			%xmm1	// (big input) ? 0xfff : 0
88*7c3d14c8STreehugger Robot	movl		$12,			%edx
89*7c3d14c8STreehugger Robot	andl		%eax,			%edx	// (big input) ? 12 : 0
90*7c3d14c8STreehugger Robot	movd		%edx,			%xmm3
91*7c3d14c8STreehugger Robot	andpd		%xmm0,			%xmm1	// (big input) ? input & 0xfff : 0
92*7c3d14c8STreehugger Robot	movsd		TWOp52,			%xmm2	// 0x1.0p52
93*7c3d14c8STreehugger Robot	psrlq		%xmm3,			%xmm0	// (big input) ? input >> 12 : input
94*7c3d14c8STreehugger Robot	orpd		%xmm2,			%xmm1	// 0x1.0p52 + ((big input) ? input & 0xfff : input)
95*7c3d14c8STreehugger Robot	orpd		%xmm1,			%xmm0	// 0x1.0p52 + ((big input) ? (input >> 12 | input & 0xfff) : input)
96*7c3d14c8STreehugger Robot	subsd		%xmm2,			%xmm0	// (double)((big input) ? (input >> 12 | input & 0xfff) : input)
97*7c3d14c8STreehugger Robot	cvtsd2ss	%xmm0,			%xmm0	// (float)((big input) ? (input >> 12 | input & 0xfff) : input)
98*7c3d14c8STreehugger Robot	pslld		$23,			%xmm3
99*7c3d14c8STreehugger Robot	paddd		%xmm3,			%xmm0	// (float)input
100*7c3d14c8STreehugger Robot	movd		%xmm0,			4(%esp)
101*7c3d14c8STreehugger Robot	flds		4(%esp)
102*7c3d14c8STreehugger Robot	ret
103*7c3d14c8STreehugger RobotEND_COMPILERRT_FUNCTION(__floatundisf)
104*7c3d14c8STreehugger Robot
105*7c3d14c8STreehugger Robot#endif // __i386__
106*7c3d14c8STreehugger Robot
107*7c3d14c8STreehugger RobotNO_EXEC_STACK_DIRECTIVE
108*7c3d14c8STreehugger Robot
109