xref: /aosp_15_r20/external/fdlibm/e_sqrt.c (revision 1e651e1ef2b613db2c4b29ae59c1de74cf0222ae)
1*1e651e1eSRoland Levillain /* @(#)e_sqrt.c 1.3 95/01/18 */
2*1e651e1eSRoland Levillain /*
3*1e651e1eSRoland Levillain  * ====================================================
4*1e651e1eSRoland Levillain  * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
5*1e651e1eSRoland Levillain  *
6*1e651e1eSRoland Levillain  * Developed at SunSoft, a Sun Microsystems, Inc. business.
7*1e651e1eSRoland Levillain  * Permission to use, copy, modify, and distribute this
8*1e651e1eSRoland Levillain  * software is freely granted, provided that this notice
9*1e651e1eSRoland Levillain  * is preserved.
10*1e651e1eSRoland Levillain  * ====================================================
11*1e651e1eSRoland Levillain  */
12*1e651e1eSRoland Levillain 
13*1e651e1eSRoland Levillain /* __ieee754_sqrt(x)
14*1e651e1eSRoland Levillain  * Return correctly rounded sqrt.
15*1e651e1eSRoland Levillain  *           ------------------------------------------
16*1e651e1eSRoland Levillain  *	     |  Use the hardware sqrt if you have one |
17*1e651e1eSRoland Levillain  *           ------------------------------------------
18*1e651e1eSRoland Levillain  * Method:
19*1e651e1eSRoland Levillain  *   Bit by bit method using integer arithmetic. (Slow, but portable)
20*1e651e1eSRoland Levillain  *   1. Normalization
21*1e651e1eSRoland Levillain  *	Scale x to y in [1,4) with even powers of 2:
22*1e651e1eSRoland Levillain  *	find an integer k such that  1 <= (y=x*2^(2k)) < 4, then
23*1e651e1eSRoland Levillain  *		sqrt(x) = 2^k * ieee_sqrt(y)
24*1e651e1eSRoland Levillain  *   2. Bit by bit computation
25*1e651e1eSRoland Levillain  *	Let q  = ieee_sqrt(y) truncated to i bit after binary point (q = 1),
26*1e651e1eSRoland Levillain  *	     i							 0
27*1e651e1eSRoland Levillain  *                                     i+1         2
28*1e651e1eSRoland Levillain  *	    s  = 2*q , and	y  =  2   * ( y - q  ).		(1)
29*1e651e1eSRoland Levillain  *	     i      i            i                 i
30*1e651e1eSRoland Levillain  *
31*1e651e1eSRoland Levillain  *	To compute q    from q , one checks whether
32*1e651e1eSRoland Levillain  *		    i+1       i
33*1e651e1eSRoland Levillain  *
34*1e651e1eSRoland Levillain  *			      -(i+1) 2
35*1e651e1eSRoland Levillain  *			(q + 2      ) <= y.			(2)
36*1e651e1eSRoland Levillain  *     			  i
37*1e651e1eSRoland Levillain  *							      -(i+1)
38*1e651e1eSRoland Levillain  *	If (2) is false, then q   = q ; otherwise q   = q  + 2      .
39*1e651e1eSRoland Levillain  *		 	       i+1   i             i+1   i
40*1e651e1eSRoland Levillain  *
41*1e651e1eSRoland Levillain  *	With some algebric manipulation, it is not difficult to see
42*1e651e1eSRoland Levillain  *	that (2) is equivalent to
43*1e651e1eSRoland Levillain  *                             -(i+1)
44*1e651e1eSRoland Levillain  *			s  +  2       <= y			(3)
45*1e651e1eSRoland Levillain  *			 i                i
46*1e651e1eSRoland Levillain  *
47*1e651e1eSRoland Levillain  *	The advantage of (3) is that s  and y  can be computed by
48*1e651e1eSRoland Levillain  *				      i      i
49*1e651e1eSRoland Levillain  *	the following recurrence formula:
50*1e651e1eSRoland Levillain  *	    if (3) is false
51*1e651e1eSRoland Levillain  *
52*1e651e1eSRoland Levillain  *	    s     =  s  ,	y    = y   ;			(4)
53*1e651e1eSRoland Levillain  *	     i+1      i		 i+1    i
54*1e651e1eSRoland Levillain  *
55*1e651e1eSRoland Levillain  *	    otherwise,
56*1e651e1eSRoland Levillain  *                         -i                     -(i+1)
57*1e651e1eSRoland Levillain  *	    s	  =  s  + 2  ,  y    = y  -  s  - 2  		(5)
58*1e651e1eSRoland Levillain  *           i+1      i          i+1    i     i
59*1e651e1eSRoland Levillain  *
60*1e651e1eSRoland Levillain  *	One may easily use induction to prove (4) and (5).
61*1e651e1eSRoland Levillain  *	Note. Since the left hand side of (3) contain only i+2 bits,
62*1e651e1eSRoland Levillain  *	      it does not necessary to do a full (53-bit) comparison
63*1e651e1eSRoland Levillain  *	      in (3).
64*1e651e1eSRoland Levillain  *   3. Final rounding
65*1e651e1eSRoland Levillain  *	After generating the 53 bits result, we compute one more bit.
66*1e651e1eSRoland Levillain  *	Together with the remainder, we can decide whether the
67*1e651e1eSRoland Levillain  *	result is exact, bigger than 1/2ulp, or less than 1/2ulp
68*1e651e1eSRoland Levillain  *	(it will never equal to 1/2ulp).
69*1e651e1eSRoland Levillain  *	The rounding mode can be detected by checking whether
70*1e651e1eSRoland Levillain  *	huge + tiny is equal to huge, and whether huge - tiny is
71*1e651e1eSRoland Levillain  *	equal to huge for some floating point number "huge" and "tiny".
72*1e651e1eSRoland Levillain  *
73*1e651e1eSRoland Levillain  * Special cases:
74*1e651e1eSRoland Levillain  *	sqrt(+-0) = +-0 	... exact
75*1e651e1eSRoland Levillain  *	sqrt(inf) = inf
76*1e651e1eSRoland Levillain  *	sqrt(-ve) = NaN		... with invalid signal
77*1e651e1eSRoland Levillain  *	sqrt(NaN) = NaN		... with invalid signal for signaling NaN
78*1e651e1eSRoland Levillain  *
79*1e651e1eSRoland Levillain  * Other methods : see the appended file at the end of the program below.
80*1e651e1eSRoland Levillain  *---------------
81*1e651e1eSRoland Levillain  */
82*1e651e1eSRoland Levillain 
83*1e651e1eSRoland Levillain #include "fdlibm.h"
84*1e651e1eSRoland Levillain 
85*1e651e1eSRoland Levillain #ifdef __STDC__
86*1e651e1eSRoland Levillain static	const double	one	= 1.0, tiny=1.0e-300;
87*1e651e1eSRoland Levillain #else
88*1e651e1eSRoland Levillain static	double	one	= 1.0, tiny=1.0e-300;
89*1e651e1eSRoland Levillain #endif
90*1e651e1eSRoland Levillain 
91*1e651e1eSRoland Levillain #ifdef __STDC__
__ieee754_sqrt(double x)92*1e651e1eSRoland Levillain 	double __ieee754_sqrt(double x)
93*1e651e1eSRoland Levillain #else
94*1e651e1eSRoland Levillain 	double __ieee754_sqrt(x)
95*1e651e1eSRoland Levillain 	double x;
96*1e651e1eSRoland Levillain #endif
97*1e651e1eSRoland Levillain {
98*1e651e1eSRoland Levillain 	double z;
99*1e651e1eSRoland Levillain 	int 	sign = (int)0x80000000;
100*1e651e1eSRoland Levillain 	unsigned r,t1,s1,ix1,q1;
101*1e651e1eSRoland Levillain 	int ix0,s0,q,m,t,i;
102*1e651e1eSRoland Levillain 
103*1e651e1eSRoland Levillain 	ix0 = __HI(x);			/* high word of x */
104*1e651e1eSRoland Levillain 	ix1 = __LO(x);		/* low word of x */
105*1e651e1eSRoland Levillain 
106*1e651e1eSRoland Levillain     /* take care of Inf and NaN */
107*1e651e1eSRoland Levillain 	if((ix0&0x7ff00000)==0x7ff00000) {
108*1e651e1eSRoland Levillain 	    return x*x+x;		/* ieee_sqrt(NaN)=NaN, ieee_sqrt(+inf)=+inf
109*1e651e1eSRoland Levillain 					   ieee_sqrt(-inf)=sNaN */
110*1e651e1eSRoland Levillain 	}
111*1e651e1eSRoland Levillain     /* take care of zero */
112*1e651e1eSRoland Levillain 	if(ix0<=0) {
113*1e651e1eSRoland Levillain 	    if(((ix0&(~sign))|ix1)==0) return x;/* ieee_sqrt(+-0) = +-0 */
114*1e651e1eSRoland Levillain 	    else if(ix0<0)
115*1e651e1eSRoland Levillain 		return (x-x)/(x-x);		/* ieee_sqrt(-ve) = sNaN */
116*1e651e1eSRoland Levillain 	}
117*1e651e1eSRoland Levillain     /* normalize x */
118*1e651e1eSRoland Levillain 	m = (ix0>>20);
119*1e651e1eSRoland Levillain 	if(m==0) {				/* subnormal x */
120*1e651e1eSRoland Levillain 	    while(ix0==0) {
121*1e651e1eSRoland Levillain 		m -= 21;
122*1e651e1eSRoland Levillain 		ix0 |= (ix1>>11); ix1 <<= 21;
123*1e651e1eSRoland Levillain 	    }
124*1e651e1eSRoland Levillain 	    for(i=0;(ix0&0x00100000)==0;i++) ix0<<=1;
125*1e651e1eSRoland Levillain 	    m -= i-1;
126*1e651e1eSRoland Levillain 	    ix0 |= (ix1>>(32-i));
127*1e651e1eSRoland Levillain 	    ix1 <<= i;
128*1e651e1eSRoland Levillain 	}
129*1e651e1eSRoland Levillain 	m -= 1023;	/* unbias exponent */
130*1e651e1eSRoland Levillain 	ix0 = (ix0&0x000fffff)|0x00100000;
131*1e651e1eSRoland Levillain 	if(m&1){	/* odd m, double x to make it even */
132*1e651e1eSRoland Levillain 	    ix0 += ix0 + ((ix1&sign)>>31);
133*1e651e1eSRoland Levillain 	    ix1 += ix1;
134*1e651e1eSRoland Levillain 	}
135*1e651e1eSRoland Levillain 	m >>= 1;	/* m = [m/2] */
136*1e651e1eSRoland Levillain 
137*1e651e1eSRoland Levillain     /* generate ieee_sqrt(x) bit by bit */
138*1e651e1eSRoland Levillain 	ix0 += ix0 + ((ix1&sign)>>31);
139*1e651e1eSRoland Levillain 	ix1 += ix1;
140*1e651e1eSRoland Levillain 	q = q1 = s0 = s1 = 0;	/* [q,q1] = ieee_sqrt(x) */
141*1e651e1eSRoland Levillain 	r = 0x00200000;		/* r = moving bit from right to left */
142*1e651e1eSRoland Levillain 
143*1e651e1eSRoland Levillain 	while(r!=0) {
144*1e651e1eSRoland Levillain 	    t = s0+r;
145*1e651e1eSRoland Levillain 	    if(t<=ix0) {
146*1e651e1eSRoland Levillain 		s0   = t+r;
147*1e651e1eSRoland Levillain 		ix0 -= t;
148*1e651e1eSRoland Levillain 		q   += r;
149*1e651e1eSRoland Levillain 	    }
150*1e651e1eSRoland Levillain 	    ix0 += ix0 + ((ix1&sign)>>31);
151*1e651e1eSRoland Levillain 	    ix1 += ix1;
152*1e651e1eSRoland Levillain 	    r>>=1;
153*1e651e1eSRoland Levillain 	}
154*1e651e1eSRoland Levillain 
155*1e651e1eSRoland Levillain 	r = sign;
156*1e651e1eSRoland Levillain 	while(r!=0) {
157*1e651e1eSRoland Levillain 	    t1 = s1+r;
158*1e651e1eSRoland Levillain 	    t  = s0;
159*1e651e1eSRoland Levillain 	    if((t<ix0)||((t==ix0)&&(t1<=ix1))) {
160*1e651e1eSRoland Levillain 		s1  = t1+r;
161*1e651e1eSRoland Levillain 		if(((t1&sign)==sign)&&(s1&sign)==0) s0 += 1;
162*1e651e1eSRoland Levillain 		ix0 -= t;
163*1e651e1eSRoland Levillain 		if (ix1 < t1) ix0 -= 1;
164*1e651e1eSRoland Levillain 		ix1 -= t1;
165*1e651e1eSRoland Levillain 		q1  += r;
166*1e651e1eSRoland Levillain 	    }
167*1e651e1eSRoland Levillain 	    ix0 += ix0 + ((ix1&sign)>>31);
168*1e651e1eSRoland Levillain 	    ix1 += ix1;
169*1e651e1eSRoland Levillain 	    r>>=1;
170*1e651e1eSRoland Levillain 	}
171*1e651e1eSRoland Levillain 
172*1e651e1eSRoland Levillain     /* use floating add to find out rounding direction */
173*1e651e1eSRoland Levillain 	if((ix0|ix1)!=0) {
174*1e651e1eSRoland Levillain 	    z = one-tiny; /* trigger inexact flag */
175*1e651e1eSRoland Levillain 	    if (z>=one) {
176*1e651e1eSRoland Levillain 	        z = one+tiny;
177*1e651e1eSRoland Levillain 	        if (q1==(unsigned)0xffffffff) { q1=0; q += 1;}
178*1e651e1eSRoland Levillain 		else if (z>one) {
179*1e651e1eSRoland Levillain 		    if (q1==(unsigned)0xfffffffe) q+=1;
180*1e651e1eSRoland Levillain 		    q1+=2;
181*1e651e1eSRoland Levillain 		} else
182*1e651e1eSRoland Levillain 	            q1 += (q1&1);
183*1e651e1eSRoland Levillain 	    }
184*1e651e1eSRoland Levillain 	}
185*1e651e1eSRoland Levillain 	ix0 = (q>>1)+0x3fe00000;
186*1e651e1eSRoland Levillain 	ix1 =  q1>>1;
187*1e651e1eSRoland Levillain 	if ((q&1)==1) ix1 |= sign;
188*1e651e1eSRoland Levillain 	ix0 += (m <<20);
189*1e651e1eSRoland Levillain 	__HI(z) = ix0;
190*1e651e1eSRoland Levillain 	__LO(z) = ix1;
191*1e651e1eSRoland Levillain 	return z;
192*1e651e1eSRoland Levillain }
193*1e651e1eSRoland Levillain 
194*1e651e1eSRoland Levillain /*
195*1e651e1eSRoland Levillain Other methods  (use floating-point arithmetic)
196*1e651e1eSRoland Levillain -------------
197*1e651e1eSRoland Levillain (This is a copy of a drafted paper by Prof W. Kahan
198*1e651e1eSRoland Levillain and K.C. Ng, written in May, 1986)
199*1e651e1eSRoland Levillain 
200*1e651e1eSRoland Levillain 	Two algorithms are given here to implement ieee_sqrt(x)
201*1e651e1eSRoland Levillain 	(IEEE double precision arithmetic) in software.
202*1e651e1eSRoland Levillain 	Both supply ieee_sqrt(x) correctly rounded. The first algorithm (in
203*1e651e1eSRoland Levillain 	Section A) uses newton iterations and involves four divisions.
204*1e651e1eSRoland Levillain 	The second one uses reciproot iterations to avoid division, but
205*1e651e1eSRoland Levillain 	requires more multiplications. Both algorithms need the ability
206*1e651e1eSRoland Levillain 	to chop results of arithmetic operations instead of round them,
207*1e651e1eSRoland Levillain 	and the INEXACT flag to indicate when an arithmetic operation
208*1e651e1eSRoland Levillain 	is executed exactly with no roundoff error, all part of the
209*1e651e1eSRoland Levillain 	standard (IEEE 754-1985). The ability to perform shift, add,
210*1e651e1eSRoland Levillain 	subtract and logical AND operations upon 32-bit words is needed
211*1e651e1eSRoland Levillain 	too, though not part of the standard.
212*1e651e1eSRoland Levillain 
213*1e651e1eSRoland Levillain A.  ieee_sqrt(x) by Newton Iteration
214*1e651e1eSRoland Levillain 
215*1e651e1eSRoland Levillain    (1)	Initial approximation
216*1e651e1eSRoland Levillain 
217*1e651e1eSRoland Levillain 	Let x0 and x1 be the leading and the trailing 32-bit words of
218*1e651e1eSRoland Levillain 	a floating point number x (in IEEE double format) respectively
219*1e651e1eSRoland Levillain 
220*1e651e1eSRoland Levillain 	    1    11		     52				  ...widths
221*1e651e1eSRoland Levillain 	   ------------------------------------------------------
222*1e651e1eSRoland Levillain 	x: |s|	  e     |	      f				|
223*1e651e1eSRoland Levillain 	   ------------------------------------------------------
224*1e651e1eSRoland Levillain 	      msb    lsb  msb				      lsb ...order
225*1e651e1eSRoland Levillain 
226*1e651e1eSRoland Levillain 
227*1e651e1eSRoland Levillain 	     ------------------------  	     ------------------------
228*1e651e1eSRoland Levillain 	x0:  |s|   e    |    f1     |	 x1: |          f2           |
229*1e651e1eSRoland Levillain 	     ------------------------  	     ------------------------
230*1e651e1eSRoland Levillain 
231*1e651e1eSRoland Levillain 	By performing shifts and subtracts on x0 and x1 (both regarded
232*1e651e1eSRoland Levillain 	as integers), we obtain an 8-bit approximation of ieee_sqrt(x) as
233*1e651e1eSRoland Levillain 	follows.
234*1e651e1eSRoland Levillain 
235*1e651e1eSRoland Levillain 		k  := (x0>>1) + 0x1ff80000;
236*1e651e1eSRoland Levillain 		y0 := k - T1[31&(k>>15)].	... y ~ ieee_sqrt(x) to 8 bits
237*1e651e1eSRoland Levillain 	Here k is a 32-bit integer and T1[] is an integer array containing
238*1e651e1eSRoland Levillain 	correction terms. Now magically the floating value of y (y's
239*1e651e1eSRoland Levillain 	leading 32-bit word is y0, the value of its trailing word is 0)
240*1e651e1eSRoland Levillain 	approximates ieee_sqrt(x) to almost 8-bit.
241*1e651e1eSRoland Levillain 
242*1e651e1eSRoland Levillain 	Value of T1:
243*1e651e1eSRoland Levillain 	static int T1[32]= {
244*1e651e1eSRoland Levillain 	0,	1024,	3062,	5746,	9193,	13348,	18162,	23592,
245*1e651e1eSRoland Levillain 	29598,	36145,	43202,	50740,	58733,	67158,	75992,	85215,
246*1e651e1eSRoland Levillain 	83599,	71378,	60428,	50647,	41945,	34246,	27478,	21581,
247*1e651e1eSRoland Levillain 	16499,	12183,	8588,	5674,	3403,	1742,	661,	130,};
248*1e651e1eSRoland Levillain 
249*1e651e1eSRoland Levillain     (2)	Iterative refinement
250*1e651e1eSRoland Levillain 
251*1e651e1eSRoland Levillain 	Apply Heron's rule three times to y, we have y approximates
252*1e651e1eSRoland Levillain 	sqrt(x) to within 1 ulp (Unit in the Last Place):
253*1e651e1eSRoland Levillain 
254*1e651e1eSRoland Levillain 		y := (y+x/y)/2		... almost 17 sig. bits
255*1e651e1eSRoland Levillain 		y := (y+x/y)/2		... almost 35 sig. bits
256*1e651e1eSRoland Levillain 		y := y-(y-x/y)/2	... within 1 ulp
257*1e651e1eSRoland Levillain 
258*1e651e1eSRoland Levillain 
259*1e651e1eSRoland Levillain 	Remark 1.
260*1e651e1eSRoland Levillain 	    Another way to improve y to within 1 ulp is:
261*1e651e1eSRoland Levillain 
262*1e651e1eSRoland Levillain 		y := (y+x/y)		... almost 17 sig. bits to 2*ieee_sqrt(x)
263*1e651e1eSRoland Levillain 		y := y - 0x00100006	... almost 18 sig. bits to ieee_sqrt(x)
264*1e651e1eSRoland Levillain 
265*1e651e1eSRoland Levillain 				2
266*1e651e1eSRoland Levillain 			    (x-y )*y
267*1e651e1eSRoland Levillain 		y := y + 2* ----------	...within 1 ulp
268*1e651e1eSRoland Levillain 			       2
269*1e651e1eSRoland Levillain 			     3y  + x
270*1e651e1eSRoland Levillain 
271*1e651e1eSRoland Levillain 
272*1e651e1eSRoland Levillain 	This formula has one division fewer than the one above; however,
273*1e651e1eSRoland Levillain 	it requires more multiplications and additions. Also x must be
274*1e651e1eSRoland Levillain 	scaled in advance to avoid spurious overflow in evaluating the
275*1e651e1eSRoland Levillain 	expression 3y*y+x. Hence it is not recommended uless division
276*1e651e1eSRoland Levillain 	is slow. If division is very slow, then one should use the
277*1e651e1eSRoland Levillain 	reciproot algorithm given in section B.
278*1e651e1eSRoland Levillain 
279*1e651e1eSRoland Levillain     (3) Final adjustment
280*1e651e1eSRoland Levillain 
281*1e651e1eSRoland Levillain 	By twiddling y's last bit it is possible to force y to be
282*1e651e1eSRoland Levillain 	correctly rounded according to the prevailing rounding mode
283*1e651e1eSRoland Levillain 	as follows. Let r and i be copies of the rounding mode and
284*1e651e1eSRoland Levillain 	inexact flag before entering the square root program. Also we
285*1e651e1eSRoland Levillain 	use the expression y+-ulp for the next representable floating
286*1e651e1eSRoland Levillain 	numbers (up and down) of y. Note that y+-ulp = either fixed
287*1e651e1eSRoland Levillain 	point y+-1, or multiply y by ieee_nextafter(1,+-inf) in chopped
288*1e651e1eSRoland Levillain 	mode.
289*1e651e1eSRoland Levillain 
290*1e651e1eSRoland Levillain 		I := FALSE;	... reset INEXACT flag I
291*1e651e1eSRoland Levillain 		R := RZ;	... set rounding mode to round-toward-zero
292*1e651e1eSRoland Levillain 		z := x/y;	... chopped quotient, possibly inexact
293*1e651e1eSRoland Levillain 		If(not I) then {	... if the quotient is exact
294*1e651e1eSRoland Levillain 		    if(z=y) {
295*1e651e1eSRoland Levillain 		        I := i;	 ... restore inexact flag
296*1e651e1eSRoland Levillain 		        R := r;  ... restore rounded mode
297*1e651e1eSRoland Levillain 		        return ieee_sqrt(x):=y.
298*1e651e1eSRoland Levillain 		    } else {
299*1e651e1eSRoland Levillain 			z := z - ulp;	... special rounding
300*1e651e1eSRoland Levillain 		    }
301*1e651e1eSRoland Levillain 		}
302*1e651e1eSRoland Levillain 		i := TRUE;		... ieee_sqrt(x) is inexact
303*1e651e1eSRoland Levillain 		If (r=RN) then z=z+ulp	... rounded-to-nearest
304*1e651e1eSRoland Levillain 		If (r=RP) then {	... round-toward-+inf
305*1e651e1eSRoland Levillain 		    y = y+ulp; z=z+ulp;
306*1e651e1eSRoland Levillain 		}
307*1e651e1eSRoland Levillain 		y := y+z;		... chopped sum
308*1e651e1eSRoland Levillain 		y0:=y0-0x00100000;	... y := y/2 is correctly rounded.
309*1e651e1eSRoland Levillain 	        I := i;	 		... restore inexact flag
310*1e651e1eSRoland Levillain 	        R := r;  		... restore rounded mode
311*1e651e1eSRoland Levillain 	        return ieee_sqrt(x):=y.
312*1e651e1eSRoland Levillain 
313*1e651e1eSRoland Levillain     (4)	Special cases
314*1e651e1eSRoland Levillain 
315*1e651e1eSRoland Levillain 	Square root of +inf, +-0, or NaN is itself;
316*1e651e1eSRoland Levillain 	Square root of a negative number is NaN with invalid signal.
317*1e651e1eSRoland Levillain 
318*1e651e1eSRoland Levillain 
319*1e651e1eSRoland Levillain B.  ieee_sqrt(x) by Reciproot Iteration
320*1e651e1eSRoland Levillain 
321*1e651e1eSRoland Levillain    (1)	Initial approximation
322*1e651e1eSRoland Levillain 
323*1e651e1eSRoland Levillain 	Let x0 and x1 be the leading and the trailing 32-bit words of
324*1e651e1eSRoland Levillain 	a floating point number x (in IEEE double format) respectively
325*1e651e1eSRoland Levillain 	(see section A). By performing shifs and subtracts on x0 and y0,
326*1e651e1eSRoland Levillain 	we obtain a 7.8-bit approximation of 1/ieee_sqrt(x) as follows.
327*1e651e1eSRoland Levillain 
328*1e651e1eSRoland Levillain 	    k := 0x5fe80000 - (x0>>1);
329*1e651e1eSRoland Levillain 	    y0:= k - T2[63&(k>>14)].	... y ~ 1/ieee_sqrt(x) to 7.8 bits
330*1e651e1eSRoland Levillain 
331*1e651e1eSRoland Levillain 	Here k is a 32-bit integer and T2[] is an integer array
332*1e651e1eSRoland Levillain 	containing correction terms. Now magically the floating
333*1e651e1eSRoland Levillain 	value of y (y's leading 32-bit word is y0, the value of
334*1e651e1eSRoland Levillain 	its trailing word y1 is set to zero) approximates 1/ieee_sqrt(x)
335*1e651e1eSRoland Levillain 	to almost 7.8-bit.
336*1e651e1eSRoland Levillain 
337*1e651e1eSRoland Levillain 	Value of T2:
338*1e651e1eSRoland Levillain 	static int T2[64]= {
339*1e651e1eSRoland Levillain 	0x1500,	0x2ef8,	0x4d67,	0x6b02,	0x87be,	0xa395,	0xbe7a,	0xd866,
340*1e651e1eSRoland Levillain 	0xf14a,	0x1091b,0x11fcd,0x13552,0x14999,0x15c98,0x16e34,0x17e5f,
341*1e651e1eSRoland Levillain 	0x18d03,0x19a01,0x1a545,0x1ae8a,0x1b5c4,0x1bb01,0x1bfde,0x1c28d,
342*1e651e1eSRoland Levillain 	0x1c2de,0x1c0db,0x1ba73,0x1b11c,0x1a4b5,0x1953d,0x18266,0x16be0,
343*1e651e1eSRoland Levillain 	0x1683e,0x179d8,0x18a4d,0x19992,0x1a789,0x1b445,0x1bf61,0x1c989,
344*1e651e1eSRoland Levillain 	0x1d16d,0x1d77b,0x1dddf,0x1e2ad,0x1e5bf,0x1e6e8,0x1e654,0x1e3cd,
345*1e651e1eSRoland Levillain 	0x1df2a,0x1d635,0x1cb16,0x1be2c,0x1ae4e,0x19bde,0x1868e,0x16e2e,
346*1e651e1eSRoland Levillain 	0x1527f,0x1334a,0x11051,0xe951,	0xbe01,	0x8e0d,	0x5924,	0x1edd,};
347*1e651e1eSRoland Levillain 
348*1e651e1eSRoland Levillain     (2)	Iterative refinement
349*1e651e1eSRoland Levillain 
350*1e651e1eSRoland Levillain 	Apply Reciproot iteration three times to y and multiply the
351*1e651e1eSRoland Levillain 	result by x to get an approximation z that matches ieee_sqrt(x)
352*1e651e1eSRoland Levillain 	to about 1 ulp. To be exact, we will have
353*1e651e1eSRoland Levillain 		-1ulp < ieee_sqrt(x)-z<1.0625ulp.
354*1e651e1eSRoland Levillain 
355*1e651e1eSRoland Levillain 	... set rounding mode to Round-to-nearest
356*1e651e1eSRoland Levillain 	   y := y*(1.5-0.5*x*y*y)	... almost 15 sig. bits to 1/ieee_sqrt(x)
357*1e651e1eSRoland Levillain 	   y := y*((1.5-2^-30)+0.5*x*y*y)... about 29 sig. bits to 1/ieee_sqrt(x)
358*1e651e1eSRoland Levillain 	... special arrangement for better accuracy
359*1e651e1eSRoland Levillain 	   z := x*y			... 29 bits to ieee_sqrt(x), with z*y<1
360*1e651e1eSRoland Levillain 	   z := z + 0.5*z*(1-z*y)	... about 1 ulp to ieee_sqrt(x)
361*1e651e1eSRoland Levillain 
362*1e651e1eSRoland Levillain 	Remark 2. The constant 1.5-2^-30 is chosen to bias the error so that
363*1e651e1eSRoland Levillain 	(a) the term z*y in the final iteration is always less than 1;
364*1e651e1eSRoland Levillain 	(b) the error in the final result is biased upward so that
365*1e651e1eSRoland Levillain 		-1 ulp < ieee_sqrt(x) - z < 1.0625 ulp
366*1e651e1eSRoland Levillain 	    instead of |ieee_sqrt(x)-z|<1.03125ulp.
367*1e651e1eSRoland Levillain 
368*1e651e1eSRoland Levillain     (3)	Final adjustment
369*1e651e1eSRoland Levillain 
370*1e651e1eSRoland Levillain 	By twiddling y's last bit it is possible to force y to be
371*1e651e1eSRoland Levillain 	correctly rounded according to the prevailing rounding mode
372*1e651e1eSRoland Levillain 	as follows. Let r and i be copies of the rounding mode and
373*1e651e1eSRoland Levillain 	inexact flag before entering the square root program. Also we
374*1e651e1eSRoland Levillain 	use the expression y+-ulp for the next representable floating
375*1e651e1eSRoland Levillain 	numbers (up and down) of y. Note that y+-ulp = either fixed
376*1e651e1eSRoland Levillain 	point y+-1, or multiply y by ieee_nextafter(1,+-inf) in chopped
377*1e651e1eSRoland Levillain 	mode.
378*1e651e1eSRoland Levillain 
379*1e651e1eSRoland Levillain 	R := RZ;		... set rounding mode to round-toward-zero
380*1e651e1eSRoland Levillain 	switch(r) {
381*1e651e1eSRoland Levillain 	    case RN:		... round-to-nearest
382*1e651e1eSRoland Levillain 	       if(x<= z*(z-ulp)...chopped) z = z - ulp; else
383*1e651e1eSRoland Levillain 	       if(x<= z*(z+ulp)...chopped) z = z; else z = z+ulp;
384*1e651e1eSRoland Levillain 	       break;
385*1e651e1eSRoland Levillain 	    case RZ:case RM:	... round-to-zero or round-to--inf
386*1e651e1eSRoland Levillain 	       R:=RP;		... reset rounding mod to round-to-+inf
387*1e651e1eSRoland Levillain 	       if(x<z*z ... rounded up) z = z - ulp; else
388*1e651e1eSRoland Levillain 	       if(x>=(z+ulp)*(z+ulp) ...rounded up) z = z+ulp;
389*1e651e1eSRoland Levillain 	       break;
390*1e651e1eSRoland Levillain 	    case RP:		... round-to-+inf
391*1e651e1eSRoland Levillain 	       if(x>(z+ulp)*(z+ulp)...chopped) z = z+2*ulp; else
392*1e651e1eSRoland Levillain 	       if(x>z*z ...chopped) z = z+ulp;
393*1e651e1eSRoland Levillain 	       break;
394*1e651e1eSRoland Levillain 	}
395*1e651e1eSRoland Levillain 
396*1e651e1eSRoland Levillain 	Remark 3. The above comparisons can be done in fixed point. For
397*1e651e1eSRoland Levillain 	example, to compare x and w=z*z chopped, it suffices to compare
398*1e651e1eSRoland Levillain 	x1 and w1 (the trailing parts of x and w), regarding them as
399*1e651e1eSRoland Levillain 	two's complement integers.
400*1e651e1eSRoland Levillain 
401*1e651e1eSRoland Levillain 	...Is z an exact square root?
402*1e651e1eSRoland Levillain 	To determine whether z is an exact square root of x, let z1 be the
403*1e651e1eSRoland Levillain 	trailing part of z, and also let x0 and x1 be the leading and
404*1e651e1eSRoland Levillain 	trailing parts of x.
405*1e651e1eSRoland Levillain 
406*1e651e1eSRoland Levillain 	If ((z1&0x03ffffff)!=0)	... not exact if trailing 26 bits of z!=0
407*1e651e1eSRoland Levillain 	    I := 1;		... Raise Inexact flag: z is not exact
408*1e651e1eSRoland Levillain 	else {
409*1e651e1eSRoland Levillain 	    j := 1 - [(x0>>20)&1]	... j = ieee_logb(x) mod 2
410*1e651e1eSRoland Levillain 	    k := z1 >> 26;		... get z's 25-th and 26-th
411*1e651e1eSRoland Levillain 					    fraction bits
412*1e651e1eSRoland Levillain 	    I := i or (k&j) or ((k&(j+j+1))!=(x1&3));
413*1e651e1eSRoland Levillain 	}
414*1e651e1eSRoland Levillain 	R:= r		... restore rounded mode
415*1e651e1eSRoland Levillain 	return ieee_sqrt(x):=z.
416*1e651e1eSRoland Levillain 
417*1e651e1eSRoland Levillain 	If multiplication is cheaper then the foregoing red tape, the
418*1e651e1eSRoland Levillain 	Inexact flag can be evaluated by
419*1e651e1eSRoland Levillain 
420*1e651e1eSRoland Levillain 	    I := i;
421*1e651e1eSRoland Levillain 	    I := (z*z!=x) or I.
422*1e651e1eSRoland Levillain 
423*1e651e1eSRoland Levillain 	Note that z*z can overwrite I; this value must be sensed if it is
424*1e651e1eSRoland Levillain 	True.
425*1e651e1eSRoland Levillain 
426*1e651e1eSRoland Levillain 	Remark 4. If z*z = x exactly, then bit 25 to bit 0 of z1 must be
427*1e651e1eSRoland Levillain 	zero.
428*1e651e1eSRoland Levillain 
429*1e651e1eSRoland Levillain 		    --------------------
430*1e651e1eSRoland Levillain 		z1: |        f2        |
431*1e651e1eSRoland Levillain 		    --------------------
432*1e651e1eSRoland Levillain 		bit 31		   bit 0
433*1e651e1eSRoland Levillain 
434*1e651e1eSRoland Levillain 	Further more, bit 27 and 26 of z1, bit 0 and 1 of x1, and the odd
435*1e651e1eSRoland Levillain 	or even of ieee_logb(x) have the following relations:
436*1e651e1eSRoland Levillain 
437*1e651e1eSRoland Levillain 	-------------------------------------------------
438*1e651e1eSRoland Levillain 	bit 27,26 of z1		bit 1,0 of x1	logb(x)
439*1e651e1eSRoland Levillain 	-------------------------------------------------
440*1e651e1eSRoland Levillain 	00			00		odd and even
441*1e651e1eSRoland Levillain 	01			01		even
442*1e651e1eSRoland Levillain 	10			10		odd
443*1e651e1eSRoland Levillain 	10			00		even
444*1e651e1eSRoland Levillain 	11			01		even
445*1e651e1eSRoland Levillain 	-------------------------------------------------
446*1e651e1eSRoland Levillain 
447*1e651e1eSRoland Levillain     (4)	Special cases (see (4) of Section A).
448*1e651e1eSRoland Levillain 
449*1e651e1eSRoland Levillain  */
450*1e651e1eSRoland Levillain 
451