1*1e651e1eSRoland Levillain /* @(#)e_sqrt.c 1.3 95/01/18 */ 2*1e651e1eSRoland Levillain /* 3*1e651e1eSRoland Levillain * ==================================================== 4*1e651e1eSRoland Levillain * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. 5*1e651e1eSRoland Levillain * 6*1e651e1eSRoland Levillain * Developed at SunSoft, a Sun Microsystems, Inc. business. 7*1e651e1eSRoland Levillain * Permission to use, copy, modify, and distribute this 8*1e651e1eSRoland Levillain * software is freely granted, provided that this notice 9*1e651e1eSRoland Levillain * is preserved. 10*1e651e1eSRoland Levillain * ==================================================== 11*1e651e1eSRoland Levillain */ 12*1e651e1eSRoland Levillain 13*1e651e1eSRoland Levillain /* __ieee754_sqrt(x) 14*1e651e1eSRoland Levillain * Return correctly rounded sqrt. 15*1e651e1eSRoland Levillain * ------------------------------------------ 16*1e651e1eSRoland Levillain * | Use the hardware sqrt if you have one | 17*1e651e1eSRoland Levillain * ------------------------------------------ 18*1e651e1eSRoland Levillain * Method: 19*1e651e1eSRoland Levillain * Bit by bit method using integer arithmetic. (Slow, but portable) 20*1e651e1eSRoland Levillain * 1. Normalization 21*1e651e1eSRoland Levillain * Scale x to y in [1,4) with even powers of 2: 22*1e651e1eSRoland Levillain * find an integer k such that 1 <= (y=x*2^(2k)) < 4, then 23*1e651e1eSRoland Levillain * sqrt(x) = 2^k * ieee_sqrt(y) 24*1e651e1eSRoland Levillain * 2. Bit by bit computation 25*1e651e1eSRoland Levillain * Let q = ieee_sqrt(y) truncated to i bit after binary point (q = 1), 26*1e651e1eSRoland Levillain * i 0 27*1e651e1eSRoland Levillain * i+1 2 28*1e651e1eSRoland Levillain * s = 2*q , and y = 2 * ( y - q ). (1) 29*1e651e1eSRoland Levillain * i i i i 30*1e651e1eSRoland Levillain * 31*1e651e1eSRoland Levillain * To compute q from q , one checks whether 32*1e651e1eSRoland Levillain * i+1 i 33*1e651e1eSRoland Levillain * 34*1e651e1eSRoland Levillain * -(i+1) 2 35*1e651e1eSRoland Levillain * (q + 2 ) <= y. (2) 36*1e651e1eSRoland Levillain * i 37*1e651e1eSRoland Levillain * -(i+1) 38*1e651e1eSRoland Levillain * If (2) is false, then q = q ; otherwise q = q + 2 . 39*1e651e1eSRoland Levillain * i+1 i i+1 i 40*1e651e1eSRoland Levillain * 41*1e651e1eSRoland Levillain * With some algebric manipulation, it is not difficult to see 42*1e651e1eSRoland Levillain * that (2) is equivalent to 43*1e651e1eSRoland Levillain * -(i+1) 44*1e651e1eSRoland Levillain * s + 2 <= y (3) 45*1e651e1eSRoland Levillain * i i 46*1e651e1eSRoland Levillain * 47*1e651e1eSRoland Levillain * The advantage of (3) is that s and y can be computed by 48*1e651e1eSRoland Levillain * i i 49*1e651e1eSRoland Levillain * the following recurrence formula: 50*1e651e1eSRoland Levillain * if (3) is false 51*1e651e1eSRoland Levillain * 52*1e651e1eSRoland Levillain * s = s , y = y ; (4) 53*1e651e1eSRoland Levillain * i+1 i i+1 i 54*1e651e1eSRoland Levillain * 55*1e651e1eSRoland Levillain * otherwise, 56*1e651e1eSRoland Levillain * -i -(i+1) 57*1e651e1eSRoland Levillain * s = s + 2 , y = y - s - 2 (5) 58*1e651e1eSRoland Levillain * i+1 i i+1 i i 59*1e651e1eSRoland Levillain * 60*1e651e1eSRoland Levillain * One may easily use induction to prove (4) and (5). 61*1e651e1eSRoland Levillain * Note. Since the left hand side of (3) contain only i+2 bits, 62*1e651e1eSRoland Levillain * it does not necessary to do a full (53-bit) comparison 63*1e651e1eSRoland Levillain * in (3). 64*1e651e1eSRoland Levillain * 3. Final rounding 65*1e651e1eSRoland Levillain * After generating the 53 bits result, we compute one more bit. 66*1e651e1eSRoland Levillain * Together with the remainder, we can decide whether the 67*1e651e1eSRoland Levillain * result is exact, bigger than 1/2ulp, or less than 1/2ulp 68*1e651e1eSRoland Levillain * (it will never equal to 1/2ulp). 69*1e651e1eSRoland Levillain * The rounding mode can be detected by checking whether 70*1e651e1eSRoland Levillain * huge + tiny is equal to huge, and whether huge - tiny is 71*1e651e1eSRoland Levillain * equal to huge for some floating point number "huge" and "tiny". 72*1e651e1eSRoland Levillain * 73*1e651e1eSRoland Levillain * Special cases: 74*1e651e1eSRoland Levillain * sqrt(+-0) = +-0 ... exact 75*1e651e1eSRoland Levillain * sqrt(inf) = inf 76*1e651e1eSRoland Levillain * sqrt(-ve) = NaN ... with invalid signal 77*1e651e1eSRoland Levillain * sqrt(NaN) = NaN ... with invalid signal for signaling NaN 78*1e651e1eSRoland Levillain * 79*1e651e1eSRoland Levillain * Other methods : see the appended file at the end of the program below. 80*1e651e1eSRoland Levillain *--------------- 81*1e651e1eSRoland Levillain */ 82*1e651e1eSRoland Levillain 83*1e651e1eSRoland Levillain #include "fdlibm.h" 84*1e651e1eSRoland Levillain 85*1e651e1eSRoland Levillain #ifdef __STDC__ 86*1e651e1eSRoland Levillain static const double one = 1.0, tiny=1.0e-300; 87*1e651e1eSRoland Levillain #else 88*1e651e1eSRoland Levillain static double one = 1.0, tiny=1.0e-300; 89*1e651e1eSRoland Levillain #endif 90*1e651e1eSRoland Levillain 91*1e651e1eSRoland Levillain #ifdef __STDC__ __ieee754_sqrt(double x)92*1e651e1eSRoland Levillain double __ieee754_sqrt(double x) 93*1e651e1eSRoland Levillain #else 94*1e651e1eSRoland Levillain double __ieee754_sqrt(x) 95*1e651e1eSRoland Levillain double x; 96*1e651e1eSRoland Levillain #endif 97*1e651e1eSRoland Levillain { 98*1e651e1eSRoland Levillain double z; 99*1e651e1eSRoland Levillain int sign = (int)0x80000000; 100*1e651e1eSRoland Levillain unsigned r,t1,s1,ix1,q1; 101*1e651e1eSRoland Levillain int ix0,s0,q,m,t,i; 102*1e651e1eSRoland Levillain 103*1e651e1eSRoland Levillain ix0 = __HI(x); /* high word of x */ 104*1e651e1eSRoland Levillain ix1 = __LO(x); /* low word of x */ 105*1e651e1eSRoland Levillain 106*1e651e1eSRoland Levillain /* take care of Inf and NaN */ 107*1e651e1eSRoland Levillain if((ix0&0x7ff00000)==0x7ff00000) { 108*1e651e1eSRoland Levillain return x*x+x; /* ieee_sqrt(NaN)=NaN, ieee_sqrt(+inf)=+inf 109*1e651e1eSRoland Levillain ieee_sqrt(-inf)=sNaN */ 110*1e651e1eSRoland Levillain } 111*1e651e1eSRoland Levillain /* take care of zero */ 112*1e651e1eSRoland Levillain if(ix0<=0) { 113*1e651e1eSRoland Levillain if(((ix0&(~sign))|ix1)==0) return x;/* ieee_sqrt(+-0) = +-0 */ 114*1e651e1eSRoland Levillain else if(ix0<0) 115*1e651e1eSRoland Levillain return (x-x)/(x-x); /* ieee_sqrt(-ve) = sNaN */ 116*1e651e1eSRoland Levillain } 117*1e651e1eSRoland Levillain /* normalize x */ 118*1e651e1eSRoland Levillain m = (ix0>>20); 119*1e651e1eSRoland Levillain if(m==0) { /* subnormal x */ 120*1e651e1eSRoland Levillain while(ix0==0) { 121*1e651e1eSRoland Levillain m -= 21; 122*1e651e1eSRoland Levillain ix0 |= (ix1>>11); ix1 <<= 21; 123*1e651e1eSRoland Levillain } 124*1e651e1eSRoland Levillain for(i=0;(ix0&0x00100000)==0;i++) ix0<<=1; 125*1e651e1eSRoland Levillain m -= i-1; 126*1e651e1eSRoland Levillain ix0 |= (ix1>>(32-i)); 127*1e651e1eSRoland Levillain ix1 <<= i; 128*1e651e1eSRoland Levillain } 129*1e651e1eSRoland Levillain m -= 1023; /* unbias exponent */ 130*1e651e1eSRoland Levillain ix0 = (ix0&0x000fffff)|0x00100000; 131*1e651e1eSRoland Levillain if(m&1){ /* odd m, double x to make it even */ 132*1e651e1eSRoland Levillain ix0 += ix0 + ((ix1&sign)>>31); 133*1e651e1eSRoland Levillain ix1 += ix1; 134*1e651e1eSRoland Levillain } 135*1e651e1eSRoland Levillain m >>= 1; /* m = [m/2] */ 136*1e651e1eSRoland Levillain 137*1e651e1eSRoland Levillain /* generate ieee_sqrt(x) bit by bit */ 138*1e651e1eSRoland Levillain ix0 += ix0 + ((ix1&sign)>>31); 139*1e651e1eSRoland Levillain ix1 += ix1; 140*1e651e1eSRoland Levillain q = q1 = s0 = s1 = 0; /* [q,q1] = ieee_sqrt(x) */ 141*1e651e1eSRoland Levillain r = 0x00200000; /* r = moving bit from right to left */ 142*1e651e1eSRoland Levillain 143*1e651e1eSRoland Levillain while(r!=0) { 144*1e651e1eSRoland Levillain t = s0+r; 145*1e651e1eSRoland Levillain if(t<=ix0) { 146*1e651e1eSRoland Levillain s0 = t+r; 147*1e651e1eSRoland Levillain ix0 -= t; 148*1e651e1eSRoland Levillain q += r; 149*1e651e1eSRoland Levillain } 150*1e651e1eSRoland Levillain ix0 += ix0 + ((ix1&sign)>>31); 151*1e651e1eSRoland Levillain ix1 += ix1; 152*1e651e1eSRoland Levillain r>>=1; 153*1e651e1eSRoland Levillain } 154*1e651e1eSRoland Levillain 155*1e651e1eSRoland Levillain r = sign; 156*1e651e1eSRoland Levillain while(r!=0) { 157*1e651e1eSRoland Levillain t1 = s1+r; 158*1e651e1eSRoland Levillain t = s0; 159*1e651e1eSRoland Levillain if((t<ix0)||((t==ix0)&&(t1<=ix1))) { 160*1e651e1eSRoland Levillain s1 = t1+r; 161*1e651e1eSRoland Levillain if(((t1&sign)==sign)&&(s1&sign)==0) s0 += 1; 162*1e651e1eSRoland Levillain ix0 -= t; 163*1e651e1eSRoland Levillain if (ix1 < t1) ix0 -= 1; 164*1e651e1eSRoland Levillain ix1 -= t1; 165*1e651e1eSRoland Levillain q1 += r; 166*1e651e1eSRoland Levillain } 167*1e651e1eSRoland Levillain ix0 += ix0 + ((ix1&sign)>>31); 168*1e651e1eSRoland Levillain ix1 += ix1; 169*1e651e1eSRoland Levillain r>>=1; 170*1e651e1eSRoland Levillain } 171*1e651e1eSRoland Levillain 172*1e651e1eSRoland Levillain /* use floating add to find out rounding direction */ 173*1e651e1eSRoland Levillain if((ix0|ix1)!=0) { 174*1e651e1eSRoland Levillain z = one-tiny; /* trigger inexact flag */ 175*1e651e1eSRoland Levillain if (z>=one) { 176*1e651e1eSRoland Levillain z = one+tiny; 177*1e651e1eSRoland Levillain if (q1==(unsigned)0xffffffff) { q1=0; q += 1;} 178*1e651e1eSRoland Levillain else if (z>one) { 179*1e651e1eSRoland Levillain if (q1==(unsigned)0xfffffffe) q+=1; 180*1e651e1eSRoland Levillain q1+=2; 181*1e651e1eSRoland Levillain } else 182*1e651e1eSRoland Levillain q1 += (q1&1); 183*1e651e1eSRoland Levillain } 184*1e651e1eSRoland Levillain } 185*1e651e1eSRoland Levillain ix0 = (q>>1)+0x3fe00000; 186*1e651e1eSRoland Levillain ix1 = q1>>1; 187*1e651e1eSRoland Levillain if ((q&1)==1) ix1 |= sign; 188*1e651e1eSRoland Levillain ix0 += (m <<20); 189*1e651e1eSRoland Levillain __HI(z) = ix0; 190*1e651e1eSRoland Levillain __LO(z) = ix1; 191*1e651e1eSRoland Levillain return z; 192*1e651e1eSRoland Levillain } 193*1e651e1eSRoland Levillain 194*1e651e1eSRoland Levillain /* 195*1e651e1eSRoland Levillain Other methods (use floating-point arithmetic) 196*1e651e1eSRoland Levillain ------------- 197*1e651e1eSRoland Levillain (This is a copy of a drafted paper by Prof W. Kahan 198*1e651e1eSRoland Levillain and K.C. Ng, written in May, 1986) 199*1e651e1eSRoland Levillain 200*1e651e1eSRoland Levillain Two algorithms are given here to implement ieee_sqrt(x) 201*1e651e1eSRoland Levillain (IEEE double precision arithmetic) in software. 202*1e651e1eSRoland Levillain Both supply ieee_sqrt(x) correctly rounded. The first algorithm (in 203*1e651e1eSRoland Levillain Section A) uses newton iterations and involves four divisions. 204*1e651e1eSRoland Levillain The second one uses reciproot iterations to avoid division, but 205*1e651e1eSRoland Levillain requires more multiplications. Both algorithms need the ability 206*1e651e1eSRoland Levillain to chop results of arithmetic operations instead of round them, 207*1e651e1eSRoland Levillain and the INEXACT flag to indicate when an arithmetic operation 208*1e651e1eSRoland Levillain is executed exactly with no roundoff error, all part of the 209*1e651e1eSRoland Levillain standard (IEEE 754-1985). The ability to perform shift, add, 210*1e651e1eSRoland Levillain subtract and logical AND operations upon 32-bit words is needed 211*1e651e1eSRoland Levillain too, though not part of the standard. 212*1e651e1eSRoland Levillain 213*1e651e1eSRoland Levillain A. ieee_sqrt(x) by Newton Iteration 214*1e651e1eSRoland Levillain 215*1e651e1eSRoland Levillain (1) Initial approximation 216*1e651e1eSRoland Levillain 217*1e651e1eSRoland Levillain Let x0 and x1 be the leading and the trailing 32-bit words of 218*1e651e1eSRoland Levillain a floating point number x (in IEEE double format) respectively 219*1e651e1eSRoland Levillain 220*1e651e1eSRoland Levillain 1 11 52 ...widths 221*1e651e1eSRoland Levillain ------------------------------------------------------ 222*1e651e1eSRoland Levillain x: |s| e | f | 223*1e651e1eSRoland Levillain ------------------------------------------------------ 224*1e651e1eSRoland Levillain msb lsb msb lsb ...order 225*1e651e1eSRoland Levillain 226*1e651e1eSRoland Levillain 227*1e651e1eSRoland Levillain ------------------------ ------------------------ 228*1e651e1eSRoland Levillain x0: |s| e | f1 | x1: | f2 | 229*1e651e1eSRoland Levillain ------------------------ ------------------------ 230*1e651e1eSRoland Levillain 231*1e651e1eSRoland Levillain By performing shifts and subtracts on x0 and x1 (both regarded 232*1e651e1eSRoland Levillain as integers), we obtain an 8-bit approximation of ieee_sqrt(x) as 233*1e651e1eSRoland Levillain follows. 234*1e651e1eSRoland Levillain 235*1e651e1eSRoland Levillain k := (x0>>1) + 0x1ff80000; 236*1e651e1eSRoland Levillain y0 := k - T1[31&(k>>15)]. ... y ~ ieee_sqrt(x) to 8 bits 237*1e651e1eSRoland Levillain Here k is a 32-bit integer and T1[] is an integer array containing 238*1e651e1eSRoland Levillain correction terms. Now magically the floating value of y (y's 239*1e651e1eSRoland Levillain leading 32-bit word is y0, the value of its trailing word is 0) 240*1e651e1eSRoland Levillain approximates ieee_sqrt(x) to almost 8-bit. 241*1e651e1eSRoland Levillain 242*1e651e1eSRoland Levillain Value of T1: 243*1e651e1eSRoland Levillain static int T1[32]= { 244*1e651e1eSRoland Levillain 0, 1024, 3062, 5746, 9193, 13348, 18162, 23592, 245*1e651e1eSRoland Levillain 29598, 36145, 43202, 50740, 58733, 67158, 75992, 85215, 246*1e651e1eSRoland Levillain 83599, 71378, 60428, 50647, 41945, 34246, 27478, 21581, 247*1e651e1eSRoland Levillain 16499, 12183, 8588, 5674, 3403, 1742, 661, 130,}; 248*1e651e1eSRoland Levillain 249*1e651e1eSRoland Levillain (2) Iterative refinement 250*1e651e1eSRoland Levillain 251*1e651e1eSRoland Levillain Apply Heron's rule three times to y, we have y approximates 252*1e651e1eSRoland Levillain sqrt(x) to within 1 ulp (Unit in the Last Place): 253*1e651e1eSRoland Levillain 254*1e651e1eSRoland Levillain y := (y+x/y)/2 ... almost 17 sig. bits 255*1e651e1eSRoland Levillain y := (y+x/y)/2 ... almost 35 sig. bits 256*1e651e1eSRoland Levillain y := y-(y-x/y)/2 ... within 1 ulp 257*1e651e1eSRoland Levillain 258*1e651e1eSRoland Levillain 259*1e651e1eSRoland Levillain Remark 1. 260*1e651e1eSRoland Levillain Another way to improve y to within 1 ulp is: 261*1e651e1eSRoland Levillain 262*1e651e1eSRoland Levillain y := (y+x/y) ... almost 17 sig. bits to 2*ieee_sqrt(x) 263*1e651e1eSRoland Levillain y := y - 0x00100006 ... almost 18 sig. bits to ieee_sqrt(x) 264*1e651e1eSRoland Levillain 265*1e651e1eSRoland Levillain 2 266*1e651e1eSRoland Levillain (x-y )*y 267*1e651e1eSRoland Levillain y := y + 2* ---------- ...within 1 ulp 268*1e651e1eSRoland Levillain 2 269*1e651e1eSRoland Levillain 3y + x 270*1e651e1eSRoland Levillain 271*1e651e1eSRoland Levillain 272*1e651e1eSRoland Levillain This formula has one division fewer than the one above; however, 273*1e651e1eSRoland Levillain it requires more multiplications and additions. Also x must be 274*1e651e1eSRoland Levillain scaled in advance to avoid spurious overflow in evaluating the 275*1e651e1eSRoland Levillain expression 3y*y+x. Hence it is not recommended uless division 276*1e651e1eSRoland Levillain is slow. If division is very slow, then one should use the 277*1e651e1eSRoland Levillain reciproot algorithm given in section B. 278*1e651e1eSRoland Levillain 279*1e651e1eSRoland Levillain (3) Final adjustment 280*1e651e1eSRoland Levillain 281*1e651e1eSRoland Levillain By twiddling y's last bit it is possible to force y to be 282*1e651e1eSRoland Levillain correctly rounded according to the prevailing rounding mode 283*1e651e1eSRoland Levillain as follows. Let r and i be copies of the rounding mode and 284*1e651e1eSRoland Levillain inexact flag before entering the square root program. Also we 285*1e651e1eSRoland Levillain use the expression y+-ulp for the next representable floating 286*1e651e1eSRoland Levillain numbers (up and down) of y. Note that y+-ulp = either fixed 287*1e651e1eSRoland Levillain point y+-1, or multiply y by ieee_nextafter(1,+-inf) in chopped 288*1e651e1eSRoland Levillain mode. 289*1e651e1eSRoland Levillain 290*1e651e1eSRoland Levillain I := FALSE; ... reset INEXACT flag I 291*1e651e1eSRoland Levillain R := RZ; ... set rounding mode to round-toward-zero 292*1e651e1eSRoland Levillain z := x/y; ... chopped quotient, possibly inexact 293*1e651e1eSRoland Levillain If(not I) then { ... if the quotient is exact 294*1e651e1eSRoland Levillain if(z=y) { 295*1e651e1eSRoland Levillain I := i; ... restore inexact flag 296*1e651e1eSRoland Levillain R := r; ... restore rounded mode 297*1e651e1eSRoland Levillain return ieee_sqrt(x):=y. 298*1e651e1eSRoland Levillain } else { 299*1e651e1eSRoland Levillain z := z - ulp; ... special rounding 300*1e651e1eSRoland Levillain } 301*1e651e1eSRoland Levillain } 302*1e651e1eSRoland Levillain i := TRUE; ... ieee_sqrt(x) is inexact 303*1e651e1eSRoland Levillain If (r=RN) then z=z+ulp ... rounded-to-nearest 304*1e651e1eSRoland Levillain If (r=RP) then { ... round-toward-+inf 305*1e651e1eSRoland Levillain y = y+ulp; z=z+ulp; 306*1e651e1eSRoland Levillain } 307*1e651e1eSRoland Levillain y := y+z; ... chopped sum 308*1e651e1eSRoland Levillain y0:=y0-0x00100000; ... y := y/2 is correctly rounded. 309*1e651e1eSRoland Levillain I := i; ... restore inexact flag 310*1e651e1eSRoland Levillain R := r; ... restore rounded mode 311*1e651e1eSRoland Levillain return ieee_sqrt(x):=y. 312*1e651e1eSRoland Levillain 313*1e651e1eSRoland Levillain (4) Special cases 314*1e651e1eSRoland Levillain 315*1e651e1eSRoland Levillain Square root of +inf, +-0, or NaN is itself; 316*1e651e1eSRoland Levillain Square root of a negative number is NaN with invalid signal. 317*1e651e1eSRoland Levillain 318*1e651e1eSRoland Levillain 319*1e651e1eSRoland Levillain B. ieee_sqrt(x) by Reciproot Iteration 320*1e651e1eSRoland Levillain 321*1e651e1eSRoland Levillain (1) Initial approximation 322*1e651e1eSRoland Levillain 323*1e651e1eSRoland Levillain Let x0 and x1 be the leading and the trailing 32-bit words of 324*1e651e1eSRoland Levillain a floating point number x (in IEEE double format) respectively 325*1e651e1eSRoland Levillain (see section A). By performing shifs and subtracts on x0 and y0, 326*1e651e1eSRoland Levillain we obtain a 7.8-bit approximation of 1/ieee_sqrt(x) as follows. 327*1e651e1eSRoland Levillain 328*1e651e1eSRoland Levillain k := 0x5fe80000 - (x0>>1); 329*1e651e1eSRoland Levillain y0:= k - T2[63&(k>>14)]. ... y ~ 1/ieee_sqrt(x) to 7.8 bits 330*1e651e1eSRoland Levillain 331*1e651e1eSRoland Levillain Here k is a 32-bit integer and T2[] is an integer array 332*1e651e1eSRoland Levillain containing correction terms. Now magically the floating 333*1e651e1eSRoland Levillain value of y (y's leading 32-bit word is y0, the value of 334*1e651e1eSRoland Levillain its trailing word y1 is set to zero) approximates 1/ieee_sqrt(x) 335*1e651e1eSRoland Levillain to almost 7.8-bit. 336*1e651e1eSRoland Levillain 337*1e651e1eSRoland Levillain Value of T2: 338*1e651e1eSRoland Levillain static int T2[64]= { 339*1e651e1eSRoland Levillain 0x1500, 0x2ef8, 0x4d67, 0x6b02, 0x87be, 0xa395, 0xbe7a, 0xd866, 340*1e651e1eSRoland Levillain 0xf14a, 0x1091b,0x11fcd,0x13552,0x14999,0x15c98,0x16e34,0x17e5f, 341*1e651e1eSRoland Levillain 0x18d03,0x19a01,0x1a545,0x1ae8a,0x1b5c4,0x1bb01,0x1bfde,0x1c28d, 342*1e651e1eSRoland Levillain 0x1c2de,0x1c0db,0x1ba73,0x1b11c,0x1a4b5,0x1953d,0x18266,0x16be0, 343*1e651e1eSRoland Levillain 0x1683e,0x179d8,0x18a4d,0x19992,0x1a789,0x1b445,0x1bf61,0x1c989, 344*1e651e1eSRoland Levillain 0x1d16d,0x1d77b,0x1dddf,0x1e2ad,0x1e5bf,0x1e6e8,0x1e654,0x1e3cd, 345*1e651e1eSRoland Levillain 0x1df2a,0x1d635,0x1cb16,0x1be2c,0x1ae4e,0x19bde,0x1868e,0x16e2e, 346*1e651e1eSRoland Levillain 0x1527f,0x1334a,0x11051,0xe951, 0xbe01, 0x8e0d, 0x5924, 0x1edd,}; 347*1e651e1eSRoland Levillain 348*1e651e1eSRoland Levillain (2) Iterative refinement 349*1e651e1eSRoland Levillain 350*1e651e1eSRoland Levillain Apply Reciproot iteration three times to y and multiply the 351*1e651e1eSRoland Levillain result by x to get an approximation z that matches ieee_sqrt(x) 352*1e651e1eSRoland Levillain to about 1 ulp. To be exact, we will have 353*1e651e1eSRoland Levillain -1ulp < ieee_sqrt(x)-z<1.0625ulp. 354*1e651e1eSRoland Levillain 355*1e651e1eSRoland Levillain ... set rounding mode to Round-to-nearest 356*1e651e1eSRoland Levillain y := y*(1.5-0.5*x*y*y) ... almost 15 sig. bits to 1/ieee_sqrt(x) 357*1e651e1eSRoland Levillain y := y*((1.5-2^-30)+0.5*x*y*y)... about 29 sig. bits to 1/ieee_sqrt(x) 358*1e651e1eSRoland Levillain ... special arrangement for better accuracy 359*1e651e1eSRoland Levillain z := x*y ... 29 bits to ieee_sqrt(x), with z*y<1 360*1e651e1eSRoland Levillain z := z + 0.5*z*(1-z*y) ... about 1 ulp to ieee_sqrt(x) 361*1e651e1eSRoland Levillain 362*1e651e1eSRoland Levillain Remark 2. The constant 1.5-2^-30 is chosen to bias the error so that 363*1e651e1eSRoland Levillain (a) the term z*y in the final iteration is always less than 1; 364*1e651e1eSRoland Levillain (b) the error in the final result is biased upward so that 365*1e651e1eSRoland Levillain -1 ulp < ieee_sqrt(x) - z < 1.0625 ulp 366*1e651e1eSRoland Levillain instead of |ieee_sqrt(x)-z|<1.03125ulp. 367*1e651e1eSRoland Levillain 368*1e651e1eSRoland Levillain (3) Final adjustment 369*1e651e1eSRoland Levillain 370*1e651e1eSRoland Levillain By twiddling y's last bit it is possible to force y to be 371*1e651e1eSRoland Levillain correctly rounded according to the prevailing rounding mode 372*1e651e1eSRoland Levillain as follows. Let r and i be copies of the rounding mode and 373*1e651e1eSRoland Levillain inexact flag before entering the square root program. Also we 374*1e651e1eSRoland Levillain use the expression y+-ulp for the next representable floating 375*1e651e1eSRoland Levillain numbers (up and down) of y. Note that y+-ulp = either fixed 376*1e651e1eSRoland Levillain point y+-1, or multiply y by ieee_nextafter(1,+-inf) in chopped 377*1e651e1eSRoland Levillain mode. 378*1e651e1eSRoland Levillain 379*1e651e1eSRoland Levillain R := RZ; ... set rounding mode to round-toward-zero 380*1e651e1eSRoland Levillain switch(r) { 381*1e651e1eSRoland Levillain case RN: ... round-to-nearest 382*1e651e1eSRoland Levillain if(x<= z*(z-ulp)...chopped) z = z - ulp; else 383*1e651e1eSRoland Levillain if(x<= z*(z+ulp)...chopped) z = z; else z = z+ulp; 384*1e651e1eSRoland Levillain break; 385*1e651e1eSRoland Levillain case RZ:case RM: ... round-to-zero or round-to--inf 386*1e651e1eSRoland Levillain R:=RP; ... reset rounding mod to round-to-+inf 387*1e651e1eSRoland Levillain if(x<z*z ... rounded up) z = z - ulp; else 388*1e651e1eSRoland Levillain if(x>=(z+ulp)*(z+ulp) ...rounded up) z = z+ulp; 389*1e651e1eSRoland Levillain break; 390*1e651e1eSRoland Levillain case RP: ... round-to-+inf 391*1e651e1eSRoland Levillain if(x>(z+ulp)*(z+ulp)...chopped) z = z+2*ulp; else 392*1e651e1eSRoland Levillain if(x>z*z ...chopped) z = z+ulp; 393*1e651e1eSRoland Levillain break; 394*1e651e1eSRoland Levillain } 395*1e651e1eSRoland Levillain 396*1e651e1eSRoland Levillain Remark 3. The above comparisons can be done in fixed point. For 397*1e651e1eSRoland Levillain example, to compare x and w=z*z chopped, it suffices to compare 398*1e651e1eSRoland Levillain x1 and w1 (the trailing parts of x and w), regarding them as 399*1e651e1eSRoland Levillain two's complement integers. 400*1e651e1eSRoland Levillain 401*1e651e1eSRoland Levillain ...Is z an exact square root? 402*1e651e1eSRoland Levillain To determine whether z is an exact square root of x, let z1 be the 403*1e651e1eSRoland Levillain trailing part of z, and also let x0 and x1 be the leading and 404*1e651e1eSRoland Levillain trailing parts of x. 405*1e651e1eSRoland Levillain 406*1e651e1eSRoland Levillain If ((z1&0x03ffffff)!=0) ... not exact if trailing 26 bits of z!=0 407*1e651e1eSRoland Levillain I := 1; ... Raise Inexact flag: z is not exact 408*1e651e1eSRoland Levillain else { 409*1e651e1eSRoland Levillain j := 1 - [(x0>>20)&1] ... j = ieee_logb(x) mod 2 410*1e651e1eSRoland Levillain k := z1 >> 26; ... get z's 25-th and 26-th 411*1e651e1eSRoland Levillain fraction bits 412*1e651e1eSRoland Levillain I := i or (k&j) or ((k&(j+j+1))!=(x1&3)); 413*1e651e1eSRoland Levillain } 414*1e651e1eSRoland Levillain R:= r ... restore rounded mode 415*1e651e1eSRoland Levillain return ieee_sqrt(x):=z. 416*1e651e1eSRoland Levillain 417*1e651e1eSRoland Levillain If multiplication is cheaper then the foregoing red tape, the 418*1e651e1eSRoland Levillain Inexact flag can be evaluated by 419*1e651e1eSRoland Levillain 420*1e651e1eSRoland Levillain I := i; 421*1e651e1eSRoland Levillain I := (z*z!=x) or I. 422*1e651e1eSRoland Levillain 423*1e651e1eSRoland Levillain Note that z*z can overwrite I; this value must be sensed if it is 424*1e651e1eSRoland Levillain True. 425*1e651e1eSRoland Levillain 426*1e651e1eSRoland Levillain Remark 4. If z*z = x exactly, then bit 25 to bit 0 of z1 must be 427*1e651e1eSRoland Levillain zero. 428*1e651e1eSRoland Levillain 429*1e651e1eSRoland Levillain -------------------- 430*1e651e1eSRoland Levillain z1: | f2 | 431*1e651e1eSRoland Levillain -------------------- 432*1e651e1eSRoland Levillain bit 31 bit 0 433*1e651e1eSRoland Levillain 434*1e651e1eSRoland Levillain Further more, bit 27 and 26 of z1, bit 0 and 1 of x1, and the odd 435*1e651e1eSRoland Levillain or even of ieee_logb(x) have the following relations: 436*1e651e1eSRoland Levillain 437*1e651e1eSRoland Levillain ------------------------------------------------- 438*1e651e1eSRoland Levillain bit 27,26 of z1 bit 1,0 of x1 logb(x) 439*1e651e1eSRoland Levillain ------------------------------------------------- 440*1e651e1eSRoland Levillain 00 00 odd and even 441*1e651e1eSRoland Levillain 01 01 even 442*1e651e1eSRoland Levillain 10 10 odd 443*1e651e1eSRoland Levillain 10 00 even 444*1e651e1eSRoland Levillain 11 01 even 445*1e651e1eSRoland Levillain ------------------------------------------------- 446*1e651e1eSRoland Levillain 447*1e651e1eSRoland Levillain (4) Special cases (see (4) of Section A). 448*1e651e1eSRoland Levillain 449*1e651e1eSRoland Levillain */ 450*1e651e1eSRoland Levillain 451