1*c9945492SAndroid Build Coastguard Worker #include <stdint.h>
2*c9945492SAndroid Build Coastguard Worker #include <math.h>
3*c9945492SAndroid Build Coastguard Worker #include "libm.h"
4*c9945492SAndroid Build Coastguard Worker #include "sqrt_data.h"
5*c9945492SAndroid Build Coastguard Worker
6*c9945492SAndroid Build Coastguard Worker #define FENV_SUPPORT 1
7*c9945492SAndroid Build Coastguard Worker
8*c9945492SAndroid Build Coastguard Worker /* returns a*b*2^-32 - e, with error 0 <= e < 1. */
mul32(uint32_t a,uint32_t b)9*c9945492SAndroid Build Coastguard Worker static inline uint32_t mul32(uint32_t a, uint32_t b)
10*c9945492SAndroid Build Coastguard Worker {
11*c9945492SAndroid Build Coastguard Worker return (uint64_t)a*b >> 32;
12*c9945492SAndroid Build Coastguard Worker }
13*c9945492SAndroid Build Coastguard Worker
14*c9945492SAndroid Build Coastguard Worker /* returns a*b*2^-64 - e, with error 0 <= e < 3. */
mul64(uint64_t a,uint64_t b)15*c9945492SAndroid Build Coastguard Worker static inline uint64_t mul64(uint64_t a, uint64_t b)
16*c9945492SAndroid Build Coastguard Worker {
17*c9945492SAndroid Build Coastguard Worker uint64_t ahi = a>>32;
18*c9945492SAndroid Build Coastguard Worker uint64_t alo = a&0xffffffff;
19*c9945492SAndroid Build Coastguard Worker uint64_t bhi = b>>32;
20*c9945492SAndroid Build Coastguard Worker uint64_t blo = b&0xffffffff;
21*c9945492SAndroid Build Coastguard Worker return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32);
22*c9945492SAndroid Build Coastguard Worker }
23*c9945492SAndroid Build Coastguard Worker
sqrt(double x)24*c9945492SAndroid Build Coastguard Worker double sqrt(double x)
25*c9945492SAndroid Build Coastguard Worker {
26*c9945492SAndroid Build Coastguard Worker uint64_t ix, top, m;
27*c9945492SAndroid Build Coastguard Worker
28*c9945492SAndroid Build Coastguard Worker /* special case handling. */
29*c9945492SAndroid Build Coastguard Worker ix = asuint64(x);
30*c9945492SAndroid Build Coastguard Worker top = ix >> 52;
31*c9945492SAndroid Build Coastguard Worker if (predict_false(top - 0x001 >= 0x7ff - 0x001)) {
32*c9945492SAndroid Build Coastguard Worker /* x < 0x1p-1022 or inf or nan. */
33*c9945492SAndroid Build Coastguard Worker if (ix * 2 == 0)
34*c9945492SAndroid Build Coastguard Worker return x;
35*c9945492SAndroid Build Coastguard Worker if (ix == 0x7ff0000000000000)
36*c9945492SAndroid Build Coastguard Worker return x;
37*c9945492SAndroid Build Coastguard Worker if (ix > 0x7ff0000000000000)
38*c9945492SAndroid Build Coastguard Worker return __math_invalid(x);
39*c9945492SAndroid Build Coastguard Worker /* x is subnormal, normalize it. */
40*c9945492SAndroid Build Coastguard Worker ix = asuint64(x * 0x1p52);
41*c9945492SAndroid Build Coastguard Worker top = ix >> 52;
42*c9945492SAndroid Build Coastguard Worker top -= 52;
43*c9945492SAndroid Build Coastguard Worker }
44*c9945492SAndroid Build Coastguard Worker
45*c9945492SAndroid Build Coastguard Worker /* argument reduction:
46*c9945492SAndroid Build Coastguard Worker x = 4^e m; with integer e, and m in [1, 4)
47*c9945492SAndroid Build Coastguard Worker m: fixed point representation [2.62]
48*c9945492SAndroid Build Coastguard Worker 2^e is the exponent part of the result. */
49*c9945492SAndroid Build Coastguard Worker int even = top & 1;
50*c9945492SAndroid Build Coastguard Worker m = (ix << 11) | 0x8000000000000000;
51*c9945492SAndroid Build Coastguard Worker if (even) m >>= 1;
52*c9945492SAndroid Build Coastguard Worker top = (top + 0x3ff) >> 1;
53*c9945492SAndroid Build Coastguard Worker
54*c9945492SAndroid Build Coastguard Worker /* approximate r ~ 1/sqrt(m) and s ~ sqrt(m) when m in [1,4)
55*c9945492SAndroid Build Coastguard Worker
56*c9945492SAndroid Build Coastguard Worker initial estimate:
57*c9945492SAndroid Build Coastguard Worker 7bit table lookup (1bit exponent and 6bit significand).
58*c9945492SAndroid Build Coastguard Worker
59*c9945492SAndroid Build Coastguard Worker iterative approximation:
60*c9945492SAndroid Build Coastguard Worker using 2 goldschmidt iterations with 32bit int arithmetics
61*c9945492SAndroid Build Coastguard Worker and a final iteration with 64bit int arithmetics.
62*c9945492SAndroid Build Coastguard Worker
63*c9945492SAndroid Build Coastguard Worker details:
64*c9945492SAndroid Build Coastguard Worker
65*c9945492SAndroid Build Coastguard Worker the relative error (e = r0 sqrt(m)-1) of a linear estimate
66*c9945492SAndroid Build Coastguard Worker (r0 = a m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best,
67*c9945492SAndroid Build Coastguard Worker a table lookup is faster and needs one less iteration
68*c9945492SAndroid Build Coastguard Worker 6 bit lookup table (128b) gives |e| < 0x1.f9p-8
69*c9945492SAndroid Build Coastguard Worker 7 bit lookup table (256b) gives |e| < 0x1.fdp-9
70*c9945492SAndroid Build Coastguard Worker for single and double prec 6bit is enough but for quad
71*c9945492SAndroid Build Coastguard Worker prec 7bit is needed (or modified iterations). to avoid
72*c9945492SAndroid Build Coastguard Worker one more iteration >=13bit table would be needed (16k).
73*c9945492SAndroid Build Coastguard Worker
74*c9945492SAndroid Build Coastguard Worker a newton-raphson iteration for r is
75*c9945492SAndroid Build Coastguard Worker w = r*r
76*c9945492SAndroid Build Coastguard Worker u = 3 - m*w
77*c9945492SAndroid Build Coastguard Worker r = r*u/2
78*c9945492SAndroid Build Coastguard Worker can use a goldschmidt iteration for s at the end or
79*c9945492SAndroid Build Coastguard Worker s = m*r
80*c9945492SAndroid Build Coastguard Worker
81*c9945492SAndroid Build Coastguard Worker first goldschmidt iteration is
82*c9945492SAndroid Build Coastguard Worker s = m*r
83*c9945492SAndroid Build Coastguard Worker u = 3 - s*r
84*c9945492SAndroid Build Coastguard Worker r = r*u/2
85*c9945492SAndroid Build Coastguard Worker s = s*u/2
86*c9945492SAndroid Build Coastguard Worker next goldschmidt iteration is
87*c9945492SAndroid Build Coastguard Worker u = 3 - s*r
88*c9945492SAndroid Build Coastguard Worker r = r*u/2
89*c9945492SAndroid Build Coastguard Worker s = s*u/2
90*c9945492SAndroid Build Coastguard Worker and at the end r is not computed only s.
91*c9945492SAndroid Build Coastguard Worker
92*c9945492SAndroid Build Coastguard Worker they use the same amount of operations and converge at the
93*c9945492SAndroid Build Coastguard Worker same quadratic rate, i.e. if
94*c9945492SAndroid Build Coastguard Worker r1 sqrt(m) - 1 = e, then
95*c9945492SAndroid Build Coastguard Worker r2 sqrt(m) - 1 = -3/2 e^2 - 1/2 e^3
96*c9945492SAndroid Build Coastguard Worker the advantage of goldschmidt is that the mul for s and r
97*c9945492SAndroid Build Coastguard Worker are independent (computed in parallel), however it is not
98*c9945492SAndroid Build Coastguard Worker "self synchronizing": it only uses the input m in the
99*c9945492SAndroid Build Coastguard Worker first iteration so rounding errors accumulate. at the end
100*c9945492SAndroid Build Coastguard Worker or when switching to larger precision arithmetics rounding
101*c9945492SAndroid Build Coastguard Worker errors dominate so the first iteration should be used.
102*c9945492SAndroid Build Coastguard Worker
103*c9945492SAndroid Build Coastguard Worker the fixed point representations are
104*c9945492SAndroid Build Coastguard Worker m: 2.30 r: 0.32, s: 2.30, d: 2.30, u: 2.30, three: 2.30
105*c9945492SAndroid Build Coastguard Worker and after switching to 64 bit
106*c9945492SAndroid Build Coastguard Worker m: 2.62 r: 0.64, s: 2.62, d: 2.62, u: 2.62, three: 2.62 */
107*c9945492SAndroid Build Coastguard Worker
108*c9945492SAndroid Build Coastguard Worker static const uint64_t three = 0xc0000000;
109*c9945492SAndroid Build Coastguard Worker uint64_t r, s, d, u, i;
110*c9945492SAndroid Build Coastguard Worker
111*c9945492SAndroid Build Coastguard Worker i = (ix >> 46) % 128;
112*c9945492SAndroid Build Coastguard Worker r = (uint32_t)__rsqrt_tab[i] << 16;
113*c9945492SAndroid Build Coastguard Worker /* |r sqrt(m) - 1| < 0x1.fdp-9 */
114*c9945492SAndroid Build Coastguard Worker s = mul32(m>>32, r);
115*c9945492SAndroid Build Coastguard Worker /* |s/sqrt(m) - 1| < 0x1.fdp-9 */
116*c9945492SAndroid Build Coastguard Worker d = mul32(s, r);
117*c9945492SAndroid Build Coastguard Worker u = three - d;
118*c9945492SAndroid Build Coastguard Worker r = mul32(r, u) << 1;
119*c9945492SAndroid Build Coastguard Worker /* |r sqrt(m) - 1| < 0x1.7bp-16 */
120*c9945492SAndroid Build Coastguard Worker s = mul32(s, u) << 1;
121*c9945492SAndroid Build Coastguard Worker /* |s/sqrt(m) - 1| < 0x1.7bp-16 */
122*c9945492SAndroid Build Coastguard Worker d = mul32(s, r);
123*c9945492SAndroid Build Coastguard Worker u = three - d;
124*c9945492SAndroid Build Coastguard Worker r = mul32(r, u) << 1;
125*c9945492SAndroid Build Coastguard Worker /* |r sqrt(m) - 1| < 0x1.3704p-29 (measured worst-case) */
126*c9945492SAndroid Build Coastguard Worker r = r << 32;
127*c9945492SAndroid Build Coastguard Worker s = mul64(m, r);
128*c9945492SAndroid Build Coastguard Worker d = mul64(s, r);
129*c9945492SAndroid Build Coastguard Worker u = (three<<32) - d;
130*c9945492SAndroid Build Coastguard Worker s = mul64(s, u); /* repr: 3.61 */
131*c9945492SAndroid Build Coastguard Worker /* -0x1p-57 < s - sqrt(m) < 0x1.8001p-61 */
132*c9945492SAndroid Build Coastguard Worker s = (s - 2) >> 9; /* repr: 12.52 */
133*c9945492SAndroid Build Coastguard Worker /* -0x1.09p-52 < s - sqrt(m) < -0x1.fffcp-63 */
134*c9945492SAndroid Build Coastguard Worker
135*c9945492SAndroid Build Coastguard Worker /* s < sqrt(m) < s + 0x1.09p-52,
136*c9945492SAndroid Build Coastguard Worker compute nearest rounded result:
137*c9945492SAndroid Build Coastguard Worker the nearest result to 52 bits is either s or s+0x1p-52,
138*c9945492SAndroid Build Coastguard Worker we can decide by comparing (2^52 s + 0.5)^2 to 2^104 m. */
139*c9945492SAndroid Build Coastguard Worker uint64_t d0, d1, d2;
140*c9945492SAndroid Build Coastguard Worker double y, t;
141*c9945492SAndroid Build Coastguard Worker d0 = (m << 42) - s*s;
142*c9945492SAndroid Build Coastguard Worker d1 = s - d0;
143*c9945492SAndroid Build Coastguard Worker d2 = d1 + s + 1;
144*c9945492SAndroid Build Coastguard Worker s += d1 >> 63;
145*c9945492SAndroid Build Coastguard Worker s &= 0x000fffffffffffff;
146*c9945492SAndroid Build Coastguard Worker s |= top << 52;
147*c9945492SAndroid Build Coastguard Worker y = asdouble(s);
148*c9945492SAndroid Build Coastguard Worker if (FENV_SUPPORT) {
149*c9945492SAndroid Build Coastguard Worker /* handle rounding modes and inexact exception:
150*c9945492SAndroid Build Coastguard Worker only (s+1)^2 == 2^42 m case is exact otherwise
151*c9945492SAndroid Build Coastguard Worker add a tiny value to cause the fenv effects. */
152*c9945492SAndroid Build Coastguard Worker uint64_t tiny = predict_false(d2==0) ? 0 : 0x0010000000000000;
153*c9945492SAndroid Build Coastguard Worker tiny |= (d1^d2) & 0x8000000000000000;
154*c9945492SAndroid Build Coastguard Worker t = asdouble(tiny);
155*c9945492SAndroid Build Coastguard Worker y = eval_as_double(y + t);
156*c9945492SAndroid Build Coastguard Worker }
157*c9945492SAndroid Build Coastguard Worker return y;
158*c9945492SAndroid Build Coastguard Worker }
159