1*412f47f9SXin Li /*
2*412f47f9SXin Li * Double-precision erf(x) function.
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2020, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li #include "math_config.h"
9*412f47f9SXin Li #include <math.h>
10*412f47f9SXin Li #include <stdint.h>
11*412f47f9SXin Li
12*412f47f9SXin Li #define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3
13*412f47f9SXin Li #define C 0x1.b0ac16p-1
14*412f47f9SXin Li #define PA __erf_data.erf_poly_A
15*412f47f9SXin Li #define NA __erf_data.erf_ratio_N_A
16*412f47f9SXin Li #define DA __erf_data.erf_ratio_D_A
17*412f47f9SXin Li #define NB __erf_data.erf_ratio_N_B
18*412f47f9SXin Li #define DB __erf_data.erf_ratio_D_B
19*412f47f9SXin Li #define PC __erf_data.erfc_poly_C
20*412f47f9SXin Li #define PD __erf_data.erfc_poly_D
21*412f47f9SXin Li #define PE __erf_data.erfc_poly_E
22*412f47f9SXin Li #define PF __erf_data.erfc_poly_F
23*412f47f9SXin Li
24*412f47f9SXin Li /* Top 32 bits of a double. */
25*412f47f9SXin Li static inline uint32_t
top32(double x)26*412f47f9SXin Li top32 (double x)
27*412f47f9SXin Li {
28*412f47f9SXin Li return asuint64 (x) >> 32;
29*412f47f9SXin Li }
30*412f47f9SXin Li
31*412f47f9SXin Li /* Fast erf implementation using a mix of
32*412f47f9SXin Li rational and polynomial approximations.
33*412f47f9SXin Li Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0. */
34*412f47f9SXin Li double
erf(double x)35*412f47f9SXin Li erf (double x)
36*412f47f9SXin Li {
37*412f47f9SXin Li /* Get top word and sign. */
38*412f47f9SXin Li uint32_t ix = top32 (x);
39*412f47f9SXin Li uint32_t ia = ix & 0x7fffffff;
40*412f47f9SXin Li uint32_t sign = ix >> 31;
41*412f47f9SXin Li
42*412f47f9SXin Li /* Normalized and subnormal cases */
43*412f47f9SXin Li if (ia < 0x3feb0000)
44*412f47f9SXin Li { /* a = |x| < 0.84375. */
45*412f47f9SXin Li
46*412f47f9SXin Li if (ia < 0x3e300000)
47*412f47f9SXin Li { /* a < 2^(-28). */
48*412f47f9SXin Li if (ia < 0x00800000)
49*412f47f9SXin Li { /* a < 2^(-1015). */
50*412f47f9SXin Li double y = fma (TwoOverSqrtPiMinusOne, x, x);
51*412f47f9SXin Li return check_uflow (y);
52*412f47f9SXin Li }
53*412f47f9SXin Li return x + TwoOverSqrtPiMinusOne * x;
54*412f47f9SXin Li }
55*412f47f9SXin Li
56*412f47f9SXin Li double x2 = x * x;
57*412f47f9SXin Li
58*412f47f9SXin Li if (ia < 0x3fe00000)
59*412f47f9SXin Li { /* a < 0.5 - Use polynomial approximation. */
60*412f47f9SXin Li double r1 = fma (x2, PA[1], PA[0]);
61*412f47f9SXin Li double r2 = fma (x2, PA[3], PA[2]);
62*412f47f9SXin Li double r3 = fma (x2, PA[5], PA[4]);
63*412f47f9SXin Li double r4 = fma (x2, PA[7], PA[6]);
64*412f47f9SXin Li double r5 = fma (x2, PA[9], PA[8]);
65*412f47f9SXin Li double x4 = x2 * x2;
66*412f47f9SXin Li double r = r5;
67*412f47f9SXin Li r = fma (x4, r, r4);
68*412f47f9SXin Li r = fma (x4, r, r3);
69*412f47f9SXin Li r = fma (x4, r, r2);
70*412f47f9SXin Li r = fma (x4, r, r1);
71*412f47f9SXin Li return fma (r, x, x); /* This fma is crucial for accuracy. */
72*412f47f9SXin Li }
73*412f47f9SXin Li else
74*412f47f9SXin Li { /* 0.5 <= a < 0.84375 - Use rational approximation. */
75*412f47f9SXin Li double x4, x8, r1n, r2n, r1d, r2d, r3d;
76*412f47f9SXin Li
77*412f47f9SXin Li r1n = fma (x2, NA[1], NA[0]);
78*412f47f9SXin Li x4 = x2 * x2;
79*412f47f9SXin Li r2n = fma (x2, NA[3], NA[2]);
80*412f47f9SXin Li x8 = x4 * x4;
81*412f47f9SXin Li r1d = fma (x2, DA[0], 1.0);
82*412f47f9SXin Li r2d = fma (x2, DA[2], DA[1]);
83*412f47f9SXin Li r3d = fma (x2, DA[4], DA[3]);
84*412f47f9SXin Li double P = r1n + x4 * r2n + x8 * NA[4];
85*412f47f9SXin Li double Q = r1d + x4 * r2d + x8 * r3d;
86*412f47f9SXin Li return fma (P / Q, x, x);
87*412f47f9SXin Li }
88*412f47f9SXin Li }
89*412f47f9SXin Li else if (ia < 0x3ff40000)
90*412f47f9SXin Li { /* 0.84375 <= |x| < 1.25. */
91*412f47f9SXin Li double a2, a4, a6, r1n, r2n, r3n, r4n, r1d, r2d, r3d, r4d;
92*412f47f9SXin Li double a = fabs (x) - 1.0;
93*412f47f9SXin Li r1n = fma (a, NB[1], NB[0]);
94*412f47f9SXin Li a2 = a * a;
95*412f47f9SXin Li r1d = fma (a, DB[0], 1.0);
96*412f47f9SXin Li a4 = a2 * a2;
97*412f47f9SXin Li r2n = fma (a, NB[3], NB[2]);
98*412f47f9SXin Li a6 = a4 * a2;
99*412f47f9SXin Li r2d = fma (a, DB[2], DB[1]);
100*412f47f9SXin Li r3n = fma (a, NB[5], NB[4]);
101*412f47f9SXin Li r3d = fma (a, DB[4], DB[3]);
102*412f47f9SXin Li r4n = NB[6];
103*412f47f9SXin Li r4d = DB[5];
104*412f47f9SXin Li double P = r1n + a2 * r2n + a4 * r3n + a6 * r4n;
105*412f47f9SXin Li double Q = r1d + a2 * r2d + a4 * r3d + a6 * r4d;
106*412f47f9SXin Li if (sign)
107*412f47f9SXin Li return -C - P / Q;
108*412f47f9SXin Li else
109*412f47f9SXin Li return C + P / Q;
110*412f47f9SXin Li }
111*412f47f9SXin Li else if (ia < 0x40000000)
112*412f47f9SXin Li { /* 1.25 <= |x| < 2.0. */
113*412f47f9SXin Li double a = fabs (x);
114*412f47f9SXin Li a = a - 1.25;
115*412f47f9SXin Li
116*412f47f9SXin Li double r1 = fma (a, PC[1], PC[0]);
117*412f47f9SXin Li double r2 = fma (a, PC[3], PC[2]);
118*412f47f9SXin Li double r3 = fma (a, PC[5], PC[4]);
119*412f47f9SXin Li double r4 = fma (a, PC[7], PC[6]);
120*412f47f9SXin Li double r5 = fma (a, PC[9], PC[8]);
121*412f47f9SXin Li double r6 = fma (a, PC[11], PC[10]);
122*412f47f9SXin Li double r7 = fma (a, PC[13], PC[12]);
123*412f47f9SXin Li double r8 = fma (a, PC[15], PC[14]);
124*412f47f9SXin Li
125*412f47f9SXin Li double a2 = a * a;
126*412f47f9SXin Li
127*412f47f9SXin Li double r = r8;
128*412f47f9SXin Li r = fma (a2, r, r7);
129*412f47f9SXin Li r = fma (a2, r, r6);
130*412f47f9SXin Li r = fma (a2, r, r5);
131*412f47f9SXin Li r = fma (a2, r, r4);
132*412f47f9SXin Li r = fma (a2, r, r3);
133*412f47f9SXin Li r = fma (a2, r, r2);
134*412f47f9SXin Li r = fma (a2, r, r1);
135*412f47f9SXin Li
136*412f47f9SXin Li if (sign)
137*412f47f9SXin Li return -1.0 + r;
138*412f47f9SXin Li else
139*412f47f9SXin Li return 1.0 - r;
140*412f47f9SXin Li }
141*412f47f9SXin Li else if (ia < 0x400a0000)
142*412f47f9SXin Li { /* 2 <= |x| < 3.25. */
143*412f47f9SXin Li double a = fabs (x);
144*412f47f9SXin Li a = fma (0.5, a, -1.0);
145*412f47f9SXin Li
146*412f47f9SXin Li double r1 = fma (a, PD[1], PD[0]);
147*412f47f9SXin Li double r2 = fma (a, PD[3], PD[2]);
148*412f47f9SXin Li double r3 = fma (a, PD[5], PD[4]);
149*412f47f9SXin Li double r4 = fma (a, PD[7], PD[6]);
150*412f47f9SXin Li double r5 = fma (a, PD[9], PD[8]);
151*412f47f9SXin Li double r6 = fma (a, PD[11], PD[10]);
152*412f47f9SXin Li double r7 = fma (a, PD[13], PD[12]);
153*412f47f9SXin Li double r8 = fma (a, PD[15], PD[14]);
154*412f47f9SXin Li double r9 = fma (a, PD[17], PD[16]);
155*412f47f9SXin Li
156*412f47f9SXin Li double a2 = a * a;
157*412f47f9SXin Li
158*412f47f9SXin Li double r = r9;
159*412f47f9SXin Li r = fma (a2, r, r8);
160*412f47f9SXin Li r = fma (a2, r, r7);
161*412f47f9SXin Li r = fma (a2, r, r6);
162*412f47f9SXin Li r = fma (a2, r, r5);
163*412f47f9SXin Li r = fma (a2, r, r4);
164*412f47f9SXin Li r = fma (a2, r, r3);
165*412f47f9SXin Li r = fma (a2, r, r2);
166*412f47f9SXin Li r = fma (a2, r, r1);
167*412f47f9SXin Li
168*412f47f9SXin Li if (sign)
169*412f47f9SXin Li return -1.0 + r;
170*412f47f9SXin Li else
171*412f47f9SXin Li return 1.0 - r;
172*412f47f9SXin Li }
173*412f47f9SXin Li else if (ia < 0x40100000)
174*412f47f9SXin Li { /* 3.25 <= |x| < 4.0. */
175*412f47f9SXin Li double a = fabs (x);
176*412f47f9SXin Li a = a - 3.25;
177*412f47f9SXin Li
178*412f47f9SXin Li double r1 = fma (a, PE[1], PE[0]);
179*412f47f9SXin Li double r2 = fma (a, PE[3], PE[2]);
180*412f47f9SXin Li double r3 = fma (a, PE[5], PE[4]);
181*412f47f9SXin Li double r4 = fma (a, PE[7], PE[6]);
182*412f47f9SXin Li double r5 = fma (a, PE[9], PE[8]);
183*412f47f9SXin Li double r6 = fma (a, PE[11], PE[10]);
184*412f47f9SXin Li double r7 = fma (a, PE[13], PE[12]);
185*412f47f9SXin Li
186*412f47f9SXin Li double a2 = a * a;
187*412f47f9SXin Li
188*412f47f9SXin Li double r = r7;
189*412f47f9SXin Li r = fma (a2, r, r6);
190*412f47f9SXin Li r = fma (a2, r, r5);
191*412f47f9SXin Li r = fma (a2, r, r4);
192*412f47f9SXin Li r = fma (a2, r, r3);
193*412f47f9SXin Li r = fma (a2, r, r2);
194*412f47f9SXin Li r = fma (a2, r, r1);
195*412f47f9SXin Li
196*412f47f9SXin Li if (sign)
197*412f47f9SXin Li return -1.0 + r;
198*412f47f9SXin Li else
199*412f47f9SXin Li return 1.0 - r;
200*412f47f9SXin Li }
201*412f47f9SXin Li else if (ia < 0x4017a000)
202*412f47f9SXin Li { /* 4 <= |x| < 5.90625. */
203*412f47f9SXin Li double a = fabs (x);
204*412f47f9SXin Li a = fma (0.5, a, -2.0);
205*412f47f9SXin Li
206*412f47f9SXin Li double r1 = fma (a, PF[1], PF[0]);
207*412f47f9SXin Li double r2 = fma (a, PF[3], PF[2]);
208*412f47f9SXin Li double r3 = fma (a, PF[5], PF[4]);
209*412f47f9SXin Li double r4 = fma (a, PF[7], PF[6]);
210*412f47f9SXin Li double r5 = fma (a, PF[9], PF[8]);
211*412f47f9SXin Li double r6 = fma (a, PF[11], PF[10]);
212*412f47f9SXin Li double r7 = fma (a, PF[13], PF[12]);
213*412f47f9SXin Li double r8 = fma (a, PF[15], PF[14]);
214*412f47f9SXin Li double r9 = PF[16];
215*412f47f9SXin Li
216*412f47f9SXin Li double a2 = a * a;
217*412f47f9SXin Li
218*412f47f9SXin Li double r = r9;
219*412f47f9SXin Li r = fma (a2, r, r8);
220*412f47f9SXin Li r = fma (a2, r, r7);
221*412f47f9SXin Li r = fma (a2, r, r6);
222*412f47f9SXin Li r = fma (a2, r, r5);
223*412f47f9SXin Li r = fma (a2, r, r4);
224*412f47f9SXin Li r = fma (a2, r, r3);
225*412f47f9SXin Li r = fma (a2, r, r2);
226*412f47f9SXin Li r = fma (a2, r, r1);
227*412f47f9SXin Li
228*412f47f9SXin Li if (sign)
229*412f47f9SXin Li return -1.0 + r;
230*412f47f9SXin Li else
231*412f47f9SXin Li return 1.0 - r;
232*412f47f9SXin Li }
233*412f47f9SXin Li else
234*412f47f9SXin Li {
235*412f47f9SXin Li /* Special cases : erf(nan)=nan, erf(+inf)=+1 and erf(-inf)=-1. */
236*412f47f9SXin Li if (unlikely (ia >= 0x7ff00000))
237*412f47f9SXin Li return (double) (1.0 - (sign << 1)) + 1.0 / x;
238*412f47f9SXin Li
239*412f47f9SXin Li if (sign)
240*412f47f9SXin Li return -1.0;
241*412f47f9SXin Li else
242*412f47f9SXin Li return 1.0;
243*412f47f9SXin Li }
244*412f47f9SXin Li }
245