1*412f47f9SXin Li /*
2*412f47f9SXin Li * Single-precision erf(x) function.
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2020, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li #include <stdint.h>
9*412f47f9SXin Li #include <math.h>
10*412f47f9SXin Li #include "math_config.h"
11*412f47f9SXin Li
12*412f47f9SXin Li #define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
13*412f47f9SXin Li #define A __erff_data.erff_poly_A
14*412f47f9SXin Li #define B __erff_data.erff_poly_B
15*412f47f9SXin Li
16*412f47f9SXin Li /* Top 12 bits of a float. */
17*412f47f9SXin Li static inline uint32_t
top12(float x)18*412f47f9SXin Li top12 (float x)
19*412f47f9SXin Li {
20*412f47f9SXin Li return asuint (x) >> 20;
21*412f47f9SXin Li }
22*412f47f9SXin Li
23*412f47f9SXin Li /* Efficient implementation of erff
24*412f47f9SXin Li using either a pure polynomial approximation or
25*412f47f9SXin Li the exponential of a polynomial.
26*412f47f9SXin Li Worst-case error is 1.09ulps at 0x1.c111acp-1. */
27*412f47f9SXin Li float
erff(float x)28*412f47f9SXin Li erff (float x)
29*412f47f9SXin Li {
30*412f47f9SXin Li float r, x2, u;
31*412f47f9SXin Li
32*412f47f9SXin Li /* Get top word. */
33*412f47f9SXin Li uint32_t ix = asuint (x);
34*412f47f9SXin Li uint32_t sign = ix >> 31;
35*412f47f9SXin Li uint32_t ia12 = top12 (x) & 0x7ff;
36*412f47f9SXin Li
37*412f47f9SXin Li /* Limit of both intervals is 0.875 for performance reasons but coefficients
38*412f47f9SXin Li computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
39*412f47f9SXin Li from 0.94 to 1.1ulps. */
40*412f47f9SXin Li if (ia12 < 0x3f6)
41*412f47f9SXin Li { /* a = |x| < 0.875. */
42*412f47f9SXin Li
43*412f47f9SXin Li /* Tiny and subnormal cases. */
44*412f47f9SXin Li if (unlikely (ia12 < 0x318))
45*412f47f9SXin Li { /* |x| < 2^(-28). */
46*412f47f9SXin Li if (unlikely (ia12 < 0x040))
47*412f47f9SXin Li { /* |x| < 2^(-119). */
48*412f47f9SXin Li float y = fmaf (TwoOverSqrtPiMinusOne, x, x);
49*412f47f9SXin Li return check_uflowf (y);
50*412f47f9SXin Li }
51*412f47f9SXin Li return x + TwoOverSqrtPiMinusOne * x;
52*412f47f9SXin Li }
53*412f47f9SXin Li
54*412f47f9SXin Li x2 = x * x;
55*412f47f9SXin Li
56*412f47f9SXin Li /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2). */
57*412f47f9SXin Li r = A[5];
58*412f47f9SXin Li r = fmaf (r, x2, A[4]);
59*412f47f9SXin Li r = fmaf (r, x2, A[3]);
60*412f47f9SXin Li r = fmaf (r, x2, A[2]);
61*412f47f9SXin Li r = fmaf (r, x2, A[1]);
62*412f47f9SXin Li r = fmaf (r, x2, A[0]);
63*412f47f9SXin Li r = fmaf (r, x, x);
64*412f47f9SXin Li }
65*412f47f9SXin Li else if (ia12 < 0x408)
66*412f47f9SXin Li { /* |x| < 4.0 - Use a custom Estrin scheme. */
67*412f47f9SXin Li
68*412f47f9SXin Li float a = fabsf (x);
69*412f47f9SXin Li /* Start with Estrin scheme on high order (small magnitude) coefficients. */
70*412f47f9SXin Li r = fmaf (B[6], a, B[5]);
71*412f47f9SXin Li u = fmaf (B[4], a, B[3]);
72*412f47f9SXin Li x2 = x * x;
73*412f47f9SXin Li r = fmaf (r, x2, u);
74*412f47f9SXin Li /* Then switch to pure Horner scheme. */
75*412f47f9SXin Li r = fmaf (r, a, B[2]);
76*412f47f9SXin Li r = fmaf (r, a, B[1]);
77*412f47f9SXin Li r = fmaf (r, a, B[0]);
78*412f47f9SXin Li r = fmaf (r, a, a);
79*412f47f9SXin Li /* Single precision exponential with ~0.5ulps,
80*412f47f9SXin Li ensures erff has max. rel. error
81*412f47f9SXin Li < 1ulp on [0.921875, 4.0],
82*412f47f9SXin Li < 1.1ulps on [0.875, 4.0]. */
83*412f47f9SXin Li r = expf (-r);
84*412f47f9SXin Li /* Explicit copysign (calling copysignf increases latency). */
85*412f47f9SXin Li if (sign)
86*412f47f9SXin Li r = -1.0f + r;
87*412f47f9SXin Li else
88*412f47f9SXin Li r = 1.0f - r;
89*412f47f9SXin Li }
90*412f47f9SXin Li else
91*412f47f9SXin Li { /* |x| >= 4.0. */
92*412f47f9SXin Li
93*412f47f9SXin Li /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */
94*412f47f9SXin Li if (unlikely (ia12 >= 0x7f8))
95*412f47f9SXin Li return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;
96*412f47f9SXin Li
97*412f47f9SXin Li /* Explicit copysign (calling copysignf increases latency). */
98*412f47f9SXin Li if (sign)
99*412f47f9SXin Li r = -1.0f;
100*412f47f9SXin Li else
101*412f47f9SXin Li r = 1.0f;
102*412f47f9SXin Li }
103*412f47f9SXin Li return r;
104*412f47f9SXin Li }
105