xref: /aosp_15_r20/external/arm-optimized-routines/math/erff.c (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li /*
2*412f47f9SXin Li  * Single-precision erf(x) function.
3*412f47f9SXin Li  *
4*412f47f9SXin Li  * Copyright (c) 2020, Arm Limited.
5*412f47f9SXin Li  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li  */
7*412f47f9SXin Li 
8*412f47f9SXin Li #include <stdint.h>
9*412f47f9SXin Li #include <math.h>
10*412f47f9SXin Li #include "math_config.h"
11*412f47f9SXin Li 
12*412f47f9SXin Li #define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
13*412f47f9SXin Li #define A __erff_data.erff_poly_A
14*412f47f9SXin Li #define B __erff_data.erff_poly_B
15*412f47f9SXin Li 
16*412f47f9SXin Li /* Top 12 bits of a float.  */
17*412f47f9SXin Li static inline uint32_t
top12(float x)18*412f47f9SXin Li top12 (float x)
19*412f47f9SXin Li {
20*412f47f9SXin Li   return asuint (x) >> 20;
21*412f47f9SXin Li }
22*412f47f9SXin Li 
23*412f47f9SXin Li /* Efficient implementation of erff
24*412f47f9SXin Li    using either a pure polynomial approximation or
25*412f47f9SXin Li    the exponential of a polynomial.
26*412f47f9SXin Li    Worst-case error is 1.09ulps at 0x1.c111acp-1.  */
27*412f47f9SXin Li float
erff(float x)28*412f47f9SXin Li erff (float x)
29*412f47f9SXin Li {
30*412f47f9SXin Li   float r, x2, u;
31*412f47f9SXin Li 
32*412f47f9SXin Li   /* Get top word.  */
33*412f47f9SXin Li   uint32_t ix = asuint (x);
34*412f47f9SXin Li   uint32_t sign = ix >> 31;
35*412f47f9SXin Li   uint32_t ia12 = top12 (x) & 0x7ff;
36*412f47f9SXin Li 
37*412f47f9SXin Li   /* Limit of both intervals is 0.875 for performance reasons but coefficients
38*412f47f9SXin Li      computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
39*412f47f9SXin Li      from 0.94 to 1.1ulps.  */
40*412f47f9SXin Li   if (ia12 < 0x3f6)
41*412f47f9SXin Li     { /* a = |x| < 0.875.  */
42*412f47f9SXin Li 
43*412f47f9SXin Li       /* Tiny and subnormal cases.  */
44*412f47f9SXin Li       if (unlikely (ia12 < 0x318))
45*412f47f9SXin Li 	{ /* |x| < 2^(-28).  */
46*412f47f9SXin Li 	  if (unlikely (ia12 < 0x040))
47*412f47f9SXin Li 	    { /* |x| < 2^(-119).  */
48*412f47f9SXin Li 	      float y = fmaf (TwoOverSqrtPiMinusOne, x, x);
49*412f47f9SXin Li 	      return check_uflowf (y);
50*412f47f9SXin Li 	    }
51*412f47f9SXin Li 	  return x + TwoOverSqrtPiMinusOne * x;
52*412f47f9SXin Li 	}
53*412f47f9SXin Li 
54*412f47f9SXin Li       x2 = x * x;
55*412f47f9SXin Li 
56*412f47f9SXin Li       /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2).  */
57*412f47f9SXin Li       r = A[5];
58*412f47f9SXin Li       r = fmaf (r, x2, A[4]);
59*412f47f9SXin Li       r = fmaf (r, x2, A[3]);
60*412f47f9SXin Li       r = fmaf (r, x2, A[2]);
61*412f47f9SXin Li       r = fmaf (r, x2, A[1]);
62*412f47f9SXin Li       r = fmaf (r, x2, A[0]);
63*412f47f9SXin Li       r = fmaf (r, x, x);
64*412f47f9SXin Li     }
65*412f47f9SXin Li   else if (ia12 < 0x408)
66*412f47f9SXin Li     { /* |x| < 4.0 - Use a custom Estrin scheme.  */
67*412f47f9SXin Li 
68*412f47f9SXin Li       float a = fabsf (x);
69*412f47f9SXin Li       /* Start with Estrin scheme on high order (small magnitude) coefficients.  */
70*412f47f9SXin Li       r = fmaf (B[6], a, B[5]);
71*412f47f9SXin Li       u = fmaf (B[4], a, B[3]);
72*412f47f9SXin Li       x2 = x * x;
73*412f47f9SXin Li       r = fmaf (r, x2, u);
74*412f47f9SXin Li       /* Then switch to pure Horner scheme.  */
75*412f47f9SXin Li       r = fmaf (r, a, B[2]);
76*412f47f9SXin Li       r = fmaf (r, a, B[1]);
77*412f47f9SXin Li       r = fmaf (r, a, B[0]);
78*412f47f9SXin Li       r = fmaf (r, a, a);
79*412f47f9SXin Li       /* Single precision exponential with ~0.5ulps,
80*412f47f9SXin Li 	 ensures erff has max. rel. error
81*412f47f9SXin Li 	 < 1ulp on [0.921875, 4.0],
82*412f47f9SXin Li 	 < 1.1ulps on [0.875, 4.0].  */
83*412f47f9SXin Li       r = expf (-r);
84*412f47f9SXin Li       /* Explicit copysign (calling copysignf increases latency).  */
85*412f47f9SXin Li       if (sign)
86*412f47f9SXin Li 	r = -1.0f + r;
87*412f47f9SXin Li       else
88*412f47f9SXin Li 	r = 1.0f - r;
89*412f47f9SXin Li     }
90*412f47f9SXin Li   else
91*412f47f9SXin Li     { /* |x| >= 4.0.  */
92*412f47f9SXin Li 
93*412f47f9SXin Li       /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1.  */
94*412f47f9SXin Li       if (unlikely (ia12 >= 0x7f8))
95*412f47f9SXin Li 	return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;
96*412f47f9SXin Li 
97*412f47f9SXin Li       /* Explicit copysign (calling copysignf increases latency).  */
98*412f47f9SXin Li       if (sign)
99*412f47f9SXin Li 	r = -1.0f;
100*412f47f9SXin Li       else
101*412f47f9SXin Li 	r = 1.0f;
102*412f47f9SXin Li     }
103*412f47f9SXin Li   return r;
104*412f47f9SXin Li }
105