xref: /aosp_15_r20/external/arm-optimized-routines/pl/math/erfc_1u8.c (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li /*
2*412f47f9SXin Li  * Double-precision erfc(x) function.
3*412f47f9SXin Li  *
4*412f47f9SXin Li  * Copyright (c) 2023, Arm Limited.
5*412f47f9SXin Li  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li  */
7*412f47f9SXin Li 
8*412f47f9SXin Li #include "math_config.h"
9*412f47f9SXin Li #include "pl_sig.h"
10*412f47f9SXin Li #include "pl_test.h"
11*412f47f9SXin Li 
12*412f47f9SXin Li #define Shift 0x1p45
13*412f47f9SXin Li #define P20 0x1.5555555555555p-2 /* 1/3.  */
14*412f47f9SXin Li #define P21 0x1.5555555555555p-1 /* 2/3.  */
15*412f47f9SXin Li 
16*412f47f9SXin Li #define P40 0x1.999999999999ap-4  /* 1/10.  */
17*412f47f9SXin Li #define P41 0x1.999999999999ap-2  /* 2/5.  */
18*412f47f9SXin Li #define P42 0x1.11111111111111p-3 /* 2/15.  */
19*412f47f9SXin Li 
20*412f47f9SXin Li #define P50 0x1.5555555555555p-3 /* 1/6.  */
21*412f47f9SXin Li #define P51 0x1.c71c71c71c71cp-3 /* 2/9.  */
22*412f47f9SXin Li #define P52 0x1.6c16c16c16c17p-5 /* 2/45.  */
23*412f47f9SXin Li 
24*412f47f9SXin Li /* Qi = (i+1) / i.  */
25*412f47f9SXin Li #define Q5 0x1.3333333333333p0
26*412f47f9SXin Li #define Q6 0x1.2aaaaaaaaaaabp0
27*412f47f9SXin Li #define Q7 0x1.2492492492492p0
28*412f47f9SXin Li #define Q8 0x1.2p0
29*412f47f9SXin Li #define Q9 0x1.1c71c71c71c72p0
30*412f47f9SXin Li 
31*412f47f9SXin Li /* Ri = -2 * i / ((i+1)*(i+2)).  */
32*412f47f9SXin Li #define R5 -0x1.e79e79e79e79ep-3
33*412f47f9SXin Li #define R6 -0x1.b6db6db6db6dbp-3
34*412f47f9SXin Li #define R7 -0x1.8e38e38e38e39p-3
35*412f47f9SXin Li #define R8 -0x1.6c16c16c16c17p-3
36*412f47f9SXin Li #define R9 -0x1.4f2094f2094f2p-3
37*412f47f9SXin Li 
38*412f47f9SXin Li /* Fast erfc approximation based on series expansion near x rounded to
39*412f47f9SXin Li    nearest multiple of 1/128.
40*412f47f9SXin Li    Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
41*412f47f9SXin Li 
42*412f47f9SXin Li    erfc(x) ~ erfc(r) - scale * d * poly(r, d), with
43*412f47f9SXin Li 
44*412f47f9SXin Li    poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3
45*412f47f9SXin Li 		+ (2/15 r^4 - 2/5 r^2 + 1/10) d^4
46*412f47f9SXin Li 		- r * (2/45 r^4 - 2/9 r^2 + 1/6) d^5
47*412f47f9SXin Li 		+ p6(r) d^6 + ... + p10(r) d^10
48*412f47f9SXin Li 
49*412f47f9SXin Li    Polynomials p6(r) to p10(r) are computed using recurrence relation
50*412f47f9SXin Li 
51*412f47f9SXin Li    2(i+1)p_i + 2r(i+2)p_{i+1} + (i+2)(i+3)p_{i+2} = 0,
52*412f47f9SXin Li    with p0 = 1, and p1(r) = -r.
53*412f47f9SXin Li 
54*412f47f9SXin Li    Values of erfc(r) and scale(r) are read from lookup tables. Stored values
55*412f47f9SXin Li    are scaled to avoid hitting the subnormal range.
56*412f47f9SXin Li 
57*412f47f9SXin Li    Note that for x < 0, erfc(x) = 2.0 - erfc(-x).
58*412f47f9SXin Li 
59*412f47f9SXin Li    Maximum measured error: 1.71 ULP
60*412f47f9SXin Li    erfc(0x1.46cfe976733p+4) got 0x1.e15fcbea3e7afp-608
61*412f47f9SXin Li 			   want 0x1.e15fcbea3e7adp-608.  */
62*412f47f9SXin Li double
erfc(double x)63*412f47f9SXin Li erfc (double x)
64*412f47f9SXin Li {
65*412f47f9SXin Li   /* Get top words and sign.  */
66*412f47f9SXin Li   uint64_t ix = asuint64 (x);
67*412f47f9SXin Li   uint64_t ia = ix & 0x7fffffffffffffff;
68*412f47f9SXin Li   double a = asdouble (ia);
69*412f47f9SXin Li   uint64_t sign = ix & ~0x7fffffffffffffff;
70*412f47f9SXin Li 
71*412f47f9SXin Li   /* erfc(nan)=nan, erfc(+inf)=0 and erfc(-inf)=2.  */
72*412f47f9SXin Li   if (unlikely (ia >= 0x7ff0000000000000))
73*412f47f9SXin Li     return asdouble (sign >> 1) + 1.0 / x; /* Special cases.  */
74*412f47f9SXin Li 
75*412f47f9SXin Li   /* Return early for large enough negative values.  */
76*412f47f9SXin Li   if (x < -6.0)
77*412f47f9SXin Li     return 2.0;
78*412f47f9SXin Li 
79*412f47f9SXin Li   /* For |x| < 3487.0/128.0, the following approximation holds.  */
80*412f47f9SXin Li   if (likely (ia < 0x403b3e0000000000))
81*412f47f9SXin Li     {
82*412f47f9SXin Li       /* |x| < 0x1p-511 => accurate to 0.5 ULP.  */
83*412f47f9SXin Li       if (unlikely (ia < asuint64 (0x1p-511)))
84*412f47f9SXin Li 	return 1.0 - x;
85*412f47f9SXin Li 
86*412f47f9SXin Li       /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 1 and scale
87*412f47f9SXin Li 	 to 2/sqrt(pi), when x reduced to r = 0.  */
88*412f47f9SXin Li       double z = a + Shift;
89*412f47f9SXin Li       uint64_t i = asuint64 (z);
90*412f47f9SXin Li       double r = z - Shift;
91*412f47f9SXin Li       /* These values are scaled by 2^128.  */
92*412f47f9SXin Li       double erfcr = __erfc_data.tab[i].erfc;
93*412f47f9SXin Li       double scale = __erfc_data.tab[i].scale;
94*412f47f9SXin Li 
95*412f47f9SXin Li       /* erfc(x) ~ erfc(r) - scale * d * poly (r, d).  */
96*412f47f9SXin Li       double d = a - r;
97*412f47f9SXin Li       double d2 = d * d;
98*412f47f9SXin Li       double r2 = r * r;
99*412f47f9SXin Li       /* Compute p_i as a regular (low-order) polynomial.  */
100*412f47f9SXin Li       double p1 = -r;
101*412f47f9SXin Li       double p2 = fma (P21, r2, -P20);
102*412f47f9SXin Li       double p3 = -r * fma (P20, r2, -0.5);
103*412f47f9SXin Li       double p4 = fma (fma (P42, r2, -P41), r2, P40);
104*412f47f9SXin Li       double p5 = -r * fma (fma (P52, r2, -P51), r2, P50);
105*412f47f9SXin Li       /* Compute p_i using recurrence relation:
106*412f47f9SXin Li 	 p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}.  */
107*412f47f9SXin Li       double p6 = fma (Q5 * r, p5, p4) * R5;
108*412f47f9SXin Li       double p7 = fma (Q6 * r, p6, p5) * R6;
109*412f47f9SXin Li       double p8 = fma (Q7 * r, p7, p6) * R7;
110*412f47f9SXin Li       double p9 = fma (Q8 * r, p8, p7) * R8;
111*412f47f9SXin Li       double p10 = fma (Q9 * r, p9, p8) * R9;
112*412f47f9SXin Li       /* Compute polynomial in d using pairwise Horner scheme.  */
113*412f47f9SXin Li       double p90 = fma (p10, d, p9);
114*412f47f9SXin Li       double p78 = fma (p8, d, p7);
115*412f47f9SXin Li       double p56 = fma (p6, d, p5);
116*412f47f9SXin Li       double p34 = fma (p4, d, p3);
117*412f47f9SXin Li       double p12 = fma (p2, d, p1);
118*412f47f9SXin Li       double y = fma (p90, d2, p78);
119*412f47f9SXin Li       y = fma (y, d2, p56);
120*412f47f9SXin Li       y = fma (y, d2, p34);
121*412f47f9SXin Li       y = fma (y, d2, p12);
122*412f47f9SXin Li 
123*412f47f9SXin Li       y = fma (-fma (y, d2, d), scale, erfcr);
124*412f47f9SXin Li 
125*412f47f9SXin Li       /* Handle sign and scale back in a single fma.  */
126*412f47f9SXin Li       double off = asdouble (sign >> 1);
127*412f47f9SXin Li       double fac = asdouble (asuint64 (0x1p-128) | sign);
128*412f47f9SXin Li       y = fma (y, fac, off);
129*412f47f9SXin Li 
130*412f47f9SXin Li       if (unlikely (x > 26.0))
131*412f47f9SXin Li 	{
132*412f47f9SXin Li 	  /* The underflow exception needs to be signaled explicitly when
133*412f47f9SXin Li 	     result gets into the subnormal range.  */
134*412f47f9SXin Li 	  if (unlikely (y < 0x1p-1022))
135*412f47f9SXin Li 	    force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
136*412f47f9SXin Li 	  /* Set errno to ERANGE if result rounds to 0.  */
137*412f47f9SXin Li 	  return __math_check_uflow (y);
138*412f47f9SXin Li 	}
139*412f47f9SXin Li 
140*412f47f9SXin Li       return y;
141*412f47f9SXin Li     }
142*412f47f9SXin Li   /* Above the threshold (x > 3487.0/128.0) erfc is constant and needs to raise
143*412f47f9SXin Li      underflow exception for positive x.  */
144*412f47f9SXin Li   return __math_uflow (0);
145*412f47f9SXin Li }
146*412f47f9SXin Li 
147*412f47f9SXin Li PL_SIG (S, D, 1, erfc, -6.0, 28.0)
148*412f47f9SXin Li PL_TEST_ULP (erfc, 1.21)
149*412f47f9SXin Li PL_TEST_SYM_INTERVAL (erfc, 0, 0x1p-26, 40000)
150*412f47f9SXin Li PL_TEST_INTERVAL (erfc, 0x1p-26, 28.0, 100000)
151*412f47f9SXin Li PL_TEST_INTERVAL (erfc, -0x1p-26, -6.0, 100000)
152*412f47f9SXin Li PL_TEST_INTERVAL (erfc, 28.0, inf, 40000)
153*412f47f9SXin Li PL_TEST_INTERVAL (erfc, -6.0, -inf, 40000)
154