xref: /aosp_15_r20/external/arm-optimized-routines/pl/math/erfcf_1u7.c (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li /*
2*412f47f9SXin Li  * Single-precision erfc(x) function.
3*412f47f9SXin Li  *
4*412f47f9SXin Li  * Copyright (c) 2023, Arm Limited.
5*412f47f9SXin Li  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li  */
7*412f47f9SXin Li 
8*412f47f9SXin Li #include "math_config.h"
9*412f47f9SXin Li #include "pl_sig.h"
10*412f47f9SXin Li #include "pl_test.h"
11*412f47f9SXin Li 
12*412f47f9SXin Li #define Shift 0x1p17f
13*412f47f9SXin Li #define OneThird 0x1.555556p-2f
14*412f47f9SXin Li #define TwoThird 0x1.555556p-1f
15*412f47f9SXin Li 
16*412f47f9SXin Li #define TwoOverFifteen 0x1.111112p-3f
17*412f47f9SXin Li #define TwoOverFive 0x1.99999ap-2f
18*412f47f9SXin Li #define Tenth 0x1.99999ap-4f
19*412f47f9SXin Li 
20*412f47f9SXin Li #define SignMask 0x7fffffff
21*412f47f9SXin Li 
22*412f47f9SXin Li /* Fast erfcf approximation based on series expansion near x rounded to
23*412f47f9SXin Li    nearest multiple of 1/64.
24*412f47f9SXin Li    Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
25*412f47f9SXin Li 
26*412f47f9SXin Li    erfc(x) ~ erfc(r) - scale * d * poly(r, d), with
27*412f47f9SXin Li 
28*412f47f9SXin Li    poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3
29*412f47f9SXin Li 		+ (2/15 r^4 - 2/5 r^2 + 1/10) d^4
30*412f47f9SXin Li 
31*412f47f9SXin Li    Values of erfc(r) and scale are read from lookup tables. Stored values
32*412f47f9SXin Li    are scaled to avoid hitting the subnormal range.
33*412f47f9SXin Li 
34*412f47f9SXin Li    Note that for x < 0, erfc(x) = 2.0 - erfc(-x).
35*412f47f9SXin Li 
36*412f47f9SXin Li    Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0).
37*412f47f9SXin Li    erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120
38*412f47f9SXin Li 		       want 0x1.f51216p-120.  */
39*412f47f9SXin Li float
erfcf(float x)40*412f47f9SXin Li erfcf (float x)
41*412f47f9SXin Li {
42*412f47f9SXin Li   /* Get top words and sign.  */
43*412f47f9SXin Li   uint32_t ix = asuint (x);
44*412f47f9SXin Li   uint32_t ia = ix & SignMask;
45*412f47f9SXin Li   uint32_t sign = ix & ~SignMask;
46*412f47f9SXin Li 
47*412f47f9SXin Li   /* |x| < 0x1.0p-26 => accurate to 0.5 ULP (top12(0x1p-26) = 0x328).  */
48*412f47f9SXin Li   if (unlikely (ia < 0x32800000))
49*412f47f9SXin Li     return 1.0f - x; /* Small case.  */
50*412f47f9SXin Li 
51*412f47f9SXin Li   /* For |x| < 10.0625, the following approximation holds.  */
52*412f47f9SXin Li   if (likely (ia < 0x41210000))
53*412f47f9SXin Li     {
54*412f47f9SXin Li       /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 1 and scale
55*412f47f9SXin Li 	 to 2/sqrt(pi), when x reduced to r = 0.  */
56*412f47f9SXin Li       float a = asfloat (ia);
57*412f47f9SXin Li       float z = a + Shift;
58*412f47f9SXin Li       uint32_t i = asuint (z) - asuint (Shift);
59*412f47f9SXin Li       float r = z - Shift;
60*412f47f9SXin Li 
61*412f47f9SXin Li       /* These values are scaled by 2^-47.  */
62*412f47f9SXin Li       float erfcr = __erfcf_data.tab[i].erfc;
63*412f47f9SXin Li       float scale = __erfcf_data.tab[i].scale;
64*412f47f9SXin Li 
65*412f47f9SXin Li       /* erfc(x) ~ erfc(r) - scale * d * poly (r, d).  */
66*412f47f9SXin Li       float d = a - r;
67*412f47f9SXin Li       float d2 = d * d;
68*412f47f9SXin Li       float r2 = r * r;
69*412f47f9SXin Li       float p1 = -r;
70*412f47f9SXin Li       float p2 = fmaf (TwoThird, r2, -OneThird);
71*412f47f9SXin Li       float p3 = -r * fmaf (OneThird, r2, -0.5f);
72*412f47f9SXin Li       float p4 = fmaf (fmaf (TwoOverFifteen, r2, -TwoOverFive), r2, Tenth);
73*412f47f9SXin Li       float y = fmaf (p4, d, p3);
74*412f47f9SXin Li       y = fmaf (y, d, p2);
75*412f47f9SXin Li       y = fmaf (y, d, p1);
76*412f47f9SXin Li       y = fmaf (-fmaf (y, d2, d), scale, erfcr);
77*412f47f9SXin Li       /* Handle sign and scale back in a single fma.  */
78*412f47f9SXin Li       float off = asfloat (sign >> 1);
79*412f47f9SXin Li       float fac = asfloat (asuint (0x1p-47f) | sign);
80*412f47f9SXin Li       y = fmaf (y, fac, off);
81*412f47f9SXin Li       /* The underflow exception needs to be signaled explicitly when
82*412f47f9SXin Li 	 result gets into subormnal range.  */
83*412f47f9SXin Li       if (x >= 0x1.2639cp+3f)
84*412f47f9SXin Li 	force_eval_float (opt_barrier_float (0x1p-123f) * 0x1p-123f);
85*412f47f9SXin Li       return y;
86*412f47f9SXin Li     }
87*412f47f9SXin Li 
88*412f47f9SXin Li   /* erfcf(nan)=nan, erfcf(+inf)=0 and erfcf(-inf)=2.  */
89*412f47f9SXin Li   if (unlikely (ia >= 0x7f800000))
90*412f47f9SXin Li     return asfloat (sign >> 1) + 1.0f / x; /* Special cases.  */
91*412f47f9SXin Li 
92*412f47f9SXin Li   /* Above this threshold erfcf is constant and needs to raise underflow
93*412f47f9SXin Li      exception for positive x.  */
94*412f47f9SXin Li   return sign ? 2.0f : __math_uflowf (0);
95*412f47f9SXin Li }
96*412f47f9SXin Li 
97*412f47f9SXin Li PL_SIG (S, F, 1, erfc, -4.0, 10.0)
98*412f47f9SXin Li PL_TEST_ULP (erfcf, 1.14)
99*412f47f9SXin Li PL_TEST_SYM_INTERVAL (erfcf, 0, 0x1p-26, 40000)
100*412f47f9SXin Li PL_TEST_INTERVAL (erfcf, 0x1p-26, 10.0625, 40000)
101*412f47f9SXin Li PL_TEST_INTERVAL (erfcf, -0x1p-26, -4.0, 40000)
102*412f47f9SXin Li PL_TEST_INTERVAL (erfcf, 10.0625, inf, 40000)
103*412f47f9SXin Li PL_TEST_INTERVAL (erfcf, -4.0, -inf, 40000)
104