xref: /aosp_15_r20/external/arm-optimized-routines/math/tgamma128.c (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li /*
2*412f47f9SXin Li  * Implementation of the true gamma function (as opposed to lgamma)
3*412f47f9SXin Li  * for 128-bit long double.
4*412f47f9SXin Li  *
5*412f47f9SXin Li  * Copyright (c) 2006-2024, Arm Limited.
6*412f47f9SXin Li  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
7*412f47f9SXin Li  */
8*412f47f9SXin Li 
9*412f47f9SXin Li /*
10*412f47f9SXin Li  * This module implements the float128 gamma function under the name
11*412f47f9SXin Li  * tgamma128. It's expected to be suitable for integration into system
12*412f47f9SXin Li  * maths libraries under the standard name tgammal, if long double is
13*412f47f9SXin Li  * 128-bit. Such a library will probably want to check the error
14*412f47f9SXin Li  * handling and optimize the initial process of extracting the
15*412f47f9SXin Li  * exponent, which is done here by simple and portable (but
16*412f47f9SXin Li  * potentially slower) methods.
17*412f47f9SXin Li  */
18*412f47f9SXin Li 
19*412f47f9SXin Li #include <float.h>
20*412f47f9SXin Li #include <math.h>
21*412f47f9SXin Li #include <stdbool.h>
22*412f47f9SXin Li #include <stddef.h>
23*412f47f9SXin Li 
24*412f47f9SXin Li /* Only binary128 format is supported.  */
25*412f47f9SXin Li #if LDBL_MANT_DIG == 113
26*412f47f9SXin Li 
27*412f47f9SXin Li #include "tgamma128.h"
28*412f47f9SXin Li 
29*412f47f9SXin Li #define lenof(x) (sizeof(x)/sizeof(*(x)))
30*412f47f9SXin Li 
31*412f47f9SXin Li /*
32*412f47f9SXin Li  * Helper routine to evaluate a polynomial via Horner's rule
33*412f47f9SXin Li  */
poly(const long double * coeffs,size_t n,long double x)34*412f47f9SXin Li static long double poly(const long double *coeffs, size_t n, long double x)
35*412f47f9SXin Li {
36*412f47f9SXin Li     long double result = coeffs[--n];
37*412f47f9SXin Li 
38*412f47f9SXin Li     while (n > 0)
39*412f47f9SXin Li         result = (result * x) + coeffs[--n];
40*412f47f9SXin Li 
41*412f47f9SXin Li     return result;
42*412f47f9SXin Li }
43*412f47f9SXin Li 
44*412f47f9SXin Li /*
45*412f47f9SXin Li  * Compute sin(pi*x) / pi, for use in the reflection formula that
46*412f47f9SXin Li  * relates gamma(-x) and gamma(x).
47*412f47f9SXin Li  */
sin_pi_x_over_pi(long double x)48*412f47f9SXin Li static long double sin_pi_x_over_pi(long double x)
49*412f47f9SXin Li {
50*412f47f9SXin Li     int quo;
51*412f47f9SXin Li     long double fracpart = remquol(x, 0.5L, &quo);
52*412f47f9SXin Li 
53*412f47f9SXin Li     long double sign = 1.0L;
54*412f47f9SXin Li     if (quo & 2)
55*412f47f9SXin Li         sign = -sign;
56*412f47f9SXin Li     quo &= 1;
57*412f47f9SXin Li 
58*412f47f9SXin Li     if (quo == 0 && fabsl(fracpart) < 0x1.p-58L) {
59*412f47f9SXin Li         /* For numbers this size, sin(pi*x) is so close to pi*x that
60*412f47f9SXin Li          * sin(pi*x)/pi is indistinguishable from x in float128 */
61*412f47f9SXin Li         return sign * fracpart;
62*412f47f9SXin Li     }
63*412f47f9SXin Li 
64*412f47f9SXin Li     if (quo == 0) {
65*412f47f9SXin Li         return sign * sinl(pi*fracpart) / pi;
66*412f47f9SXin Li     } else {
67*412f47f9SXin Li         return sign * cosl(pi*fracpart) / pi;
68*412f47f9SXin Li     }
69*412f47f9SXin Li }
70*412f47f9SXin Li 
71*412f47f9SXin Li /* Return tgamma(x) on the assumption that x >= 8. */
tgamma_large(long double x,bool negative,long double negadjust)72*412f47f9SXin Li static long double tgamma_large(long double x,
73*412f47f9SXin Li                                 bool negative, long double negadjust)
74*412f47f9SXin Li {
75*412f47f9SXin Li     /*
76*412f47f9SXin Li      * In this range we compute gamma(x) as x^(x-1/2) * e^-x * K,
77*412f47f9SXin Li      * where K is a correction factor computed as a polynomial in 1/x.
78*412f47f9SXin Li      *
79*412f47f9SXin Li      * (Vaguely inspired by the form of the Lanczos approximation, but
80*412f47f9SXin Li      * I tried the Lanczos approximation itself and it suffers badly
81*412f47f9SXin Li      * from big cancellation leading to loss of significance.)
82*412f47f9SXin Li      */
83*412f47f9SXin Li     long double t = 1/x;
84*412f47f9SXin Li     long double p = poly(coeffs_large, lenof(coeffs_large), t);
85*412f47f9SXin Li 
86*412f47f9SXin Li     /*
87*412f47f9SXin Li      * To avoid overflow in cases where x^(x-0.5) does overflow
88*412f47f9SXin Li      * but gamma(x) does not, we split x^(x-0.5) in half and
89*412f47f9SXin Li      * multiply back up _after_ multiplying the shrinking factor
90*412f47f9SXin Li      * of exp(-(x-0.5)).
91*412f47f9SXin Li      *
92*412f47f9SXin Li      * Note that computing x-0.5 and (x-0.5)/2 is exact for the
93*412f47f9SXin Li      * relevant range of x, so the only sources of error are pow
94*412f47f9SXin Li      * and exp themselves, plus the multiplications.
95*412f47f9SXin Li      */
96*412f47f9SXin Li     long double powhalf = powl(x, (x-0.5L)/2.0L);
97*412f47f9SXin Li     long double expret = expl(-(x-0.5L));
98*412f47f9SXin Li 
99*412f47f9SXin Li     if (!negative) {
100*412f47f9SXin Li         return (expret * powhalf) * powhalf * p;
101*412f47f9SXin Li     } else {
102*412f47f9SXin Li         /*
103*412f47f9SXin Li          * Apply the reflection formula as commented below, but
104*412f47f9SXin Li          * carefully: negadjust has magnitude less than 1, so it can
105*412f47f9SXin Li          * turn a case where gamma(+x) would overflow into a case
106*412f47f9SXin Li          * where gamma(-x) doesn't underflow. Not only that, but the
107*412f47f9SXin Li          * FP format has greater range in the tiny domain due to
108*412f47f9SXin Li          * denormals. For both reasons, it's not good enough to
109*412f47f9SXin Li          * compute the positive result and then adjust it.
110*412f47f9SXin Li          */
111*412f47f9SXin Li         long double ret = 1 / ((expret * powhalf) * (x * negadjust) * p);
112*412f47f9SXin Li         return ret / powhalf;
113*412f47f9SXin Li     }
114*412f47f9SXin Li }
115*412f47f9SXin Li 
116*412f47f9SXin Li /* Return tgamma(x) on the assumption that 0 <= x < 1/32. */
tgamma_tiny(long double x,bool negative,long double negadjust)117*412f47f9SXin Li static long double tgamma_tiny(long double x,
118*412f47f9SXin Li                                bool negative, long double negadjust)
119*412f47f9SXin Li {
120*412f47f9SXin Li     /*
121*412f47f9SXin Li      * For x near zero, we use a polynomial approximation to
122*412f47f9SXin Li      * g = 1/(x*gamma(x)), and then return 1/(g*x).
123*412f47f9SXin Li      */
124*412f47f9SXin Li     long double g = poly(coeffs_tiny, lenof(coeffs_tiny), x);
125*412f47f9SXin Li     if (!negative)
126*412f47f9SXin Li         return 1.0L / (g*x);
127*412f47f9SXin Li     else
128*412f47f9SXin Li         return g / negadjust;
129*412f47f9SXin Li }
130*412f47f9SXin Li 
131*412f47f9SXin Li /* Return tgamma(x) on the assumption that 0 <= x < 2^-113. */
tgamma_ultratiny(long double x,bool negative,long double negadjust)132*412f47f9SXin Li static long double tgamma_ultratiny(long double x, bool negative,
133*412f47f9SXin Li                                     long double negadjust)
134*412f47f9SXin Li {
135*412f47f9SXin Li     /* On this interval, gamma can't even be distinguished from 1/x,
136*412f47f9SXin Li      * so we skip the polynomial evaluation in tgamma_tiny, partly to
137*412f47f9SXin Li      * save time and partly to avoid the tiny intermediate values
138*412f47f9SXin Li      * setting the underflow exception flag. */
139*412f47f9SXin Li     if (!negative)
140*412f47f9SXin Li         return 1.0L / x;
141*412f47f9SXin Li     else
142*412f47f9SXin Li         return 1.0L / negadjust;
143*412f47f9SXin Li }
144*412f47f9SXin Li 
145*412f47f9SXin Li /* Return tgamma(x) on the assumption that 1 <= x <= 2. */
tgamma_central(long double x)146*412f47f9SXin Li static long double tgamma_central(long double x)
147*412f47f9SXin Li {
148*412f47f9SXin Li     /*
149*412f47f9SXin Li      * In this central interval, our strategy is to finding the
150*412f47f9SXin Li      * difference between x and the point where gamma has a minimum,
151*412f47f9SXin Li      * and approximate based on that.
152*412f47f9SXin Li      */
153*412f47f9SXin Li 
154*412f47f9SXin Li     /* The difference between the input x and the minimum x. The first
155*412f47f9SXin Li      * subtraction is expected to be exact, since x and min_hi have
156*412f47f9SXin Li      * the same exponent (unless x=2, in which case it will still be
157*412f47f9SXin Li      * exact). */
158*412f47f9SXin Li     long double t = (x - min_x_hi) - min_x_lo;
159*412f47f9SXin Li 
160*412f47f9SXin Li     /*
161*412f47f9SXin Li      * Now use two different polynomials for the intervals [1,m] and
162*412f47f9SXin Li      * [m,2].
163*412f47f9SXin Li      */
164*412f47f9SXin Li     long double p;
165*412f47f9SXin Li     if (t < 0)
166*412f47f9SXin Li         p = poly(coeffs_central_neg, lenof(coeffs_central_neg), -t);
167*412f47f9SXin Li     else
168*412f47f9SXin Li         p = poly(coeffs_central_pos, lenof(coeffs_central_pos), t);
169*412f47f9SXin Li 
170*412f47f9SXin Li     return (min_y_lo + p * (t*t)) + min_y_hi;
171*412f47f9SXin Li }
172*412f47f9SXin Li 
tgamma128(long double x)173*412f47f9SXin Li long double tgamma128(long double x)
174*412f47f9SXin Li {
175*412f47f9SXin Li     /*
176*412f47f9SXin Li      * Start by extracting the number's sign and exponent, and ruling
177*412f47f9SXin Li      * out cases of non-normalized numbers.
178*412f47f9SXin Li      *
179*412f47f9SXin Li      * For an implementation integrated into a system libm, it would
180*412f47f9SXin Li      * almost certainly be quicker to do this by direct bitwise access
181*412f47f9SXin Li      * to the input float128 value, using whatever is the local idiom
182*412f47f9SXin Li      * for knowing its endianness.
183*412f47f9SXin Li      *
184*412f47f9SXin Li      * Integration into a system libc may also need to worry about
185*412f47f9SXin Li      * setting errno, if that's the locally preferred way to report
186*412f47f9SXin Li      * math.h errors.
187*412f47f9SXin Li      */
188*412f47f9SXin Li     int sign = signbit(x);
189*412f47f9SXin Li     int exponent;
190*412f47f9SXin Li     switch (fpclassify(x)) {
191*412f47f9SXin Li       case FP_NAN:
192*412f47f9SXin Li         return x+x; /* propagate QNaN, make SNaN throw an exception */
193*412f47f9SXin Li       case FP_ZERO:
194*412f47f9SXin Li         return 1/x; /* divide by zero on purpose to indicate a pole */
195*412f47f9SXin Li       case FP_INFINITE:
196*412f47f9SXin Li         if (sign) {
197*412f47f9SXin Li             return x-x; /* gamma(-inf) has indeterminate sign, so provoke an
198*412f47f9SXin Li                          * IEEE invalid operation exception to indicate that */
199*412f47f9SXin Li         }
200*412f47f9SXin Li         return x;     /* but gamma(+inf) is just +inf with no error */
201*412f47f9SXin Li       case FP_SUBNORMAL:
202*412f47f9SXin Li         exponent = -16384;
203*412f47f9SXin Li         break;
204*412f47f9SXin Li       default:
205*412f47f9SXin Li         frexpl(x, &exponent);
206*412f47f9SXin Li         exponent--;
207*412f47f9SXin Li         break;
208*412f47f9SXin Li     }
209*412f47f9SXin Li 
210*412f47f9SXin Li     bool negative = false;
211*412f47f9SXin Li     long double negadjust = 0.0L;
212*412f47f9SXin Li 
213*412f47f9SXin Li     if (sign) {
214*412f47f9SXin Li         /*
215*412f47f9SXin Li          * Euler's reflection formula is
216*412f47f9SXin Li          *
217*412f47f9SXin Li          *    gamma(1-x) gamma(x) = pi/sin(pi*x)
218*412f47f9SXin Li          *
219*412f47f9SXin Li          *                        pi
220*412f47f9SXin Li          * => gamma(x) = --------------------
221*412f47f9SXin Li          *               gamma(1-x) sin(pi*x)
222*412f47f9SXin Li          *
223*412f47f9SXin Li          * But computing 1-x is going to lose a lot of accuracy when x
224*412f47f9SXin Li          * is very small, so instead we transform using the recurrence
225*412f47f9SXin Li          * gamma(t+1)=t gamma(t). Setting t=-x, this gives us
226*412f47f9SXin Li          * gamma(1-x) = -x gamma(-x), so we now have
227*412f47f9SXin Li          *
228*412f47f9SXin Li          *                         pi
229*412f47f9SXin Li          *    gamma(x) = ----------------------
230*412f47f9SXin Li          *               -x gamma(-x) sin(pi*x)
231*412f47f9SXin Li          *
232*412f47f9SXin Li          * which relates gamma(x) to gamma(-x), which is much nicer,
233*412f47f9SXin Li          * since x can be turned into -x without rounding.
234*412f47f9SXin Li          */
235*412f47f9SXin Li         negadjust = sin_pi_x_over_pi(x);
236*412f47f9SXin Li         negative = true;
237*412f47f9SXin Li         x = -x;
238*412f47f9SXin Li 
239*412f47f9SXin Li         /*
240*412f47f9SXin Li          * Now the ultimate answer we want is
241*412f47f9SXin Li          *
242*412f47f9SXin Li          *    1 / (gamma(x) * x * negadjust)
243*412f47f9SXin Li          *
244*412f47f9SXin Li          * where x is the positive value we've just turned it into.
245*412f47f9SXin Li          *
246*412f47f9SXin Li          * For some of the cases below, we'll compute gamma(x)
247*412f47f9SXin Li          * normally and then compute this adjusted value afterwards.
248*412f47f9SXin Li          * But for others, we can implement the reciprocal operation
249*412f47f9SXin Li          * in this formula by _avoiding_ an inversion that the
250*412f47f9SXin Li          * sub-case was going to do anyway.
251*412f47f9SXin Li          */
252*412f47f9SXin Li 
253*412f47f9SXin Li         if (negadjust == 0) {
254*412f47f9SXin Li             /*
255*412f47f9SXin Li              * Special case for negative integers. Applying the
256*412f47f9SXin Li              * reflection formula would cause division by zero, but
257*412f47f9SXin Li              * standards would prefer we treat this error case as an
258*412f47f9SXin Li              * invalid operation and return NaN instead. (Possibly
259*412f47f9SXin Li              * because otherwise you'd have to decide which sign of
260*412f47f9SXin Li              * infinity to return, and unlike the x=0 case, there's no
261*412f47f9SXin Li              * sign of zero available to disambiguate.)
262*412f47f9SXin Li              */
263*412f47f9SXin Li             return negadjust / negadjust;
264*412f47f9SXin Li         }
265*412f47f9SXin Li     }
266*412f47f9SXin Li 
267*412f47f9SXin Li     /*
268*412f47f9SXin Li      * Split the positive domain into various cases. For cases where
269*412f47f9SXin Li      * we do the negative-number adjustment the usual way, we'll leave
270*412f47f9SXin Li      * the answer in 'g' and drop out of the if statement.
271*412f47f9SXin Li      */
272*412f47f9SXin Li     long double g;
273*412f47f9SXin Li 
274*412f47f9SXin Li     if (exponent >= 11) {
275*412f47f9SXin Li         /*
276*412f47f9SXin Li          * gamma of any positive value this large overflows, and gamma
277*412f47f9SXin Li          * of any negative value underflows.
278*412f47f9SXin Li          */
279*412f47f9SXin Li         if (!negative) {
280*412f47f9SXin Li             long double huge = 0x1p+12288L;
281*412f47f9SXin Li             return huge * huge; /* provoke an overflow */
282*412f47f9SXin Li         } else {
283*412f47f9SXin Li             long double tiny = 0x1p-12288L;
284*412f47f9SXin Li             return tiny * tiny * negadjust; /* underflow, of the right sign */
285*412f47f9SXin Li         }
286*412f47f9SXin Li     } else if (exponent >= 3) {
287*412f47f9SXin Li         /* Negative-number adjustment happens inside here */
288*412f47f9SXin Li         return tgamma_large(x, negative, negadjust);
289*412f47f9SXin Li     } else if (exponent < -113) {
290*412f47f9SXin Li         /* Negative-number adjustment happens inside here */
291*412f47f9SXin Li         return tgamma_ultratiny(x, negative, negadjust);
292*412f47f9SXin Li     } else if (exponent < -5) {
293*412f47f9SXin Li         /* Negative-number adjustment happens inside here */
294*412f47f9SXin Li         return tgamma_tiny(x, negative, negadjust);
295*412f47f9SXin Li     } else if (exponent == 0) {
296*412f47f9SXin Li         g = tgamma_central(x);
297*412f47f9SXin Li     } else if (exponent < 0) {
298*412f47f9SXin Li         /*
299*412f47f9SXin Li          * For x in [1/32,1) we range-reduce upwards to the interval
300*412f47f9SXin Li          * [1,2), using the inverse of the normal recurrence formula:
301*412f47f9SXin Li          * gamma(x) = gamma(x+1)/x.
302*412f47f9SXin Li          */
303*412f47f9SXin Li         g = tgamma_central(1+x) / x;
304*412f47f9SXin Li     } else {
305*412f47f9SXin Li         /*
306*412f47f9SXin Li          * For x in [2,8) we range-reduce downwards to the interval
307*412f47f9SXin Li          * [1,2) by repeated application of the recurrence formula.
308*412f47f9SXin Li          *
309*412f47f9SXin Li          * Actually multiplying (x-1) by (x-2) by (x-3) and so on
310*412f47f9SXin Li          * would introduce multiple ULPs of rounding error. We can get
311*412f47f9SXin Li          * better accuracy by writing x = (k+1/2) + t, where k is an
312*412f47f9SXin Li          * integer and |t|<1/2, and expanding out the obvious factor
313*412f47f9SXin Li          * (x-1)(x-2)...(x-k+1) as a polynomial in t.
314*412f47f9SXin Li          */
315*412f47f9SXin Li         long double mult;
316*412f47f9SXin Li         int i = x;
317*412f47f9SXin Li         if (i == 2) { /* x in [2,3) */
318*412f47f9SXin Li             mult = (x-1);
319*412f47f9SXin Li         } else {
320*412f47f9SXin Li             long double t = x - (i + 0.5L);
321*412f47f9SXin Li             switch (i) {
322*412f47f9SXin Li                 /* E.g. for x=3.5+t, we want
323*412f47f9SXin Li                  * (x-1)(x-2) = (2.5+t)(1.5+t) = 3.75 + 4t + t^2 */
324*412f47f9SXin Li               case 3:
325*412f47f9SXin Li                 mult = 3.75L+t*(4.0L+t);
326*412f47f9SXin Li                 break;
327*412f47f9SXin Li               case 4:
328*412f47f9SXin Li                 mult = 13.125L+t*(17.75L+t*(7.5L+t));
329*412f47f9SXin Li                 break;
330*412f47f9SXin Li               case 5:
331*412f47f9SXin Li                 mult = 59.0625L+t*(93.0L+t*(51.50L+t*(12.0L+t)));
332*412f47f9SXin Li                 break;
333*412f47f9SXin Li               case 6:
334*412f47f9SXin Li                 mult = 324.84375L+t*(570.5625L+t*(376.250L+t*(
335*412f47f9SXin Li                     117.5L+t*(17.5L+t))));
336*412f47f9SXin Li                 break;
337*412f47f9SXin Li               case 7:
338*412f47f9SXin Li                 mult = 2111.484375L+t*(4033.5L+t*(3016.1875L+t*(
339*412f47f9SXin Li                     1140.0L+t*(231.25L+t*(24.0L+t)))));
340*412f47f9SXin Li                 break;
341*412f47f9SXin Li             }
342*412f47f9SXin Li         }
343*412f47f9SXin Li 
344*412f47f9SXin Li         g = tgamma_central(x - (i-1)) * mult;
345*412f47f9SXin Li     }
346*412f47f9SXin Li 
347*412f47f9SXin Li     if (!negative) {
348*412f47f9SXin Li         /* Positive domain: return g unmodified */
349*412f47f9SXin Li         return g;
350*412f47f9SXin Li     } else {
351*412f47f9SXin Li         /* Negative domain: apply the reflection formula as commented above */
352*412f47f9SXin Li         return 1.0L / (g * x * negadjust);
353*412f47f9SXin Li     }
354*412f47f9SXin Li }
355*412f47f9SXin Li 
356*412f47f9SXin Li #endif
357