xref: /aosp_15_r20/external/arm-optimized-routines/pl/math/sv_erfinv_25u.c (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1 /*
2  * Double-precision inverse error function (SVE variant).
3  *
4  * Copyright (c) 2024, Arm Limited.
5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6  */
7 #include "sv_math.h"
8 #include "pl_test.h"
9 #include "math_config.h"
10 #include "pl_sig.h"
11 #include "poly_sve_f64.h"
12 #define SV_LOG_INLINE_POLY_ORDER 4
13 #include "sv_log_inline.h"
14 
15 const static struct data
16 {
17   /*  We use P_N and Q_N to refer to arrays of coefficients, where P_N is the
18       coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs
19       of the denominator. P is interleaved P_17 and P_37, similar for Q.  */
20   double P[7][2], Q[7][2];
21   double P_57[9], Q_57[9], tailshift, P37_0;
22   struct sv_log_inline_data log_tbl;
23 } data = {
24   .P37_0 = -0x1.f3596123109edp-7,
25   .tailshift = -0.87890625,
26   .P = { { 0x1.007ce8f01b2e8p+4, 0x1.60b8fe375999ep-2 },
27 	 { -0x1.6b23cc5c6c6d7p+6, -0x1.779bb9bef7c0fp+1 },
28 	 { 0x1.74e5f6ceb3548p+7, 0x1.786ea384470a2p+3 },
29 	 { -0x1.5200bb15cc6bbp+7, -0x1.6a7c1453c85d3p+4 },
30 	 { 0x1.05d193233a849p+6, 0x1.31f0fc5613142p+4 },
31 	 { -0x1.148c5474ee5e1p+3, -0x1.5ea6c007d4dbbp+2 },
32 	 { 0x1.689181bbafd0cp-3, 0x1.e66f265ce9e5p-3 } },
33   .Q = { { 0x1.d8fb0f913bd7bp+3, -0x1.636b2dcf4edbep-7 },
34 	 { -0x1.6d7f25a3f1c24p+6, 0x1.0b5411e2acf29p-2 },
35 	 { 0x1.a450d8e7f4cbbp+7, -0x1.3413109467a0bp+1 },
36 	 { -0x1.bc3480485857p+7, 0x1.563e8136c554ap+3 },
37 	 { 0x1.ae6b0c504ee02p+6, -0x1.7b77aab1dcafbp+4 },
38 	 { -0x1.499dfec1a7f5fp+4, 0x1.8a3e174e05ddcp+4 },
39 	 { 0x1p+0, -0x1.4075c56404eecp+3 } },
40   .P_57 = { 0x1.b874f9516f7f1p-14, 0x1.5921f2916c1c4p-7, 0x1.145ae7d5b8fa4p-2,
41 	    0x1.29d6dcc3b2fb7p+1, 0x1.cabe2209a7985p+2, 0x1.11859f0745c4p+3,
42 	    0x1.b7ec7bc6a2ce5p+2, 0x1.d0419e0bb42aep+1, 0x1.c5aa03eef7258p-1 },
43   .Q_57 = { 0x1.b8747e12691f1p-14, 0x1.59240d8ed1e0ap-7, 0x1.14aef2b181e2p-2,
44 	    0x1.2cd181bcea52p+1, 0x1.e6e63e0b7aa4cp+2, 0x1.65cf8da94aa3ap+3,
45 	    0x1.7e5c787b10a36p+3, 0x1.0626d68b6cea3p+3, 0x1.065c5f193abf6p+2 },
46   .log_tbl = SV_LOG_CONSTANTS
47 };
48 
49 static inline svfloat64_t
special(svbool_t pg,svfloat64_t x,const struct data * d)50 special (svbool_t pg, svfloat64_t x, const struct data *d)
51 {
52   /* Note erfinv(inf) should return NaN, and erfinv(1) should return Inf.
53      By using log here, instead of log1p, we return finite values for both
54      these inputs, and values outside [-1, 1]. This is non-compliant, but is an
55      acceptable optimisation at Ofast. To get correct behaviour for all finite
56      values use the log1p_inline helper on -abs(x) - note that erfinv(inf)
57      will still be finite.  */
58   svfloat64_t ax = svabs_x (pg, x);
59   svfloat64_t t
60       = svneg_x (pg, sv_log_inline (pg, svsubr_x (pg, ax, 1), &d->log_tbl));
61   t = svdivr_x (pg, svsqrt_x (pg, t), 1);
62   svuint64_t sign
63       = sveor_x (pg, svreinterpret_u64 (ax), svreinterpret_u64 (x));
64   svfloat64_t ts
65       = svreinterpret_f64 (svorr_x (pg, sign, svreinterpret_u64 (t)));
66 
67   svfloat64_t q = svadd_x (pg, t, d->Q_57[8]);
68   for (int i = 7; i >= 0; i--)
69     q = svmad_x (pg, q, t, d->Q_57[i]);
70 
71   return svdiv_x (pg, sv_horner_8_f64_x (pg, t, d->P_57), svmul_x (pg, ts, q));
72 }
73 
74 static inline svfloat64_t
lookup(const double * c,svuint64_t idx)75 lookup (const double *c, svuint64_t idx)
76 {
77   svfloat64_t x = svld1rq_f64 (svptrue_b64 (), c);
78   return svtbl (x, idx);
79 }
80 
81 static inline svfloat64_t
notails(svbool_t pg,svfloat64_t x,const struct data * d)82 notails (svbool_t pg, svfloat64_t x, const struct data *d)
83 {
84   svfloat64_t t = svmad_x (pg, x, x, -0.5625);
85   svfloat64_t p = svmla_x (pg, sv_f64 (d->P[5][0]), t, d->P[6][0]);
86   svfloat64_t q = svadd_x (pg, t, d->Q[5][0]);
87   for (int i = 4; i >= 0; i--)
88     {
89       p = svmad_x (pg, t, p, d->P[i][0]);
90       q = svmad_x (pg, t, q, d->Q[i][0]);
91     }
92   p = svmul_x (pg, p, x);
93   return svdiv_x (pg, p, q);
94 }
95 
96 /* Vector implementation of Blair et al's rational approximation to inverse
97    error function in double precision. Largest observed error is 24.75 ULP:
98    _ZGVsMxv_erfinv(0x1.fc861d81c2ba8p-1) got 0x1.ea05472686625p+0
99 					want 0x1.ea0547268660cp+0.  */
SV_NAME_D1(erfinv)100 svfloat64_t SV_NAME_D1 (erfinv) (svfloat64_t x, svbool_t pg)
101 {
102   const struct data *d = ptr_barrier (&data);
103   /* Calculate inverse error using algorithm described in
104      J. M. Blair, C. A. Edwards, and J. H. Johnson,
105      "Rational Chebyshev approximations for the inverse of the error function",
106      Math. Comp. 30, pp. 827--830 (1976).
107      https://doi.org/10.1090/S0025-5718-1976-0421040-7.
108 
109      Algorithm has 3 intervals:
110      - 'Normal' region [-0.75, 0.75]
111      - Tail region [0.75, 0.9375] U [-0.9375, -0.75]
112      - Extreme tail [-1, -0.9375] U [0.9375, 1]
113      Normal and tail are both rational approximation of similar order on
114      shifted input - these are typically performed in parallel using gather
115      loads to obtain correct coefficients depending on interval.  */
116 
117   svbool_t no_tail = svacle (pg, x, 0.75);
118   if (unlikely (!svptest_any (pg, svnot_z (pg, no_tail))))
119     return notails (pg, x, d);
120 
121   svbool_t is_tail = svnot_z (pg, no_tail);
122   svbool_t extreme_tail = svacgt (pg, x, 0.9375);
123   svuint64_t idx = svdup_n_u64_z (is_tail, 1);
124 
125   svfloat64_t t = svsel_f64 (is_tail, sv_f64 (d->tailshift), sv_f64 (-0.5625));
126   t = svmla_x (pg, t, x, x);
127 
128   svfloat64_t p = lookup (&d->P[6][0], idx);
129   svfloat64_t q
130       = svmla_x (pg, lookup (&d->Q[6][0], idx), svdup_n_f64_z (is_tail, 1), t);
131   for (int i = 5; i >= 0; i--)
132     {
133       p = svmla_x (pg, lookup (&d->P[i][0], idx), p, t);
134       q = svmla_x (pg, lookup (&d->Q[i][0], idx), q, t);
135     }
136   p = svmad_m (is_tail, p, t, d->P37_0);
137   p = svmul_x (pg, p, x);
138 
139   if (likely (svptest_any (pg, extreme_tail)))
140     return svsel (extreme_tail, special (pg, x, d), svdiv_x (pg, p, q));
141   return svdiv_x (pg, p, q);
142 }
143 
144 PL_SIG (SV, D, 1, erfinv, -0.99, 0.99)
145 
146 PL_TEST_ULP (SV_NAME_D1 (erfinv), 24.5)
147 /* Test with control lane in each interval.  */
148 #define TEST_INTERVAL(lo, hi, n)                                              \
149   PL_TEST_INTERVAL_C (SV_NAME_D1 (erfinv), lo, hi, n, 0.5)                    \
150   PL_TEST_INTERVAL_C (SV_NAME_D1 (erfinv), lo, hi, n, 0.8)                    \
151   PL_TEST_INTERVAL_C (SV_NAME_D1 (erfinv), lo, hi, n, 0.95)
152 TEST_INTERVAL (0, 1, 100000)
153 TEST_INTERVAL (-0, -1, 100000)
154