1*412f47f9SXin Li /*
2*412f47f9SXin Li * Double-precision SVE pow(x, y) function.
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2022-2024, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li #include "sv_math.h"
9*412f47f9SXin Li #include "pl_sig.h"
10*412f47f9SXin Li #include "pl_test.h"
11*412f47f9SXin Li
12*412f47f9SXin Li /* This version share a similar algorithm as AOR scalar pow.
13*412f47f9SXin Li
14*412f47f9SXin Li The core computation consists in computing pow(x, y) as
15*412f47f9SXin Li
16*412f47f9SXin Li exp (y * log (x)).
17*412f47f9SXin Li
18*412f47f9SXin Li The algorithms for exp and log are very similar to scalar exp and log.
19*412f47f9SXin Li The log relies on table lookup for 3 variables and an order 8 polynomial.
20*412f47f9SXin Li It returns a high and a low contribution that are then passed to the exp,
21*412f47f9SXin Li to minimise the loss of accuracy in both routines.
22*412f47f9SXin Li The exp is based on 8-bit table lookup for scale and order-4 polynomial.
23*412f47f9SXin Li The SVE algorithm drops the tail in the exp computation at the price of
24*412f47f9SXin Li a lower accuracy, slightly above 1ULP.
25*412f47f9SXin Li The SVE algorithm also drops the special treatement of small (< 2^-65) and
26*412f47f9SXin Li large (> 2^63) finite values of |y|, as they only affect non-round to nearest
27*412f47f9SXin Li modes.
28*412f47f9SXin Li
29*412f47f9SXin Li Maximum measured error is 1.04 ULPs:
30*412f47f9SXin Li SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12)
31*412f47f9SXin Li got 0x1.f7116284221fcp-1
32*412f47f9SXin Li want 0x1.f7116284221fdp-1. */
33*412f47f9SXin Li
34*412f47f9SXin Li /* Data is defined in v_pow_log_data.c. */
35*412f47f9SXin Li #define N_LOG (1 << V_POW_LOG_TABLE_BITS)
36*412f47f9SXin Li #define A __v_pow_log_data.poly
37*412f47f9SXin Li #define Off 0x3fe6955500000000
38*412f47f9SXin Li
39*412f47f9SXin Li /* Data is defined in v_pow_exp_data.c. */
40*412f47f9SXin Li #define N_EXP (1 << V_POW_EXP_TABLE_BITS)
41*412f47f9SXin Li #define SignBias (0x800 << V_POW_EXP_TABLE_BITS)
42*412f47f9SXin Li #define C __v_pow_exp_data.poly
43*412f47f9SXin Li #define SmallExp 0x3c9 /* top12(0x1p-54). */
44*412f47f9SXin Li #define BigExp 0x408 /* top12(512.). */
45*412f47f9SXin Li #define ThresExp 0x03f /* BigExp - SmallExp. */
46*412f47f9SXin Li #define HugeExp 0x409 /* top12(1024.). */
47*412f47f9SXin Li
48*412f47f9SXin Li /* Constants associated with pow. */
49*412f47f9SXin Li #define SmallPowX 0x001 /* top12(0x1p-126). */
50*412f47f9SXin Li #define BigPowX 0x7ff /* top12(INFINITY). */
51*412f47f9SXin Li #define ThresPowX 0x7fe /* BigPowX - SmallPowX. */
52*412f47f9SXin Li #define SmallPowY 0x3be /* top12(0x1.e7b6p-65). */
53*412f47f9SXin Li #define BigPowY 0x43e /* top12(0x1.749p62). */
54*412f47f9SXin Li #define ThresPowY 0x080 /* BigPowY - SmallPowY. */
55*412f47f9SXin Li
56*412f47f9SXin Li /* Check if x is an integer. */
57*412f47f9SXin Li static inline svbool_t
sv_isint(svbool_t pg,svfloat64_t x)58*412f47f9SXin Li sv_isint (svbool_t pg, svfloat64_t x)
59*412f47f9SXin Li {
60*412f47f9SXin Li return svcmpeq (pg, svrintz_z (pg, x), x);
61*412f47f9SXin Li }
62*412f47f9SXin Li
63*412f47f9SXin Li /* Check if x is real not integer valued. */
64*412f47f9SXin Li static inline svbool_t
sv_isnotint(svbool_t pg,svfloat64_t x)65*412f47f9SXin Li sv_isnotint (svbool_t pg, svfloat64_t x)
66*412f47f9SXin Li {
67*412f47f9SXin Li return svcmpne (pg, svrintz_z (pg, x), x);
68*412f47f9SXin Li }
69*412f47f9SXin Li
70*412f47f9SXin Li /* Check if x is an odd integer. */
71*412f47f9SXin Li static inline svbool_t
sv_isodd(svbool_t pg,svfloat64_t x)72*412f47f9SXin Li sv_isodd (svbool_t pg, svfloat64_t x)
73*412f47f9SXin Li {
74*412f47f9SXin Li svfloat64_t y = svmul_x (pg, x, 0.5);
75*412f47f9SXin Li return sv_isnotint (pg, y);
76*412f47f9SXin Li }
77*412f47f9SXin Li
78*412f47f9SXin Li /* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is
79*412f47f9SXin Li the bit representation of a non-zero finite floating-point value. */
80*412f47f9SXin Li static inline int
checkint(uint64_t iy)81*412f47f9SXin Li checkint (uint64_t iy)
82*412f47f9SXin Li {
83*412f47f9SXin Li int e = iy >> 52 & 0x7ff;
84*412f47f9SXin Li if (e < 0x3ff)
85*412f47f9SXin Li return 0;
86*412f47f9SXin Li if (e > 0x3ff + 52)
87*412f47f9SXin Li return 2;
88*412f47f9SXin Li if (iy & ((1ULL << (0x3ff + 52 - e)) - 1))
89*412f47f9SXin Li return 0;
90*412f47f9SXin Li if (iy & (1ULL << (0x3ff + 52 - e)))
91*412f47f9SXin Li return 1;
92*412f47f9SXin Li return 2;
93*412f47f9SXin Li }
94*412f47f9SXin Li
95*412f47f9SXin Li /* Top 12 bits (sign and exponent of each double float lane). */
96*412f47f9SXin Li static inline svuint64_t
sv_top12(svfloat64_t x)97*412f47f9SXin Li sv_top12 (svfloat64_t x)
98*412f47f9SXin Li {
99*412f47f9SXin Li return svlsr_x (svptrue_b64 (), svreinterpret_u64 (x), 52);
100*412f47f9SXin Li }
101*412f47f9SXin Li
102*412f47f9SXin Li /* Returns 1 if input is the bit representation of 0, infinity or nan. */
103*412f47f9SXin Li static inline int
zeroinfnan(uint64_t i)104*412f47f9SXin Li zeroinfnan (uint64_t i)
105*412f47f9SXin Li {
106*412f47f9SXin Li return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1;
107*412f47f9SXin Li }
108*412f47f9SXin Li
109*412f47f9SXin Li /* Returns 1 if input is the bit representation of 0, infinity or nan. */
110*412f47f9SXin Li static inline svbool_t
sv_zeroinfnan(svbool_t pg,svuint64_t i)111*412f47f9SXin Li sv_zeroinfnan (svbool_t pg, svuint64_t i)
112*412f47f9SXin Li {
113*412f47f9SXin Li return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2), 1),
114*412f47f9SXin Li 2 * asuint64 (INFINITY) - 1);
115*412f47f9SXin Li }
116*412f47f9SXin Li
117*412f47f9SXin Li /* Handle cases that may overflow or underflow when computing the result that
118*412f47f9SXin Li is scale*(1+TMP) without intermediate rounding. The bit representation of
119*412f47f9SXin Li scale is in SBITS, however it has a computed exponent that may have
120*412f47f9SXin Li overflown into the sign bit so that needs to be adjusted before using it as
121*412f47f9SXin Li a double. (int32_t)KI is the k used in the argument reduction and exponent
122*412f47f9SXin Li adjustment of scale, positive k here means the result may overflow and
123*412f47f9SXin Li negative k means the result may underflow. */
124*412f47f9SXin Li static inline double
specialcase(double tmp,uint64_t sbits,uint64_t ki)125*412f47f9SXin Li specialcase (double tmp, uint64_t sbits, uint64_t ki)
126*412f47f9SXin Li {
127*412f47f9SXin Li double scale;
128*412f47f9SXin Li if ((ki & 0x80000000) == 0)
129*412f47f9SXin Li {
130*412f47f9SXin Li /* k > 0, the exponent of scale might have overflowed by <= 460. */
131*412f47f9SXin Li sbits -= 1009ull << 52;
132*412f47f9SXin Li scale = asdouble (sbits);
133*412f47f9SXin Li return 0x1p1009 * (scale + scale * tmp);
134*412f47f9SXin Li }
135*412f47f9SXin Li /* k < 0, need special care in the subnormal range. */
136*412f47f9SXin Li sbits += 1022ull << 52;
137*412f47f9SXin Li /* Note: sbits is signed scale. */
138*412f47f9SXin Li scale = asdouble (sbits);
139*412f47f9SXin Li double y = scale + scale * tmp;
140*412f47f9SXin Li return 0x1p-1022 * y;
141*412f47f9SXin Li }
142*412f47f9SXin Li
143*412f47f9SXin Li /* Scalar fallback for special cases of SVE pow's exp. */
144*412f47f9SXin Li static inline svfloat64_t
sv_call_specialcase(svfloat64_t x1,svuint64_t u1,svuint64_t u2,svfloat64_t y,svbool_t cmp)145*412f47f9SXin Li sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2,
146*412f47f9SXin Li svfloat64_t y, svbool_t cmp)
147*412f47f9SXin Li {
148*412f47f9SXin Li svbool_t p = svpfirst (cmp, svpfalse ());
149*412f47f9SXin Li while (svptest_any (cmp, p))
150*412f47f9SXin Li {
151*412f47f9SXin Li double sx1 = svclastb (p, 0, x1);
152*412f47f9SXin Li uint64_t su1 = svclastb (p, 0, u1);
153*412f47f9SXin Li uint64_t su2 = svclastb (p, 0, u2);
154*412f47f9SXin Li double elem = specialcase (sx1, su1, su2);
155*412f47f9SXin Li svfloat64_t y2 = sv_f64 (elem);
156*412f47f9SXin Li y = svsel (p, y2, y);
157*412f47f9SXin Li p = svpnext_b64 (cmp, p);
158*412f47f9SXin Li }
159*412f47f9SXin Li return y;
160*412f47f9SXin Li }
161*412f47f9SXin Li
162*412f47f9SXin Li /* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about
163*412f47f9SXin Li additional 15 bits precision. IX is the bit representation of x, but
164*412f47f9SXin Li normalized in the subnormal range using the sign bit for the exponent. */
165*412f47f9SXin Li static inline svfloat64_t
sv_log_inline(svbool_t pg,svuint64_t ix,svfloat64_t * tail)166*412f47f9SXin Li sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail)
167*412f47f9SXin Li {
168*412f47f9SXin Li /* x = 2^k z; where z is in range [Off,2*Off) and exact.
169*412f47f9SXin Li The range is split into N subintervals.
170*412f47f9SXin Li The ith subinterval contains z and c is near its center. */
171*412f47f9SXin Li svuint64_t tmp = svsub_x (pg, ix, Off);
172*412f47f9SXin Li svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS),
173*412f47f9SXin Li sv_u64 (N_LOG - 1));
174*412f47f9SXin Li svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
175*412f47f9SXin Li svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, sv_u64 (0xfffULL << 52)));
176*412f47f9SXin Li svfloat64_t z = svreinterpret_f64 (iz);
177*412f47f9SXin Li svfloat64_t kd = svcvt_f64_x (pg, k);
178*412f47f9SXin Li
179*412f47f9SXin Li /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
180*412f47f9SXin Li /* SVE lookup requires 3 separate lookup tables, as opposed to scalar version
181*412f47f9SXin Li that uses array of structures. We also do the lookup earlier in the code to
182*412f47f9SXin Li make sure it finishes as early as possible. */
183*412f47f9SXin Li svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i);
184*412f47f9SXin Li svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i);
185*412f47f9SXin Li svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i);
186*412f47f9SXin Li
187*412f47f9SXin Li /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and
188*412f47f9SXin Li |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */
189*412f47f9SXin Li svfloat64_t r = svmad_x (pg, z, invc, -1.0);
190*412f47f9SXin Li /* k*Ln2 + log(c) + r. */
191*412f47f9SXin Li svfloat64_t t1 = svmla_x (pg, logc, kd, __v_pow_log_data.ln2_hi);
192*412f47f9SXin Li svfloat64_t t2 = svadd_x (pg, t1, r);
193*412f47f9SXin Li svfloat64_t lo1 = svmla_x (pg, logctail, kd, __v_pow_log_data.ln2_lo);
194*412f47f9SXin Li svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r);
195*412f47f9SXin Li
196*412f47f9SXin Li /* Evaluation is optimized assuming superscalar pipelined execution. */
197*412f47f9SXin Li svfloat64_t ar = svmul_x (pg, r, -0.5); /* A[0] = -0.5. */
198*412f47f9SXin Li svfloat64_t ar2 = svmul_x (pg, r, ar);
199*412f47f9SXin Li svfloat64_t ar3 = svmul_x (pg, r, ar2);
200*412f47f9SXin Li /* k*Ln2 + log(c) + r + A[0]*r*r. */
201*412f47f9SXin Li svfloat64_t hi = svadd_x (pg, t2, ar2);
202*412f47f9SXin Li svfloat64_t lo3 = svmla_x (pg, svneg_x (pg, ar2), ar, r);
203*412f47f9SXin Li svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2);
204*412f47f9SXin Li /* p = log1p(r) - r - A[0]*r*r. */
205*412f47f9SXin Li /* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r *
206*412f47f9SXin Li A[6])))). */
207*412f47f9SXin Li svfloat64_t a56 = svmla_x (pg, sv_f64 (A[5]), r, A[6]);
208*412f47f9SXin Li svfloat64_t a34 = svmla_x (pg, sv_f64 (A[3]), r, A[4]);
209*412f47f9SXin Li svfloat64_t a12 = svmla_x (pg, sv_f64 (A[1]), r, A[2]);
210*412f47f9SXin Li svfloat64_t p = svmla_x (pg, a34, ar2, a56);
211*412f47f9SXin Li p = svmla_x (pg, a12, ar2, p);
212*412f47f9SXin Li p = svmul_x (pg, ar3, p);
213*412f47f9SXin Li svfloat64_t lo = svadd_x (
214*412f47f9SXin Li pg, svadd_x (pg, svadd_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
215*412f47f9SXin Li svfloat64_t y = svadd_x (pg, hi, lo);
216*412f47f9SXin Li *tail = svadd_x (pg, svsub_x (pg, hi, y), lo);
217*412f47f9SXin Li return y;
218*412f47f9SXin Li }
219*412f47f9SXin Li
220*412f47f9SXin Li /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
221*412f47f9SXin Li The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */
222*412f47f9SXin Li static inline svfloat64_t
sv_exp_inline(svbool_t pg,svfloat64_t x,svfloat64_t xtail,svuint64_t sign_bias)223*412f47f9SXin Li sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
224*412f47f9SXin Li svuint64_t sign_bias)
225*412f47f9SXin Li {
226*412f47f9SXin Li /* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow)
227*412f47f9SXin Li and other cases of large values of x (scale * (1 + TMP) oflow). */
228*412f47f9SXin Li svuint64_t abstop = svand_x (pg, sv_top12 (x), 0x7ff);
229*412f47f9SXin Li /* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54). */
230*412f47f9SXin Li svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp);
231*412f47f9SXin Li
232*412f47f9SXin Li /* Conditions special, uflow and oflow are all expressed as uoflow &&
233*412f47f9SXin Li something, hence do not bother computing anything if no lane in uoflow is
234*412f47f9SXin Li true. */
235*412f47f9SXin Li svbool_t special = svpfalse_b ();
236*412f47f9SXin Li svbool_t uflow = svpfalse_b ();
237*412f47f9SXin Li svbool_t oflow = svpfalse_b ();
238*412f47f9SXin Li if (unlikely (svptest_any (pg, uoflow)))
239*412f47f9SXin Li {
240*412f47f9SXin Li /* |x| is tiny (|x| <= 0x1p-54). */
241*412f47f9SXin Li uflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
242*412f47f9SXin Li uflow = svand_z (pg, uoflow, uflow);
243*412f47f9SXin Li /* |x| is huge (|x| >= 1024). */
244*412f47f9SXin Li oflow = svcmpge (pg, abstop, HugeExp);
245*412f47f9SXin Li oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow));
246*412f47f9SXin Li /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow
247*412f47f9SXin Li or underflow. */
248*412f47f9SXin Li special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
249*412f47f9SXin Li }
250*412f47f9SXin Li
251*412f47f9SXin Li /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
252*412f47f9SXin Li /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */
253*412f47f9SXin Li svfloat64_t z = svmul_x (pg, x, __v_pow_exp_data.n_over_ln2);
254*412f47f9SXin Li /* z - kd is in [-1, 1] in non-nearest rounding modes. */
255*412f47f9SXin Li svfloat64_t shift = sv_f64 (__v_pow_exp_data.shift);
256*412f47f9SXin Li svfloat64_t kd = svadd_x (pg, z, shift);
257*412f47f9SXin Li svuint64_t ki = svreinterpret_u64 (kd);
258*412f47f9SXin Li kd = svsub_x (pg, kd, shift);
259*412f47f9SXin Li svfloat64_t r = x;
260*412f47f9SXin Li r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_hi);
261*412f47f9SXin Li r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_lo);
262*412f47f9SXin Li /* The code assumes 2^-200 < |xtail| < 2^-8/N. */
263*412f47f9SXin Li r = svadd_x (pg, r, xtail);
264*412f47f9SXin Li /* 2^(k/N) ~= scale. */
265*412f47f9SXin Li svuint64_t idx = svand_x (pg, ki, N_EXP - 1);
266*412f47f9SXin Li svuint64_t top
267*412f47f9SXin Li = svlsl_x (pg, svadd_x (pg, ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
268*412f47f9SXin Li /* This is only a valid scale when -1023*N < k < 1024*N. */
269*412f47f9SXin Li svuint64_t sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
270*412f47f9SXin Li sbits = svadd_x (pg, sbits, top);
271*412f47f9SXin Li /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
272*412f47f9SXin Li svfloat64_t r2 = svmul_x (pg, r, r);
273*412f47f9SXin Li svfloat64_t tmp = svmla_x (pg, sv_f64 (C[1]), r, C[2]);
274*412f47f9SXin Li tmp = svmla_x (pg, sv_f64 (C[0]), r, tmp);
275*412f47f9SXin Li tmp = svmla_x (pg, r, r2, tmp);
276*412f47f9SXin Li svfloat64_t scale = svreinterpret_f64 (sbits);
277*412f47f9SXin Li /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
278*412f47f9SXin Li is no spurious underflow here even without fma. */
279*412f47f9SXin Li z = svmla_x (pg, scale, scale, tmp);
280*412f47f9SXin Li
281*412f47f9SXin Li /* Update result with special and large cases. */
282*412f47f9SXin Li if (unlikely (svptest_any (pg, special)))
283*412f47f9SXin Li z = sv_call_specialcase (tmp, sbits, ki, z, special);
284*412f47f9SXin Li
285*412f47f9SXin Li /* Handle underflow and overflow. */
286*412f47f9SXin Li svuint64_t sign_bit = svlsr_x (pg, svreinterpret_u64 (x), 63);
287*412f47f9SXin Li svbool_t x_is_neg = svcmpne (pg, sign_bit, 0);
288*412f47f9SXin Li svuint64_t sign_mask = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
289*412f47f9SXin Li svfloat64_t res_uoflow = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
290*412f47f9SXin Li res_uoflow = svreinterpret_f64 (
291*412f47f9SXin Li svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
292*412f47f9SXin Li z = svsel (oflow, res_uoflow, z);
293*412f47f9SXin Li /* Avoid spurious underflow for tiny x. */
294*412f47f9SXin Li svfloat64_t res_spurious_uflow
295*412f47f9SXin Li = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
296*412f47f9SXin Li z = svsel (uflow, res_spurious_uflow, z);
297*412f47f9SXin Li
298*412f47f9SXin Li return z;
299*412f47f9SXin Li }
300*412f47f9SXin Li
301*412f47f9SXin Li static inline double
pow_sc(double x,double y)302*412f47f9SXin Li pow_sc (double x, double y)
303*412f47f9SXin Li {
304*412f47f9SXin Li uint64_t ix = asuint64 (x);
305*412f47f9SXin Li uint64_t iy = asuint64 (y);
306*412f47f9SXin Li /* Special cases: |x| or |y| is 0, inf or nan. */
307*412f47f9SXin Li if (unlikely (zeroinfnan (iy)))
308*412f47f9SXin Li {
309*412f47f9SXin Li if (2 * iy == 0)
310*412f47f9SXin Li return issignaling_inline (x) ? x + y : 1.0;
311*412f47f9SXin Li if (ix == asuint64 (1.0))
312*412f47f9SXin Li return issignaling_inline (y) ? x + y : 1.0;
313*412f47f9SXin Li if (2 * ix > 2 * asuint64 (INFINITY) || 2 * iy > 2 * asuint64 (INFINITY))
314*412f47f9SXin Li return x + y;
315*412f47f9SXin Li if (2 * ix == 2 * asuint64 (1.0))
316*412f47f9SXin Li return 1.0;
317*412f47f9SXin Li if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63))
318*412f47f9SXin Li return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf. */
319*412f47f9SXin Li return y * y;
320*412f47f9SXin Li }
321*412f47f9SXin Li if (unlikely (zeroinfnan (ix)))
322*412f47f9SXin Li {
323*412f47f9SXin Li double_t x2 = x * x;
324*412f47f9SXin Li if (ix >> 63 && checkint (iy) == 1)
325*412f47f9SXin Li x2 = -x2;
326*412f47f9SXin Li return (iy >> 63) ? 1 / x2 : x2;
327*412f47f9SXin Li }
328*412f47f9SXin Li return x;
329*412f47f9SXin Li }
330*412f47f9SXin Li
SV_NAME_D2(pow)331*412f47f9SXin Li svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
332*412f47f9SXin Li {
333*412f47f9SXin Li /* This preamble handles special case conditions used in the final scalar
334*412f47f9SXin Li fallbacks. It also updates ix and sign_bias, that are used in the core
335*412f47f9SXin Li computation too, i.e., exp( y * log (x) ). */
336*412f47f9SXin Li svuint64_t vix0 = svreinterpret_u64 (x);
337*412f47f9SXin Li svuint64_t viy0 = svreinterpret_u64 (y);
338*412f47f9SXin Li svuint64_t vtopx0 = svlsr_x (svptrue_b64 (), vix0, 52);
339*412f47f9SXin Li
340*412f47f9SXin Li /* Negative x cases. */
341*412f47f9SXin Li svuint64_t sign_bit = svlsr_m (pg, vix0, 63);
342*412f47f9SXin Li svbool_t xisneg = svcmpeq (pg, sign_bit, 1);
343*412f47f9SXin Li
344*412f47f9SXin Li /* Set sign_bias and ix depending on sign of x and nature of y. */
345*412f47f9SXin Li svbool_t yisnotint_xisneg = svpfalse_b ();
346*412f47f9SXin Li svuint64_t sign_bias = sv_u64 (0);
347*412f47f9SXin Li svuint64_t vix = vix0;
348*412f47f9SXin Li svuint64_t vtopx1 = vtopx0;
349*412f47f9SXin Li if (unlikely (svptest_any (pg, xisneg)))
350*412f47f9SXin Li {
351*412f47f9SXin Li /* Determine nature of y. */
352*412f47f9SXin Li yisnotint_xisneg = sv_isnotint (xisneg, y);
353*412f47f9SXin Li svbool_t yisint_xisneg = sv_isint (xisneg, y);
354*412f47f9SXin Li svbool_t yisodd_xisneg = sv_isodd (xisneg, y);
355*412f47f9SXin Li /* ix set to abs(ix) if y is integer. */
356*412f47f9SXin Li vix = svand_m (yisint_xisneg, vix0, 0x7fffffffffffffff);
357*412f47f9SXin Li vtopx1 = svand_m (yisint_xisneg, vtopx0, 0x7ff);
358*412f47f9SXin Li /* Set to SignBias if x is negative and y is odd. */
359*412f47f9SXin Li sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0));
360*412f47f9SXin Li }
361*412f47f9SXin Li
362*412f47f9SXin Li /* Special cases of x or y: zero, inf and nan. */
363*412f47f9SXin Li svbool_t xspecial = sv_zeroinfnan (pg, vix0);
364*412f47f9SXin Li svbool_t yspecial = sv_zeroinfnan (pg, viy0);
365*412f47f9SXin Li svbool_t special = svorr_z (pg, xspecial, yspecial);
366*412f47f9SXin Li
367*412f47f9SXin Li /* Small cases of x: |x| < 0x1p-126. */
368*412f47f9SXin Li svuint64_t vabstopx0 = svand_x (pg, vtopx0, 0x7ff);
369*412f47f9SXin Li svbool_t xsmall = svcmplt (pg, vabstopx0, SmallPowX);
370*412f47f9SXin Li if (unlikely (svptest_any (pg, xsmall)))
371*412f47f9SXin Li {
372*412f47f9SXin Li /* Normalize subnormal x so exponent becomes negative. */
373*412f47f9SXin Li svbool_t topx_is_null = svcmpeq (xsmall, vtopx1, 0);
374*412f47f9SXin Li
375*412f47f9SXin Li svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52));
376*412f47f9SXin Li vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff);
377*412f47f9SXin Li vix_norm = svsub_m (xsmall, vix_norm, 52ULL << 52);
378*412f47f9SXin Li vix = svsel (topx_is_null, vix_norm, vix);
379*412f47f9SXin Li }
380*412f47f9SXin Li
381*412f47f9SXin Li /* y_hi = log(ix, &y_lo). */
382*412f47f9SXin Li svfloat64_t vlo;
383*412f47f9SXin Li svfloat64_t vhi = sv_log_inline (pg, vix, &vlo);
384*412f47f9SXin Li
385*412f47f9SXin Li /* z = exp(y_hi, y_lo, sign_bias). */
386*412f47f9SXin Li svfloat64_t vehi = svmul_x (pg, y, vhi);
387*412f47f9SXin Li svfloat64_t velo = svmul_x (pg, y, vlo);
388*412f47f9SXin Li svfloat64_t vemi = svmls_x (pg, vehi, y, vhi);
389*412f47f9SXin Li velo = svsub_x (pg, velo, vemi);
390*412f47f9SXin Li svfloat64_t vz = sv_exp_inline (pg, vehi, velo, sign_bias);
391*412f47f9SXin Li
392*412f47f9SXin Li /* Cases of finite y and finite negative x. */
393*412f47f9SXin Li vz = svsel (yisnotint_xisneg, sv_f64 (__builtin_nan ("")), vz);
394*412f47f9SXin Li
395*412f47f9SXin Li /* Cases of zero/inf/nan x or y. */
396*412f47f9SXin Li if (unlikely (svptest_any (pg, special)))
397*412f47f9SXin Li vz = sv_call2_f64 (pow_sc, x, y, vz, special);
398*412f47f9SXin Li
399*412f47f9SXin Li return vz;
400*412f47f9SXin Li }
401*412f47f9SXin Li
402*412f47f9SXin Li PL_SIG (SV, D, 2, pow)
403*412f47f9SXin Li PL_TEST_ULP (SV_NAME_D2 (pow), 0.55)
404*412f47f9SXin Li /* Wide intervals spanning the whole domain but shared between x and y. */
405*412f47f9SXin Li #define SV_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \
406*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \
407*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \
408*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \
409*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
410*412f47f9SXin Li #define EXPAND(str) str##000000000
411*412f47f9SXin Li #define SHL52(str) EXPAND (str)
412*412f47f9SXin Li SV_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000)
413*412f47f9SXin Li SV_POW_INTERVAL2 (SHL52 (SmallPowX), SHL52 (BigPowX), 0, inf, 40000)
414*412f47f9SXin Li SV_POW_INTERVAL2 (SHL52 (BigPowX), inf, 0, inf, 40000)
415*412f47f9SXin Li SV_POW_INTERVAL2 (0, inf, 0, SHL52 (SmallPowY), 40000)
416*412f47f9SXin Li SV_POW_INTERVAL2 (0, inf, SHL52 (SmallPowY), SHL52 (BigPowY), 40000)
417*412f47f9SXin Li SV_POW_INTERVAL2 (0, inf, SHL52 (BigPowY), inf, 40000)
418*412f47f9SXin Li SV_POW_INTERVAL2 (0, inf, 0, inf, 1000)
419*412f47f9SXin Li /* x~1 or y~1. */
420*412f47f9SXin Li SV_POW_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000)
421*412f47f9SXin Li SV_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
422*412f47f9SXin Li SV_POW_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000)
423*412f47f9SXin Li /* around estimated argmaxs of ULP error. */
424*412f47f9SXin Li SV_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
425*412f47f9SXin Li SV_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
426*412f47f9SXin Li /* x is negative, y is odd or even integer, or y is real not integer. */
427*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
428*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
429*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
430*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
431*412f47f9SXin Li /* |x| is inf, y is odd or even integer, or y is real not integer. */
432*412f47f9SXin Li SV_POW_INTERVAL2 (inf, inf, 0.5, 0.5, 1)
433*412f47f9SXin Li SV_POW_INTERVAL2 (inf, inf, 1.0, 1.0, 1)
434*412f47f9SXin Li SV_POW_INTERVAL2 (inf, inf, 2.0, 2.0, 1)
435*412f47f9SXin Li SV_POW_INTERVAL2 (inf, inf, 3.0, 3.0, 1)
436*412f47f9SXin Li /* 0.0^y. */
437*412f47f9SXin Li SV_POW_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000)
438*412f47f9SXin Li /* 1.0^y. */
439*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
440*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
441*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
442*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
443