pl/math/sv_pow_1u5.c

*412f47f9SXin Li/*
*412f47f9SXin Li * Double-precision SVE pow(x, y) function.
*412f47f9SXin Li *
*412f47f9SXin Li * Copyright (c) 2022-2024, Arm Limited.
*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*412f47f9SXin Li */
*412f47f9SXin Li
*412f47f9SXin Li#include "sv_math.h"
*412f47f9SXin Li#include "pl_sig.h"
*412f47f9SXin Li#include "pl_test.h"
*412f47f9SXin Li
*412f47f9SXin Li/* This version share a similar algorithm as AOR scalar pow.
*412f47f9SXin Li
*412f47f9SXin Li   The core computation consists in computing pow(x, y) as
*412f47f9SXin Li
*412f47f9SXin Li     exp (y * log (x)).
*412f47f9SXin Li
*412f47f9SXin Li   The algorithms for exp and log are very similar to scalar exp and log.
*412f47f9SXin Li   The log relies on table lookup for 3 variables and an order 8 polynomial.
*412f47f9SXin Li   It returns a high and a low contribution that are then passed to the exp,
*412f47f9SXin Li   to minimise the loss of accuracy in both routines.
*412f47f9SXin Li   The exp is based on 8-bit table lookup for scale and order-4 polynomial.
*412f47f9SXin Li   The SVE algorithm drops the tail in the exp computation at the price of
*412f47f9SXin Li   a lower accuracy, slightly above 1ULP.
*412f47f9SXin Li   The SVE algorithm also drops the special treatement of small (< 2^-65) and
*412f47f9SXin Li   large (> 2^63) finite values of |y|, as they only affect non-round to nearest
*412f47f9SXin Li   modes.
*412f47f9SXin Li
*412f47f9SXin Li   Maximum measured error is 1.04 ULPs:
*412f47f9SXin Li   SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12)
*412f47f9SXin Li     got 0x1.f7116284221fcp-1
*412f47f9SXin Li    want 0x1.f7116284221fdp-1.  */
*412f47f9SXin Li
*412f47f9SXin Li/* Data is defined in v_pow_log_data.c.  */
*412f47f9SXin Li#define N_LOG (1 << V_POW_LOG_TABLE_BITS)
*412f47f9SXin Li#define A __v_pow_log_data.poly
*412f47f9SXin Li#define Off 0x3fe6955500000000
*412f47f9SXin Li
*412f47f9SXin Li/* Data is defined in v_pow_exp_data.c.  */
*412f47f9SXin Li#define N_EXP (1 << V_POW_EXP_TABLE_BITS)
*412f47f9SXin Li#define SignBias (0x800 << V_POW_EXP_TABLE_BITS)
*412f47f9SXin Li#define C __v_pow_exp_data.poly
*412f47f9SXin Li#define SmallExp 0x3c9 /* top12(0x1p-54).  */
*412f47f9SXin Li#define BigExp 0x408   /* top12(512.).  */
*412f47f9SXin Li#define ThresExp 0x03f /* BigExp - SmallExp.  */
*412f47f9SXin Li#define HugeExp 0x409  /* top12(1024.).  */
*412f47f9SXin Li
*412f47f9SXin Li/* Constants associated with pow.  */
*412f47f9SXin Li#define SmallPowX 0x001 /* top12(0x1p-126).  */
*412f47f9SXin Li#define BigPowX 0x7ff	/* top12(INFINITY).  */
*412f47f9SXin Li#define ThresPowX 0x7fe /* BigPowX - SmallPowX.  */
*412f47f9SXin Li#define SmallPowY 0x3be /* top12(0x1.e7b6p-65).  */
*412f47f9SXin Li#define BigPowY 0x43e	/* top12(0x1.749p62).  */
*412f47f9SXin Li#define ThresPowY 0x080 /* BigPowY - SmallPowY.  */
*412f47f9SXin Li
*412f47f9SXin Li/* Check if x is an integer.  */
*412f47f9SXin Listatic inline svbool_t
*412f47f9SXin Lisv_isint (svbool_t pg, svfloat64_t x)
*412f47f9SXin Li{
*412f47f9SXin Li  return svcmpeq (pg, svrintz_z (pg, x), x);
*412f47f9SXin Li}
*412f47f9SXin Li
*412f47f9SXin Li/* Check if x is real not integer valued.  */
*412f47f9SXin Listatic inline svbool_t
*412f47f9SXin Lisv_isnotint (svbool_t pg, svfloat64_t x)
*412f47f9SXin Li{
*412f47f9SXin Li  return svcmpne (pg, svrintz_z (pg, x), x);
*412f47f9SXin Li}
*412f47f9SXin Li
*412f47f9SXin Li/* Check if x is an odd integer.  */
*412f47f9SXin Listatic inline svbool_t
*412f47f9SXin Lisv_isodd (svbool_t pg, svfloat64_t x)
*412f47f9SXin Li{
*412f47f9SXin Li  svfloat64_t y = svmul_x (pg, x, 0.5);
*412f47f9SXin Li  return sv_isnotint (pg, y);
*412f47f9SXin Li}
*412f47f9SXin Li
*412f47f9SXin Li/* Returns 0 if not int, 1 if odd int, 2 if even int.  The argument is
*412f47f9SXin Li   the bit representation of a non-zero finite floating-point value.  */
*412f47f9SXin Listatic inline int
*412f47f9SXin Licheckint (uint64_t iy)
*412f47f9SXin Li{
*412f47f9SXin Li  int e = iy >> 52 & 0x7ff;
*412f47f9SXin Li  if (e < 0x3ff)
*412f47f9SXin Li    return 0;
*412f47f9SXin Li  if (e > 0x3ff + 52)
*412f47f9SXin Li    return 2;
*412f47f9SXin Li  if (iy & ((1ULL << (0x3ff + 52 - e)) - 1))
*412f47f9SXin Li    return 0;
*412f47f9SXin Li  if (iy & (1ULL << (0x3ff + 52 - e)))
*412f47f9SXin Li    return 1;
*412f47f9SXin Li  return 2;
*412f47f9SXin Li}
*412f47f9SXin Li
*412f47f9SXin Li/* Top 12 bits (sign and exponent of each double float lane).  */
*412f47f9SXin Listatic inline svuint64_t
*412f47f9SXin Lisv_top12 (svfloat64_t x)
*412f47f9SXin Li{
*412f47f9SXin Li  return svlsr_x (svptrue_b64 (), svreinterpret_u64 (x), 52);
*412f47f9SXin Li}
*412f47f9SXin Li
*412f47f9SXin Li/* Returns 1 if input is the bit representation of 0, infinity or nan.  */
*412f47f9SXin Listatic inline int
*412f47f9SXin Lizeroinfnan (uint64_t i)
*412f47f9SXin Li{
*412f47f9SXin Li  return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1;
*412f47f9SXin Li}
*412f47f9SXin Li
*412f47f9SXin Li/* Returns 1 if input is the bit representation of 0, infinity or nan.  */
*412f47f9SXin Listatic inline svbool_t
*412f47f9SXin Lisv_zeroinfnan (svbool_t pg, svuint64_t i)
*412f47f9SXin Li{
*412f47f9SXin Li  return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2), 1),
*412f47f9SXin Li		  2 * asuint64 (INFINITY) - 1);
*412f47f9SXin Li}
*412f47f9SXin Li
*412f47f9SXin Li/* Handle cases that may overflow or underflow when computing the result that
*412f47f9SXin Li   is scale*(1+TMP) without intermediate rounding.  The bit representation of
*412f47f9SXin Li   scale is in SBITS, however it has a computed exponent that may have
*412f47f9SXin Li   overflown into the sign bit so that needs to be adjusted before using it as
*412f47f9SXin Li   a double.  (int32_t)KI is the k used in the argument reduction and exponent
*412f47f9SXin Li   adjustment of scale, positive k here means the result may overflow and
*412f47f9SXin Li   negative k means the result may underflow.  */
*412f47f9SXin Listatic inline double
*412f47f9SXin Lispecialcase (double tmp, uint64_t sbits, uint64_t ki)
*412f47f9SXin Li{
*412f47f9SXin Li  double scale;
*412f47f9SXin Li  if ((ki & 0x80000000) == 0)
*412f47f9SXin Li    {
*412f47f9SXin Li      /* k > 0, the exponent of scale might have overflowed by <= 460.  */
*412f47f9SXin Li      sbits -= 1009ull << 52;
*412f47f9SXin Li      scale = asdouble (sbits);
*412f47f9SXin Li      return 0x1p1009 * (scale + scale * tmp);
*412f47f9SXin Li    }
*412f47f9SXin Li  /* k < 0, need special care in the subnormal range.  */
*412f47f9SXin Li  sbits += 1022ull << 52;
*412f47f9SXin Li  /* Note: sbits is signed scale.  */
*412f47f9SXin Li  scale = asdouble (sbits);
*412f47f9SXin Li  double y = scale + scale * tmp;
*412f47f9SXin Li  return 0x1p-1022 * y;
*412f47f9SXin Li}
*412f47f9SXin Li
*412f47f9SXin Li/* Scalar fallback for special cases of SVE pow's exp.  */
*412f47f9SXin Listatic inline svfloat64_t
*412f47f9SXin Lisv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2,
*412f47f9SXin Li		     svfloat64_t y, svbool_t cmp)
*412f47f9SXin Li{
*412f47f9SXin Li  svbool_t p = svpfirst (cmp, svpfalse ());
*412f47f9SXin Li  while (svptest_any (cmp, p))
*412f47f9SXin Li    {
*412f47f9SXin Li      double sx1 = svclastb (p, 0, x1);
*412f47f9SXin Li      uint64_t su1 = svclastb (p, 0, u1);
*412f47f9SXin Li      uint64_t su2 = svclastb (p, 0, u2);
*412f47f9SXin Li      double elem = specialcase (sx1, su1, su2);
*412f47f9SXin Li      svfloat64_t y2 = sv_f64 (elem);
*412f47f9SXin Li      y = svsel (p, y2, y);
*412f47f9SXin Li      p = svpnext_b64 (cmp, p);
*412f47f9SXin Li    }
*412f47f9SXin Li  return y;
*412f47f9SXin Li}
*412f47f9SXin Li
*412f47f9SXin Li/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about
*412f47f9SXin Li   additional 15 bits precision.  IX is the bit representation of x, but
*412f47f9SXin Li   normalized in the subnormal range using the sign bit for the exponent.  */
*412f47f9SXin Listatic inline svfloat64_t
*412f47f9SXin Lisv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail)
*412f47f9SXin Li{
*412f47f9SXin Li  /* x = 2^k z; where z is in range [Off,2*Off) and exact.
*412f47f9SXin Li     The range is split into N subintervals.
*412f47f9SXin Li     The ith subinterval contains z and c is near its center.  */
*412f47f9SXin Li  svuint64_t tmp = svsub_x (pg, ix, Off);
*412f47f9SXin Li  svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS),
*412f47f9SXin Li			  sv_u64 (N_LOG - 1));
*412f47f9SXin Li  svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
*412f47f9SXin Li  svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, sv_u64 (0xfffULL << 52)));
*412f47f9SXin Li  svfloat64_t z = svreinterpret_f64 (iz);
*412f47f9SXin Li  svfloat64_t kd = svcvt_f64_x (pg, k);
*412f47f9SXin Li
*412f47f9SXin Li  /* log(x) = k*Ln2 + log(c) + log1p(z/c-1).  */
*412f47f9SXin Li  /* SVE lookup requires 3 separate lookup tables, as opposed to scalar version
*412f47f9SXin Li     that uses array of structures. We also do the lookup earlier in the code to
*412f47f9SXin Li     make sure it finishes as early as possible.  */
*412f47f9SXin Li  svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i);
*412f47f9SXin Li  svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i);
*412f47f9SXin Li  svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i);
*412f47f9SXin Li
*412f47f9SXin Li  /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and
*412f47f9SXin Li     |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible.  */
*412f47f9SXin Li  svfloat64_t r = svmad_x (pg, z, invc, -1.0);
*412f47f9SXin Li  /* k*Ln2 + log(c) + r.  */
*412f47f9SXin Li  svfloat64_t t1 = svmla_x (pg, logc, kd, __v_pow_log_data.ln2_hi);
*412f47f9SXin Li  svfloat64_t t2 = svadd_x (pg, t1, r);
*412f47f9SXin Li  svfloat64_t lo1 = svmla_x (pg, logctail, kd, __v_pow_log_data.ln2_lo);
*412f47f9SXin Li  svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r);
*412f47f9SXin Li
*412f47f9SXin Li  /* Evaluation is optimized assuming superscalar pipelined execution.  */
*412f47f9SXin Li  svfloat64_t ar = svmul_x (pg, r, -0.5); /* A[0] = -0.5.  */
*412f47f9SXin Li  svfloat64_t ar2 = svmul_x (pg, r, ar);
*412f47f9SXin Li  svfloat64_t ar3 = svmul_x (pg, r, ar2);
*412f47f9SXin Li  /* k*Ln2 + log(c) + r + A[0]*r*r.  */
*412f47f9SXin Li  svfloat64_t hi = svadd_x (pg, t2, ar2);
*412f47f9SXin Li  svfloat64_t lo3 = svmla_x (pg, svneg_x (pg, ar2), ar, r);
*412f47f9SXin Li  svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2);
*412f47f9SXin Li  /* p = log1p(r) - r - A[0]*r*r.  */
*412f47f9SXin Li  /* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r *
*412f47f9SXin Li     A[6])))).  */
*412f47f9SXin Li  svfloat64_t a56 = svmla_x (pg, sv_f64 (A[5]), r, A[6]);
*412f47f9SXin Li  svfloat64_t a34 = svmla_x (pg, sv_f64 (A[3]), r, A[4]);
*412f47f9SXin Li  svfloat64_t a12 = svmla_x (pg, sv_f64 (A[1]), r, A[2]);
*412f47f9SXin Li  svfloat64_t p = svmla_x (pg, a34, ar2, a56);
*412f47f9SXin Li  p = svmla_x (pg, a12, ar2, p);
*412f47f9SXin Li  p = svmul_x (pg, ar3, p);
*412f47f9SXin Li  svfloat64_t lo = svadd_x (
*412f47f9SXin Li      pg, svadd_x (pg, svadd_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
*412f47f9SXin Li  svfloat64_t y = svadd_x (pg, hi, lo);
*412f47f9SXin Li  *tail = svadd_x (pg, svsub_x (pg, hi, y), lo);
*412f47f9SXin Li  return y;
*412f47f9SXin Li}
*412f47f9SXin Li
*412f47f9SXin Li/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
*412f47f9SXin Li   The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1.  */
*412f47f9SXin Listatic inline svfloat64_t
*412f47f9SXin Lisv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
*412f47f9SXin Li	       svuint64_t sign_bias)
*412f47f9SXin Li{
*412f47f9SXin Li  /* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow)
*412f47f9SXin Li     and other cases of large values of x (scale * (1 + TMP) oflow).  */
*412f47f9SXin Li  svuint64_t abstop = svand_x (pg, sv_top12 (x), 0x7ff);
*412f47f9SXin Li  /* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54).  */
*412f47f9SXin Li  svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp);
*412f47f9SXin Li
*412f47f9SXin Li  /* Conditions special, uflow and oflow are all expressed as uoflow &&
*412f47f9SXin Li     something, hence do not bother computing anything if no lane in uoflow is
*412f47f9SXin Li     true.  */
*412f47f9SXin Li  svbool_t special = svpfalse_b ();
*412f47f9SXin Li  svbool_t uflow = svpfalse_b ();
*412f47f9SXin Li  svbool_t oflow = svpfalse_b ();
*412f47f9SXin Li  if (unlikely (svptest_any (pg, uoflow)))
*412f47f9SXin Li    {
*412f47f9SXin Li      /* |x| is tiny (|x| <= 0x1p-54).  */
*412f47f9SXin Li      uflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
*412f47f9SXin Li      uflow = svand_z (pg, uoflow, uflow);
*412f47f9SXin Li      /* |x| is huge (|x| >= 1024).  */
*412f47f9SXin Li      oflow = svcmpge (pg, abstop, HugeExp);
*412f47f9SXin Li      oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow));
*412f47f9SXin Li      /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow
*412f47f9SXin Li	 or underflow.  */
*412f47f9SXin Li      special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
*412f47f9SXin Li    }
*412f47f9SXin Li
*412f47f9SXin Li  /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
*412f47f9SXin Li  /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
*412f47f9SXin Li  svfloat64_t z = svmul_x (pg, x, __v_pow_exp_data.n_over_ln2);
*412f47f9SXin Li  /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
*412f47f9SXin Li  svfloat64_t shift = sv_f64 (__v_pow_exp_data.shift);
*412f47f9SXin Li  svfloat64_t kd = svadd_x (pg, z, shift);
*412f47f9SXin Li  svuint64_t ki = svreinterpret_u64 (kd);
*412f47f9SXin Li  kd = svsub_x (pg, kd, shift);
*412f47f9SXin Li  svfloat64_t r = x;
*412f47f9SXin Li  r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_hi);
*412f47f9SXin Li  r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_lo);
*412f47f9SXin Li  /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
*412f47f9SXin Li  r = svadd_x (pg, r, xtail);
*412f47f9SXin Li  /* 2^(k/N) ~= scale.  */
*412f47f9SXin Li  svuint64_t idx = svand_x (pg, ki, N_EXP - 1);
*412f47f9SXin Li  svuint64_t top
*412f47f9SXin Li      = svlsl_x (pg, svadd_x (pg, ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
*412f47f9SXin Li  /* This is only a valid scale when -1023*N < k < 1024*N.  */
*412f47f9SXin Li  svuint64_t sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
*412f47f9SXin Li  sbits = svadd_x (pg, sbits, top);
*412f47f9SXin Li  /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1).  */
*412f47f9SXin Li  svfloat64_t r2 = svmul_x (pg, r, r);
*412f47f9SXin Li  svfloat64_t tmp = svmla_x (pg, sv_f64 (C[1]), r, C[2]);
*412f47f9SXin Li  tmp = svmla_x (pg, sv_f64 (C[0]), r, tmp);
*412f47f9SXin Li  tmp = svmla_x (pg, r, r2, tmp);
*412f47f9SXin Li  svfloat64_t scale = svreinterpret_f64 (sbits);
*412f47f9SXin Li  /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
*412f47f9SXin Li     is no spurious underflow here even without fma.  */
*412f47f9SXin Li  z = svmla_x (pg, scale, scale, tmp);
*412f47f9SXin Li
*412f47f9SXin Li  /* Update result with special and large cases.  */
*412f47f9SXin Li  if (unlikely (svptest_any (pg, special)))
*412f47f9SXin Li    z = sv_call_specialcase (tmp, sbits, ki, z, special);
*412f47f9SXin Li
*412f47f9SXin Li  /* Handle underflow and overflow.  */
*412f47f9SXin Li  svuint64_t sign_bit = svlsr_x (pg, svreinterpret_u64 (x), 63);
*412f47f9SXin Li  svbool_t x_is_neg = svcmpne (pg, sign_bit, 0);
*412f47f9SXin Li  svuint64_t sign_mask = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
*412f47f9SXin Li  svfloat64_t res_uoflow = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
*412f47f9SXin Li  res_uoflow = svreinterpret_f64 (
*412f47f9SXin Li      svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
*412f47f9SXin Li  z = svsel (oflow, res_uoflow, z);
*412f47f9SXin Li  /* Avoid spurious underflow for tiny x.  */
*412f47f9SXin Li  svfloat64_t res_spurious_uflow
*412f47f9SXin Li      = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
*412f47f9SXin Li  z = svsel (uflow, res_spurious_uflow, z);
*412f47f9SXin Li
*412f47f9SXin Li  return z;
*412f47f9SXin Li}
*412f47f9SXin Li
*412f47f9SXin Listatic inline double
*412f47f9SXin Lipow_sc (double x, double y)
*412f47f9SXin Li{
*412f47f9SXin Li  uint64_t ix = asuint64 (x);
*412f47f9SXin Li  uint64_t iy = asuint64 (y);
*412f47f9SXin Li  /* Special cases: |x| or |y| is 0, inf or nan.  */
*412f47f9SXin Li  if (unlikely (zeroinfnan (iy)))
*412f47f9SXin Li    {
*412f47f9SXin Li      if (2 * iy == 0)
*412f47f9SXin Li	return issignaling_inline (x) ? x + y : 1.0;
*412f47f9SXin Li      if (ix == asuint64 (1.0))
*412f47f9SXin Li	return issignaling_inline (y) ? x + y : 1.0;
*412f47f9SXin Li      if (2 * ix > 2 * asuint64 (INFINITY) || 2 * iy > 2 * asuint64 (INFINITY))
*412f47f9SXin Li	return x + y;
*412f47f9SXin Li      if (2 * ix == 2 * asuint64 (1.0))
*412f47f9SXin Li	return 1.0;
*412f47f9SXin Li      if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63))
*412f47f9SXin Li	return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf.  */
*412f47f9SXin Li      return y * y;
*412f47f9SXin Li    }
*412f47f9SXin Li  if (unlikely (zeroinfnan (ix)))
*412f47f9SXin Li    {
*412f47f9SXin Li      double_t x2 = x * x;
*412f47f9SXin Li      if (ix >> 63 && checkint (iy) == 1)
*412f47f9SXin Li	x2 = -x2;
*412f47f9SXin Li      return (iy >> 63) ? 1 / x2 : x2;
*412f47f9SXin Li    }
*412f47f9SXin Li  return x;
*412f47f9SXin Li}
*412f47f9SXin Li
*412f47f9SXin Lisvfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
*412f47f9SXin Li{
*412f47f9SXin Li  /* This preamble handles special case conditions used in the final scalar
*412f47f9SXin Li     fallbacks. It also updates ix and sign_bias, that are used in the core
*412f47f9SXin Li     computation too, i.e., exp( y * log (x) ).  */
*412f47f9SXin Li  svuint64_t vix0 = svreinterpret_u64 (x);
*412f47f9SXin Li  svuint64_t viy0 = svreinterpret_u64 (y);
*412f47f9SXin Li  svuint64_t vtopx0 = svlsr_x (svptrue_b64 (), vix0, 52);
*412f47f9SXin Li
*412f47f9SXin Li  /* Negative x cases.  */
*412f47f9SXin Li  svuint64_t sign_bit = svlsr_m (pg, vix0, 63);
*412f47f9SXin Li  svbool_t xisneg = svcmpeq (pg, sign_bit, 1);
*412f47f9SXin Li
*412f47f9SXin Li  /* Set sign_bias and ix depending on sign of x and nature of y.  */
*412f47f9SXin Li  svbool_t yisnotint_xisneg = svpfalse_b ();
*412f47f9SXin Li  svuint64_t sign_bias = sv_u64 (0);
*412f47f9SXin Li  svuint64_t vix = vix0;
*412f47f9SXin Li  svuint64_t vtopx1 = vtopx0;
*412f47f9SXin Li  if (unlikely (svptest_any (pg, xisneg)))
*412f47f9SXin Li    {
*412f47f9SXin Li      /* Determine nature of y.  */
*412f47f9SXin Li      yisnotint_xisneg = sv_isnotint (xisneg, y);
*412f47f9SXin Li      svbool_t yisint_xisneg = sv_isint (xisneg, y);
*412f47f9SXin Li      svbool_t yisodd_xisneg = sv_isodd (xisneg, y);
*412f47f9SXin Li      /* ix set to abs(ix) if y is integer.  */
*412f47f9SXin Li      vix = svand_m (yisint_xisneg, vix0, 0x7fffffffffffffff);
*412f47f9SXin Li      vtopx1 = svand_m (yisint_xisneg, vtopx0, 0x7ff);
*412f47f9SXin Li      /* Set to SignBias if x is negative and y is odd.  */
*412f47f9SXin Li      sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0));
*412f47f9SXin Li    }
*412f47f9SXin Li
*412f47f9SXin Li  /* Special cases of x or y: zero, inf and nan.  */
*412f47f9SXin Li  svbool_t xspecial = sv_zeroinfnan (pg, vix0);
*412f47f9SXin Li  svbool_t yspecial = sv_zeroinfnan (pg, viy0);
*412f47f9SXin Li  svbool_t special = svorr_z (pg, xspecial, yspecial);
*412f47f9SXin Li
*412f47f9SXin Li  /* Small cases of x: |x| < 0x1p-126.  */
*412f47f9SXin Li  svuint64_t vabstopx0 = svand_x (pg, vtopx0, 0x7ff);
*412f47f9SXin Li  svbool_t xsmall = svcmplt (pg, vabstopx0, SmallPowX);
*412f47f9SXin Li  if (unlikely (svptest_any (pg, xsmall)))
*412f47f9SXin Li    {
*412f47f9SXin Li      /* Normalize subnormal x so exponent becomes negative.  */
*412f47f9SXin Li      svbool_t topx_is_null = svcmpeq (xsmall, vtopx1, 0);
*412f47f9SXin Li
*412f47f9SXin Li      svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52));
*412f47f9SXin Li      vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff);
*412f47f9SXin Li      vix_norm = svsub_m (xsmall, vix_norm, 52ULL << 52);
*412f47f9SXin Li      vix = svsel (topx_is_null, vix_norm, vix);
*412f47f9SXin Li    }
*412f47f9SXin Li
*412f47f9SXin Li  /* y_hi = log(ix, &y_lo).  */
*412f47f9SXin Li  svfloat64_t vlo;
*412f47f9SXin Li  svfloat64_t vhi = sv_log_inline (pg, vix, &vlo);
*412f47f9SXin Li
*412f47f9SXin Li  /* z = exp(y_hi, y_lo, sign_bias).  */
*412f47f9SXin Li  svfloat64_t vehi = svmul_x (pg, y, vhi);
*412f47f9SXin Li  svfloat64_t velo = svmul_x (pg, y, vlo);
*412f47f9SXin Li  svfloat64_t vemi = svmls_x (pg, vehi, y, vhi);
*412f47f9SXin Li  velo = svsub_x (pg, velo, vemi);
*412f47f9SXin Li  svfloat64_t vz = sv_exp_inline (pg, vehi, velo, sign_bias);
*412f47f9SXin Li
*412f47f9SXin Li  /* Cases of finite y and finite negative x.  */
*412f47f9SXin Li  vz = svsel (yisnotint_xisneg, sv_f64 (__builtin_nan ("")), vz);
*412f47f9SXin Li
*412f47f9SXin Li  /* Cases of zero/inf/nan x or y.  */
*412f47f9SXin Li  if (unlikely (svptest_any (pg, special)))
*412f47f9SXin Li    vz = sv_call2_f64 (pow_sc, x, y, vz, special);
*412f47f9SXin Li
*412f47f9SXin Li  return vz;
*412f47f9SXin Li}
*412f47f9SXin Li
*412f47f9SXin LiPL_SIG (SV, D, 2, pow)
*412f47f9SXin LiPL_TEST_ULP (SV_NAME_D2 (pow), 0.55)
*412f47f9SXin Li/* Wide intervals spanning the whole domain but shared between x and y.  */
*412f47f9SXin Li#define SV_POW_INTERVAL2(xlo, xhi, ylo, yhi, n)                                \
*412f47f9SXin Li  PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, ylo, yhi, n)                  \
*412f47f9SXin Li  PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n)                \
*412f47f9SXin Li  PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n)                \
*412f47f9SXin Li  PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
*412f47f9SXin Li#define EXPAND(str) str##000000000
*412f47f9SXin Li#define SHL52(str) EXPAND (str)
*412f47f9SXin LiSV_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000)
*412f47f9SXin LiSV_POW_INTERVAL2 (SHL52 (SmallPowX), SHL52 (BigPowX), 0, inf, 40000)
*412f47f9SXin LiSV_POW_INTERVAL2 (SHL52 (BigPowX), inf, 0, inf, 40000)
*412f47f9SXin LiSV_POW_INTERVAL2 (0, inf, 0, SHL52 (SmallPowY), 40000)
*412f47f9SXin LiSV_POW_INTERVAL2 (0, inf, SHL52 (SmallPowY), SHL52 (BigPowY), 40000)
*412f47f9SXin LiSV_POW_INTERVAL2 (0, inf, SHL52 (BigPowY), inf, 40000)
*412f47f9SXin LiSV_POW_INTERVAL2 (0, inf, 0, inf, 1000)
*412f47f9SXin Li/* x~1 or y~1.  */
*412f47f9SXin LiSV_POW_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000)
*412f47f9SXin LiSV_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
*412f47f9SXin LiSV_POW_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000)
*412f47f9SXin Li/* around estimated argmaxs of ULP error.  */
*412f47f9SXin LiSV_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
*412f47f9SXin LiSV_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
*412f47f9SXin Li/* x is negative, y is odd or even integer, or y is real not integer.  */
*412f47f9SXin LiPL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
*412f47f9SXin LiPL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
*412f47f9SXin LiPL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
*412f47f9SXin LiPL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
*412f47f9SXin Li/* |x| is inf, y is odd or even integer, or y is real not integer.  */
*412f47f9SXin LiSV_POW_INTERVAL2 (inf, inf, 0.5, 0.5, 1)
*412f47f9SXin LiSV_POW_INTERVAL2 (inf, inf, 1.0, 1.0, 1)
*412f47f9SXin LiSV_POW_INTERVAL2 (inf, inf, 2.0, 2.0, 1)
*412f47f9SXin LiSV_POW_INTERVAL2 (inf, inf, 3.0, 3.0, 1)
*412f47f9SXin Li/* 0.0^y.  */
*412f47f9SXin LiSV_POW_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000)
*412f47f9SXin Li/* 1.0^y.  */
*412f47f9SXin LiPL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
*412f47f9SXin LiPL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
*412f47f9SXin LiPL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
*412f47f9SXin LiPL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)