xref: /aosp_15_r20/external/arm-optimized-routines/pl/math/sv_powf_2u6.c (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li /*
2*412f47f9SXin Li  * Single-precision SVE powf function.
3*412f47f9SXin Li  *
4*412f47f9SXin Li  * Copyright (c) 2023-2024, Arm Limited.
5*412f47f9SXin Li  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li  */
7*412f47f9SXin Li 
8*412f47f9SXin Li #include "sv_math.h"
9*412f47f9SXin Li #include "pl_sig.h"
10*412f47f9SXin Li #include "pl_test.h"
11*412f47f9SXin Li 
12*412f47f9SXin Li /* The following data is used in the SVE pow core computation
13*412f47f9SXin Li    and special case detection.  */
14*412f47f9SXin Li #define Tinvc __v_powf_data.invc
15*412f47f9SXin Li #define Tlogc __v_powf_data.logc
16*412f47f9SXin Li #define Texp __v_powf_data.scale
17*412f47f9SXin Li #define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11))
18*412f47f9SXin Li #define Shift 0x1.8p52
19*412f47f9SXin Li #define Norm 0x1p23f /* 0x4b000000.  */
20*412f47f9SXin Li 
21*412f47f9SXin Li /* Overall ULP error bound for pow is 2.6 ulp
22*412f47f9SXin Li    ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2).  */
23*412f47f9SXin Li static const struct data
24*412f47f9SXin Li {
25*412f47f9SXin Li   double log_poly[4];
26*412f47f9SXin Li   double exp_poly[3];
27*412f47f9SXin Li   float uflow_bound, oflow_bound, small_bound;
28*412f47f9SXin Li   uint32_t sign_bias, sign_mask, subnormal_bias, off;
29*412f47f9SXin Li } data = {
30*412f47f9SXin Li   /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of
31*412f47f9SXin Li      V_POWF_EXP2_N.  */
32*412f47f9SXin Li   .log_poly = { -0x1.6ff5daa3b3d7cp+3, 0x1.ec81d03c01aebp+3,
33*412f47f9SXin Li 		-0x1.71547bb43f101p+4, 0x1.7154764a815cbp+5 },
34*412f47f9SXin Li   /* rel err: 1.69 * 2^-34.  */
35*412f47f9SXin Li   .exp_poly = {
36*412f47f9SXin Li     0x1.c6af84b912394p-20, /* A0 / V_POWF_EXP2_N^3.  */
37*412f47f9SXin Li     0x1.ebfce50fac4f3p-13, /* A1 / V_POWF_EXP2_N^2.  */
38*412f47f9SXin Li     0x1.62e42ff0c52d6p-6,   /* A3 / V_POWF_EXP2_N.  */
39*412f47f9SXin Li   },
40*412f47f9SXin Li   .uflow_bound = -0x1.2cp+12f, /* -150.0 * V_POWF_EXP2_N.  */
41*412f47f9SXin Li   .oflow_bound = 0x1p+12f, /* 128.0 * V_POWF_EXP2_N.  */
42*412f47f9SXin Li   .small_bound = 0x1p-126f,
43*412f47f9SXin Li   .off = 0x3f35d000,
44*412f47f9SXin Li   .sign_bias = SignBias,
45*412f47f9SXin Li   .sign_mask = 0x80000000,
46*412f47f9SXin Li   .subnormal_bias = 0x0b800000, /* 23 << 23.  */
47*412f47f9SXin Li };
48*412f47f9SXin Li 
49*412f47f9SXin Li #define A(i) sv_f64 (d->log_poly[i])
50*412f47f9SXin Li #define C(i) sv_f64 (d->exp_poly[i])
51*412f47f9SXin Li 
52*412f47f9SXin Li /* Check if x is an integer.  */
53*412f47f9SXin Li static inline svbool_t
svisint(svbool_t pg,svfloat32_t x)54*412f47f9SXin Li svisint (svbool_t pg, svfloat32_t x)
55*412f47f9SXin Li {
56*412f47f9SXin Li   return svcmpeq (pg, svrintz_z (pg, x), x);
57*412f47f9SXin Li }
58*412f47f9SXin Li 
59*412f47f9SXin Li /* Check if x is real not integer valued.  */
60*412f47f9SXin Li static inline svbool_t
svisnotint(svbool_t pg,svfloat32_t x)61*412f47f9SXin Li svisnotint (svbool_t pg, svfloat32_t x)
62*412f47f9SXin Li {
63*412f47f9SXin Li   return svcmpne (pg, svrintz_z (pg, x), x);
64*412f47f9SXin Li }
65*412f47f9SXin Li 
66*412f47f9SXin Li /* Check if x is an odd integer.  */
67*412f47f9SXin Li static inline svbool_t
svisodd(svbool_t pg,svfloat32_t x)68*412f47f9SXin Li svisodd (svbool_t pg, svfloat32_t x)
69*412f47f9SXin Li {
70*412f47f9SXin Li   svfloat32_t y = svmul_x (pg, x, 0.5f);
71*412f47f9SXin Li   return svisnotint (pg, y);
72*412f47f9SXin Li }
73*412f47f9SXin Li 
74*412f47f9SXin Li /* Check if zero, inf or nan.  */
75*412f47f9SXin Li static inline svbool_t
sv_zeroinfnan(svbool_t pg,svuint32_t i)76*412f47f9SXin Li sv_zeroinfnan (svbool_t pg, svuint32_t i)
77*412f47f9SXin Li {
78*412f47f9SXin Li   return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1),
79*412f47f9SXin Li 		  2u * 0x7f800000 - 1);
80*412f47f9SXin Li }
81*412f47f9SXin Li 
82*412f47f9SXin Li /* Returns 0 if not int, 1 if odd int, 2 if even int.  The argument is
83*412f47f9SXin Li    the bit representation of a non-zero finite floating-point value.  */
84*412f47f9SXin Li static inline int
checkint(uint32_t iy)85*412f47f9SXin Li checkint (uint32_t iy)
86*412f47f9SXin Li {
87*412f47f9SXin Li   int e = iy >> 23 & 0xff;
88*412f47f9SXin Li   if (e < 0x7f)
89*412f47f9SXin Li     return 0;
90*412f47f9SXin Li   if (e > 0x7f + 23)
91*412f47f9SXin Li     return 2;
92*412f47f9SXin Li   if (iy & ((1 << (0x7f + 23 - e)) - 1))
93*412f47f9SXin Li     return 0;
94*412f47f9SXin Li   if (iy & (1 << (0x7f + 23 - e)))
95*412f47f9SXin Li     return 1;
96*412f47f9SXin Li   return 2;
97*412f47f9SXin Li }
98*412f47f9SXin Li 
99*412f47f9SXin Li /* Check if zero, inf or nan.  */
100*412f47f9SXin Li static inline int
zeroinfnan(uint32_t ix)101*412f47f9SXin Li zeroinfnan (uint32_t ix)
102*412f47f9SXin Li {
103*412f47f9SXin Li   return 2 * ix - 1 >= 2u * 0x7f800000 - 1;
104*412f47f9SXin Li }
105*412f47f9SXin Li 
106*412f47f9SXin Li /* A scalar subroutine used to fix main power special cases. Similar to the
107*412f47f9SXin Li    preamble of scalar powf except that we do not update ix and sign_bias. This
108*412f47f9SXin Li    is done in the preamble of the SVE powf.  */
109*412f47f9SXin Li static inline float
powf_specialcase(float x,float y,float z)110*412f47f9SXin Li powf_specialcase (float x, float y, float z)
111*412f47f9SXin Li {
112*412f47f9SXin Li   uint32_t ix = asuint (x);
113*412f47f9SXin Li   uint32_t iy = asuint (y);
114*412f47f9SXin Li   /* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan).  */
115*412f47f9SXin Li   if (unlikely (zeroinfnan (iy)))
116*412f47f9SXin Li     {
117*412f47f9SXin Li       if (2 * iy == 0)
118*412f47f9SXin Li 	return issignalingf_inline (x) ? x + y : 1.0f;
119*412f47f9SXin Li       if (ix == 0x3f800000)
120*412f47f9SXin Li 	return issignalingf_inline (y) ? x + y : 1.0f;
121*412f47f9SXin Li       if (2 * ix > 2u * 0x7f800000 || 2 * iy > 2u * 0x7f800000)
122*412f47f9SXin Li 	return x + y;
123*412f47f9SXin Li       if (2 * ix == 2 * 0x3f800000)
124*412f47f9SXin Li 	return 1.0f;
125*412f47f9SXin Li       if ((2 * ix < 2 * 0x3f800000) == !(iy & 0x80000000))
126*412f47f9SXin Li 	return 0.0f; /* |x|<1 && y==inf or |x|>1 && y==-inf.  */
127*412f47f9SXin Li       return y * y;
128*412f47f9SXin Li     }
129*412f47f9SXin Li   if (unlikely (zeroinfnan (ix)))
130*412f47f9SXin Li     {
131*412f47f9SXin Li       float_t x2 = x * x;
132*412f47f9SXin Li       if (ix & 0x80000000 && checkint (iy) == 1)
133*412f47f9SXin Li 	x2 = -x2;
134*412f47f9SXin Li       return iy & 0x80000000 ? 1 / x2 : x2;
135*412f47f9SXin Li     }
136*412f47f9SXin Li   /* We need a return here in case x<0 and y is integer, but all other tests
137*412f47f9SXin Li    need to be run.  */
138*412f47f9SXin Li   return z;
139*412f47f9SXin Li }
140*412f47f9SXin Li 
141*412f47f9SXin Li /* Scalar fallback for special case routines with custom signature.  */
142*412f47f9SXin Li static inline svfloat32_t
sv_call_powf_sc(svfloat32_t x1,svfloat32_t x2,svfloat32_t y,svbool_t cmp)143*412f47f9SXin Li sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
144*412f47f9SXin Li {
145*412f47f9SXin Li   svbool_t p = svpfirst (cmp, svpfalse ());
146*412f47f9SXin Li   while (svptest_any (cmp, p))
147*412f47f9SXin Li     {
148*412f47f9SXin Li       float sx1 = svclastb (p, 0, x1);
149*412f47f9SXin Li       float sx2 = svclastb (p, 0, x2);
150*412f47f9SXin Li       float elem = svclastb (p, 0, y);
151*412f47f9SXin Li       elem = powf_specialcase (sx1, sx2, elem);
152*412f47f9SXin Li       svfloat32_t y2 = sv_f32 (elem);
153*412f47f9SXin Li       y = svsel (p, y2, y);
154*412f47f9SXin Li       p = svpnext_b32 (cmp, p);
155*412f47f9SXin Li     }
156*412f47f9SXin Li   return y;
157*412f47f9SXin Li }
158*412f47f9SXin Li 
159*412f47f9SXin Li /* Compute core for half of the lanes in double precision.  */
160*412f47f9SXin Li static inline svfloat64_t
sv_powf_core_ext(const svbool_t pg,svuint64_t i,svfloat64_t z,svint64_t k,svfloat64_t y,svuint64_t sign_bias,svfloat64_t * pylogx,const struct data * d)161*412f47f9SXin Li sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k,
162*412f47f9SXin Li 		  svfloat64_t y, svuint64_t sign_bias, svfloat64_t *pylogx,
163*412f47f9SXin Li 		  const struct data *d)
164*412f47f9SXin Li {
165*412f47f9SXin Li   svfloat64_t invc = svld1_gather_index (pg, Tinvc, i);
166*412f47f9SXin Li   svfloat64_t logc = svld1_gather_index (pg, Tlogc, i);
167*412f47f9SXin Li 
168*412f47f9SXin Li   /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k.  */
169*412f47f9SXin Li   svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), z, invc);
170*412f47f9SXin Li   svfloat64_t y0 = svadd_x (pg, logc, svcvt_f64_x (pg, k));
171*412f47f9SXin Li 
172*412f47f9SXin Li   /* Polynomial to approximate log1p(r)/ln2.  */
173*412f47f9SXin Li   svfloat64_t logx = A (0);
174*412f47f9SXin Li   logx = svmla_x (pg, A (1), r, logx);
175*412f47f9SXin Li   logx = svmla_x (pg, A (2), r, logx);
176*412f47f9SXin Li   logx = svmla_x (pg, A (3), r, logx);
177*412f47f9SXin Li   logx = svmla_x (pg, y0, r, logx);
178*412f47f9SXin Li   *pylogx = svmul_x (pg, y, logx);
179*412f47f9SXin Li 
180*412f47f9SXin Li   /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
181*412f47f9SXin Li   svfloat64_t kd = svadd_x (pg, *pylogx, Shift);
182*412f47f9SXin Li   svuint64_t ki = svreinterpret_u64 (kd);
183*412f47f9SXin Li   kd = svsub_x (pg, kd, Shift);
184*412f47f9SXin Li 
185*412f47f9SXin Li   r = svsub_x (pg, *pylogx, kd);
186*412f47f9SXin Li 
187*412f47f9SXin Li   /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1).  */
188*412f47f9SXin Li   svuint64_t t
189*412f47f9SXin Li       = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1));
190*412f47f9SXin Li   svuint64_t ski = svadd_x (pg, ki, sign_bias);
191*412f47f9SXin Li   t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS));
192*412f47f9SXin Li   svfloat64_t s = svreinterpret_f64 (t);
193*412f47f9SXin Li 
194*412f47f9SXin Li   svfloat64_t p = C (0);
195*412f47f9SXin Li   p = svmla_x (pg, C (1), p, r);
196*412f47f9SXin Li   p = svmla_x (pg, C (2), p, r);
197*412f47f9SXin Li   p = svmla_x (pg, s, p, svmul_x (pg, s, r));
198*412f47f9SXin Li 
199*412f47f9SXin Li   return p;
200*412f47f9SXin Li }
201*412f47f9SXin Li 
202*412f47f9SXin Li /* Widen vector to double precision and compute core on both halves of the
203*412f47f9SXin Li    vector. Lower cost of promotion by considering all lanes active.  */
204*412f47f9SXin Li static inline svfloat32_t
sv_powf_core(const svbool_t pg,svuint32_t i,svuint32_t iz,svint32_t k,svfloat32_t y,svuint32_t sign_bias,svfloat32_t * pylogx,const struct data * d)205*412f47f9SXin Li sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
206*412f47f9SXin Li 	      svfloat32_t y, svuint32_t sign_bias, svfloat32_t *pylogx,
207*412f47f9SXin Li 	      const struct data *d)
208*412f47f9SXin Li {
209*412f47f9SXin Li   const svbool_t ptrue = svptrue_b64 ();
210*412f47f9SXin Li 
211*412f47f9SXin Li   /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in
212*412f47f9SXin Li      order to perform core computation in double precision.  */
213*412f47f9SXin Li   const svbool_t pg_lo = svunpklo (pg);
214*412f47f9SXin Li   const svbool_t pg_hi = svunpkhi (pg);
215*412f47f9SXin Li   svfloat64_t y_lo = svcvt_f64_x (
216*412f47f9SXin Li       ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
217*412f47f9SXin Li   svfloat64_t y_hi = svcvt_f64_x (
218*412f47f9SXin Li       ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
219*412f47f9SXin Li   svfloat32_t z = svreinterpret_f32 (iz);
220*412f47f9SXin Li   svfloat64_t z_lo = svcvt_f64_x (
221*412f47f9SXin Li       ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z))));
222*412f47f9SXin Li   svfloat64_t z_hi = svcvt_f64_x (
223*412f47f9SXin Li       ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z))));
224*412f47f9SXin Li   svuint64_t i_lo = svunpklo (i);
225*412f47f9SXin Li   svuint64_t i_hi = svunpkhi (i);
226*412f47f9SXin Li   svint64_t k_lo = svunpklo (k);
227*412f47f9SXin Li   svint64_t k_hi = svunpkhi (k);
228*412f47f9SXin Li   svuint64_t sign_bias_lo = svunpklo (sign_bias);
229*412f47f9SXin Li   svuint64_t sign_bias_hi = svunpkhi (sign_bias);
230*412f47f9SXin Li 
231*412f47f9SXin Li   /* Compute each part in double precision.  */
232*412f47f9SXin Li   svfloat64_t ylogx_lo, ylogx_hi;
233*412f47f9SXin Li   svfloat64_t lo = sv_powf_core_ext (pg_lo, i_lo, z_lo, k_lo, y_lo,
234*412f47f9SXin Li 				     sign_bias_lo, &ylogx_lo, d);
235*412f47f9SXin Li   svfloat64_t hi = sv_powf_core_ext (pg_hi, i_hi, z_hi, k_hi, y_hi,
236*412f47f9SXin Li 				     sign_bias_hi, &ylogx_hi, d);
237*412f47f9SXin Li 
238*412f47f9SXin Li   /* Convert back to single-precision and interleave.  */
239*412f47f9SXin Li   svfloat32_t ylogx_lo_32 = svcvt_f32_x (ptrue, ylogx_lo);
240*412f47f9SXin Li   svfloat32_t ylogx_hi_32 = svcvt_f32_x (ptrue, ylogx_hi);
241*412f47f9SXin Li   *pylogx = svuzp1 (ylogx_lo_32, ylogx_hi_32);
242*412f47f9SXin Li   svfloat32_t lo_32 = svcvt_f32_x (ptrue, lo);
243*412f47f9SXin Li   svfloat32_t hi_32 = svcvt_f32_x (ptrue, hi);
244*412f47f9SXin Li   return svuzp1 (lo_32, hi_32);
245*412f47f9SXin Li }
246*412f47f9SXin Li 
247*412f47f9SXin Li /* Implementation of SVE powf.
248*412f47f9SXin Li    Provides the same accuracy as AdvSIMD powf, since it relies on the same
249*412f47f9SXin Li    algorithm. The theoretical maximum error is under 2.60 ULPs.
250*412f47f9SXin Li    Maximum measured error is 2.56 ULPs:
251*412f47f9SXin Li    SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127
252*412f47f9SXin Li 						   want 0x1.fd4b06p+127.  */
SV_NAME_F2(pow)253*412f47f9SXin Li svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
254*412f47f9SXin Li {
255*412f47f9SXin Li   const struct data *d = ptr_barrier (&data);
256*412f47f9SXin Li 
257*412f47f9SXin Li   svuint32_t vix0 = svreinterpret_u32 (x);
258*412f47f9SXin Li   svuint32_t viy0 = svreinterpret_u32 (y);
259*412f47f9SXin Li 
260*412f47f9SXin Li   /* Negative x cases.  */
261*412f47f9SXin Li   svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask);
262*412f47f9SXin Li   svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask);
263*412f47f9SXin Li 
264*412f47f9SXin Li   /* Set sign_bias and ix depending on sign of x and nature of y.  */
265*412f47f9SXin Li   svbool_t yisnotint_xisneg = svpfalse_b ();
266*412f47f9SXin Li   svuint32_t sign_bias = sv_u32 (0);
267*412f47f9SXin Li   svuint32_t vix = vix0;
268*412f47f9SXin Li   if (unlikely (svptest_any (pg, xisneg)))
269*412f47f9SXin Li     {
270*412f47f9SXin Li       /* Determine nature of y.  */
271*412f47f9SXin Li       yisnotint_xisneg = svisnotint (xisneg, y);
272*412f47f9SXin Li       svbool_t yisint_xisneg = svisint (xisneg, y);
273*412f47f9SXin Li       svbool_t yisodd_xisneg = svisodd (xisneg, y);
274*412f47f9SXin Li       /* ix set to abs(ix) if y is integer.  */
275*412f47f9SXin Li       vix = svand_m (yisint_xisneg, vix0, 0x7fffffff);
276*412f47f9SXin Li       /* Set to SignBias if x is negative and y is odd.  */
277*412f47f9SXin Li       sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0));
278*412f47f9SXin Li     }
279*412f47f9SXin Li 
280*412f47f9SXin Li   /* Special cases of x or y: zero, inf and nan.  */
281*412f47f9SXin Li   svbool_t xspecial = sv_zeroinfnan (pg, vix0);
282*412f47f9SXin Li   svbool_t yspecial = sv_zeroinfnan (pg, viy0);
283*412f47f9SXin Li   svbool_t cmp = svorr_z (pg, xspecial, yspecial);
284*412f47f9SXin Li 
285*412f47f9SXin Li   /* Small cases of x: |x| < 0x1p-126.  */
286*412f47f9SXin Li   svbool_t xsmall = svaclt (pg, x, d->small_bound);
287*412f47f9SXin Li   if (unlikely (svptest_any (pg, xsmall)))
288*412f47f9SXin Li     {
289*412f47f9SXin Li       /* Normalize subnormal x so exponent becomes negative.  */
290*412f47f9SXin Li       svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm));
291*412f47f9SXin Li       vix_norm = svand_x (xsmall, vix_norm, 0x7fffffff);
292*412f47f9SXin Li       vix_norm = svsub_x (xsmall, vix_norm, d->subnormal_bias);
293*412f47f9SXin Li       vix = svsel (xsmall, vix_norm, vix);
294*412f47f9SXin Li     }
295*412f47f9SXin Li   /* Part of core computation carried in working precision.  */
296*412f47f9SXin Li   svuint32_t tmp = svsub_x (pg, vix, d->off);
297*412f47f9SXin Li   svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
298*412f47f9SXin Li 			  V_POWF_LOG2_N - 1);
299*412f47f9SXin Li   svuint32_t top = svand_x (pg, tmp, 0xff800000);
300*412f47f9SXin Li   svuint32_t iz = svsub_x (pg, vix, top);
301*412f47f9SXin Li   svint32_t k
302*412f47f9SXin Li       = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS));
303*412f47f9SXin Li 
304*412f47f9SXin Li   /* Compute core in extended precision and return intermediate ylogx results to
305*412f47f9SXin Li       handle cases of underflow and underflow in exp.  */
306*412f47f9SXin Li   svfloat32_t ylogx;
307*412f47f9SXin Li   svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d);
308*412f47f9SXin Li 
309*412f47f9SXin Li   /* Handle exp special cases of underflow and overflow.  */
310*412f47f9SXin Li   svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
311*412f47f9SXin Li   svfloat32_t ret_oflow
312*412f47f9SXin Li       = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY)));
313*412f47f9SXin Li   svfloat32_t ret_uflow = svreinterpret_f32 (sign);
314*412f47f9SXin Li   ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret);
315*412f47f9SXin Li   ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret);
316*412f47f9SXin Li 
317*412f47f9SXin Li   /* Cases of finite y and finite negative x.  */
318*412f47f9SXin Li   ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret);
319*412f47f9SXin Li 
320*412f47f9SXin Li   if (unlikely (svptest_any (pg, cmp)))
321*412f47f9SXin Li     return sv_call_powf_sc (x, y, ret, cmp);
322*412f47f9SXin Li 
323*412f47f9SXin Li   return ret;
324*412f47f9SXin Li }
325*412f47f9SXin Li 
326*412f47f9SXin Li PL_SIG (SV, F, 2, pow)
327*412f47f9SXin Li PL_TEST_ULP (SV_NAME_F2 (pow), 2.06)
328*412f47f9SXin Li /* Wide intervals spanning the whole domain but shared between x and y.  */
329*412f47f9SXin Li #define SV_POWF_INTERVAL2(xlo, xhi, ylo, yhi, n)                               \
330*412f47f9SXin Li   PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, ylo, yhi, n)                  \
331*412f47f9SXin Li   PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, -ylo, -yhi, n)                \
332*412f47f9SXin Li   PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, ylo, yhi, n)                \
333*412f47f9SXin Li   PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, -ylo, -yhi, n)
334*412f47f9SXin Li SV_POWF_INTERVAL2 (0, 0x1p-126, 0, inf, 40000)
335*412f47f9SXin Li SV_POWF_INTERVAL2 (0x1p-126, 1, 0, inf, 50000)
336*412f47f9SXin Li SV_POWF_INTERVAL2 (1, inf, 0, inf, 50000)
337*412f47f9SXin Li /* x~1 or y~1.  */
338*412f47f9SXin Li SV_POWF_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000)
339*412f47f9SXin Li SV_POWF_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
340*412f47f9SXin Li SV_POWF_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000)
341*412f47f9SXin Li /* around estimated argmaxs of ULP error.  */
342*412f47f9SXin Li SV_POWF_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
343*412f47f9SXin Li SV_POWF_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
344*412f47f9SXin Li /* x is negative, y is odd or even integer, or y is real not integer.  */
345*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
346*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
347*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
348*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
349*412f47f9SXin Li /* |x| is inf, y is odd or even integer, or y is real not integer.  */
350*412f47f9SXin Li SV_POWF_INTERVAL2 (inf, inf, 0.5, 0.5, 1)
351*412f47f9SXin Li SV_POWF_INTERVAL2 (inf, inf, 1.0, 1.0, 1)
352*412f47f9SXin Li SV_POWF_INTERVAL2 (inf, inf, 2.0, 2.0, 1)
353*412f47f9SXin Li SV_POWF_INTERVAL2 (inf, inf, 3.0, 3.0, 1)
354*412f47f9SXin Li /* 0.0^y.  */
355*412f47f9SXin Li SV_POWF_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000)
356*412f47f9SXin Li /* 1.0^y.  */
357*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
358*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
359*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
360*412f47f9SXin Li PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
361