xref: /aosp_15_r20/external/arm-optimized-routines/pl/math/sv_powif.c (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li /*
2*412f47f9SXin Li  * Single-precision SVE powi(x, n) function.
3*412f47f9SXin Li  *
4*412f47f9SXin Li  * Copyright (c) 2020-2023, Arm Limited.
5*412f47f9SXin Li  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li  */
7*412f47f9SXin Li 
8*412f47f9SXin Li #include "sv_math.h"
9*412f47f9SXin Li 
10*412f47f9SXin Li /* Optimized single-precision vector powi (float base, integer power).
11*412f47f9SXin Li    powi is developed for environments in which accuracy is of much less
12*412f47f9SXin Li    importance than performance, hence we provide no estimate for worst-case
13*412f47f9SXin Li    error.  */
14*412f47f9SXin Li svfloat32_t
_ZGVsMxvv_powi(svfloat32_t as,svint32_t ns,svbool_t p)15*412f47f9SXin Li _ZGVsMxvv_powi (svfloat32_t as, svint32_t ns, svbool_t p)
16*412f47f9SXin Li {
17*412f47f9SXin Li   /* Compute powi by successive squaring, right to left.  */
18*412f47f9SXin Li   svfloat32_t acc = sv_f32 (1.f);
19*412f47f9SXin Li   svbool_t want_recip = svcmplt (p, ns, 0);
20*412f47f9SXin Li   svuint32_t ns_abs = svreinterpret_u32 (svabs_x (p, ns));
21*412f47f9SXin Li 
22*412f47f9SXin Li   /* We use a max to avoid needing to check whether any lane != 0 on each
23*412f47f9SXin Li      iteration.  */
24*412f47f9SXin Li   uint32_t max_n = svmaxv (p, ns_abs);
25*412f47f9SXin Li 
26*412f47f9SXin Li   svfloat32_t c = as;
27*412f47f9SXin Li   /* Successively square c, and use merging predication (_m) to determine
28*412f47f9SXin Li      whether or not to perform the multiplication or keep the previous
29*412f47f9SXin Li      iteration.  */
30*412f47f9SXin Li   while (true)
31*412f47f9SXin Li     {
32*412f47f9SXin Li       svbool_t px = svcmpeq (p, svand_x (p, ns_abs, 1), 1);
33*412f47f9SXin Li       acc = svmul_m (px, acc, c);
34*412f47f9SXin Li       max_n >>= 1;
35*412f47f9SXin Li       if (max_n == 0)
36*412f47f9SXin Li 	break;
37*412f47f9SXin Li 
38*412f47f9SXin Li       ns_abs = svlsr_x (p, ns_abs, 1);
39*412f47f9SXin Li       c = svmul_x (p, c, c);
40*412f47f9SXin Li     }
41*412f47f9SXin Li 
42*412f47f9SXin Li   /* Negative powers are handled by computing the abs(n) version and then
43*412f47f9SXin Li      taking the reciprocal.  */
44*412f47f9SXin Li   if (svptest_any (want_recip, want_recip))
45*412f47f9SXin Li     acc = svdivr_m (want_recip, acc, 1.0f);
46*412f47f9SXin Li 
47*412f47f9SXin Li   return acc;
48*412f47f9SXin Li }
49