xref: /aosp_15_r20/external/arm-optimized-routines/pl/math/sv_sincosf_common.h (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li /*
2*412f47f9SXin Li  * Core approximation for single-precision vector sincos
3*412f47f9SXin Li  *
4*412f47f9SXin Li  * Copyright (c) 2023, Arm Limited.
5*412f47f9SXin Li  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li  */
7*412f47f9SXin Li 
8*412f47f9SXin Li #include "sv_math.h"
9*412f47f9SXin Li 
10*412f47f9SXin Li const static struct sv_sincosf_data
11*412f47f9SXin Li {
12*412f47f9SXin Li   float poly_sin[3], poly_cos[3], pio2[3], inv_pio2, shift, range_val;
13*412f47f9SXin Li } sv_sincosf_data = {
14*412f47f9SXin Li   .poly_sin = { /* Generated using Remez, odd coeffs only, in [-pi/4, pi/4].  */
15*412f47f9SXin Li 	        -0x1.555546p-3, 0x1.11076p-7, -0x1.994eb4p-13 },
16*412f47f9SXin Li   .poly_cos = { /* Generated using Remez, even coeffs only, in [-pi/4, pi/4].  */
17*412f47f9SXin Li 	        0x1.55554ap-5, -0x1.6c0c1ap-10, 0x1.99e0eep-16 },
18*412f47f9SXin Li   .pio2 = { 0x1.921fb6p+0f, -0x1.777a5cp-25f, -0x1.ee59dap-50f },
19*412f47f9SXin Li   .inv_pio2 = 0x1.45f306p-1f,
20*412f47f9SXin Li   .shift = 0x1.8p23,
21*412f47f9SXin Li   .range_val = 0x1p20
22*412f47f9SXin Li };
23*412f47f9SXin Li 
24*412f47f9SXin Li static inline svbool_t
check_ge_rangeval(svbool_t pg,svfloat32_t x,const struct sv_sincosf_data * d)25*412f47f9SXin Li check_ge_rangeval (svbool_t pg, svfloat32_t x, const struct sv_sincosf_data *d)
26*412f47f9SXin Li {
27*412f47f9SXin Li   svbool_t in_bounds = svaclt (pg, x, d->range_val);
28*412f47f9SXin Li   return svnot_z (pg, in_bounds);
29*412f47f9SXin Li }
30*412f47f9SXin Li 
31*412f47f9SXin Li /* Single-precision vector function allowing calculation of both sin and cos in
32*412f47f9SXin Li    one function call, using shared argument reduction and separate low-order
33*412f47f9SXin Li    polynomials.
34*412f47f9SXin Li    Worst-case error for sin is 1.67 ULP:
35*412f47f9SXin Li    sv_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5
36*412f47f9SXin Li    Worst-case error for cos is 1.81 ULP:
37*412f47f9SXin Li    sv_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6.  */
38*412f47f9SXin Li static inline svfloat32x2_t
sv_sincosf_inline(svbool_t pg,svfloat32_t x,const struct sv_sincosf_data * d)39*412f47f9SXin Li sv_sincosf_inline (svbool_t pg, svfloat32_t x, const struct sv_sincosf_data *d)
40*412f47f9SXin Li {
41*412f47f9SXin Li   /* n = rint ( x / (pi/2) ).  */
42*412f47f9SXin Li   svfloat32_t q = svmla_x (pg, sv_f32 (d->shift), x, d->inv_pio2);
43*412f47f9SXin Li   q = svsub_x (pg, q, d->shift);
44*412f47f9SXin Li   svint32_t n = svcvt_s32_x (pg, q);
45*412f47f9SXin Li 
46*412f47f9SXin Li   /* Reduce x such that r is in [ -pi/4, pi/4 ].  */
47*412f47f9SXin Li   svfloat32_t r = x;
48*412f47f9SXin Li   r = svmls_x (pg, r, q, d->pio2[0]);
49*412f47f9SXin Li   r = svmls_x (pg, r, q, d->pio2[1]);
50*412f47f9SXin Li   r = svmls_x (pg, r, q, d->pio2[2]);
51*412f47f9SXin Li 
52*412f47f9SXin Li   /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2).  */
53*412f47f9SXin Li   svfloat32_t r2 = svmul_x (pg, r, r), r3 = svmul_x (pg, r, r2);
54*412f47f9SXin Li   svfloat32_t s = svmla_x (pg, sv_f32 (d->poly_sin[1]), r2, d->poly_sin[2]);
55*412f47f9SXin Li   s = svmad_x (pg, r2, s, d->poly_sin[0]);
56*412f47f9SXin Li   s = svmla_x (pg, r, r3, s);
57*412f47f9SXin Li 
58*412f47f9SXin Li   /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2).  */
59*412f47f9SXin Li   svfloat32_t r4 = svmul_x (pg, r2, r2);
60*412f47f9SXin Li   svfloat32_t p = svmla_x (pg, sv_f32 (d->poly_cos[1]), r2, d->poly_cos[2]);
61*412f47f9SXin Li   svfloat32_t c = svmad_x (pg, sv_f32 (d->poly_cos[0]), r2, -0.5);
62*412f47f9SXin Li   c = svmla_x (pg, c, r4, p);
63*412f47f9SXin Li   c = svmad_x (pg, r2, c, 1);
64*412f47f9SXin Li 
65*412f47f9SXin Li   svuint32_t un = svreinterpret_u32 (n);
66*412f47f9SXin Li   /* If odd quadrant, swap cos and sin.  */
67*412f47f9SXin Li   svbool_t swap = svcmpeq (pg, svlsl_x (pg, un, 31), 0);
68*412f47f9SXin Li   svfloat32_t ss = svsel (swap, s, c);
69*412f47f9SXin Li   svfloat32_t cc = svsel (swap, c, s);
70*412f47f9SXin Li 
71*412f47f9SXin Li   /* Fix signs according to quadrant.
72*412f47f9SXin Li      ss = asfloat(asuint(ss) ^ ((n       & 2) << 30))
73*412f47f9SXin Li      cc = asfloat(asuint(cc) & (((n + 1) & 2) << 30)).  */
74*412f47f9SXin Li   svuint32_t sin_sign = svlsl_x (pg, svand_x (pg, un, 2), 30);
75*412f47f9SXin Li   svuint32_t cos_sign = svlsl_x (
76*412f47f9SXin Li       pg, svand_x (pg, svreinterpret_u32 (svadd_x (pg, n, 1)), 2), 30);
77*412f47f9SXin Li   ss = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ss), sin_sign));
78*412f47f9SXin Li   cc = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (cc), cos_sign));
79*412f47f9SXin Li 
80*412f47f9SXin Li   return svcreate2 (ss, cc);
81*412f47f9SXin Li }
82