1*412f47f9SXin Li /*
2*412f47f9SXin Li * Core approximation for single-precision vector sincos
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2023, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li #include "v_math.h"
9*412f47f9SXin Li
10*412f47f9SXin Li const static struct v_sincosf_data
11*412f47f9SXin Li {
12*412f47f9SXin Li float32x4_t poly_sin[3], poly_cos[3], pio2[3], inv_pio2, shift, range_val;
13*412f47f9SXin Li } v_sincosf_data = {
14*412f47f9SXin Li .poly_sin = { /* Generated using Remez, odd coeffs only, in [-pi/4, pi/4]. */
15*412f47f9SXin Li V4 (-0x1.555546p-3), V4 (0x1.11076p-7), V4 (-0x1.994eb4p-13) },
16*412f47f9SXin Li .poly_cos = { /* Generated using Remez, even coeffs only, in [-pi/4, pi/4]. */
17*412f47f9SXin Li V4 (0x1.55554ap-5), V4 (-0x1.6c0c1ap-10), V4 (0x1.99e0eep-16) },
18*412f47f9SXin Li .pio2 = { V4 (0x1.921fb6p+0f), V4 (-0x1.777a5cp-25f), V4 (-0x1.ee59dap-50f) },
19*412f47f9SXin Li .inv_pio2 = V4 (0x1.45f306p-1f),
20*412f47f9SXin Li .shift = V4 (0x1.8p23),
21*412f47f9SXin Li .range_val = V4 (0x1p20),
22*412f47f9SXin Li };
23*412f47f9SXin Li
24*412f47f9SXin Li static inline uint32x4_t
check_ge_rangeval(float32x4_t x,const struct v_sincosf_data * d)25*412f47f9SXin Li check_ge_rangeval (float32x4_t x, const struct v_sincosf_data *d)
26*412f47f9SXin Li {
27*412f47f9SXin Li return vcagtq_f32 (x, d->range_val);
28*412f47f9SXin Li }
29*412f47f9SXin Li
30*412f47f9SXin Li /* Single-precision vector function allowing calculation of both sin and cos in
31*412f47f9SXin Li one function call, using shared argument reduction and separate low-order
32*412f47f9SXin Li polynomials.
33*412f47f9SXin Li Worst-case error for sin is 1.67 ULP:
34*412f47f9SXin Li v_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5
35*412f47f9SXin Li Worst-case error for cos is 1.81 ULP:
36*412f47f9SXin Li v_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */
37*412f47f9SXin Li static inline float32x4x2_t
v_sincosf_inline(float32x4_t x,const struct v_sincosf_data * d)38*412f47f9SXin Li v_sincosf_inline (float32x4_t x, const struct v_sincosf_data *d)
39*412f47f9SXin Li {
40*412f47f9SXin Li /* n = rint ( x / (pi/2) ). */
41*412f47f9SXin Li float32x4_t shift = d->shift;
42*412f47f9SXin Li float32x4_t q = vfmaq_f32 (shift, x, d->inv_pio2);
43*412f47f9SXin Li q = vsubq_f32 (q, shift);
44*412f47f9SXin Li int32x4_t n = vcvtq_s32_f32 (q);
45*412f47f9SXin Li
46*412f47f9SXin Li /* Reduce x such that r is in [ -pi/4, pi/4 ]. */
47*412f47f9SXin Li float32x4_t r = x;
48*412f47f9SXin Li r = vfmsq_f32 (r, q, d->pio2[0]);
49*412f47f9SXin Li r = vfmsq_f32 (r, q, d->pio2[1]);
50*412f47f9SXin Li r = vfmsq_f32 (r, q, d->pio2[2]);
51*412f47f9SXin Li
52*412f47f9SXin Li /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */
53*412f47f9SXin Li float32x4_t r2 = vmulq_f32 (r, r), r3 = vmulq_f32 (r, r2);
54*412f47f9SXin Li float32x4_t s = vfmaq_f32 (d->poly_sin[1], r2, d->poly_sin[2]);
55*412f47f9SXin Li s = vfmaq_f32 (d->poly_sin[0], r2, s);
56*412f47f9SXin Li s = vfmaq_f32 (r, r3, s);
57*412f47f9SXin Li
58*412f47f9SXin Li /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */
59*412f47f9SXin Li float32x4_t r4 = vmulq_f32 (r2, r2);
60*412f47f9SXin Li float32x4_t p = vfmaq_f32 (d->poly_cos[1], r2, d->poly_cos[2]);
61*412f47f9SXin Li float32x4_t c = vfmaq_f32 (v_f32 (-0.5), r2, d->poly_cos[0]);
62*412f47f9SXin Li c = vfmaq_f32 (c, r4, p);
63*412f47f9SXin Li c = vfmaq_f32 (v_f32 (1), c, r2);
64*412f47f9SXin Li
65*412f47f9SXin Li /* If odd quadrant, swap cos and sin. */
66*412f47f9SXin Li uint32x4_t swap = vtstq_u32 (vreinterpretq_u32_s32 (n), v_u32 (1));
67*412f47f9SXin Li float32x4_t ss = vbslq_f32 (swap, c, s);
68*412f47f9SXin Li float32x4_t cc = vbslq_f32 (swap, s, c);
69*412f47f9SXin Li
70*412f47f9SXin Li /* Fix signs according to quadrant.
71*412f47f9SXin Li ss = asfloat(asuint(ss) ^ ((n & 2) << 30))
72*412f47f9SXin Li cc = asfloat(asuint(cc) & (((n + 1) & 2) << 30)). */
73*412f47f9SXin Li uint32x4_t sin_sign
74*412f47f9SXin Li = vshlq_n_u32 (vandq_u32 (vreinterpretq_u32_s32 (n), v_u32 (2)), 30);
75*412f47f9SXin Li uint32x4_t cos_sign = vshlq_n_u32 (
76*412f47f9SXin Li vandq_u32 (vreinterpretq_u32_s32 (vaddq_s32 (n, v_s32 (1))), v_u32 (2)),
77*412f47f9SXin Li 30);
78*412f47f9SXin Li ss = vreinterpretq_f32_u32 (
79*412f47f9SXin Li veorq_u32 (vreinterpretq_u32_f32 (ss), sin_sign));
80*412f47f9SXin Li cc = vreinterpretq_f32_u32 (
81*412f47f9SXin Li veorq_u32 (vreinterpretq_u32_f32 (cc), cos_sign));
82*412f47f9SXin Li
83*412f47f9SXin Li return (float32x4x2_t){ ss, cc };
84*412f47f9SXin Li }
85