xref: /aosp_15_r20/external/arm-optimized-routines/pl/math/v_sincos_common.h (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li /*
2*412f47f9SXin Li  * Core approximation for double-precision vector sincos
3*412f47f9SXin Li  *
4*412f47f9SXin Li  * Copyright (c) 2023, Arm Limited.
5*412f47f9SXin Li  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li  */
7*412f47f9SXin Li 
8*412f47f9SXin Li #include "v_math.h"
9*412f47f9SXin Li #include "poly_advsimd_f64.h"
10*412f47f9SXin Li 
11*412f47f9SXin Li static const struct v_sincos_data
12*412f47f9SXin Li {
13*412f47f9SXin Li   float64x2_t sin_poly[7], cos_poly[6], pio2[3];
14*412f47f9SXin Li   float64x2_t inv_pio2, shift, range_val;
15*412f47f9SXin Li } v_sincos_data = {
16*412f47f9SXin Li   .inv_pio2 = V2 (0x1.45f306dc9c882p-1),
17*412f47f9SXin Li   .pio2 = { V2 (0x1.921fb50000000p+0), V2 (0x1.110b460000000p-26),
18*412f47f9SXin Li 	    V2 (0x1.1a62633145c07p-54) },
19*412f47f9SXin Li   .shift = V2 (0x1.8p52),
20*412f47f9SXin Li   .sin_poly = { /* Computed using Remez in [-pi/2, pi/2].  */
21*412f47f9SXin Li 	        V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
22*412f47f9SXin Li 		V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
23*412f47f9SXin Li 		V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
24*412f47f9SXin Li 		V2 (-0x1.9e9540300a1p-41) },
25*412f47f9SXin Li   .cos_poly = { /* Computed using Remez in [-pi/4, pi/4].  */
26*412f47f9SXin Li 	        V2 (0x1.555555555554cp-5), V2 (-0x1.6c16c16c1521fp-10),
27*412f47f9SXin Li 		V2 (0x1.a01a019cbf62ap-16), V2 (-0x1.27e4f812b681ep-22),
28*412f47f9SXin Li 		V2 (0x1.1ee9f152a57cdp-29), V2 (-0x1.8fb131098404bp-37) },
29*412f47f9SXin Li   .range_val = V2 (0x1p23), };
30*412f47f9SXin Li 
31*412f47f9SXin Li static inline uint64x2_t
check_ge_rangeval(float64x2_t x,const struct v_sincos_data * d)32*412f47f9SXin Li check_ge_rangeval (float64x2_t x, const struct v_sincos_data *d)
33*412f47f9SXin Li {
34*412f47f9SXin Li   return vcagtq_f64 (x, d->range_val);
35*412f47f9SXin Li }
36*412f47f9SXin Li 
37*412f47f9SXin Li /* Double-precision vector function allowing calculation of both sin and cos in
38*412f47f9SXin Li    one function call, using shared argument reduction and separate polynomials.
39*412f47f9SXin Li    Largest observed error is for sin, 3.22 ULP:
40*412f47f9SXin Li    v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3
41*412f47f9SXin Li 				       want -0x1.ffe9537d5dbb4p-3.  */
42*412f47f9SXin Li static inline float64x2x2_t
v_sincos_inline(float64x2_t x,const struct v_sincos_data * d)43*412f47f9SXin Li v_sincos_inline (float64x2_t x, const struct v_sincos_data *d)
44*412f47f9SXin Li {
45*412f47f9SXin Li   /* q = nearest integer to 2 * x / pi.  */
46*412f47f9SXin Li   float64x2_t q = vsubq_f64 (vfmaq_f64 (d->shift, x, d->inv_pio2), d->shift);
47*412f47f9SXin Li   int64x2_t n = vcvtq_s64_f64 (q);
48*412f47f9SXin Li 
49*412f47f9SXin Li   /* Use q to reduce x to r in [-pi/4, pi/4], by:
50*412f47f9SXin Li      r = x - q * pi/2, in extended precision.  */
51*412f47f9SXin Li   float64x2_t r = x;
52*412f47f9SXin Li   r = vfmsq_f64 (r, q, d->pio2[0]);
53*412f47f9SXin Li   r = vfmsq_f64 (r, q, d->pio2[1]);
54*412f47f9SXin Li   r = vfmsq_f64 (r, q, d->pio2[2]);
55*412f47f9SXin Li 
56*412f47f9SXin Li   float64x2_t r2 = r * r, r3 = r2 * r, r4 = r2 * r2;
57*412f47f9SXin Li 
58*412f47f9SXin Li   /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2).  */
59*412f47f9SXin Li   float64x2_t s = v_pw_horner_6_f64 (r2, r4, d->sin_poly);
60*412f47f9SXin Li   s = vfmaq_f64 (r, r3, s);
61*412f47f9SXin Li 
62*412f47f9SXin Li   /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2).  */
63*412f47f9SXin Li   float64x2_t c = v_pw_horner_5_f64 (r2, r4, d->cos_poly);
64*412f47f9SXin Li   c = vfmaq_f64 (v_f64 (-0.5), r2, c);
65*412f47f9SXin Li   c = vfmaq_f64 (v_f64 (1), r2, c);
66*412f47f9SXin Li 
67*412f47f9SXin Li   /* If odd quadrant, swap cos and sin.  */
68*412f47f9SXin Li   uint64x2_t swap = vtstq_s64 (n, v_s64 (1));
69*412f47f9SXin Li   float64x2_t ss = vbslq_f64 (swap, c, s);
70*412f47f9SXin Li   float64x2_t cc = vbslq_f64 (swap, s, c);
71*412f47f9SXin Li 
72*412f47f9SXin Li   /* Fix signs according to quadrant.
73*412f47f9SXin Li      ss = asdouble(asuint64(ss) ^ ((n       & 2) << 62))
74*412f47f9SXin Li      cc = asdouble(asuint64(cc) & (((n + 1) & 2) << 62)).  */
75*412f47f9SXin Li   uint64x2_t sin_sign
76*412f47f9SXin Li       = vshlq_n_u64 (vandq_u64 (vreinterpretq_u64_s64 (n), v_u64 (2)), 62);
77*412f47f9SXin Li   uint64x2_t cos_sign = vshlq_n_u64 (
78*412f47f9SXin Li       vandq_u64 (vreinterpretq_u64_s64 (vaddq_s64 (n, v_s64 (1))), v_u64 (2)),
79*412f47f9SXin Li       62);
80*412f47f9SXin Li   ss = vreinterpretq_f64_u64 (
81*412f47f9SXin Li       veorq_u64 (vreinterpretq_u64_f64 (ss), sin_sign));
82*412f47f9SXin Li   cc = vreinterpretq_f64_u64 (
83*412f47f9SXin Li       veorq_u64 (vreinterpretq_u64_f64 (cc), cos_sign));
84*412f47f9SXin Li 
85*412f47f9SXin Li   return (float64x2x2_t){ ss, cc };
86*412f47f9SXin Li }
87