xref: /aosp_15_r20/external/arm-optimized-routines/pl/math/v_math.h (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li /*
2*412f47f9SXin Li  * Vector math abstractions.
3*412f47f9SXin Li  *
4*412f47f9SXin Li  * Copyright (c) 2019-2023, Arm Limited.
5*412f47f9SXin Li  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li  */
7*412f47f9SXin Li 
8*412f47f9SXin Li #ifndef _V_MATH_H
9*412f47f9SXin Li #define _V_MATH_H
10*412f47f9SXin Li 
11*412f47f9SXin Li #ifndef WANT_VMATH
12*412f47f9SXin Li /* Enable the build of vector math code.  */
13*412f47f9SXin Li # define WANT_VMATH 1
14*412f47f9SXin Li #endif
15*412f47f9SXin Li 
16*412f47f9SXin Li #if WANT_VMATH
17*412f47f9SXin Li 
18*412f47f9SXin Li # if __aarch64__
19*412f47f9SXin Li #  define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
20*412f47f9SXin Li # else
21*412f47f9SXin Li #  error "Cannot build without AArch64"
22*412f47f9SXin Li # endif
23*412f47f9SXin Li 
24*412f47f9SXin Li # include <stdint.h>
25*412f47f9SXin Li # include "math_config.h"
26*412f47f9SXin Li # if __aarch64__
27*412f47f9SXin Li 
28*412f47f9SXin Li #  include <arm_neon.h>
29*412f47f9SXin Li 
30*412f47f9SXin Li /* Shorthand helpers for declaring constants.  */
31*412f47f9SXin Li #  define V2(X) { X, X }
32*412f47f9SXin Li #  define V4(X) { X, X, X, X }
33*412f47f9SXin Li #  define V8(X) { X, X, X, X, X, X, X, X }
34*412f47f9SXin Li 
35*412f47f9SXin Li static inline int
v_any_u16h(uint16x4_t x)36*412f47f9SXin Li v_any_u16h (uint16x4_t x)
37*412f47f9SXin Li {
38*412f47f9SXin Li   return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
39*412f47f9SXin Li }
40*412f47f9SXin Li 
41*412f47f9SXin Li static inline float32x4_t
v_f32(float x)42*412f47f9SXin Li v_f32 (float x)
43*412f47f9SXin Li {
44*412f47f9SXin Li   return (float32x4_t) V4 (x);
45*412f47f9SXin Li }
46*412f47f9SXin Li static inline uint32x4_t
v_u32(uint32_t x)47*412f47f9SXin Li v_u32 (uint32_t x)
48*412f47f9SXin Li {
49*412f47f9SXin Li   return (uint32x4_t) V4 (x);
50*412f47f9SXin Li }
51*412f47f9SXin Li static inline int32x4_t
v_s32(int32_t x)52*412f47f9SXin Li v_s32 (int32_t x)
53*412f47f9SXin Li {
54*412f47f9SXin Li   return (int32x4_t) V4 (x);
55*412f47f9SXin Li }
56*412f47f9SXin Li 
57*412f47f9SXin Li /* true if any elements of a vector compare result is non-zero.  */
58*412f47f9SXin Li static inline int
v_any_u32(uint32x4_t x)59*412f47f9SXin Li v_any_u32 (uint32x4_t x)
60*412f47f9SXin Li {
61*412f47f9SXin Li   /* assume elements in x are either 0 or -1u.  */
62*412f47f9SXin Li   return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
63*412f47f9SXin Li }
64*412f47f9SXin Li static inline int
v_any_u32h(uint32x2_t x)65*412f47f9SXin Li v_any_u32h (uint32x2_t x)
66*412f47f9SXin Li {
67*412f47f9SXin Li   return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0;
68*412f47f9SXin Li }
69*412f47f9SXin Li static inline float32x4_t
v_lookup_f32(const float * tab,uint32x4_t idx)70*412f47f9SXin Li v_lookup_f32 (const float *tab, uint32x4_t idx)
71*412f47f9SXin Li {
72*412f47f9SXin Li   return (float32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] };
73*412f47f9SXin Li }
74*412f47f9SXin Li static inline uint32x4_t
v_lookup_u32(const uint32_t * tab,uint32x4_t idx)75*412f47f9SXin Li v_lookup_u32 (const uint32_t *tab, uint32x4_t idx)
76*412f47f9SXin Li {
77*412f47f9SXin Li   return (uint32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] };
78*412f47f9SXin Li }
79*412f47f9SXin Li static inline float32x4_t
v_call_f32(float (* f)(float),float32x4_t x,float32x4_t y,uint32x4_t p)80*412f47f9SXin Li v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p)
81*412f47f9SXin Li {
82*412f47f9SXin Li   return (float32x4_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
83*412f47f9SXin Li 			p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3] };
84*412f47f9SXin Li }
85*412f47f9SXin Li static inline float32x4_t
v_call2_f32(float (* f)(float,float),float32x4_t x1,float32x4_t x2,float32x4_t y,uint32x4_t p)86*412f47f9SXin Li v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
87*412f47f9SXin Li 	     float32x4_t y, uint32x4_t p)
88*412f47f9SXin Li {
89*412f47f9SXin Li   return (float32x4_t){ p[0] ? f (x1[0], x2[0]) : y[0],
90*412f47f9SXin Li 			p[1] ? f (x1[1], x2[1]) : y[1],
91*412f47f9SXin Li 			p[2] ? f (x1[2], x2[2]) : y[2],
92*412f47f9SXin Li 			p[3] ? f (x1[3], x2[3]) : y[3] };
93*412f47f9SXin Li }
94*412f47f9SXin Li static inline float32x4_t
v_zerofy_f32(float32x4_t x,uint32x4_t mask)95*412f47f9SXin Li v_zerofy_f32 (float32x4_t x, uint32x4_t mask)
96*412f47f9SXin Li {
97*412f47f9SXin Li   return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask));
98*412f47f9SXin Li }
99*412f47f9SXin Li 
100*412f47f9SXin Li static inline float64x2_t
v_f64(double x)101*412f47f9SXin Li v_f64 (double x)
102*412f47f9SXin Li {
103*412f47f9SXin Li   return (float64x2_t) V2 (x);
104*412f47f9SXin Li }
105*412f47f9SXin Li static inline uint64x2_t
v_u64(uint64_t x)106*412f47f9SXin Li v_u64 (uint64_t x)
107*412f47f9SXin Li {
108*412f47f9SXin Li   return (uint64x2_t) V2 (x);
109*412f47f9SXin Li }
110*412f47f9SXin Li static inline int64x2_t
v_s64(int64_t x)111*412f47f9SXin Li v_s64 (int64_t x)
112*412f47f9SXin Li {
113*412f47f9SXin Li   return (int64x2_t) V2 (x);
114*412f47f9SXin Li }
115*412f47f9SXin Li 
116*412f47f9SXin Li /* true if any elements of a vector compare result is non-zero.  */
117*412f47f9SXin Li static inline int
v_any_u64(uint64x2_t x)118*412f47f9SXin Li v_any_u64 (uint64x2_t x)
119*412f47f9SXin Li {
120*412f47f9SXin Li   /* assume elements in x are either 0 or -1u.  */
121*412f47f9SXin Li   return vpaddd_u64 (x) != 0;
122*412f47f9SXin Li }
123*412f47f9SXin Li /* true if all elements of a vector compare result is 1.  */
124*412f47f9SXin Li static inline int
v_all_u64(uint64x2_t x)125*412f47f9SXin Li v_all_u64 (uint64x2_t x)
126*412f47f9SXin Li {
127*412f47f9SXin Li   /* assume elements in x are either 0 or -1u.  */
128*412f47f9SXin Li   return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2;
129*412f47f9SXin Li }
130*412f47f9SXin Li static inline float64x2_t
v_lookup_f64(const double * tab,uint64x2_t idx)131*412f47f9SXin Li v_lookup_f64 (const double *tab, uint64x2_t idx)
132*412f47f9SXin Li {
133*412f47f9SXin Li   return (float64x2_t){ tab[idx[0]], tab[idx[1]] };
134*412f47f9SXin Li }
135*412f47f9SXin Li static inline uint64x2_t
v_lookup_u64(const uint64_t * tab,uint64x2_t idx)136*412f47f9SXin Li v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
137*412f47f9SXin Li {
138*412f47f9SXin Li   return (uint64x2_t){ tab[idx[0]], tab[idx[1]] };
139*412f47f9SXin Li }
140*412f47f9SXin Li 
141*412f47f9SXin Li static inline float64x2_t
v_call_f64(double (* f)(double),float64x2_t x,float64x2_t y,uint64x2_t p)142*412f47f9SXin Li v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
143*412f47f9SXin Li {
144*412f47f9SXin Li   double p1 = p[1];
145*412f47f9SXin Li   double x1 = x[1];
146*412f47f9SXin Li   if (likely (p[0]))
147*412f47f9SXin Li     y[0] = f (x[0]);
148*412f47f9SXin Li   if (likely (p1))
149*412f47f9SXin Li     y[1] = f (x1);
150*412f47f9SXin Li   return y;
151*412f47f9SXin Li }
152*412f47f9SXin Li 
153*412f47f9SXin Li static inline float64x2_t
v_call2_f64(double (* f)(double,double),float64x2_t x1,float64x2_t x2,float64x2_t y,uint64x2_t p)154*412f47f9SXin Li v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2,
155*412f47f9SXin Li 	     float64x2_t y, uint64x2_t p)
156*412f47f9SXin Li {
157*412f47f9SXin Li   double p1 = p[1];
158*412f47f9SXin Li   double x1h = x1[1];
159*412f47f9SXin Li   double x2h = x2[1];
160*412f47f9SXin Li   if (likely (p[0]))
161*412f47f9SXin Li     y[0] = f (x1[0], x2[0]);
162*412f47f9SXin Li   if (likely (p1))
163*412f47f9SXin Li     y[1] = f (x1h, x2h);
164*412f47f9SXin Li   return y;
165*412f47f9SXin Li }
166*412f47f9SXin Li static inline float64x2_t
v_zerofy_f64(float64x2_t x,uint64x2_t mask)167*412f47f9SXin Li v_zerofy_f64 (float64x2_t x, uint64x2_t mask)
168*412f47f9SXin Li {
169*412f47f9SXin Li   return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask));
170*412f47f9SXin Li }
171*412f47f9SXin Li 
172*412f47f9SXin Li # endif
173*412f47f9SXin Li #endif
174*412f47f9SXin Li 
175*412f47f9SXin Li #endif
176