xref: /aosp_15_r20/external/pytorch/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #pragma once
2 #include <cstdint>
3 #include <c10/macros/Macros.h>
4 #include <ATen/cpu/vec/intrinsics.h>
5 
6 #if defined(__clang__)
7 typedef __vector __bool char vbool8;
8 typedef __vector __bool short vbool16;
9 typedef __vector __bool int vbool32;
10 typedef __vector __bool long long vbool64;
11 using vint8    = __attribute__((vector_size(16))) signed char;
12 using vint16   = __attribute__((vector_size(16))) signed short;
13 using vint32   = __attribute__((vector_size(16))) signed int;
14 using vint64   = __attribute__((vector_size(16))) signed long long;
15 using vuint8   = __attribute__((vector_size(16))) unsigned char;
16 using vuint16  = __attribute__((vector_size(16))) unsigned short;
17 using vuint32  = __attribute__((vector_size(16))) unsigned int;
18 using vuint64  = __attribute__((vector_size(16))) unsigned long long;
19 using vfloat32 = __attribute__((vector_size(16))) float;
20 using vfloat64 = __attribute__((vector_size(16))) double;
21 #else
22 using vbool8   =  __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) char;
23 using vbool16  =  __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) short;
24 using vbool32  =  __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) int;
25 using vbool64  =  __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) long long;
26 using vint8    =  __attribute__((altivec(vector__)))  signed char;
27 using vint16   =  __attribute__((altivec(vector__)))  signed short;
28 using vint32   =  __attribute__((altivec(vector__)))  signed int;
29 using vint64   =  __attribute__((altivec(vector__)))  signed long long;
30 using vuint8   =  __attribute__((altivec(vector__)))  unsigned char;
31 using vuint16  =  __attribute__((altivec(vector__)))  unsigned short;
32 using vuint32  =  __attribute__((altivec(vector__)))  unsigned  int;
33 using vuint64  =  __attribute__((altivec(vector__)))  unsigned long long;
34 using vfloat32 =  __attribute__((altivec(vector__)))  float;
35 using vfloat64 =  __attribute__((altivec(vector__)))  double;
36 #endif
37 
38 #if !defined(vec_float)
vec_float(const vint32 & vec_in)39 C10_ALWAYS_INLINE vfloat32 vec_float(const vint32& vec_in) {
40   vfloat32 vec_out;
41   __asm__("xvcvsxwsp %x0,%x1" : "=wf"(vec_out) : "wa"(vec_in));
42   return vec_out;
43 }
44 #endif
45 
46 #if !defined(vec_signed)
vec_signed(const vfloat32 & vec_in)47 C10_ALWAYS_INLINE vint32 vec_signed(const vfloat32& vec_in) {
48   vint32 vec_out;
49   __asm__("xvcvspsxws %x0,%x1" : "=wa"(vec_out) : "wf"(vec_in));
50   return vec_out;
51 }
52 
vec_signed(const vfloat64 & vec_in)53 C10_ALWAYS_INLINE vint64 vec_signed(const vfloat64& vec_in) {
54   vint64 vec_out;
55   __asm__("xvcvdpsxds %x0,%x1" : "=wa"(vec_out) : "wd"(vec_in));
56   return vec_out;
57 }
58 #endif
59 
60 #if !defined(vec_neg)
vec_neg(const vfloat32 & vec_in)61 C10_ALWAYS_INLINE vfloat32 vec_neg(const vfloat32& vec_in) {
62   vfloat32 vec_out;
63   __asm__("xvnegsp %x0,%x1" : "=wf"(vec_out) : "wf"(vec_in));
64   return vec_out;
65 }
66 
vec_neg(const vfloat64 & vec_in)67 C10_ALWAYS_INLINE vfloat64 vec_neg(const vfloat64& vec_in) {
68   vfloat64 vec_out;
69   __asm__("xvnegdp %x0,%x1" : "=wd"(vec_out) : "wd"(vec_in));
70   return vec_out;
71 }
72 
vec_neg(const vint16 & vec_in)73 C10_ALWAYS_INLINE vint16 vec_neg(const vint16& vec_in) {
74   vint16 vint0 = {0, 0, 0, 0 ,0, 0, 0, 0};
75   return vec_vsubuhm(vint0, vec_in);
76 }
77 
vec_neg(const vint32 & vec_in)78 C10_ALWAYS_INLINE vint32 vec_neg(const vint32& vec_in) {
79   vint32 vint0 = {0, 0, 0, 0};
80   return vec_vsubuwm(vint0, vec_in);
81 }
82 
vec_neg(const vint64 & vec_in)83 C10_ALWAYS_INLINE vint64 vec_neg(const vint64& vec_in) {
84   return -vec_in;
85 }
86 #endif
87 
88 #if !defined(vec_sldw)
89 template <unsigned int C>
90 C10_ALWAYS_INLINE vfloat32
vec_sldw_aux(const vfloat32 & vec_in0,const vfloat32 & vec_in1)91 vec_sldw_aux(const vfloat32& vec_in0, const vfloat32& vec_in1) {
92   vfloat32 vec_out;
93   __asm("xxsldwi %x0, %x1, %x2, %3 "
94         : "=wa"(vec_out)
95         : "wa"(vec_in0), "wa"(vec_in1), "I"(C));
96   return vec_out;
97 }
98 
99 #define vec_sldw(a, b, c) vec_sldw_aux<c>(a, b)
100 #endif
101 
102 #define vec_not(a) vec_nor(a, a)
103 #if defined(__clang__) && !defined(vec_splats)
vec_splats(const int64_t & a)104 C10_ALWAYS_INLINE vint64 vec_splats(const int64_t& a) {
105   return vec_splats(a);
106 }
107 #endif
108 // Vectorized min/max which return a if any operand is nan
109 template <class T>
vec_min_nan(const T & a,const T & b)110 C10_ALWAYS_INLINE T vec_min_nan(const T& a, const T& b) {
111   return vec_min(a, b);
112 }
113 template <class T>
vec_max_nan(const T & a,const T & b)114 C10_ALWAYS_INLINE T vec_max_nan(const T& a, const T& b) {
115   return vec_max(a, b);
116 }
117 
118 // Specializations for float/double taken from Eigen
119 template<>
120 C10_ALWAYS_INLINE vfloat32 vec_min_nan<vfloat32>(const vfloat32& a, const vfloat32& b)
121 {
122   // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
123   vfloat32 ret;
124   __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
125   return ret;
126 }
127 // Specializations for float/double taken from Eigen
128 template<>
129 C10_ALWAYS_INLINE vfloat32 vec_max_nan<vfloat32>(const vfloat32& a, const vfloat32& b)
130 {
131   // NOTE: about 10% slower than vec_max, but consistent with std::min and SSE regarding NaN
132   vfloat32 ret;
133    __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
134   return ret;
135 }
136 
137 template<>
138 C10_ALWAYS_INLINE vfloat64 vec_min_nan<vfloat64>(const vfloat64& a, const vfloat64& b)
139 {
140   // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
141   vfloat64 ret;
142   __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
143   return ret;
144 }
145 template<>
146 C10_ALWAYS_INLINE vfloat64 vec_max_nan<vfloat64>(const vfloat64& a, const vfloat64& b)
147 {
148   // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
149   vfloat64 ret;
150   __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
151   return ret;
152 }
153 
154 // Vectorizes min/max function which returns nan if any side is nan
155 #define C10_VSX_VEC_NAN_PROPAG(name, type, btype, func)       \
156   C10_ALWAYS_INLINE type name(const type& a, const type& b) { \
157     type tmp = func(a, b);                                    \
158     btype nan_a = vec_cmpne(a, a);                            \
159     btype nan_b = vec_cmpne(b, b);                            \
160     tmp = vec_sel(tmp, a, nan_a);                             \
161     return vec_sel(tmp, b, nan_b);                            \
162   }
163 
C10_VSX_VEC_NAN_PROPAG(vec_min_nan2,vfloat32,vbool32,vec_min)164 C10_VSX_VEC_NAN_PROPAG(vec_min_nan2, vfloat32, vbool32, vec_min)
165 C10_VSX_VEC_NAN_PROPAG(vec_max_nan2, vfloat32, vbool32, vec_max)
166 C10_VSX_VEC_NAN_PROPAG(vec_min_nan2, vfloat64, vbool64, vec_min)
167 C10_VSX_VEC_NAN_PROPAG(vec_max_nan2, vfloat64, vbool64, vec_max)
168 
169 #undef C10_VSX_VEC_NAN_PROPAG
170 
171 #define DEFINE_MEMBER_UNARY_OP(op, op_type, func)     \
172   Vectorized<op_type> C10_ALWAYS_INLINE op() const {      \
173     return Vectorized<op_type>{func(_vec0), func(_vec1)}; \
174   }
175 
176 #define DEFINE_MEMBER_OP(op, op_type, func)                                  \
177   Vectorized<op_type> C10_ALWAYS_INLINE op(const Vectorized<op_type>& other) const { \
178     return Vectorized<op_type>{                                                  \
179         func(_vec0, other._vec0), func(_vec1, other._vec1)};                 \
180   }
181 
182 #define DEFINE_MEMBER_BITWISE_OP(op, op_type, func)                          \
183   Vectorized<op_type> C10_ALWAYS_INLINE op(const Vectorized<op_type>& other) const { \
184     return Vectorized<op_type>{                                                  \
185         func(_vecb0, other._vecb0), func(_vecb1, other._vecb1)};             \
186   }
187 
188 #define DEFINE_MEMBER_TERNARY_OP(op, op_type, func)                    \
189   Vectorized<op_type> C10_ALWAYS_INLINE op(                                \
190       const Vectorized<op_type>& b, const Vectorized<op_type>& c) const {      \
191     return Vectorized<op_type>{                                            \
192         func(_vec0, b._vec0, c._vec0), func(_vec1, b._vec1, c._vec1)}; \
193   }
194 
195 #define DEFINE_MEMBER_EMULATE_BINARY_OP(op, op_type, binary_op)          \
196   Vectorized<op_type> C10_ALWAYS_INLINE op(const Vectorized<op_type>& b) const { \
197     Vectorized<op_type>::vec_internal_type ret_0;                         \
198     Vectorized<op_type>::vec_internal_type ret_1;                         \
199     for (int i = 0; i < Vectorized<op_type>::size() / 2; i++) {           \
200       ret_0[i] = _vec0[i] binary_op b._vec0[i];                       \
201       ret_1[i] = _vec1[i] binary_op b._vec1[i];                       \
202     }                                                                 \
203     return Vectorized<op_type>{ret_0, ret_1};                             \
204   }
205 
206 
207 #define DEFINE_MEMBER_OP_AND_ONE(op, op_type, func)                          \
208   Vectorized<op_type> C10_ALWAYS_INLINE op(const Vectorized<op_type>& other) const { \
209     using vvtype = Vectorized<op_type>::vec_internal_type;                       \
210     const vvtype v_one = vec_splats(static_cast<op_type>(1.0));              \
211     vvtype ret0 = (vvtype)func(_vec0, other._vec0);                          \
212     vvtype ret1 = (vvtype)func(_vec1, other._vec1);                          \
213     return Vectorized<op_type>{vec_and(ret0, v_one), vec_and(ret1, v_one)};      \
214   }
215 
216 #define DEFINE_CLAMP_FUNCS(operand_type)                                        \
217   template <>                                                                   \
218   Vectorized<operand_type> C10_ALWAYS_INLINE clamp(                             \
219       const Vectorized<operand_type>& a,                                        \
220       const Vectorized<operand_type>& min,                                      \
221       const Vectorized<operand_type>& max) {                                    \
222     return Vectorized<operand_type>{                                            \
223         vec_min_nan(vec_max_nan(a.vec0(), min.vec0()), max.vec0()),             \
224         vec_min_nan(vec_max_nan(a.vec1(), min.vec1()), max.vec1())};            \
225   }                                                                             \
226   template <>                                                                   \
227   Vectorized<operand_type> C10_ALWAYS_INLINE clamp_min(                         \
228       const Vectorized<operand_type>& a, const Vectorized<operand_type>& min) { \
229     return Vectorized<operand_type>{                                            \
230         vec_max_nan(a.vec0(), min.vec0()),                                      \
231         vec_max_nan(a.vec1(), min.vec1())};                                     \
232   }                                                                             \
233   template <>                                                                   \
234   Vectorized<operand_type> C10_ALWAYS_INLINE clamp_max(                         \
235       const Vectorized<operand_type>& a, const Vectorized<operand_type>& max) { \
236     return Vectorized<operand_type>{                                            \
237         vec_min_nan(a.vec0(), max.vec0()),                                      \
238         vec_min_nan(a.vec1(), max.vec1())};                                     \
239   }
240 
241 #define DEFINE_REINTERPRET_CAST_FUNCS(                             \
242     first_type, cast_type, cast_inner_vector_type)                 \
243   template <>                                                      \
244   C10_ALWAYS_INLINE Vectorized<cast_type> cast<cast_type, first_type>( \
245       const Vectorized<first_type>& src) {                                 \
246     return Vectorized<cast_type>{(cast_inner_vector_type)src.vec0(),       \
247                              (cast_inner_vector_type)src.vec1()};      \
248   }
249 
250 #define DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(first_type)     \
251   DEFINE_REINTERPRET_CAST_FUNCS(first_type, double, vfloat64)    \
252   DEFINE_REINTERPRET_CAST_FUNCS(first_type, float, vfloat32)     \
253   DEFINE_REINTERPRET_CAST_FUNCS(first_type, int64_t, vint64) \
254   DEFINE_REINTERPRET_CAST_FUNCS(first_type, int32_t, vint32)   \
255   DEFINE_REINTERPRET_CAST_FUNCS(first_type, int16_t, vint16)
256 
257 // it can be used to emulate blend faster
258 constexpr int blendChoice(uint32_t mask, uint32_t half1 = 0xF, uint32_t half2 = 0xF0) {
259   uint32_t none = 0;
260   uint32_t both = half1 | half2;
261   // clamp it between 0 and both
262   mask = mask & both;
263   // return  (a._vec0, a._vec1)
264   if (mask == none) return 0;
265   // return (b._vec0,b._vec1)
266   else if (mask == both)
267     return 1;
268   // return  (b._vec0,a._vec1)
269   else if (mask == half1)
270     return 2;
271   // return  (a._vec0,b._vec1)
272   else if (mask == half2)
273     return 3;
274   // return  (*_vec0,a._vec1)
275   else if (mask > 0 && mask < half1)
276     return 4;
277   // return  (*_vec0,b._vec1)
278   else if ((mask & half2) == half2)
279     return 5;
280   // return (a._vec0,*_vec1)
281   else if ((mask & half1) == 0 && mask > half1)
282     return 6;
283   // return (b._vec0,*_vec1)
284   else if ((mask & half1) == half1 && mask > half1)
285     return 7;
286   // return (*_vec0,*_vec1)
287   return 8;
288 }
289 
290 // it can be used to emulate blend faster
blendChoiceDbl(uint32_t mask)291 constexpr int blendChoiceDbl(uint32_t mask) {
292   // clamp it 0 and 0xF
293   return blendChoice(mask, 0x3, 0xC);
294 }
295 
VsxMask1(uint32_t mask)296 constexpr vbool32 VsxMask1(uint32_t mask) {
297   uint32_t g0 = (mask & 1) * 0xffffffff;
298   uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff;
299   uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff;
300   uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff;
301   return (vbool32){g0, g1, g2, g3};
302 }
303 
VsxMask2(uint32_t mask)304 constexpr vbool32 VsxMask2(uint32_t mask) {
305   uint32_t mask2 = (mask & 0xFF) >> 4;
306   return VsxMask1(mask2);
307 }
308 
VsxDblMask1(uint32_t mask)309 constexpr vbool64 VsxDblMask1(uint32_t mask) {
310   uint64_t g0 = (mask & 1) * 0xffffffffffffffff;
311   uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff;
312   return (vbool64){g0, g1};
313 }
314 
VsxDblMask2(uint32_t mask)315 constexpr vbool64 VsxDblMask2(uint32_t mask) {
316   uint32_t mask2 = (mask & 0xF) >> 2;
317   return VsxDblMask1(mask2);
318 }
319 
maskForComplex(uint32_t mask)320 constexpr int maskForComplex(uint32_t mask) {
321   mask = mask & 0xF;
322   int complex_mask = 0;
323   if (mask & 1) complex_mask |= 3;
324   if (mask & 2) complex_mask |= (3 << 2);
325   if (mask & 4) complex_mask |= (3 << 4);
326   if (mask & 8) complex_mask |= (3 << 6);
327   return complex_mask;
328 }
329 
maskForComplexDbl(uint32_t mask)330 constexpr int maskForComplexDbl(uint32_t mask) {
331   mask = mask & 0x3;
332   int complex_mask = 0;
333   if (mask & 1) complex_mask |= 3;
334   if (mask & 2) complex_mask |= (3 << 2);
335   return complex_mask;
336 }
337 
blendChoiceComplex(uint32_t mask)338 constexpr int blendChoiceComplex(uint32_t mask) {
339   return blendChoice(maskForComplex(mask));
340 }
341 
blendChoiceComplexDbl(uint32_t mask)342 constexpr int blendChoiceComplexDbl(uint32_t mask) {
343   return blendChoiceDbl(maskForComplexDbl(mask));
344 }
345 
VsxComplexMask1(uint32_t mask)346 constexpr vbool32 VsxComplexMask1(uint32_t mask) {
347   return VsxMask1(maskForComplex(mask));
348 }
349 
VsxComplexMask2(uint32_t mask)350 constexpr vbool32 VsxComplexMask2(uint32_t mask) {
351   uint32_t mask2 = (mask & 0xF) >> 2;
352   return VsxMask1(maskForComplex(mask2));
353 }
354 
VsxComplexDblMask1(uint32_t mask)355 constexpr vbool64 VsxComplexDblMask1(uint32_t mask) { return VsxDblMask1(mask); }
356 
VsxComplexDblMask2(uint32_t mask)357 constexpr vbool64 VsxComplexDblMask2(uint32_t mask) {
358   uint32_t mask2 = (mask & 0xF) >> 2;
359   return VsxDblMask1(mask2);
360 }
361 
362 // constants
363 namespace at {
364 namespace vec {
365 // See Note [CPU_CAPABILITY namespace]
366 inline namespace CPU_CAPABILITY {
367 //
368 constexpr int offset0 = 0;
369 constexpr int offset16 = 16;
370 
371 // #Constants
372 const vuint8 mask_zero_bits = vuint8{128, 128, 128, 128, 128, 128, 128, 128,
373                                 128, 128, 128, 128, 96,  64,  32,  0};
374 
375 const vuint8 swap_mask =
376     vuint8{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11};
377 
378 const vint32 v0x7f = vec_splats(0x7f);
379 const vint32 vi_0 = vec_splats((int)(0));
380 const vint32 vi_1 = vec_splats((int)1);
381 const vint32 vi_2 = vec_splats((int)2);
382 const vint32 vi_4 = vec_splats((int)4);
383 const vint32 vi_inv1 = vec_splats((int)~1);
384 const vuint32 vu_29 = vec_splats(29u);
385 const vuint32 vu_23 = vec_splats(23u);
386 
387 const vbool32 inv_mant_mask = (vbool32)vec_splats((unsigned int)~0xff800000);
388 const vbool32 sign_mask = (vbool32)vec_splats((int)0x80000000);
389 const vbool32 real_mask = vbool32{0xFFFFFFFF, 0x0, 0xFFFFFFFF, 0x0};
390 const vbool32 imag_mask = vbool32{0x0, 0xFFFFFFFF, 0x0, 0xFFFFFFFF};
391 const vbool32 isign_mask = vbool32{0x0, 0x80000000, 0x0, 0x80000000};
392 const vbool32 rsign_mask = vbool32{0x80000000, 0x0, 0x80000000, 0x0};
393 
394 const vbool64 vd_sign_mask  = vbool64{0x8000000000000000, 0x8000000000000000};
395 const vbool64 vd_imag_mask  = vbool64{0x0, 0xFFFFFFFFFFFFFFFF};
396 const vbool64 vd_real_mask  = vbool64{0xFFFFFFFFFFFFFFFF, 0x0};
397 const vbool64 vd_isign_mask = vbool64{0x0, 0x8000000000000000};
398 const vbool64 vd_rsign_mask = vbool64{0x8000000000000000, 0x0};
399 
400 const vfloat32 zero = vec_splats(0.f);
401 const vfloat32 half = vec_splats(0.5f);
402 const vfloat32 one = vec_splats(1.f);
403 const vfloat32 two = vec_splats(2.0f);
404 const vfloat32 _4div_pi = vec_splats(1.27323954473516f);
405 const vfloat32 v_inf = (vfloat32)vec_splats(0x7f800000u);
406 const vfloat32 v_minus_inf = vfloat32{ 0xff800000u, 0xff800000u, 0xff800000u, 0xff800000u };
407 const vfloat32 v_nan = (vfloat32)vec_splats(0x7fffffff);
408 const vfloat32 log10e_inv = vec_splats(0.43429448190325176f);
409 const vfloat32 log2e_inv = vec_splats(1.4426950408889634f);
410 const vfloat32 log2eB_inv = vec_splats(1.442695036924675f);
411 const vfloat32 cephes_SQRTHF = vec_splats(0.707106781186547524f);
412 const vfloat32 coscof_p0 = vec_splats(2.443315711809948E-005f);
413 const vfloat32 coscof_p1 = vec_splats(-1.388731625493765E-003f);
414 const vfloat32 coscof_p2 = vec_splats(4.166664568298827E-002f);
415 const vfloat32 exp_hi = vec_splats(104.f);
416 const vfloat32 exp_lo = vec_splats(-104.f);
417 const vfloat32 exp_p0 = vec_splats(0.000198527617612853646278381f);
418 const vfloat32 exp_p1 = vec_splats((0.00139304355252534151077271f));
419 const vfloat32 exp_p2 = vec_splats(0.00833336077630519866943359f);
420 const vfloat32 exp_p3 = vec_splats(0.0416664853692054748535156f);
421 const vfloat32 exp_p4 = vec_splats(0.166666671633720397949219f);
422 const vfloat32 exp_p5 = vec_splats(0.5f);
423 const vfloat32 log_p0 = vec_splats(7.0376836292E-2f);
424 const vfloat32 log_p1 = vec_splats(-1.1514610310E-1f);
425 const vfloat32 log_p2 = vec_splats(1.1676998740E-1f);
426 const vfloat32 log_p3 = vec_splats(-1.2420140846E-1f);
427 const vfloat32 log_p4 = vec_splats(+1.4249322787E-1f);
428 const vfloat32 log_p5 = vec_splats(-1.6668057665E-1f);
429 const vfloat32 log_p6 = vec_splats(+2.0000714765E-1f);
430 const vfloat32 log_p7 = vec_splats(-2.4999993993E-1f);
431 const vfloat32 log_p8 = vec_splats(+3.3333331174E-1f);
432 const vfloat32 log_q1 = vec_splats(-2.12194440e-4f);
433 const vfloat32 log_q2 = vec_splats(0.693359375f);
434 const vfloat32 max_logf = vec_splats(88.02969187150841f);
435 const vfloat32 max_numf = vec_splats(1.7014117331926442990585209174225846272e38f);
436 const vfloat32 min_inf = (vfloat32)vec_splats(0xff800000u);
437 const vfloat32 min_norm_pos = (vfloat32)vec_splats(0x0800000u);
438 const vfloat32 minus_cephes_dp1 = vec_splats(-0.78515625f);
439 const vfloat32 minus_cephes_dp2 = vec_splats(-2.4187564849853515625e-4f);
440 const vfloat32 minus_cephes_dp3 = vec_splats(-3.77489497744594108e-8f);
441 const vfloat32 negln2f_hi = vec_splats(-0.693145751953125f);
442 const vfloat32 negln2f_lo = vec_splats(-1.428606765330187045e-06f);
443 const vfloat32 p0 = vec_splats(2.03721912945E-4f);
444 const vfloat32 p1 = vec_splats(8.33028376239E-3f);
445 const vfloat32 p2 = vec_splats(1.66667160211E-1f);
446 const vfloat32 sincof_p0 = vec_splats(-1.9515295891E-4f);
447 const vfloat32 sincof_p1 = vec_splats(8.3321608736E-3f);
448 const vfloat32 sincof_p2 = vec_splats(-1.6666654611E-1f);
449 const vfloat32 tanh_0p625 = vec_splats(0.625f);
450 const vfloat32 tanh_half_max = vec_splats(44.014845935754205f);
451 const vfloat32 tanh_p0 = vec_splats(-5.70498872745E-3f);
452 const vfloat32 tanh_p1 = vec_splats(2.06390887954E-2f);
453 const vfloat32 tanh_p2 = vec_splats(-5.37397155531E-2f);
454 const vfloat32 tanh_p3 = vec_splats(1.33314422036E-1f);
455 const vfloat32 tanh_p4 = vec_splats(-3.33332819422E-1f);
456 const vfloat32 vcheck = vec_splats((float)(1LL << 24));
457 const vfloat32 imag_one = vfloat32{0.f, 1.f, 0.f, 1.f};
458 const vfloat32 imag_half = vfloat32{0.f, 0.5f, 0.f, 0.5f};
459 const vfloat32 sqrt2_2 = vfloat32{0.70710676908493042f, 0.70710676908493042,
460                           0.70710676908493042, 0.70710676908493042};
461 const vfloat32 pi_2 = vfloat32{M_PI / 2, 0.0, M_PI / 2, 0.0};
462 const vfloat32 vf_89 = vfloat32{89.f, 89.f, 89.f, 89.f};
463 const vfloat64 vd_one = vec_splats(1.0);
464 const vfloat64 vd_zero = vec_splats(0.0);
465 const vfloat64 vd_log10e_inv = vec_splats(0.43429448190325176);
466 const vfloat64 vd_log2e_inv = vec_splats(1.4426950408889634);
467 const vfloat64 vd_imag_one = vfloat64{0.0, 1.0};
468 const vfloat64 vd_imag_half = vfloat64{0.0, 0.5};
469 const vfloat64 vd_sqrt2_2 = vfloat64{0.70710678118654757, 0.70710678118654757};
470 const vfloat64 vd_pi_2 = vfloat64{M_PI / 2.0, 0.0};
471 
472 } // namespace
473 } // namespace vec
474 } // namespace at
475