xref: /aosp_15_r20/external/pytorch/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #pragma once
2 
3 #include <ATen/cpu/vec/intrinsics.h>
4 #include <ATen/cpu/vec/vec_base.h>
5 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
6 #include <c10/util/irange.h>
7 
8 #include <sleef.h>
9 
10 namespace at {
11 namespace vec {
12 
13 inline namespace CPU_CAPABILITY {
14 
15 
16 template <>
17 class Vectorized<double> {
18  private:
19   union {
20     struct {
21       vfloat64 _vec0;
22       vfloat64 _vec1;
23     };
24     struct {
25       vbool64 _vecb0;
26       vbool64 _vecb1;
27     };
28 
29   } __attribute__((__may_alias__));
30 
31  public:
32   using value_type = double;
33   using vec_internal_type = vfloat64;
34   using vec_internal_mask_type = vbool64;
35   using size_type = int;
size()36   static constexpr size_type size() {
37     return 4;
38   }
Vectorized()39   Vectorized() {}
Vectorized(vfloat64 v)40   C10_ALWAYS_INLINE Vectorized(vfloat64 v) : _vec0{v}, _vec1{v} {}
Vectorized(vbool64 vmask)41   C10_ALWAYS_INLINE Vectorized(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
Vectorized(vfloat64 v1,vfloat64 v2)42   C10_ALWAYS_INLINE Vectorized(vfloat64 v1, vfloat64 v2) : _vec0{v1}, _vec1{v2} {}
Vectorized(vbool64 v1,vbool64 v2)43   C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2) : _vecb0{v1}, _vecb1{v2} {}
Vectorized(double scalar)44   C10_ALWAYS_INLINE Vectorized(double scalar)
45       : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {}
Vectorized(double scalar1,double scalar2,double scalar3,double scalar4)46   C10_ALWAYS_INLINE Vectorized(
47       double scalar1,
48       double scalar2,
49       double scalar3,
50       double scalar4)
51       : _vec0{vfloat64{scalar1, scalar2}}, _vec1{vfloat64{scalar3, scalar4}} {}
vec0()52   C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
53     return _vec0;
54   }
vec1()55   C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
56     return _vec1;
57   }
58 
zero_mask()59   int zero_mask() const {
60     auto cmp = (*this == vd_zero);
61     return (cmp._vecb0[0] & 1) | (cmp._vecb0[1] & 2) | (cmp._vecb1[0] & 4) |
62         (cmp._vecb1[1] & 8);
63   }
64 
65   template <int64_t mask>
66   static std::enable_if_t<blendChoiceDbl(mask) == 0, Vectorized<double>> C10_ALWAYS_INLINE
blend(const Vectorized<double> & a,const Vectorized<double> & b)67       blend(const Vectorized<double>& a, const Vectorized<double>& b) {
68       return a;
69   }
70 
71   template <int64_t mask>
72   static std::enable_if_t<blendChoiceDbl(mask) == 1, Vectorized<double>> C10_ALWAYS_INLINE
blend(const Vectorized<double> & a,const Vectorized<double> & b)73       blend(const Vectorized<double>& a, const Vectorized<double>& b) {
74       return b;
75   }
76 
77   template <int64_t mask>
78   static std::enable_if_t<blendChoiceDbl(mask) == 2, Vectorized<double>> C10_ALWAYS_INLINE
blend(const Vectorized<double> & a,const Vectorized<double> & b)79       blend(const Vectorized<double>& a, const Vectorized<double>& b) {
80       return { b._vec0, a._vec1 };
81   }
82 
83   template <int64_t mask>
84   static std::enable_if_t<blendChoiceDbl(mask) == 3, Vectorized<double>> C10_ALWAYS_INLINE
blend(const Vectorized<double> & a,const Vectorized<double> & b)85       blend(const Vectorized<double>& a, const Vectorized<double>& b) {
86       return { a._vec0, b._vec1 };
87   }
88 
89 
90   template <int64_t mask>
91   static std::enable_if_t<blendChoiceDbl(mask) == 4, Vectorized<double>> C10_ALWAYS_INLINE
blend(const Vectorized<double> & a,const Vectorized<double> & b)92       blend(const Vectorized<double>& a, const Vectorized<double>& b) {
93       const vbool64 mask_1st = VsxDblMask1(mask);
94       return { (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1 };
95   }
96 
97   template <int64_t mask>
98   static std::enable_if_t<blendChoiceDbl(mask) == 5, Vectorized<double>> C10_ALWAYS_INLINE
blend(const Vectorized<double> & a,const Vectorized<double> & b)99       blend(const Vectorized<double>& a, const Vectorized<double>& b) {
100       const vbool64 mask_1st = VsxDblMask1(mask);
101       return { (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1 };
102   }
103 
104 
105   template <int64_t mask>
106   static std::enable_if_t<blendChoiceDbl(mask) == 6,
107       Vectorized<double>>
blend(const Vectorized<double> & a,const Vectorized<double> & b)108       C10_ALWAYS_INLINE blend(const Vectorized<double>& a, const Vectorized<double>& b) {
109       const vbool64 mask_2nd = VsxDblMask2(mask);
110       // generated masks
111       return { a._vec0,
112           (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd) };
113   }
114 
115   template <int64_t mask>
116   static std::enable_if_t<blendChoiceDbl(mask) == 7,
117       Vectorized<double>>
blend(const Vectorized<double> & a,const Vectorized<double> & b)118       C10_ALWAYS_INLINE blend(const Vectorized<double>& a, const Vectorized<double>& b) {
119       const vbool64 mask_2nd = VsxDblMask2(mask);
120       // generated masks
121       return { b._vec0,
122           (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd) };
123   }
124 
125   template <int64_t mask>
126   static std::enable_if_t<blendChoiceDbl(mask) == 8, Vectorized<double>>
blend(const Vectorized<double> & a,const Vectorized<double> & b)127       C10_ALWAYS_INLINE blend(const Vectorized<double>& a, const Vectorized<double>& b) {
128       const vbool64 mask_1st = VsxDblMask1(mask);
129       const vbool64 mask_2nd = VsxDblMask2(mask);
130       return {
131           (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st),
132           (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd) };
133   }
134 
135 
blendv(const Vectorized<double> & a,const Vectorized<double> & b,const Vectorized<double> & mask)136   static Vectorized<double> C10_ALWAYS_INLINE blendv(
137       const Vectorized<double>& a,
138       const Vectorized<double>& b,
139       const Vectorized<double>& mask) {
140     // the mask used here returned by comparision of vec256
141 
142     return {
143         vec_sel(a._vec0, b._vec0, mask._vecb0),
144         vec_sel(a._vec1, b._vec1, mask._vecb1)};
145   }
146   template <typename step_t>
147   static Vectorized<double> arange(double base = 0., step_t step = static_cast<step_t>(1)) {
148     return Vectorized<double>(base, base + step, base + 2 * step, base + 3 * step);
149   }
150 
151   static Vectorized<double> C10_ALWAYS_INLINE
152   set(const Vectorized<double>& a, const Vectorized<double>& b, size_t count = size()) {
153     switch (count) {
154       case 0:
155         return a;
156       case 1:
157         return blend<1>(a, b);
158       case 2:
159         return blend<3>(a, b);
160       case 3:
161         return blend<7>(a, b);
162     }
163 
164     return b;
165   }
166   static Vectorized<value_type> C10_ALWAYS_INLINE
167   loadu(const void* ptr, int count = size()) {
168     if (count == size()) {
169       return {
170           vec_vsx_ld(offset0, reinterpret_cast<const value_type*>(ptr)),
171           vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
172     }
173 
174     __at_align__ value_type tmp_values[size()] = {};
175     std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
176 
177     return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
178   }
179   void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
180     if (count == size()) {
181       vec_vsx_st(_vec0, offset0, reinterpret_cast<value_type*>(ptr));
182       vec_vsx_st(_vec1, offset16, reinterpret_cast<value_type*>(ptr));
183     } else if (count > 0) {
184       __at_align__ value_type tmp_values[size()];
185       vec_vsx_st(_vec0, offset0, tmp_values);
186       vec_vsx_st(_vec1, offset16, tmp_values);
187       std::memcpy(
188           ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
189     }
190   }
191   const double& operator[](int idx) const = delete;
192   double& operator[](int idx) = delete;
map(double (* const f)(double))193   Vectorized<double> map(double (*const f)(double)) const {
194     Vectorized<double> ret;
195     for (const auto i : c10::irange(size()/2)) {
196         ret._vec0[i] = f(_vec0[i]);
197     }
198     for (const auto i : c10::irange(size()/2)) {
199         ret._vec1[i] = f(_vec1[i]);
200     }
201     return ret;
202   }
203 
mapbi(double (* const f)(double,double),const Vectorized<double> & other)204   Vectorized<double> mapbi(double (*const f)(double, double), const Vectorized<double>& other)
205       const {
206     Vectorized<double> ret;
207     for (const auto i : c10::irange(size()/2)) {
208         ret._vec0[i] = f(_vec0[i], other._vec0[i]);
209     }
210     for (const auto i : c10::irange(size()/2)) {
211         ret._vec1[i] = f(_vec1[i], other._vec1[i]);
212     }
213     return ret;
214   }
abs()215   Vectorized<double> C10_ALWAYS_INLINE abs() const {
216     return {vec_abs(_vec0), vec_abs(_vec1)};
217   }
218 
acos()219   Vectorized<double> C10_ALWAYS_INLINE acos() const {
220      return {Sleef_acosd2_u10(_vec0), Sleef_acosd2_u10(_vec1)};
221   }
acosh()222   Vectorized<double> C10_ALWAYS_INLINE acosh() const {
223   return {Sleef_acoshd2_u10(_vec0), Sleef_acoshd2_u10(_vec1)};
224   }
asin()225   Vectorized<double> C10_ALWAYS_INLINE asin() const {
226      return {Sleef_asind2_u10(_vec0), Sleef_asind2_u10(_vec1)};
227   }
atan()228   Vectorized<double> atan() const {
229      return {Sleef_atand2_u10(_vec0), Sleef_atand2_u10(_vec1)};
230   }
atanh()231   Vectorized<double> atanh() const {
232      return {Sleef_atanhd2_u10(_vec0), Sleef_atanhd2_u10(_vec1)};
233   }
atan2(const Vectorized<double> & b)234   Vectorized<double> atan2(const Vectorized<double>& b) const {
235      return {Sleef_atan2d2_u10(_vec0, b._vec0), Sleef_atan2d2_u10(_vec1, b._vec1)};
236   }
copysign(const Vectorized<double> & sign)237   Vectorized<double> copysign(const Vectorized<double> &sign) const {
238     return {Sleef_copysignd2(_vec0, sign._vec0), Sleef_copysignd2(_vec1, sign._vec1)};
239   }
erf()240   Vectorized<double> erf() const {
241      return {Sleef_erfd2_u10(_vec0), Sleef_erfd2_u10(_vec1)};
242   }
erfc()243   Vectorized<double> erfc() const {
244      return {Sleef_erfcd2_u15(_vec0), Sleef_erfcd2_u15(_vec1)};
245   }
exp()246   Vectorized<double> C10_ALWAYS_INLINE exp() const {
247      return {Sleef_expd2_u10(_vec0), Sleef_expd2_u10(_vec1)};
248   }
exp2()249   Vectorized<double> C10_ALWAYS_INLINE exp2() const {
250     return {Sleef_exp2d2_u10(_vec0), Sleef_exp2d2_u10(_vec1)};
251   }
expm1()252   Vectorized<double> expm1() const {
253      return {Sleef_expm1d2_u10(_vec0), Sleef_expm1d2_u10(_vec1)};
254   }
exp_u20()255   Vectorized<double> C10_ALWAYS_INLINE exp_u20() const {
256      return exp();
257   }
258 
lgamma()259   Vectorized<double> lgamma() const __ubsan_ignore_undefined__ {
260      return {Sleef_lgammad2_u10(_vec0), Sleef_lgammad2_u10(_vec1)};
261   }
262 
erfinv()263   Vectorized<double> erfinv() const {
264     return map(calc_erfinv);
265   }
266 
angle()267   Vectorized<double> angle() const {
268     auto tmp = blendv(
269       Vectorized<double>(0), Vectorized<double>(c10::pi<double>), *this < Vectorized<double>(0));
270     return blendv(tmp, *this, isnan());
271   }
real()272   Vectorized<double> real() const {
273     return *this;
274   }
imag()275   Vectorized<double> imag() const {
276     return Vectorized<double>{0};
277   }
conj()278   Vectorized<double> conj() const {
279     return *this;
280   }
281 
log()282   Vectorized<double> C10_ALWAYS_INLINE log() const {
283      return {Sleef_logd2_u10(_vec0), Sleef_logd2_u10(_vec1)};
284   }
log10()285   Vectorized<double> C10_ALWAYS_INLINE log10() const {
286      return {Sleef_log10d2_u10(_vec0), Sleef_log10d2_u10(_vec1)};
287   }
log1p()288   Vectorized<double> C10_ALWAYS_INLINE log1p() const {
289      return {Sleef_log1pd2_u10(_vec0), Sleef_log1pd2_u10(_vec1)};
290   }
log2()291   Vectorized<double> C10_ALWAYS_INLINE log2() const {
292      return {Sleef_log2d2_u10(_vec0), Sleef_log2d2_u10(_vec1)};
293   }
ceil()294   Vectorized<double> C10_ALWAYS_INLINE ceil() const {
295     return {vec_ceil(_vec0), vec_ceil(_vec1)};
296   }
cos()297   Vectorized<double> C10_ALWAYS_INLINE cos() const {
298      return {Sleef_cosd2_u10(_vec0), Sleef_cosd2_u10(_vec1)};
299   }
cosh()300   Vectorized<double> C10_ALWAYS_INLINE cosh() const {
301      return {Sleef_coshd2_u10(_vec0), Sleef_coshd2_u10(_vec1)};
302   }
floor()303   Vectorized<double> C10_ALWAYS_INLINE floor() const {
304     return {vec_floor(_vec0), vec_floor(_vec1)};
305   }
neg()306   Vectorized<double> C10_ALWAYS_INLINE neg() const {
307     return {vec_neg(_vec0), vec_neg(_vec1)};
308   }
round()309   Vectorized<double> C10_ALWAYS_INLINE round() const {
310     return {vec_rint(_vec0), vec_rint(_vec1)};
311   }
sin()312   Vectorized<double> C10_ALWAYS_INLINE sin() const {
313      return {Sleef_sind2_u10(_vec0), Sleef_sind2_u10(_vec1)};
314   }
sinh()315   Vectorized<double> C10_ALWAYS_INLINE sinh() const {
316      return {Sleef_sinhd2_u10(_vec0), Sleef_sinhd2_u10(_vec1)};
317   }
tan()318   Vectorized<double> C10_ALWAYS_INLINE tan() const {
319      return {Sleef_tand2_u10(_vec0), Sleef_tand2_u10(_vec1)};
320   }
tanh()321   Vectorized<double> C10_ALWAYS_INLINE tanh() const {
322      return {Sleef_tanhd2_u10(_vec0), Sleef_tanhd2_u10(_vec1)};
323   }
trunc()324   Vectorized<double> C10_ALWAYS_INLINE trunc() const {
325     return {vec_trunc(_vec0), vec_trunc(_vec1)};
326   }
327 
frac()328   Vectorized<double> C10_ALWAYS_INLINE frac() const {
329     return *this - trunc();
330   }
331 
sqrt()332   Vectorized<double> C10_ALWAYS_INLINE sqrt() const {
333     return {vec_sqrt(_vec0), vec_sqrt(_vec1)};
334   }
reciprocal()335   Vectorized<double> C10_ALWAYS_INLINE reciprocal() const {
336     return {
337         vec_div(vd_one, _vec0), // vec_re(_vec0) is estimated one.
338         vec_div(vd_one, _vec1)};
339   }
rsqrt()340   Vectorized<double> C10_ALWAYS_INLINE rsqrt() const {
341     return sqrt().reciprocal();
342   }
343 
pow(const Vectorized<double> & b)344   Vectorized<double> C10_ALWAYS_INLINE pow(const Vectorized<double>& b) const {
345      return {Sleef_powd2_u10(_vec0, b._vec0), Sleef_powd2_u10(_vec1, b._vec1)};
346   }
fmod(const Vectorized<double> & b)347   Vectorized<double> C10_ALWAYS_INLINE fmod(const Vectorized<double>& b) const {
348      return {Sleef_fmodd2(_vec0, b._vec0),Sleef_fmodd2(_vec1, b._vec1)};
349   }
350 
hypot(const Vectorized<double> & b)351   Vectorized<double> hypot(const Vectorized<double>& b) const {
352      return {Sleef_hypotd2_u05(_vec0, b._vec0), Sleef_hypotd2_u05(_vec1, b._vec1)};
353   }
354 
nextafter(const Vectorized<double> & b)355   Vectorized<double> nextafter(const Vectorized<double>& b) const {
356      return {Sleef_nextafterd2(_vec0, b._vec0), Sleef_nextafterd2(_vec1, b._vec1)};
357   }
358 
igamma(const Vectorized<double> & x)359   Vectorized<double> igamma(const Vectorized<double>& x) const {
360     return mapbi(calc_igamma, x);
361   }
362 
igammac(const Vectorized<double> & x)363   Vectorized<double> igammac(const Vectorized<double>& x) const {
364     return mapbi(calc_igammac, x);
365   }
366 
367 
i0()368   Vectorized<double> i0() const {
369     return map(calc_i0);
370   }
371 
i0e()372   Vectorized<double> i0e() const {
373     return map(calc_i0e);
374   }
375 
digamma()376   Vectorized<double> digamma() const {
377     return map(calc_digamma);
378   }
379 
_nor()380   Vectorized<double> _nor() const {
381     return {vec_nor(_vec0, _vec0), vec_nor(_vec1, _vec1)};
382   }
383 
isnan()384   Vectorized<double> isnan() const {
385     auto x = *this;
386     auto ret = (x == x);
387     return ret._nor();
388   }
has_inf_nan()389   bool has_inf_nan() const {
390     for (const auto i : c10::irange(size()/2)) {
391       if(_isnan(_vec0[i]) || _isinf(_vec0[i])) {
392         return true;
393       }
394     }
395     for (const auto i : c10::irange(size()/2)) {
396       if(_isnan(_vec1[i]) || _isinf(_vec1[i])) {
397         return true;
398       }
399     }
400     return false;
401   }
402 
403   DEFINE_MEMBER_OP(operator==, double, vec_cmpeq)
404   DEFINE_MEMBER_OP(operator!=, double, vec_cmpne)
405   DEFINE_MEMBER_OP(operator<, double, vec_cmplt)
406   DEFINE_MEMBER_OP(operator<=, double, vec_cmple)
407   DEFINE_MEMBER_OP(operator>, double, vec_cmpgt)
408   DEFINE_MEMBER_OP(operator>=, double, vec_cmpge)
409   DEFINE_MEMBER_OP_AND_ONE(eq, double, vec_cmpeq)
410   DEFINE_MEMBER_OP_AND_ONE(ne, double, vec_cmpne)
411   DEFINE_MEMBER_OP_AND_ONE(lt, double, vec_cmplt)
412   DEFINE_MEMBER_OP_AND_ONE(le, double, vec_cmple)
413   DEFINE_MEMBER_OP_AND_ONE(gt, double, vec_cmpgt)
414   DEFINE_MEMBER_OP_AND_ONE(ge, double, vec_cmpge)
415   DEFINE_MEMBER_OP(operator+, double, vec_add)
416   DEFINE_MEMBER_OP(operator-, double, vec_sub)
417   DEFINE_MEMBER_OP(operator*, double, vec_mul)
418   DEFINE_MEMBER_OP(operator/, double, vec_div)
419   DEFINE_MEMBER_OP(maximum, double, vec_max_nan2)
420   DEFINE_MEMBER_OP(minimum, double, vec_min_nan2)
421   DEFINE_MEMBER_OP(operator&, double, vec_and)
422   DEFINE_MEMBER_OP(operator|, double, vec_or)
423   DEFINE_MEMBER_OP(operator^, double, vec_xor)
424   DEFINE_MEMBER_TERNARY_OP(madd, double, vec_madd)
425 };
426 template <>
maximum(const Vectorized<double> & a,const Vectorized<double> & b)427 Vectorized<double> inline maximum(
428     const Vectorized<double>& a,
429     const Vectorized<double>& b) {
430   return a.maximum(b);
431 }
432 
433 template <>
minimum(const Vectorized<double> & a,const Vectorized<double> & b)434 Vectorized<double> inline minimum(
435     const Vectorized<double>& a,
436     const Vectorized<double>& b) {
437   return a.minimum(b);
438 }
439 
440 template <>
441 Vectorized<double> C10_ALWAYS_INLINE operator+(const Vectorized<double>& a, const Vectorized<double>& b) {
442   return Vectorized<double>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
443 }
444 
445 template <>
446 Vectorized<double> C10_ALWAYS_INLINE operator-(const Vectorized<double>& a, const Vectorized<double>& b) {
447   return Vectorized<double>{vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
448 }
449 
450 template <>
451 Vectorized<double> C10_ALWAYS_INLINE operator*(const Vectorized<double>& a, const Vectorized<double>& b) {
452   return Vectorized<double>{vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
453 }
454 
455 template <>
456 Vectorized<double> C10_ALWAYS_INLINE operator/(const Vectorized<double>& a, const Vectorized<double>& b) {
457   return Vectorized<double>{vec_div(a.vec0(), b.vec0()), vec_div(a.vec1(), b.vec1())};
458 }
459 
460 template <>
461 Vectorized<double> C10_ALWAYS_INLINE operator&(const Vectorized<double>& a, const Vectorized<double>& b) {
462   return Vectorized<double>{vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
463 }
464 
465 template <>
466 Vectorized<double> C10_ALWAYS_INLINE operator|(const Vectorized<double>& a, const Vectorized<double>& b) {
467   return Vectorized<double>{vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
468 }
469 
470 template <>
471 Vectorized<double> C10_ALWAYS_INLINE operator^(const Vectorized<double>& a, const Vectorized<double>& b) {
472   return Vectorized<double>{vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
473 }
474 
475 } // namespace
476 } // namespace vec
477 } // namespace at
478