1 #pragma once
2
3 #include <ATen/cpu/vec/intrinsics.h>
4 #include <ATen/cpu/vec/vec_base.h>
5 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
6 #include <c10/util/irange.h>
7
8 #include <sleef.h>
9
10 namespace at {
11 namespace vec {
12
13 inline namespace CPU_CAPABILITY {
14
15
16 template <>
17 class Vectorized<double> {
18 private:
19 union {
20 struct {
21 vfloat64 _vec0;
22 vfloat64 _vec1;
23 };
24 struct {
25 vbool64 _vecb0;
26 vbool64 _vecb1;
27 };
28
29 } __attribute__((__may_alias__));
30
31 public:
32 using value_type = double;
33 using vec_internal_type = vfloat64;
34 using vec_internal_mask_type = vbool64;
35 using size_type = int;
size()36 static constexpr size_type size() {
37 return 4;
38 }
Vectorized()39 Vectorized() {}
Vectorized(vfloat64 v)40 C10_ALWAYS_INLINE Vectorized(vfloat64 v) : _vec0{v}, _vec1{v} {}
Vectorized(vbool64 vmask)41 C10_ALWAYS_INLINE Vectorized(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
Vectorized(vfloat64 v1,vfloat64 v2)42 C10_ALWAYS_INLINE Vectorized(vfloat64 v1, vfloat64 v2) : _vec0{v1}, _vec1{v2} {}
Vectorized(vbool64 v1,vbool64 v2)43 C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2) : _vecb0{v1}, _vecb1{v2} {}
Vectorized(double scalar)44 C10_ALWAYS_INLINE Vectorized(double scalar)
45 : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {}
Vectorized(double scalar1,double scalar2,double scalar3,double scalar4)46 C10_ALWAYS_INLINE Vectorized(
47 double scalar1,
48 double scalar2,
49 double scalar3,
50 double scalar4)
51 : _vec0{vfloat64{scalar1, scalar2}}, _vec1{vfloat64{scalar3, scalar4}} {}
vec0()52 C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
53 return _vec0;
54 }
vec1()55 C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
56 return _vec1;
57 }
58
zero_mask()59 int zero_mask() const {
60 auto cmp = (*this == vd_zero);
61 return (cmp._vecb0[0] & 1) | (cmp._vecb0[1] & 2) | (cmp._vecb1[0] & 4) |
62 (cmp._vecb1[1] & 8);
63 }
64
65 template <int64_t mask>
66 static std::enable_if_t<blendChoiceDbl(mask) == 0, Vectorized<double>> C10_ALWAYS_INLINE
blend(const Vectorized<double> & a,const Vectorized<double> & b)67 blend(const Vectorized<double>& a, const Vectorized<double>& b) {
68 return a;
69 }
70
71 template <int64_t mask>
72 static std::enable_if_t<blendChoiceDbl(mask) == 1, Vectorized<double>> C10_ALWAYS_INLINE
blend(const Vectorized<double> & a,const Vectorized<double> & b)73 blend(const Vectorized<double>& a, const Vectorized<double>& b) {
74 return b;
75 }
76
77 template <int64_t mask>
78 static std::enable_if_t<blendChoiceDbl(mask) == 2, Vectorized<double>> C10_ALWAYS_INLINE
blend(const Vectorized<double> & a,const Vectorized<double> & b)79 blend(const Vectorized<double>& a, const Vectorized<double>& b) {
80 return { b._vec0, a._vec1 };
81 }
82
83 template <int64_t mask>
84 static std::enable_if_t<blendChoiceDbl(mask) == 3, Vectorized<double>> C10_ALWAYS_INLINE
blend(const Vectorized<double> & a,const Vectorized<double> & b)85 blend(const Vectorized<double>& a, const Vectorized<double>& b) {
86 return { a._vec0, b._vec1 };
87 }
88
89
90 template <int64_t mask>
91 static std::enable_if_t<blendChoiceDbl(mask) == 4, Vectorized<double>> C10_ALWAYS_INLINE
blend(const Vectorized<double> & a,const Vectorized<double> & b)92 blend(const Vectorized<double>& a, const Vectorized<double>& b) {
93 const vbool64 mask_1st = VsxDblMask1(mask);
94 return { (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1 };
95 }
96
97 template <int64_t mask>
98 static std::enable_if_t<blendChoiceDbl(mask) == 5, Vectorized<double>> C10_ALWAYS_INLINE
blend(const Vectorized<double> & a,const Vectorized<double> & b)99 blend(const Vectorized<double>& a, const Vectorized<double>& b) {
100 const vbool64 mask_1st = VsxDblMask1(mask);
101 return { (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1 };
102 }
103
104
105 template <int64_t mask>
106 static std::enable_if_t<blendChoiceDbl(mask) == 6,
107 Vectorized<double>>
blend(const Vectorized<double> & a,const Vectorized<double> & b)108 C10_ALWAYS_INLINE blend(const Vectorized<double>& a, const Vectorized<double>& b) {
109 const vbool64 mask_2nd = VsxDblMask2(mask);
110 // generated masks
111 return { a._vec0,
112 (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd) };
113 }
114
115 template <int64_t mask>
116 static std::enable_if_t<blendChoiceDbl(mask) == 7,
117 Vectorized<double>>
blend(const Vectorized<double> & a,const Vectorized<double> & b)118 C10_ALWAYS_INLINE blend(const Vectorized<double>& a, const Vectorized<double>& b) {
119 const vbool64 mask_2nd = VsxDblMask2(mask);
120 // generated masks
121 return { b._vec0,
122 (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd) };
123 }
124
125 template <int64_t mask>
126 static std::enable_if_t<blendChoiceDbl(mask) == 8, Vectorized<double>>
blend(const Vectorized<double> & a,const Vectorized<double> & b)127 C10_ALWAYS_INLINE blend(const Vectorized<double>& a, const Vectorized<double>& b) {
128 const vbool64 mask_1st = VsxDblMask1(mask);
129 const vbool64 mask_2nd = VsxDblMask2(mask);
130 return {
131 (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st),
132 (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd) };
133 }
134
135
blendv(const Vectorized<double> & a,const Vectorized<double> & b,const Vectorized<double> & mask)136 static Vectorized<double> C10_ALWAYS_INLINE blendv(
137 const Vectorized<double>& a,
138 const Vectorized<double>& b,
139 const Vectorized<double>& mask) {
140 // the mask used here returned by comparision of vec256
141
142 return {
143 vec_sel(a._vec0, b._vec0, mask._vecb0),
144 vec_sel(a._vec1, b._vec1, mask._vecb1)};
145 }
146 template <typename step_t>
147 static Vectorized<double> arange(double base = 0., step_t step = static_cast<step_t>(1)) {
148 return Vectorized<double>(base, base + step, base + 2 * step, base + 3 * step);
149 }
150
151 static Vectorized<double> C10_ALWAYS_INLINE
152 set(const Vectorized<double>& a, const Vectorized<double>& b, size_t count = size()) {
153 switch (count) {
154 case 0:
155 return a;
156 case 1:
157 return blend<1>(a, b);
158 case 2:
159 return blend<3>(a, b);
160 case 3:
161 return blend<7>(a, b);
162 }
163
164 return b;
165 }
166 static Vectorized<value_type> C10_ALWAYS_INLINE
167 loadu(const void* ptr, int count = size()) {
168 if (count == size()) {
169 return {
170 vec_vsx_ld(offset0, reinterpret_cast<const value_type*>(ptr)),
171 vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
172 }
173
174 __at_align__ value_type tmp_values[size()] = {};
175 std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
176
177 return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
178 }
179 void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
180 if (count == size()) {
181 vec_vsx_st(_vec0, offset0, reinterpret_cast<value_type*>(ptr));
182 vec_vsx_st(_vec1, offset16, reinterpret_cast<value_type*>(ptr));
183 } else if (count > 0) {
184 __at_align__ value_type tmp_values[size()];
185 vec_vsx_st(_vec0, offset0, tmp_values);
186 vec_vsx_st(_vec1, offset16, tmp_values);
187 std::memcpy(
188 ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
189 }
190 }
191 const double& operator[](int idx) const = delete;
192 double& operator[](int idx) = delete;
map(double (* const f)(double))193 Vectorized<double> map(double (*const f)(double)) const {
194 Vectorized<double> ret;
195 for (const auto i : c10::irange(size()/2)) {
196 ret._vec0[i] = f(_vec0[i]);
197 }
198 for (const auto i : c10::irange(size()/2)) {
199 ret._vec1[i] = f(_vec1[i]);
200 }
201 return ret;
202 }
203
mapbi(double (* const f)(double,double),const Vectorized<double> & other)204 Vectorized<double> mapbi(double (*const f)(double, double), const Vectorized<double>& other)
205 const {
206 Vectorized<double> ret;
207 for (const auto i : c10::irange(size()/2)) {
208 ret._vec0[i] = f(_vec0[i], other._vec0[i]);
209 }
210 for (const auto i : c10::irange(size()/2)) {
211 ret._vec1[i] = f(_vec1[i], other._vec1[i]);
212 }
213 return ret;
214 }
abs()215 Vectorized<double> C10_ALWAYS_INLINE abs() const {
216 return {vec_abs(_vec0), vec_abs(_vec1)};
217 }
218
acos()219 Vectorized<double> C10_ALWAYS_INLINE acos() const {
220 return {Sleef_acosd2_u10(_vec0), Sleef_acosd2_u10(_vec1)};
221 }
acosh()222 Vectorized<double> C10_ALWAYS_INLINE acosh() const {
223 return {Sleef_acoshd2_u10(_vec0), Sleef_acoshd2_u10(_vec1)};
224 }
asin()225 Vectorized<double> C10_ALWAYS_INLINE asin() const {
226 return {Sleef_asind2_u10(_vec0), Sleef_asind2_u10(_vec1)};
227 }
atan()228 Vectorized<double> atan() const {
229 return {Sleef_atand2_u10(_vec0), Sleef_atand2_u10(_vec1)};
230 }
atanh()231 Vectorized<double> atanh() const {
232 return {Sleef_atanhd2_u10(_vec0), Sleef_atanhd2_u10(_vec1)};
233 }
atan2(const Vectorized<double> & b)234 Vectorized<double> atan2(const Vectorized<double>& b) const {
235 return {Sleef_atan2d2_u10(_vec0, b._vec0), Sleef_atan2d2_u10(_vec1, b._vec1)};
236 }
copysign(const Vectorized<double> & sign)237 Vectorized<double> copysign(const Vectorized<double> &sign) const {
238 return {Sleef_copysignd2(_vec0, sign._vec0), Sleef_copysignd2(_vec1, sign._vec1)};
239 }
erf()240 Vectorized<double> erf() const {
241 return {Sleef_erfd2_u10(_vec0), Sleef_erfd2_u10(_vec1)};
242 }
erfc()243 Vectorized<double> erfc() const {
244 return {Sleef_erfcd2_u15(_vec0), Sleef_erfcd2_u15(_vec1)};
245 }
exp()246 Vectorized<double> C10_ALWAYS_INLINE exp() const {
247 return {Sleef_expd2_u10(_vec0), Sleef_expd2_u10(_vec1)};
248 }
exp2()249 Vectorized<double> C10_ALWAYS_INLINE exp2() const {
250 return {Sleef_exp2d2_u10(_vec0), Sleef_exp2d2_u10(_vec1)};
251 }
expm1()252 Vectorized<double> expm1() const {
253 return {Sleef_expm1d2_u10(_vec0), Sleef_expm1d2_u10(_vec1)};
254 }
exp_u20()255 Vectorized<double> C10_ALWAYS_INLINE exp_u20() const {
256 return exp();
257 }
258
lgamma()259 Vectorized<double> lgamma() const __ubsan_ignore_undefined__ {
260 return {Sleef_lgammad2_u10(_vec0), Sleef_lgammad2_u10(_vec1)};
261 }
262
erfinv()263 Vectorized<double> erfinv() const {
264 return map(calc_erfinv);
265 }
266
angle()267 Vectorized<double> angle() const {
268 auto tmp = blendv(
269 Vectorized<double>(0), Vectorized<double>(c10::pi<double>), *this < Vectorized<double>(0));
270 return blendv(tmp, *this, isnan());
271 }
real()272 Vectorized<double> real() const {
273 return *this;
274 }
imag()275 Vectorized<double> imag() const {
276 return Vectorized<double>{0};
277 }
conj()278 Vectorized<double> conj() const {
279 return *this;
280 }
281
log()282 Vectorized<double> C10_ALWAYS_INLINE log() const {
283 return {Sleef_logd2_u10(_vec0), Sleef_logd2_u10(_vec1)};
284 }
log10()285 Vectorized<double> C10_ALWAYS_INLINE log10() const {
286 return {Sleef_log10d2_u10(_vec0), Sleef_log10d2_u10(_vec1)};
287 }
log1p()288 Vectorized<double> C10_ALWAYS_INLINE log1p() const {
289 return {Sleef_log1pd2_u10(_vec0), Sleef_log1pd2_u10(_vec1)};
290 }
log2()291 Vectorized<double> C10_ALWAYS_INLINE log2() const {
292 return {Sleef_log2d2_u10(_vec0), Sleef_log2d2_u10(_vec1)};
293 }
ceil()294 Vectorized<double> C10_ALWAYS_INLINE ceil() const {
295 return {vec_ceil(_vec0), vec_ceil(_vec1)};
296 }
cos()297 Vectorized<double> C10_ALWAYS_INLINE cos() const {
298 return {Sleef_cosd2_u10(_vec0), Sleef_cosd2_u10(_vec1)};
299 }
cosh()300 Vectorized<double> C10_ALWAYS_INLINE cosh() const {
301 return {Sleef_coshd2_u10(_vec0), Sleef_coshd2_u10(_vec1)};
302 }
floor()303 Vectorized<double> C10_ALWAYS_INLINE floor() const {
304 return {vec_floor(_vec0), vec_floor(_vec1)};
305 }
neg()306 Vectorized<double> C10_ALWAYS_INLINE neg() const {
307 return {vec_neg(_vec0), vec_neg(_vec1)};
308 }
round()309 Vectorized<double> C10_ALWAYS_INLINE round() const {
310 return {vec_rint(_vec0), vec_rint(_vec1)};
311 }
sin()312 Vectorized<double> C10_ALWAYS_INLINE sin() const {
313 return {Sleef_sind2_u10(_vec0), Sleef_sind2_u10(_vec1)};
314 }
sinh()315 Vectorized<double> C10_ALWAYS_INLINE sinh() const {
316 return {Sleef_sinhd2_u10(_vec0), Sleef_sinhd2_u10(_vec1)};
317 }
tan()318 Vectorized<double> C10_ALWAYS_INLINE tan() const {
319 return {Sleef_tand2_u10(_vec0), Sleef_tand2_u10(_vec1)};
320 }
tanh()321 Vectorized<double> C10_ALWAYS_INLINE tanh() const {
322 return {Sleef_tanhd2_u10(_vec0), Sleef_tanhd2_u10(_vec1)};
323 }
trunc()324 Vectorized<double> C10_ALWAYS_INLINE trunc() const {
325 return {vec_trunc(_vec0), vec_trunc(_vec1)};
326 }
327
frac()328 Vectorized<double> C10_ALWAYS_INLINE frac() const {
329 return *this - trunc();
330 }
331
sqrt()332 Vectorized<double> C10_ALWAYS_INLINE sqrt() const {
333 return {vec_sqrt(_vec0), vec_sqrt(_vec1)};
334 }
reciprocal()335 Vectorized<double> C10_ALWAYS_INLINE reciprocal() const {
336 return {
337 vec_div(vd_one, _vec0), // vec_re(_vec0) is estimated one.
338 vec_div(vd_one, _vec1)};
339 }
rsqrt()340 Vectorized<double> C10_ALWAYS_INLINE rsqrt() const {
341 return sqrt().reciprocal();
342 }
343
pow(const Vectorized<double> & b)344 Vectorized<double> C10_ALWAYS_INLINE pow(const Vectorized<double>& b) const {
345 return {Sleef_powd2_u10(_vec0, b._vec0), Sleef_powd2_u10(_vec1, b._vec1)};
346 }
fmod(const Vectorized<double> & b)347 Vectorized<double> C10_ALWAYS_INLINE fmod(const Vectorized<double>& b) const {
348 return {Sleef_fmodd2(_vec0, b._vec0),Sleef_fmodd2(_vec1, b._vec1)};
349 }
350
hypot(const Vectorized<double> & b)351 Vectorized<double> hypot(const Vectorized<double>& b) const {
352 return {Sleef_hypotd2_u05(_vec0, b._vec0), Sleef_hypotd2_u05(_vec1, b._vec1)};
353 }
354
nextafter(const Vectorized<double> & b)355 Vectorized<double> nextafter(const Vectorized<double>& b) const {
356 return {Sleef_nextafterd2(_vec0, b._vec0), Sleef_nextafterd2(_vec1, b._vec1)};
357 }
358
igamma(const Vectorized<double> & x)359 Vectorized<double> igamma(const Vectorized<double>& x) const {
360 return mapbi(calc_igamma, x);
361 }
362
igammac(const Vectorized<double> & x)363 Vectorized<double> igammac(const Vectorized<double>& x) const {
364 return mapbi(calc_igammac, x);
365 }
366
367
i0()368 Vectorized<double> i0() const {
369 return map(calc_i0);
370 }
371
i0e()372 Vectorized<double> i0e() const {
373 return map(calc_i0e);
374 }
375
digamma()376 Vectorized<double> digamma() const {
377 return map(calc_digamma);
378 }
379
_nor()380 Vectorized<double> _nor() const {
381 return {vec_nor(_vec0, _vec0), vec_nor(_vec1, _vec1)};
382 }
383
isnan()384 Vectorized<double> isnan() const {
385 auto x = *this;
386 auto ret = (x == x);
387 return ret._nor();
388 }
has_inf_nan()389 bool has_inf_nan() const {
390 for (const auto i : c10::irange(size()/2)) {
391 if(_isnan(_vec0[i]) || _isinf(_vec0[i])) {
392 return true;
393 }
394 }
395 for (const auto i : c10::irange(size()/2)) {
396 if(_isnan(_vec1[i]) || _isinf(_vec1[i])) {
397 return true;
398 }
399 }
400 return false;
401 }
402
403 DEFINE_MEMBER_OP(operator==, double, vec_cmpeq)
404 DEFINE_MEMBER_OP(operator!=, double, vec_cmpne)
405 DEFINE_MEMBER_OP(operator<, double, vec_cmplt)
406 DEFINE_MEMBER_OP(operator<=, double, vec_cmple)
407 DEFINE_MEMBER_OP(operator>, double, vec_cmpgt)
408 DEFINE_MEMBER_OP(operator>=, double, vec_cmpge)
409 DEFINE_MEMBER_OP_AND_ONE(eq, double, vec_cmpeq)
410 DEFINE_MEMBER_OP_AND_ONE(ne, double, vec_cmpne)
411 DEFINE_MEMBER_OP_AND_ONE(lt, double, vec_cmplt)
412 DEFINE_MEMBER_OP_AND_ONE(le, double, vec_cmple)
413 DEFINE_MEMBER_OP_AND_ONE(gt, double, vec_cmpgt)
414 DEFINE_MEMBER_OP_AND_ONE(ge, double, vec_cmpge)
415 DEFINE_MEMBER_OP(operator+, double, vec_add)
416 DEFINE_MEMBER_OP(operator-, double, vec_sub)
417 DEFINE_MEMBER_OP(operator*, double, vec_mul)
418 DEFINE_MEMBER_OP(operator/, double, vec_div)
419 DEFINE_MEMBER_OP(maximum, double, vec_max_nan2)
420 DEFINE_MEMBER_OP(minimum, double, vec_min_nan2)
421 DEFINE_MEMBER_OP(operator&, double, vec_and)
422 DEFINE_MEMBER_OP(operator|, double, vec_or)
423 DEFINE_MEMBER_OP(operator^, double, vec_xor)
424 DEFINE_MEMBER_TERNARY_OP(madd, double, vec_madd)
425 };
426 template <>
maximum(const Vectorized<double> & a,const Vectorized<double> & b)427 Vectorized<double> inline maximum(
428 const Vectorized<double>& a,
429 const Vectorized<double>& b) {
430 return a.maximum(b);
431 }
432
433 template <>
minimum(const Vectorized<double> & a,const Vectorized<double> & b)434 Vectorized<double> inline minimum(
435 const Vectorized<double>& a,
436 const Vectorized<double>& b) {
437 return a.minimum(b);
438 }
439
440 template <>
441 Vectorized<double> C10_ALWAYS_INLINE operator+(const Vectorized<double>& a, const Vectorized<double>& b) {
442 return Vectorized<double>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
443 }
444
445 template <>
446 Vectorized<double> C10_ALWAYS_INLINE operator-(const Vectorized<double>& a, const Vectorized<double>& b) {
447 return Vectorized<double>{vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
448 }
449
450 template <>
451 Vectorized<double> C10_ALWAYS_INLINE operator*(const Vectorized<double>& a, const Vectorized<double>& b) {
452 return Vectorized<double>{vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
453 }
454
455 template <>
456 Vectorized<double> C10_ALWAYS_INLINE operator/(const Vectorized<double>& a, const Vectorized<double>& b) {
457 return Vectorized<double>{vec_div(a.vec0(), b.vec0()), vec_div(a.vec1(), b.vec1())};
458 }
459
460 template <>
461 Vectorized<double> C10_ALWAYS_INLINE operator&(const Vectorized<double>& a, const Vectorized<double>& b) {
462 return Vectorized<double>{vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
463 }
464
465 template <>
466 Vectorized<double> C10_ALWAYS_INLINE operator|(const Vectorized<double>& a, const Vectorized<double>& b) {
467 return Vectorized<double>{vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
468 }
469
470 template <>
471 Vectorized<double> C10_ALWAYS_INLINE operator^(const Vectorized<double>& a, const Vectorized<double>& b) {
472 return Vectorized<double>{vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
473 }
474
475 } // namespace
476 } // namespace vec
477 } // namespace at
478