1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2008-2016 Konstantinos Margaritis <[email protected]> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_PACKET_MATH_ALTIVEC_H 11 #define EIGEN_PACKET_MATH_ALTIVEC_H 12 13 namespace Eigen { 14 15 namespace internal { 16 17 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 18 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 19 #endif 20 21 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 22 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 23 #endif 24 25 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 26 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 27 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 28 #endif 29 30 typedef __vector float Packet4f; 31 typedef __vector int Packet4i; 32 typedef __vector unsigned int Packet4ui; 33 typedef __vector __bool int Packet4bi; 34 typedef __vector short int Packet8s; 35 typedef __vector unsigned short int Packet8us; 36 typedef __vector signed char Packet16c; 37 typedef __vector unsigned char Packet16uc; 38 typedef eigen_packet_wrapper<__vector unsigned short int,0> Packet8bf; 39 40 // We don't want to write the same code all the time, but we need to reuse the constants 41 // and it doesn't really work to declare them global, so we define macros instead 42 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ 43 Packet4f p4f_##NAME = {X, X, X, X} 44 45 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ 46 Packet4i p4i_##NAME = vec_splat_s32(X) 47 48 #define _EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \ 49 Packet4ui p4ui_##NAME = {X, X, X, X} 50 51 #define _EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \ 52 Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X} 53 54 #define _EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \ 55 Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X} 56 57 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ 58 Packet4f p4f_##NAME = pset1<Packet4f>(X) 59 60 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ 61 Packet4i p4i_##NAME = pset1<Packet4i>(X) 62 63 #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ 64 Packet2d p2d_##NAME = pset1<Packet2d>(X) 65 66 #define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ 67 Packet2l p2l_##NAME = pset1<Packet2l>(X) 68 69 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ 70 const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X)) 71 72 #define DST_CHAN 1 73 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) 74 #define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type 75 76 // These constants are endian-agnostic 77 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} 78 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} 79 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1} 80 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16} 81 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} 82 static _EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u); 83 static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu); 84 static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1} 85 static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1); 86 static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} 87 #ifndef __VSX__ 88 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} 89 #endif 90 91 static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; 92 static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; 93 static Packet8s p8s_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 }; 94 static Packet8us p8us_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 }; 95 96 static Packet16c p16c_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7, 97 8, 9, 10, 11, 12, 13, 14, 15}; 98 static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7, 99 8, 9, 10, 11, 12, 13, 14, 15}; 100 101 static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; 102 static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 }; 103 static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 }; 104 105 static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; 106 static Packet16uc p16uc_DUPLICATE16_HI = { 0,1,0,1, 2,3,2,3, 4,5,4,5, 6,7,6,7 }; 107 static Packet16uc p16uc_DUPLICATE8_HI = { 0,0, 1,1, 2,2, 3,3, 4,4, 5,5, 6,6, 7,7 }; 108 static const Packet16uc p16uc_DUPLICATE16_EVEN= { 0,1 ,0,1, 4,5, 4,5, 8,9, 8,9, 12,13, 12,13 }; 109 static const Packet16uc p16uc_DUPLICATE16_ODD = { 2,3 ,2,3, 6,7, 6,7, 10,11, 10,11, 14,15, 14,15 }; 110 111 static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 }; 112 113 // Handle endianness properly while loading constants 114 // Define global static constants: 115 #ifdef _BIG_ENDIAN 116 static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); 117 #ifdef __VSX__ 118 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 119 #endif 120 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; 121 static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; 122 static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; 123 #else 124 static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; 125 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 126 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; 127 static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; 128 static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; 129 #endif // _BIG_ENDIAN 130 131 static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; 132 static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; 133 static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; 134 static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; 135 136 static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; 137 138 #ifdef _BIG_ENDIAN 139 static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 140 #else 141 static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 142 #endif // _BIG_ENDIAN 143 144 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC 145 #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR); 146 #else 147 #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); 148 #endif 149 150 template <> 151 struct packet_traits<float> : default_packet_traits { 152 typedef Packet4f type; 153 typedef Packet4f half; 154 enum { 155 Vectorizable = 1, 156 AlignedOnScalar = 1, 157 size = 4, 158 HasHalfPacket = 1, 159 160 HasAdd = 1, 161 HasSub = 1, 162 HasMul = 1, 163 HasDiv = 1, 164 HasMin = 1, 165 HasMax = 1, 166 HasAbs = 1, 167 HasSin = EIGEN_FAST_MATH, 168 HasCos = EIGEN_FAST_MATH, 169 HasLog = 1, 170 HasExp = 1, 171 #ifdef __VSX__ 172 HasSqrt = 1, 173 #if !EIGEN_COMP_CLANG 174 HasRsqrt = 1, 175 #else 176 HasRsqrt = 0, 177 #endif 178 #else 179 HasSqrt = 0, 180 HasRsqrt = 0, 181 HasTanh = EIGEN_FAST_MATH, 182 HasErf = EIGEN_FAST_MATH, 183 #endif 184 HasRound = 1, 185 HasFloor = 1, 186 HasCeil = 1, 187 HasRint = 1, 188 HasNegate = 1, 189 HasBlend = 1 190 }; 191 }; 192 template <> 193 struct packet_traits<bfloat16> : default_packet_traits { 194 typedef Packet8bf type; 195 typedef Packet8bf half; 196 enum { 197 Vectorizable = 1, 198 AlignedOnScalar = 1, 199 size = 8, 200 HasHalfPacket = 0, 201 202 HasAdd = 1, 203 HasSub = 1, 204 HasMul = 1, 205 HasDiv = 1, 206 HasMin = 1, 207 HasMax = 1, 208 HasAbs = 1, 209 HasSin = EIGEN_FAST_MATH, 210 HasCos = EIGEN_FAST_MATH, 211 HasLog = 1, 212 HasExp = 1, 213 #ifdef __VSX__ 214 HasSqrt = 1, 215 #if !EIGEN_COMP_CLANG 216 HasRsqrt = 1, 217 #else 218 HasRsqrt = 0, 219 #endif 220 #else 221 HasSqrt = 0, 222 HasRsqrt = 0, 223 HasTanh = EIGEN_FAST_MATH, 224 HasErf = EIGEN_FAST_MATH, 225 #endif 226 HasRound = 1, 227 HasFloor = 1, 228 HasCeil = 1, 229 HasRint = 1, 230 HasNegate = 1, 231 HasBlend = 1 232 }; 233 }; 234 235 template <> 236 struct packet_traits<int> : default_packet_traits { 237 typedef Packet4i type; 238 typedef Packet4i half; 239 enum { 240 Vectorizable = 1, 241 AlignedOnScalar = 1, 242 size = 4, 243 HasHalfPacket = 0, 244 245 HasAdd = 1, 246 HasSub = 1, 247 HasShift = 1, 248 HasMul = 1, 249 HasDiv = 0, 250 HasBlend = 1 251 }; 252 }; 253 254 template <> 255 struct packet_traits<short int> : default_packet_traits { 256 typedef Packet8s type; 257 typedef Packet8s half; 258 enum { 259 Vectorizable = 1, 260 AlignedOnScalar = 1, 261 size = 8, 262 HasHalfPacket = 0, 263 264 HasAdd = 1, 265 HasSub = 1, 266 HasMul = 1, 267 HasDiv = 0, 268 HasBlend = 1 269 }; 270 }; 271 272 template <> 273 struct packet_traits<unsigned short int> : default_packet_traits { 274 typedef Packet8us type; 275 typedef Packet8us half; 276 enum { 277 Vectorizable = 1, 278 AlignedOnScalar = 1, 279 size = 8, 280 HasHalfPacket = 0, 281 282 HasAdd = 1, 283 HasSub = 1, 284 HasMul = 1, 285 HasDiv = 0, 286 HasBlend = 1 287 }; 288 }; 289 290 template <> 291 struct packet_traits<signed char> : default_packet_traits { 292 typedef Packet16c type; 293 typedef Packet16c half; 294 enum { 295 Vectorizable = 1, 296 AlignedOnScalar = 1, 297 size = 16, 298 HasHalfPacket = 0, 299 300 HasAdd = 1, 301 HasSub = 1, 302 HasMul = 1, 303 HasDiv = 0, 304 HasBlend = 1 305 }; 306 }; 307 308 template <> 309 struct packet_traits<unsigned char> : default_packet_traits { 310 typedef Packet16uc type; 311 typedef Packet16uc half; 312 enum { 313 Vectorizable = 1, 314 AlignedOnScalar = 1, 315 size = 16, 316 HasHalfPacket = 0, 317 318 HasAdd = 1, 319 HasSub = 1, 320 HasMul = 1, 321 HasDiv = 0, 322 HasBlend = 1 323 }; 324 }; 325 326 template<> struct unpacket_traits<Packet4f> 327 { 328 typedef float type; 329 typedef Packet4f half; 330 typedef Packet4i integer_packet; 331 enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; 332 }; 333 template<> struct unpacket_traits<Packet4i> 334 { 335 typedef int type; 336 typedef Packet4i half; 337 enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; 338 }; 339 template<> struct unpacket_traits<Packet8s> 340 { 341 typedef short int type; 342 typedef Packet8s half; 343 enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; 344 }; 345 template<> struct unpacket_traits<Packet8us> 346 { 347 typedef unsigned short int type; 348 typedef Packet8us half; 349 enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; 350 }; 351 352 template<> struct unpacket_traits<Packet16c> 353 { 354 typedef signed char type; 355 typedef Packet16c half; 356 enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; 357 }; 358 template<> struct unpacket_traits<Packet16uc> 359 { 360 typedef unsigned char type; 361 typedef Packet16uc half; 362 enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; 363 }; 364 365 template<> struct unpacket_traits<Packet8bf> 366 { 367 typedef bfloat16 type; 368 typedef Packet8bf half; 369 enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; 370 }; 371 inline std::ostream & operator <<(std::ostream & s, const Packet16c & v) 372 { 373 union { 374 Packet16c v; 375 signed char n[16]; 376 } vt; 377 vt.v = v; 378 for (int i=0; i< 16; i++) 379 s << vt.n[i] << ", "; 380 return s; 381 } 382 383 inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) 384 { 385 union { 386 Packet16uc v; 387 unsigned char n[16]; 388 } vt; 389 vt.v = v; 390 for (int i=0; i< 16; i++) 391 s << vt.n[i] << ", "; 392 return s; 393 } 394 395 inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) 396 { 397 union { 398 Packet4f v; 399 float n[4]; 400 } vt; 401 vt.v = v; 402 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 403 return s; 404 } 405 406 inline std::ostream & operator <<(std::ostream & s, const Packet4i & v) 407 { 408 union { 409 Packet4i v; 410 int n[4]; 411 } vt; 412 vt.v = v; 413 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 414 return s; 415 } 416 417 inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) 418 { 419 union { 420 Packet4ui v; 421 unsigned int n[4]; 422 } vt; 423 vt.v = v; 424 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 425 return s; 426 } 427 428 template <typename Packet> 429 EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from) 430 { 431 // some versions of GCC throw "unused-but-set-parameter". 432 // ignoring these warnings for now. 433 EIGEN_UNUSED_VARIABLE(from); 434 EIGEN_DEBUG_ALIGNED_LOAD 435 #ifdef __VSX__ 436 return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); 437 #else 438 return vec_ld(0, from); 439 #endif 440 } 441 442 // Need to define them first or we get specialization after instantiation errors 443 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) 444 { 445 return pload_common<Packet4f>(from); 446 } 447 448 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) 449 { 450 return pload_common<Packet4i>(from); 451 } 452 453 template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from) 454 { 455 return pload_common<Packet8s>(from); 456 } 457 458 template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from) 459 { 460 return pload_common<Packet8us>(from); 461 } 462 463 template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char* from) 464 { 465 return pload_common<Packet16c>(from); 466 } 467 468 template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char* from) 469 { 470 return pload_common<Packet16uc>(from); 471 } 472 473 template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) 474 { 475 return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from)); 476 } 477 478 template <typename Packet> 479 EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){ 480 // some versions of GCC throw "unused-but-set-parameter" (float *to). 481 // ignoring these warnings for now. 482 EIGEN_UNUSED_VARIABLE(to); 483 EIGEN_DEBUG_ALIGNED_STORE 484 #ifdef __VSX__ 485 vec_xst(from, 0, to); 486 #else 487 vec_st(from, 0, to); 488 #endif 489 } 490 491 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) 492 { 493 pstore_common<Packet4f>(to, from); 494 } 495 496 template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) 497 { 498 pstore_common<Packet4i>(to, from); 499 } 500 501 template<> EIGEN_STRONG_INLINE void pstore<short int>(short int* to, const Packet8s& from) 502 { 503 pstore_common<Packet8s>(to, from); 504 } 505 506 template<> EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int* to, const Packet8us& from) 507 { 508 pstore_common<Packet8us>(to, from); 509 } 510 511 template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) 512 { 513 pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from); 514 } 515 516 template<> EIGEN_STRONG_INLINE void pstore<signed char>(signed char* to, const Packet16c& from) 517 { 518 pstore_common<Packet16c>(to, from); 519 } 520 521 template<> EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* to, const Packet16uc& from) 522 { 523 pstore_common<Packet16uc>(to, from); 524 } 525 526 template<typename Packet> 527 EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from) 528 { 529 Packet v = {from, from, from, from}; 530 return v; 531 } 532 533 template<typename Packet> 534 EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet)& from) 535 { 536 Packet v = {from, from, from, from, from, from, from, from}; 537 return v; 538 } 539 540 template<typename Packet> 541 EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet)& from) 542 { 543 Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from}; 544 return v; 545 } 546 547 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { 548 return pset1_size4<Packet4f>(from); 549 } 550 551 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { 552 return pset1_size4<Packet4i>(from); 553 } 554 555 template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int& from) { 556 return pset1_size8<Packet8s>(from); 557 } 558 559 template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int& from) { 560 return pset1_size8<Packet8us>(from); 561 } 562 563 template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char& from) { 564 return pset1_size16<Packet16c>(from); 565 } 566 567 template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char& from) { 568 return pset1_size16<Packet16uc>(from); 569 } 570 571 template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { 572 return reinterpret_cast<Packet4f>(pset1<Packet4i>(from)); 573 } 574 575 template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) { 576 return pset1_size8<Packet8us>(reinterpret_cast<const unsigned short int&>(from)); 577 } 578 579 template<typename Packet> EIGEN_STRONG_INLINE void 580 pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a, 581 Packet& a0, Packet& a1, Packet& a2, Packet& a3) 582 { 583 a3 = pload<Packet>(a); 584 a0 = vec_splat(a3, 0); 585 a1 = vec_splat(a3, 1); 586 a2 = vec_splat(a3, 2); 587 a3 = vec_splat(a3, 3); 588 } 589 590 template<> EIGEN_STRONG_INLINE void 591 pbroadcast4<Packet4f>(const float *a, 592 Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) 593 { 594 pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3); 595 } 596 template<> EIGEN_STRONG_INLINE void 597 pbroadcast4<Packet4i>(const int *a, 598 Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) 599 { 600 pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3); 601 } 602 603 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride) 604 { 605 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4]; 606 a[0] = from[0*stride]; 607 a[1] = from[1*stride]; 608 a[2] = from[2*stride]; 609 a[3] = from[3*stride]; 610 return pload<Packet>(a); 611 } 612 613 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) 614 { 615 return pgather_common<Packet4f>(from, stride); 616 } 617 618 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) 619 { 620 return pgather_common<Packet4i>(from, stride); 621 } 622 623 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size8(const __UNPACK_TYPE__(Packet)* from, Index stride) 624 { 625 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8]; 626 a[0] = from[0*stride]; 627 a[1] = from[1*stride]; 628 a[2] = from[2*stride]; 629 a[3] = from[3*stride]; 630 a[4] = from[4*stride]; 631 a[5] = from[5*stride]; 632 a[6] = from[6*stride]; 633 a[7] = from[7*stride]; 634 return pload<Packet>(a); 635 } 636 637 template<> EIGEN_DEVICE_FUNC inline Packet8s pgather<short int, Packet8s>(const short int* from, Index stride) 638 { 639 return pgather_size8<Packet8s>(from, stride); 640 } 641 642 template<> EIGEN_DEVICE_FUNC inline Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride) 643 { 644 return pgather_size8<Packet8us>(from, stride); 645 } 646 647 template<> EIGEN_DEVICE_FUNC inline Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride) 648 { 649 return pgather_size8<Packet8bf>(from, stride); 650 } 651 652 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride) 653 { 654 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16]; 655 a[0] = from[0*stride]; 656 a[1] = from[1*stride]; 657 a[2] = from[2*stride]; 658 a[3] = from[3*stride]; 659 a[4] = from[4*stride]; 660 a[5] = from[5*stride]; 661 a[6] = from[6*stride]; 662 a[7] = from[7*stride]; 663 a[8] = from[8*stride]; 664 a[9] = from[9*stride]; 665 a[10] = from[10*stride]; 666 a[11] = from[11*stride]; 667 a[12] = from[12*stride]; 668 a[13] = from[13*stride]; 669 a[14] = from[14*stride]; 670 a[15] = from[15*stride]; 671 return pload<Packet>(a); 672 } 673 674 675 template<> EIGEN_DEVICE_FUNC inline Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride) 676 { 677 return pgather_size16<Packet16c>(from, stride); 678 } 679 680 template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride) 681 { 682 return pgather_size16<Packet16uc>(from, stride); 683 } 684 685 template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) 686 { 687 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4]; 688 pstore<__UNPACK_TYPE__(Packet)>(a, from); 689 to[0*stride] = a[0]; 690 to[1*stride] = a[1]; 691 to[2*stride] = a[2]; 692 to[3*stride] = a[3]; 693 } 694 695 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) 696 { 697 pscatter_size4<Packet4f>(to, from, stride); 698 } 699 700 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) 701 { 702 pscatter_size4<Packet4i>(to, from, stride); 703 } 704 705 template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) 706 { 707 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8]; 708 pstore<__UNPACK_TYPE__(Packet)>(a, from); 709 to[0*stride] = a[0]; 710 to[1*stride] = a[1]; 711 to[2*stride] = a[2]; 712 to[3*stride] = a[3]; 713 to[4*stride] = a[4]; 714 to[5*stride] = a[5]; 715 to[6*stride] = a[6]; 716 to[7*stride] = a[7]; 717 } 718 719 720 template<> EIGEN_DEVICE_FUNC inline void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride) 721 { 722 pscatter_size8<Packet8s>(to, from, stride); 723 } 724 725 template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride) 726 { 727 pscatter_size8<Packet8us>(to, from, stride); 728 } 729 730 template<> EIGEN_DEVICE_FUNC inline void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride) 731 { 732 pscatter_size8<Packet8bf>(to, from, stride); 733 } 734 735 template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) 736 { 737 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16]; 738 pstore<__UNPACK_TYPE__(Packet)>(a, from); 739 to[0*stride] = a[0]; 740 to[1*stride] = a[1]; 741 to[2*stride] = a[2]; 742 to[3*stride] = a[3]; 743 to[4*stride] = a[4]; 744 to[5*stride] = a[5]; 745 to[6*stride] = a[6]; 746 to[7*stride] = a[7]; 747 to[8*stride] = a[8]; 748 to[9*stride] = a[9]; 749 to[10*stride] = a[10]; 750 to[11*stride] = a[11]; 751 to[12*stride] = a[12]; 752 to[13*stride] = a[13]; 753 to[14*stride] = a[14]; 754 to[15*stride] = a[15]; 755 } 756 757 template<> EIGEN_DEVICE_FUNC inline void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride) 758 { 759 pscatter_size16<Packet16c>(to, from, stride); 760 } 761 762 template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride) 763 { 764 pscatter_size16<Packet16uc>(to, from, stride); 765 } 766 767 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; } 768 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN; } 769 template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const short int& a) { return pset1<Packet8s>(a) + p8s_COUNTDOWN; } 770 template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const unsigned short int& a) { return pset1<Packet8us>(a) + p8us_COUNTDOWN; } 771 template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const signed char& a) { return pset1<Packet16c>(a) + p16c_COUNTDOWN; } 772 template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a) { return pset1<Packet16uc>(a) + p16uc_COUNTDOWN; } 773 774 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f> (const Packet4f& a, const Packet4f& b) { return a + b; } 775 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i> (const Packet4i& a, const Packet4i& b) { return a + b; } 776 template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui> (const Packet4ui& a, const Packet4ui& b) { return a + b; } 777 template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s> (const Packet8s& a, const Packet8s& b) { return a + b; } 778 template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us> (const Packet8us& a, const Packet8us& b) { return a + b; } 779 template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c> (const Packet16c& a, const Packet16c& b) { return a + b; } 780 template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a + b; } 781 782 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f> (const Packet4f& a, const Packet4f& b) { return a - b; } 783 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i> (const Packet4i& a, const Packet4i& b) { return a - b; } 784 template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s> (const Packet8s& a, const Packet8s& b) { return a - b; } 785 template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us> (const Packet8us& a, const Packet8us& b) { return a - b; } 786 template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c> (const Packet16c& a, const Packet16c& b) { return a - b; } 787 template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a - b; } 788 789 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; } 790 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; } 791 792 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } 793 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } 794 795 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); } 796 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i> (const Packet4i& a, const Packet4i& b) { return a * b; } 797 template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s> (const Packet8s& a, const Packet8s& b) { return vec_mul(a,b); } 798 template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us> (const Packet8us& a, const Packet8us& b) { return vec_mul(a,b); } 799 template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c> (const Packet16c& a, const Packet16c& b) { return vec_mul(a,b); } 800 template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_mul(a,b); } 801 802 803 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) 804 { 805 #ifndef __VSX__ // VSX actually provides a div instruction 806 Packet4f t, y_0, y_1; 807 808 // Altivec does not offer a divide instruction, we have to do a reciprocal approximation 809 y_0 = vec_re(b); 810 811 // Do one Newton-Raphson iteration to get the needed accuracy 812 t = vec_nmsub(y_0, b, p4f_ONE); 813 y_1 = vec_madd(y_0, t, y_0); 814 815 return vec_madd(a, y_1, p4f_MZERO); 816 #else 817 return vec_div(a, b); 818 #endif 819 } 820 821 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) 822 { eigen_assert(false && "packet integer division are not supported by AltiVec"); 823 return pset1<Packet4i>(0); 824 } 825 826 // for some weird raisons, it has to be overloaded for packet of integers 827 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); } 828 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; } 829 template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { return vec_madd(a,b,c); } 830 template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { return vec_madd(a,b,c); } 831 832 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) 833 { 834 #ifdef __VSX__ 835 // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN 836 Packet4f ret; 837 __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); 838 return ret; 839 #else 840 return vec_min(a, b); 841 #endif 842 } 843 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } 844 template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_min(a, b); } 845 template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_min(a, b); } 846 template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_min(a, b); } 847 template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_min(a, b); } 848 849 850 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) 851 { 852 #ifdef __VSX__ 853 // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN 854 Packet4f ret; 855 __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); 856 return ret; 857 #else 858 return vec_max(a, b); 859 #endif 860 } 861 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } 862 template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_max(a, b); } 863 template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_max(a, b); } 864 template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_max(a, b); } 865 template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); } 866 867 template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); } 868 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); } 869 template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); } 870 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { 871 Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b)); 872 return vec_nor(c,c); 873 } 874 875 template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmple(a,b)); } 876 template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmplt(a,b)); } 877 template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); } 878 template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmple(a,b)); } 879 template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmplt(a,b)); } 880 template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmpeq(a,b)); } 881 template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmple(a,b)); } 882 template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmplt(a,b)); } 883 template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmpeq(a,b)); } 884 template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmple(a,b)); } 885 template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmplt(a,b)); } 886 template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmpeq(a,b)); } 887 template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmple(a,b)); } 888 template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmplt(a,b)); } 889 template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmpeq(a,b)); } 890 891 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } 892 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } 893 template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vec_and(a, b); } 894 template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_and(a, b); } 895 template<> EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 896 return pand<Packet8us>(a, b); 897 } 898 899 900 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } 901 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } 902 template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_or(a, b); } 903 template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_or(a, b); } 904 template<> EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 905 return por<Packet8us>(a, b); 906 } 907 908 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } 909 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } 910 template<> EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 911 return pxor<Packet8us>(a, b); 912 } 913 914 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_andc(a, b); } 915 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_andc(a, b); } 916 917 template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) { 918 return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask)); 919 } 920 921 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) 922 { 923 Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a); 924 Packet4f res; 925 926 #ifdef __VSX__ 927 __asm__("xvrspiz %x0, %x1\n\t" 928 : "=&wa" (res) 929 : "wa" (t)); 930 #else 931 __asm__("vrfiz %0, %1\n\t" 932 : "=v" (res) 933 : "v" (t)); 934 #endif 935 936 return res; 937 } 938 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return vec_ceil(a); } 939 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); } 940 template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) 941 { 942 Packet4f res; 943 944 __asm__("xvrspic %x0, %x1\n\t" 945 : "=&wa" (res) 946 : "wa" (a)); 947 948 return res; 949 } 950 951 template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from) 952 { 953 EIGEN_DEBUG_ALIGNED_LOAD 954 #ifdef _BIG_ENDIAN 955 Packet16uc MSQ, LSQ; 956 Packet16uc mask; 957 MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword 958 LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword 959 mask = vec_lvsl(0, from); // create the permute mask 960 //TODO: Add static_cast here 961 return (Packet) vec_perm(MSQ, LSQ, mask); // align the data 962 #else 963 EIGEN_DEBUG_UNALIGNED_LOAD 964 return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); 965 #endif 966 } 967 968 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) 969 { 970 return ploadu_common<Packet4f>(from); 971 } 972 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) 973 { 974 return ploadu_common<Packet4i>(from); 975 } 976 template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from) 977 { 978 return ploadu_common<Packet8s>(from); 979 } 980 template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from) 981 { 982 return ploadu_common<Packet8us>(from); 983 } 984 template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) 985 { 986 return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from)); 987 } 988 template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from) 989 { 990 return ploadu_common<Packet16c>(from); 991 } 992 template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from) 993 { 994 return ploadu_common<Packet16uc>(from); 995 } 996 997 template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from) 998 { 999 Packet p; 1000 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet>(from); 1001 else p = ploadu<Packet>(from); 1002 return vec_perm(p, p, p16uc_DUPLICATE32_HI); 1003 } 1004 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) 1005 { 1006 return ploaddup_common<Packet4f>(from); 1007 } 1008 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) 1009 { 1010 return ploaddup_common<Packet4i>(from); 1011 } 1012 1013 template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int* from) 1014 { 1015 Packet8s p; 1016 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from); 1017 else p = ploadu<Packet8s>(from); 1018 return vec_perm(p, p, p16uc_DUPLICATE16_HI); 1019 } 1020 1021 template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int* from) 1022 { 1023 Packet8us p; 1024 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from); 1025 else p = ploadu<Packet8us>(from); 1026 return vec_perm(p, p, p16uc_DUPLICATE16_HI); 1027 } 1028 1029 template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int* from) 1030 { 1031 Packet8s p; 1032 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from); 1033 else p = ploadu<Packet8s>(from); 1034 return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI); 1035 } 1036 1037 template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int* from) 1038 { 1039 Packet8us p; 1040 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from); 1041 else p = ploadu<Packet8us>(from); 1042 return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI); 1043 } 1044 1045 template<> EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from) 1046 { 1047 return ploadquad<Packet8us>(reinterpret_cast<const unsigned short int*>(from)); 1048 } 1049 1050 template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char* from) 1051 { 1052 Packet16c p; 1053 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16c>(from); 1054 else p = ploadu<Packet16c>(from); 1055 return vec_perm(p, p, p16uc_DUPLICATE8_HI); 1056 } 1057 1058 template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char* from) 1059 { 1060 Packet16uc p; 1061 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16uc>(from); 1062 else p = ploadu<Packet16uc>(from); 1063 return vec_perm(p, p, p16uc_DUPLICATE8_HI); 1064 } 1065 1066 template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)* to, const Packet& from) 1067 { 1068 EIGEN_DEBUG_UNALIGNED_STORE 1069 #ifdef _BIG_ENDIAN 1070 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 1071 // Warning: not thread safe! 1072 Packet16uc MSQ, LSQ, edges; 1073 Packet16uc edgeAlign, align; 1074 1075 MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword 1076 LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword 1077 edgeAlign = vec_lvsl(0, to); // permute map to extract edges 1078 edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges 1079 align = vec_lvsr( 0, to ); // permute map to misalign data 1080 MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ) 1081 LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ) 1082 vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first 1083 vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part second 1084 #else 1085 vec_xst(from, 0, to); 1086 #endif 1087 } 1088 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) 1089 { 1090 pstoreu_common<Packet4f>(to, from); 1091 } 1092 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) 1093 { 1094 pstoreu_common<Packet4i>(to, from); 1095 } 1096 template<> EIGEN_STRONG_INLINE void pstoreu<short int>(short int* to, const Packet8s& from) 1097 { 1098 pstoreu_common<Packet8s>(to, from); 1099 } 1100 template<> EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int* to, const Packet8us& from) 1101 { 1102 pstoreu_common<Packet8us>(to, from); 1103 } 1104 template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) 1105 { 1106 pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from); 1107 } 1108 template<> EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char* to, const Packet16c& from) 1109 { 1110 pstoreu_common<Packet16c>(to, from); 1111 } 1112 template<> EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* to, const Packet16uc& from) 1113 { 1114 pstoreu_common<Packet16uc>(to, from); 1115 } 1116 1117 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); } 1118 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); } 1119 1120 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; } 1121 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; } 1122 1123 template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) { 1124 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x; 1125 vec_ste(a, 0, &x); 1126 return x; 1127 } 1128 1129 template<> EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) { 1130 return pfirst_common<Packet8s>(a); 1131 } 1132 1133 template<> EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) { 1134 return pfirst_common<Packet8us>(a); 1135 } 1136 1137 template<> EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a) 1138 { 1139 return pfirst_common<Packet16c>(a); 1140 } 1141 1142 template<> EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a) 1143 { 1144 return pfirst_common<Packet16uc>(a); 1145 } 1146 1147 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) 1148 { 1149 return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); 1150 } 1151 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) 1152 { 1153 return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); 1154 } 1155 template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) 1156 { 1157 return reinterpret_cast<Packet8s>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16)); 1158 } 1159 template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) 1160 { 1161 return reinterpret_cast<Packet8us>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16)); 1162 } 1163 template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) 1164 { 1165 return vec_perm(a, a, p16uc_REVERSE8); 1166 } 1167 template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) 1168 { 1169 return vec_perm(a, a, p16uc_REVERSE8); 1170 } 1171 template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) 1172 { 1173 return preverse<Packet8us>(a); 1174 } 1175 1176 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } 1177 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } 1178 template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vec_abs(a); } 1179 template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; } 1180 template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vec_abs(a); } 1181 template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; } 1182 template<> EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) { 1183 _EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask,0x7FFF); 1184 return pand<Packet8us>(p8us_abs_mask, a); 1185 } 1186 1187 template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) 1188 { return vec_sra(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); } 1189 template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) 1190 { return vec_sr(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); } 1191 template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) 1192 { return vec_sl(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); } 1193 template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a) 1194 { 1195 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); 1196 Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask); 1197 return reinterpret_cast<Packet4f>(r); 1198 } 1199 1200 template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a) 1201 { 1202 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); 1203 Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask); 1204 return reinterpret_cast<Packet4f>(r); 1205 } 1206 1207 template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) 1208 { 1209 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); 1210 return vec_sr(a, p4ui_mask); 1211 } 1212 1213 template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) 1214 { 1215 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); 1216 return vec_sl(a, p4ui_mask); 1217 } 1218 1219 template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) 1220 { 1221 const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N); 1222 return vec_sl(a, p8us_mask); 1223 } 1224 template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) 1225 { 1226 const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N); 1227 return vec_sr(a, p8us_mask); 1228 } 1229 1230 EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf){ 1231 return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val)); 1232 } 1233 1234 EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf){ 1235 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000); 1236 return pand<Packet4f>( 1237 reinterpret_cast<Packet4f>(bf.m_val), 1238 reinterpret_cast<Packet4f>(p4ui_high_mask) 1239 ); 1240 } 1241 1242 // Simple interleaving of bool masks, prevents true values from being 1243 // converted to NaNs. 1244 EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) { 1245 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000); 1246 Packet4f bf_odd, bf_even; 1247 bf_odd = pand(reinterpret_cast<Packet4f>(p4ui_high_mask), odd); 1248 bf_even = plogical_shift_right<16>(even); 1249 return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd)); 1250 } 1251 1252 EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){ 1253 Packet4ui input = reinterpret_cast<Packet4ui>(p4f); 1254 Packet4ui lsb = plogical_shift_right<16>(input); 1255 lsb = pand<Packet4ui>(lsb, reinterpret_cast<Packet4ui>(p4i_ONE)); 1256 1257 _EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS,0x7FFFu); 1258 Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS); 1259 input = padd<Packet4ui>(input, rounding_bias); 1260 1261 //Test NaN and Subnormal - Begin 1262 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000); 1263 Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask, reinterpret_cast<Packet4ui>(p4f)); 1264 1265 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF); 1266 Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask, reinterpret_cast<Packet4ui>(p4f)); 1267 1268 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(max_exp, 0x7F800000); 1269 Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_max_exp); 1270 Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO)); 1271 1272 Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO)); 1273 Packet4ui nan_selector = pandnot<Packet4ui>( 1274 reinterpret_cast<Packet4ui>(is_max_exp), 1275 reinterpret_cast<Packet4ui>(is_mant_zero) 1276 ); 1277 1278 Packet4ui subnormal_selector = pandnot<Packet4ui>( 1279 reinterpret_cast<Packet4ui>(is_zero_exp), 1280 reinterpret_cast<Packet4ui>(is_mant_zero) 1281 ); 1282 1283 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000); 1284 input = vec_sel(input, p4ui_nan, nan_selector); 1285 input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector); 1286 //Test NaN and Subnormal - End 1287 1288 input = plogical_shift_right<16>(input); 1289 return reinterpret_cast<Packet8us>(input); 1290 } 1291 1292 EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd){ 1293 Packet4f bf_odd, bf_even; 1294 bf_odd = reinterpret_cast<Packet4f>(F32ToBf16(odd).m_val); 1295 bf_odd = plogical_shift_left<16>(bf_odd); 1296 bf_even = reinterpret_cast<Packet4f>(F32ToBf16(even).m_val); 1297 return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd)); 1298 } 1299 #define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \ 1300 Packet4f a_even = Bf16ToF32Even(A);\ 1301 Packet4f a_odd = Bf16ToF32Odd(A);\ 1302 Packet4f op_even = OP(a_even);\ 1303 Packet4f op_odd = OP(a_odd);\ 1304 return F32ToBf16(op_even, op_odd);\ 1305 1306 #define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \ 1307 Packet4f a_even = Bf16ToF32Even(A);\ 1308 Packet4f a_odd = Bf16ToF32Odd(A);\ 1309 Packet4f b_even = Bf16ToF32Even(B);\ 1310 Packet4f b_odd = Bf16ToF32Odd(B);\ 1311 Packet4f op_even = OP(a_even, b_even);\ 1312 Packet4f op_odd = OP(a_odd, b_odd);\ 1313 return F32ToBf16(op_even, op_odd);\ 1314 1315 #define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \ 1316 Packet4f a_even = Bf16ToF32Even(A);\ 1317 Packet4f a_odd = Bf16ToF32Odd(A);\ 1318 Packet4f b_even = Bf16ToF32Even(B);\ 1319 Packet4f b_odd = Bf16ToF32Odd(B);\ 1320 Packet4f op_even = OP(a_even, b_even);\ 1321 Packet4f op_odd = OP(a_odd, b_odd);\ 1322 return F32ToBf16Bool(op_even, op_odd);\ 1323 1324 template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 1325 BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b); 1326 } 1327 1328 template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 1329 BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b); 1330 } 1331 1332 template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 1333 BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b); 1334 } 1335 1336 template<> EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) { 1337 BF16_TO_F32_UNARY_OP_WRAPPER(pnegate<Packet4f>, a); 1338 } 1339 1340 template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 1341 BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b); 1342 } 1343 1344 template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){ 1345 BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a); 1346 } 1347 template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){ 1348 BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a); 1349 } 1350 template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){ 1351 BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a); 1352 } 1353 1354 template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) { 1355 return pldexp_generic(a,exponent); 1356 } 1357 template<> EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf> (const Packet8bf& a, const Packet8bf& exponent){ 1358 BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent); 1359 } 1360 1361 template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) { 1362 return pfrexp_generic(a,exponent); 1363 } 1364 template<> EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf> (const Packet8bf& a, Packet8bf& e){ 1365 Packet4f a_even = Bf16ToF32Even(a); 1366 Packet4f a_odd = Bf16ToF32Odd(a); 1367 Packet4f e_even; 1368 Packet4f e_odd; 1369 Packet4f op_even = pfrexp<Packet4f>(a_even, e_even); 1370 Packet4f op_odd = pfrexp<Packet4f>(a_odd, e_odd); 1371 e = F32ToBf16(e_even, e_odd); 1372 return F32ToBf16(op_even, op_odd); 1373 } 1374 1375 template<> EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf> (const Packet8bf& a){ 1376 BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a); 1377 } 1378 template<> EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf> (const Packet8bf& a){ 1379 BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a); 1380 } 1381 template<> EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf> (const Packet8bf& a){ 1382 BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a); 1383 } 1384 template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf> (const Packet8bf& a){ 1385 BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a); 1386 } 1387 template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf> (const Packet8bf& a){ 1388 BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a); 1389 } 1390 template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf> (const Packet8bf& a){ 1391 BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a); 1392 } 1393 template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf> (const Packet8bf& a){ 1394 BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a); 1395 } 1396 template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) { 1397 Packet4f a_even = Bf16ToF32Even(a); 1398 Packet4f a_odd = Bf16ToF32Odd(a); 1399 Packet4f b_even = Bf16ToF32Even(b); 1400 Packet4f b_odd = Bf16ToF32Odd(b); 1401 Packet4f c_even = Bf16ToF32Even(c); 1402 Packet4f c_odd = Bf16ToF32Odd(c); 1403 Packet4f pmadd_even = pmadd<Packet4f>(a_even, b_even, c_even); 1404 Packet4f pmadd_odd = pmadd<Packet4f>(a_odd, b_odd, c_odd); 1405 return F32ToBf16(pmadd_even, pmadd_odd); 1406 } 1407 1408 template<> EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 1409 BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b); 1410 } 1411 1412 template<> EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 1413 BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b); 1414 } 1415 1416 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) { 1417 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b); 1418 } 1419 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) { 1420 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b); 1421 } 1422 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) { 1423 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b); 1424 } 1425 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) { 1426 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b); 1427 } 1428 1429 template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) { 1430 return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a))); 1431 } 1432 1433 template<> EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from) 1434 { 1435 return ploaddup<Packet8us>(reinterpret_cast<const unsigned short int*>(from)); 1436 } 1437 1438 template<> EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) { 1439 bfloat16 countdown[8] = { bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3), 1440 bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7) }; 1441 return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown)); 1442 } 1443 1444 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) 1445 { 1446 Packet4f b, sum; 1447 b = vec_sld(a, a, 8); 1448 sum = a + b; 1449 b = vec_sld(sum, sum, 4); 1450 sum += b; 1451 return pfirst(sum); 1452 } 1453 1454 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) 1455 { 1456 Packet4i sum; 1457 sum = vec_sums(a, p4i_ZERO); 1458 #ifdef _BIG_ENDIAN 1459 sum = vec_sld(sum, p4i_ZERO, 12); 1460 #else 1461 sum = vec_sld(p4i_ZERO, sum, 4); 1462 #endif 1463 return pfirst(sum); 1464 } 1465 1466 template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) 1467 { 1468 float redux_even = predux<Packet4f>(Bf16ToF32Even(a)); 1469 float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a)); 1470 float f32_result = redux_even + redux_odd; 1471 return bfloat16(f32_result); 1472 } 1473 template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a) 1474 { 1475 union{ 1476 Packet v; 1477 __UNPACK_TYPE__(Packet) n[8]; 1478 } vt; 1479 vt.v = a; 1480 1481 EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] }; 1482 EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] }; 1483 Packet4i first_half = pload<Packet4i>(first_loader); 1484 Packet4i second_half = pload<Packet4i>(second_loader); 1485 1486 return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half)); 1487 } 1488 1489 template<> EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a) 1490 { 1491 return predux_size8<Packet8s>(a); 1492 } 1493 1494 template<> EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a) 1495 { 1496 return predux_size8<Packet8us>(a); 1497 } 1498 1499 template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a) 1500 { 1501 union{ 1502 Packet v; 1503 __UNPACK_TYPE__(Packet) n[16]; 1504 } vt; 1505 vt.v = a; 1506 1507 EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] }; 1508 EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] }; 1509 EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] }; 1510 EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] }; 1511 1512 Packet4i first_quarter = pload<Packet4i>(first_loader); 1513 Packet4i second_quarter = pload<Packet4i>(second_loader); 1514 Packet4i third_quarter = pload<Packet4i>(third_loader); 1515 Packet4i fourth_quarter = pload<Packet4i>(fourth_loader); 1516 1517 return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter) 1518 + predux(third_quarter) + predux(fourth_quarter)); 1519 } 1520 1521 template<> EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a) 1522 { 1523 return predux_size16<Packet16c>(a); 1524 } 1525 1526 template<> EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a) 1527 { 1528 return predux_size16<Packet16uc>(a); 1529 } 1530 1531 // Other reduction functions: 1532 // mul 1533 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) 1534 { 1535 Packet4f prod; 1536 prod = pmul(a, vec_sld(a, a, 8)); 1537 return pfirst(pmul(prod, vec_sld(prod, prod, 4))); 1538 } 1539 1540 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) 1541 { 1542 EIGEN_ALIGN16 int aux[4]; 1543 pstore(aux, a); 1544 return aux[0] * aux[1] * aux[2] * aux[3]; 1545 } 1546 1547 template<> EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a) 1548 { 1549 Packet8s pair, quad, octo; 1550 1551 pair = vec_mul(a, vec_sld(a, a, 8)); 1552 quad = vec_mul(pair, vec_sld(pair, pair, 4)); 1553 octo = vec_mul(quad, vec_sld(quad, quad, 2)); 1554 1555 return pfirst(octo); 1556 } 1557 1558 template<> EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a) 1559 { 1560 Packet8us pair, quad, octo; 1561 1562 pair = vec_mul(a, vec_sld(a, a, 8)); 1563 quad = vec_mul(pair, vec_sld(pair, pair, 4)); 1564 octo = vec_mul(quad, vec_sld(quad, quad, 2)); 1565 1566 return pfirst(octo); 1567 } 1568 1569 template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) 1570 { 1571 float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a)); 1572 float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a)); 1573 float f32_result = redux_even * redux_odd; 1574 return bfloat16(f32_result); 1575 } 1576 1577 1578 template<> EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a) 1579 { 1580 Packet16c pair, quad, octo, result; 1581 1582 pair = vec_mul(a, vec_sld(a, a, 8)); 1583 quad = vec_mul(pair, vec_sld(pair, pair, 4)); 1584 octo = vec_mul(quad, vec_sld(quad, quad, 2)); 1585 result = vec_mul(octo, vec_sld(octo, octo, 1)); 1586 1587 return pfirst(result); 1588 } 1589 1590 template<> EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a) 1591 { 1592 Packet16uc pair, quad, octo, result; 1593 1594 pair = vec_mul(a, vec_sld(a, a, 8)); 1595 quad = vec_mul(pair, vec_sld(pair, pair, 4)); 1596 octo = vec_mul(quad, vec_sld(quad, quad, 2)); 1597 result = vec_mul(octo, vec_sld(octo, octo, 1)); 1598 1599 return pfirst(result); 1600 } 1601 1602 // min 1603 template<typename Packet> EIGEN_STRONG_INLINE 1604 __UNPACK_TYPE__(Packet) predux_min4(const Packet& a) 1605 { 1606 Packet b, res; 1607 b = vec_min(a, vec_sld(a, a, 8)); 1608 res = vec_min(b, vec_sld(b, b, 4)); 1609 return pfirst(res); 1610 } 1611 1612 1613 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) 1614 { 1615 return predux_min4<Packet4f>(a); 1616 } 1617 1618 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) 1619 { 1620 return predux_min4<Packet4i>(a); 1621 } 1622 1623 template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) 1624 { 1625 float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a)); 1626 float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a)); 1627 float f32_result = (std::min)(redux_even, redux_odd); 1628 return bfloat16(f32_result); 1629 } 1630 1631 template<> EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a) 1632 { 1633 Packet8s pair, quad, octo; 1634 1635 //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) } 1636 pair = vec_min(a, vec_sld(a, a, 8)); 1637 1638 //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) } 1639 quad = vec_min(pair, vec_sld(pair, pair, 4)); 1640 1641 //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) } 1642 octo = vec_min(quad, vec_sld(quad, quad, 2)); 1643 return pfirst(octo); 1644 } 1645 1646 template<> EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a) 1647 { 1648 Packet8us pair, quad, octo; 1649 1650 //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) } 1651 pair = vec_min(a, vec_sld(a, a, 8)); 1652 1653 //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) } 1654 quad = vec_min(pair, vec_sld(pair, pair, 4)); 1655 1656 //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) } 1657 octo = vec_min(quad, vec_sld(quad, quad, 2)); 1658 return pfirst(octo); 1659 } 1660 1661 template<> EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a) 1662 { 1663 Packet16c pair, quad, octo, result; 1664 1665 pair = vec_min(a, vec_sld(a, a, 8)); 1666 quad = vec_min(pair, vec_sld(pair, pair, 4)); 1667 octo = vec_min(quad, vec_sld(quad, quad, 2)); 1668 result = vec_min(octo, vec_sld(octo, octo, 1)); 1669 1670 return pfirst(result); 1671 } 1672 1673 template<> EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a) 1674 { 1675 Packet16uc pair, quad, octo, result; 1676 1677 pair = vec_min(a, vec_sld(a, a, 8)); 1678 quad = vec_min(pair, vec_sld(pair, pair, 4)); 1679 octo = vec_min(quad, vec_sld(quad, quad, 2)); 1680 result = vec_min(octo, vec_sld(octo, octo, 1)); 1681 1682 return pfirst(result); 1683 } 1684 // max 1685 template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a) 1686 { 1687 Packet b, res; 1688 b = vec_max(a, vec_sld(a, a, 8)); 1689 res = vec_max(b, vec_sld(b, b, 4)); 1690 return pfirst(res); 1691 } 1692 1693 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) 1694 { 1695 return predux_max4<Packet4f>(a); 1696 } 1697 1698 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) 1699 { 1700 return predux_max4<Packet4i>(a); 1701 } 1702 1703 template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) 1704 { 1705 float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a)); 1706 float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a)); 1707 float f32_result = (std::max)(redux_even, redux_odd); 1708 return bfloat16(f32_result); 1709 } 1710 1711 template<> EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a) 1712 { 1713 Packet8s pair, quad, octo; 1714 1715 //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) } 1716 pair = vec_max(a, vec_sld(a, a, 8)); 1717 1718 //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) } 1719 quad = vec_max(pair, vec_sld(pair, pair, 4)); 1720 1721 //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) } 1722 octo = vec_max(quad, vec_sld(quad, quad, 2)); 1723 return pfirst(octo); 1724 } 1725 1726 template<> EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a) 1727 { 1728 Packet8us pair, quad, octo; 1729 1730 //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) } 1731 pair = vec_max(a, vec_sld(a, a, 8)); 1732 1733 //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) } 1734 quad = vec_max(pair, vec_sld(pair, pair, 4)); 1735 1736 //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) } 1737 octo = vec_max(quad, vec_sld(quad, quad, 2)); 1738 return pfirst(octo); 1739 } 1740 1741 template<> EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a) 1742 { 1743 Packet16c pair, quad, octo, result; 1744 1745 pair = vec_max(a, vec_sld(a, a, 8)); 1746 quad = vec_max(pair, vec_sld(pair, pair, 4)); 1747 octo = vec_max(quad, vec_sld(quad, quad, 2)); 1748 result = vec_max(octo, vec_sld(octo, octo, 1)); 1749 1750 return pfirst(result); 1751 } 1752 1753 template<> EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a) 1754 { 1755 Packet16uc pair, quad, octo, result; 1756 1757 pair = vec_max(a, vec_sld(a, a, 8)); 1758 quad = vec_max(pair, vec_sld(pair, pair, 4)); 1759 octo = vec_max(quad, vec_sld(quad, quad, 2)); 1760 result = vec_max(octo, vec_sld(octo, octo, 1)); 1761 1762 return pfirst(result); 1763 } 1764 1765 template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) 1766 { 1767 return vec_any_ne(x, pzero(x)); 1768 } 1769 1770 template <typename T> EIGEN_DEVICE_FUNC inline void 1771 ptranpose_common(PacketBlock<T,4>& kernel){ 1772 T t0, t1, t2, t3; 1773 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); 1774 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); 1775 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); 1776 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); 1777 kernel.packet[0] = vec_mergeh(t0, t2); 1778 kernel.packet[1] = vec_mergel(t0, t2); 1779 kernel.packet[2] = vec_mergeh(t1, t3); 1780 kernel.packet[3] = vec_mergel(t1, t3); 1781 } 1782 1783 EIGEN_DEVICE_FUNC inline void 1784 ptranspose(PacketBlock<Packet4f,4>& kernel) { 1785 ptranpose_common<Packet4f>(kernel); 1786 } 1787 1788 EIGEN_DEVICE_FUNC inline void 1789 ptranspose(PacketBlock<Packet4i,4>& kernel) { 1790 ptranpose_common<Packet4i>(kernel); 1791 } 1792 1793 EIGEN_DEVICE_FUNC inline void 1794 ptranspose(PacketBlock<Packet8s,4>& kernel) { 1795 Packet8s t0, t1, t2, t3; 1796 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); 1797 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); 1798 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); 1799 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); 1800 kernel.packet[0] = vec_mergeh(t0, t2); 1801 kernel.packet[1] = vec_mergel(t0, t2); 1802 kernel.packet[2] = vec_mergeh(t1, t3); 1803 kernel.packet[3] = vec_mergel(t1, t3); 1804 } 1805 1806 EIGEN_DEVICE_FUNC inline void 1807 ptranspose(PacketBlock<Packet8us,4>& kernel) { 1808 Packet8us t0, t1, t2, t3; 1809 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); 1810 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); 1811 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); 1812 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); 1813 kernel.packet[0] = vec_mergeh(t0, t2); 1814 kernel.packet[1] = vec_mergel(t0, t2); 1815 kernel.packet[2] = vec_mergeh(t1, t3); 1816 kernel.packet[3] = vec_mergel(t1, t3); 1817 } 1818 1819 1820 EIGEN_DEVICE_FUNC inline void 1821 ptranspose(PacketBlock<Packet8bf,4>& kernel) { 1822 Packet8us t0, t1, t2, t3; 1823 1824 t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val); 1825 t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val); 1826 t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val); 1827 t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val); 1828 kernel.packet[0] = vec_mergeh(t0, t2); 1829 kernel.packet[1] = vec_mergel(t0, t2); 1830 kernel.packet[2] = vec_mergeh(t1, t3); 1831 kernel.packet[3] = vec_mergel(t1, t3); 1832 } 1833 1834 EIGEN_DEVICE_FUNC inline void 1835 ptranspose(PacketBlock<Packet16c,4>& kernel) { 1836 Packet16c t0, t1, t2, t3; 1837 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); 1838 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); 1839 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); 1840 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); 1841 kernel.packet[0] = vec_mergeh(t0, t2); 1842 kernel.packet[1] = vec_mergel(t0, t2); 1843 kernel.packet[2] = vec_mergeh(t1, t3); 1844 kernel.packet[3] = vec_mergel(t1, t3); 1845 } 1846 1847 1848 EIGEN_DEVICE_FUNC inline void 1849 ptranspose(PacketBlock<Packet16uc,4>& kernel) { 1850 Packet16uc t0, t1, t2, t3; 1851 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); 1852 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); 1853 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); 1854 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); 1855 kernel.packet[0] = vec_mergeh(t0, t2); 1856 kernel.packet[1] = vec_mergel(t0, t2); 1857 kernel.packet[2] = vec_mergeh(t1, t3); 1858 kernel.packet[3] = vec_mergel(t1, t3); 1859 } 1860 1861 EIGEN_DEVICE_FUNC inline void 1862 ptranspose(PacketBlock<Packet8s,8>& kernel) { 1863 Packet8s v[8], sum[8]; 1864 1865 v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]); 1866 v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]); 1867 v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]); 1868 v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]); 1869 v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]); 1870 v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]); 1871 v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]); 1872 v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]); 1873 sum[0] = vec_mergeh(v[0], v[4]); 1874 sum[1] = vec_mergel(v[0], v[4]); 1875 sum[2] = vec_mergeh(v[1], v[5]); 1876 sum[3] = vec_mergel(v[1], v[5]); 1877 sum[4] = vec_mergeh(v[2], v[6]); 1878 sum[5] = vec_mergel(v[2], v[6]); 1879 sum[6] = vec_mergeh(v[3], v[7]); 1880 sum[7] = vec_mergel(v[3], v[7]); 1881 1882 kernel.packet[0] = vec_mergeh(sum[0], sum[4]); 1883 kernel.packet[1] = vec_mergel(sum[0], sum[4]); 1884 kernel.packet[2] = vec_mergeh(sum[1], sum[5]); 1885 kernel.packet[3] = vec_mergel(sum[1], sum[5]); 1886 kernel.packet[4] = vec_mergeh(sum[2], sum[6]); 1887 kernel.packet[5] = vec_mergel(sum[2], sum[6]); 1888 kernel.packet[6] = vec_mergeh(sum[3], sum[7]); 1889 kernel.packet[7] = vec_mergel(sum[3], sum[7]); 1890 } 1891 1892 EIGEN_DEVICE_FUNC inline void 1893 ptranspose(PacketBlock<Packet8us,8>& kernel) { 1894 Packet8us v[8], sum[8]; 1895 1896 v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]); 1897 v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]); 1898 v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]); 1899 v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]); 1900 v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]); 1901 v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]); 1902 v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]); 1903 v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]); 1904 sum[0] = vec_mergeh(v[0], v[4]); 1905 sum[1] = vec_mergel(v[0], v[4]); 1906 sum[2] = vec_mergeh(v[1], v[5]); 1907 sum[3] = vec_mergel(v[1], v[5]); 1908 sum[4] = vec_mergeh(v[2], v[6]); 1909 sum[5] = vec_mergel(v[2], v[6]); 1910 sum[6] = vec_mergeh(v[3], v[7]); 1911 sum[7] = vec_mergel(v[3], v[7]); 1912 1913 kernel.packet[0] = vec_mergeh(sum[0], sum[4]); 1914 kernel.packet[1] = vec_mergel(sum[0], sum[4]); 1915 kernel.packet[2] = vec_mergeh(sum[1], sum[5]); 1916 kernel.packet[3] = vec_mergel(sum[1], sum[5]); 1917 kernel.packet[4] = vec_mergeh(sum[2], sum[6]); 1918 kernel.packet[5] = vec_mergel(sum[2], sum[6]); 1919 kernel.packet[6] = vec_mergeh(sum[3], sum[7]); 1920 kernel.packet[7] = vec_mergel(sum[3], sum[7]); 1921 } 1922 1923 EIGEN_DEVICE_FUNC inline void 1924 ptranspose(PacketBlock<Packet8bf,8>& kernel) { 1925 Packet8bf v[8], sum[8]; 1926 1927 v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val); 1928 v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val); 1929 v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val); 1930 v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val); 1931 v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val); 1932 v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val); 1933 v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val); 1934 v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val); 1935 sum[0] = vec_mergeh(v[0].m_val, v[4].m_val); 1936 sum[1] = vec_mergel(v[0].m_val, v[4].m_val); 1937 sum[2] = vec_mergeh(v[1].m_val, v[5].m_val); 1938 sum[3] = vec_mergel(v[1].m_val, v[5].m_val); 1939 sum[4] = vec_mergeh(v[2].m_val, v[6].m_val); 1940 sum[5] = vec_mergel(v[2].m_val, v[6].m_val); 1941 sum[6] = vec_mergeh(v[3].m_val, v[7].m_val); 1942 sum[7] = vec_mergel(v[3].m_val, v[7].m_val); 1943 1944 kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val); 1945 kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val); 1946 kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val); 1947 kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val); 1948 kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val); 1949 kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val); 1950 kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val); 1951 kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val); 1952 } 1953 1954 EIGEN_DEVICE_FUNC inline void 1955 ptranspose(PacketBlock<Packet16c,16>& kernel) { 1956 Packet16c step1[16], step2[16], step3[16]; 1957 1958 step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]); 1959 step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]); 1960 step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]); 1961 step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]); 1962 step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]); 1963 step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]); 1964 step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]); 1965 step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]); 1966 step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]); 1967 step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]); 1968 step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]); 1969 step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]); 1970 step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]); 1971 step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]); 1972 step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]); 1973 step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]); 1974 1975 step2[0] = vec_mergeh(step1[0], step1[8]); 1976 step2[1] = vec_mergel(step1[0], step1[8]); 1977 step2[2] = vec_mergeh(step1[1], step1[9]); 1978 step2[3] = vec_mergel(step1[1], step1[9]); 1979 step2[4] = vec_mergeh(step1[2], step1[10]); 1980 step2[5] = vec_mergel(step1[2], step1[10]); 1981 step2[6] = vec_mergeh(step1[3], step1[11]); 1982 step2[7] = vec_mergel(step1[3], step1[11]); 1983 step2[8] = vec_mergeh(step1[4], step1[12]); 1984 step2[9] = vec_mergel(step1[4], step1[12]); 1985 step2[10] = vec_mergeh(step1[5], step1[13]); 1986 step2[11] = vec_mergel(step1[5], step1[13]); 1987 step2[12] = vec_mergeh(step1[6], step1[14]); 1988 step2[13] = vec_mergel(step1[6], step1[14]); 1989 step2[14] = vec_mergeh(step1[7], step1[15]); 1990 step2[15] = vec_mergel(step1[7], step1[15]); 1991 1992 step3[0] = vec_mergeh(step2[0], step2[8]); 1993 step3[1] = vec_mergel(step2[0], step2[8]); 1994 step3[2] = vec_mergeh(step2[1], step2[9]); 1995 step3[3] = vec_mergel(step2[1], step2[9]); 1996 step3[4] = vec_mergeh(step2[2], step2[10]); 1997 step3[5] = vec_mergel(step2[2], step2[10]); 1998 step3[6] = vec_mergeh(step2[3], step2[11]); 1999 step3[7] = vec_mergel(step2[3], step2[11]); 2000 step3[8] = vec_mergeh(step2[4], step2[12]); 2001 step3[9] = vec_mergel(step2[4], step2[12]); 2002 step3[10] = vec_mergeh(step2[5], step2[13]); 2003 step3[11] = vec_mergel(step2[5], step2[13]); 2004 step3[12] = vec_mergeh(step2[6], step2[14]); 2005 step3[13] = vec_mergel(step2[6], step2[14]); 2006 step3[14] = vec_mergeh(step2[7], step2[15]); 2007 step3[15] = vec_mergel(step2[7], step2[15]); 2008 2009 kernel.packet[0] = vec_mergeh(step3[0], step3[8]); 2010 kernel.packet[1] = vec_mergel(step3[0], step3[8]); 2011 kernel.packet[2] = vec_mergeh(step3[1], step3[9]); 2012 kernel.packet[3] = vec_mergel(step3[1], step3[9]); 2013 kernel.packet[4] = vec_mergeh(step3[2], step3[10]); 2014 kernel.packet[5] = vec_mergel(step3[2], step3[10]); 2015 kernel.packet[6] = vec_mergeh(step3[3], step3[11]); 2016 kernel.packet[7] = vec_mergel(step3[3], step3[11]); 2017 kernel.packet[8] = vec_mergeh(step3[4], step3[12]); 2018 kernel.packet[9] = vec_mergel(step3[4], step3[12]); 2019 kernel.packet[10] = vec_mergeh(step3[5], step3[13]); 2020 kernel.packet[11] = vec_mergel(step3[5], step3[13]); 2021 kernel.packet[12] = vec_mergeh(step3[6], step3[14]); 2022 kernel.packet[13] = vec_mergel(step3[6], step3[14]); 2023 kernel.packet[14] = vec_mergeh(step3[7], step3[15]); 2024 kernel.packet[15] = vec_mergel(step3[7], step3[15]); 2025 } 2026 2027 EIGEN_DEVICE_FUNC inline void 2028 ptranspose(PacketBlock<Packet16uc,16>& kernel) { 2029 Packet16uc step1[16], step2[16], step3[16]; 2030 2031 step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]); 2032 step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]); 2033 step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]); 2034 step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]); 2035 step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]); 2036 step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]); 2037 step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]); 2038 step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]); 2039 step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]); 2040 step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]); 2041 step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]); 2042 step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]); 2043 step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]); 2044 step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]); 2045 step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]); 2046 step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]); 2047 2048 step2[0] = vec_mergeh(step1[0], step1[8]); 2049 step2[1] = vec_mergel(step1[0], step1[8]); 2050 step2[2] = vec_mergeh(step1[1], step1[9]); 2051 step2[3] = vec_mergel(step1[1], step1[9]); 2052 step2[4] = vec_mergeh(step1[2], step1[10]); 2053 step2[5] = vec_mergel(step1[2], step1[10]); 2054 step2[6] = vec_mergeh(step1[3], step1[11]); 2055 step2[7] = vec_mergel(step1[3], step1[11]); 2056 step2[8] = vec_mergeh(step1[4], step1[12]); 2057 step2[9] = vec_mergel(step1[4], step1[12]); 2058 step2[10] = vec_mergeh(step1[5], step1[13]); 2059 step2[11] = vec_mergel(step1[5], step1[13]); 2060 step2[12] = vec_mergeh(step1[6], step1[14]); 2061 step2[13] = vec_mergel(step1[6], step1[14]); 2062 step2[14] = vec_mergeh(step1[7], step1[15]); 2063 step2[15] = vec_mergel(step1[7], step1[15]); 2064 2065 step3[0] = vec_mergeh(step2[0], step2[8]); 2066 step3[1] = vec_mergel(step2[0], step2[8]); 2067 step3[2] = vec_mergeh(step2[1], step2[9]); 2068 step3[3] = vec_mergel(step2[1], step2[9]); 2069 step3[4] = vec_mergeh(step2[2], step2[10]); 2070 step3[5] = vec_mergel(step2[2], step2[10]); 2071 step3[6] = vec_mergeh(step2[3], step2[11]); 2072 step3[7] = vec_mergel(step2[3], step2[11]); 2073 step3[8] = vec_mergeh(step2[4], step2[12]); 2074 step3[9] = vec_mergel(step2[4], step2[12]); 2075 step3[10] = vec_mergeh(step2[5], step2[13]); 2076 step3[11] = vec_mergel(step2[5], step2[13]); 2077 step3[12] = vec_mergeh(step2[6], step2[14]); 2078 step3[13] = vec_mergel(step2[6], step2[14]); 2079 step3[14] = vec_mergeh(step2[7], step2[15]); 2080 step3[15] = vec_mergel(step2[7], step2[15]); 2081 2082 kernel.packet[0] = vec_mergeh(step3[0], step3[8]); 2083 kernel.packet[1] = vec_mergel(step3[0], step3[8]); 2084 kernel.packet[2] = vec_mergeh(step3[1], step3[9]); 2085 kernel.packet[3] = vec_mergel(step3[1], step3[9]); 2086 kernel.packet[4] = vec_mergeh(step3[2], step3[10]); 2087 kernel.packet[5] = vec_mergel(step3[2], step3[10]); 2088 kernel.packet[6] = vec_mergeh(step3[3], step3[11]); 2089 kernel.packet[7] = vec_mergel(step3[3], step3[11]); 2090 kernel.packet[8] = vec_mergeh(step3[4], step3[12]); 2091 kernel.packet[9] = vec_mergel(step3[4], step3[12]); 2092 kernel.packet[10] = vec_mergeh(step3[5], step3[13]); 2093 kernel.packet[11] = vec_mergel(step3[5], step3[13]); 2094 kernel.packet[12] = vec_mergeh(step3[6], step3[14]); 2095 kernel.packet[13] = vec_mergel(step3[6], step3[14]); 2096 kernel.packet[14] = vec_mergeh(step3[7], step3[15]); 2097 kernel.packet[15] = vec_mergel(step3[7], step3[15]); 2098 } 2099 2100 template<typename Packet> EIGEN_STRONG_INLINE 2101 Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) { 2102 Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; 2103 Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE))); 2104 return vec_sel(elsePacket, thenPacket, mask); 2105 } 2106 2107 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { 2108 return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket); 2109 } 2110 2111 template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { 2112 return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket); 2113 } 2114 2115 template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) { 2116 Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], 2117 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] }; 2118 Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(select, p8us_ONE)); 2119 Packet8s result = vec_sel(elsePacket, thenPacket, mask); 2120 return result; 2121 } 2122 2123 template<> EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket, const Packet8us& elsePacket) { 2124 Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], 2125 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] }; 2126 Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(reinterpret_cast<Packet8us>(select), p8us_ONE)); 2127 return vec_sel(elsePacket, thenPacket, mask); 2128 } 2129 2130 template<> EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket, const Packet8bf& elsePacket) { 2131 return pblend<Packet8us>(ifPacket, thenPacket, elsePacket); 2132 } 2133 2134 template<> EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket, const Packet16c& elsePacket) { 2135 Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], 2136 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7], 2137 ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11], 2138 ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] }; 2139 2140 Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE)); 2141 return vec_sel(elsePacket, thenPacket, mask); 2142 } 2143 2144 template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket, const Packet16uc& elsePacket) { 2145 Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], 2146 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7], 2147 ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11], 2148 ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] }; 2149 2150 Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE)); 2151 return vec_sel(elsePacket, thenPacket, mask); 2152 } 2153 2154 template <> 2155 struct type_casting_traits<float, int> { 2156 enum { 2157 VectorizedCast = 1, 2158 SrcCoeffRatio = 1, 2159 TgtCoeffRatio = 1 2160 }; 2161 }; 2162 2163 template <> 2164 struct type_casting_traits<int, float> { 2165 enum { 2166 VectorizedCast = 1, 2167 SrcCoeffRatio = 1, 2168 TgtCoeffRatio = 1 2169 }; 2170 }; 2171 2172 template <> 2173 struct type_casting_traits<bfloat16, unsigned short int> { 2174 enum { 2175 VectorizedCast = 1, 2176 SrcCoeffRatio = 1, 2177 TgtCoeffRatio = 1 2178 }; 2179 }; 2180 2181 template <> 2182 struct type_casting_traits<unsigned short int, bfloat16> { 2183 enum { 2184 VectorizedCast = 1, 2185 SrcCoeffRatio = 1, 2186 TgtCoeffRatio = 1 2187 }; 2188 }; 2189 2190 template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) { 2191 return vec_cts(a,0); 2192 } 2193 2194 template<> EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) { 2195 return vec_ctu(a,0); 2196 } 2197 2198 template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) { 2199 return vec_ctf(a,0); 2200 } 2201 2202 template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) { 2203 return vec_ctf(a,0); 2204 } 2205 2206 template<> EIGEN_STRONG_INLINE Packet8us pcast<Packet8bf, Packet8us>(const Packet8bf& a) { 2207 Packet4f float_even = Bf16ToF32Even(a); 2208 Packet4f float_odd = Bf16ToF32Odd(a); 2209 Packet4ui int_even = pcast<Packet4f, Packet4ui>(float_even); 2210 Packet4ui int_odd = pcast<Packet4f, Packet4ui>(float_odd); 2211 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF); 2212 Packet4ui low_even = pand<Packet4ui>(int_even, p4ui_low_mask); 2213 Packet4ui low_odd = pand<Packet4ui>(int_odd, p4ui_low_mask); 2214 2215 //Check values that are bigger than USHRT_MAX (0xFFFF) 2216 Packet4bi overflow_selector; 2217 if(vec_any_gt(int_even, p4ui_low_mask)){ 2218 overflow_selector = vec_cmpgt(int_even, p4ui_low_mask); 2219 low_even = vec_sel(low_even, p4ui_low_mask, overflow_selector); 2220 } 2221 if(vec_any_gt(int_odd, p4ui_low_mask)){ 2222 overflow_selector = vec_cmpgt(int_odd, p4ui_low_mask); 2223 low_odd = vec_sel(low_even, p4ui_low_mask, overflow_selector); 2224 } 2225 2226 low_odd = plogical_shift_left<16>(low_odd); 2227 2228 Packet4ui int_final = por<Packet4ui>(low_even, low_odd); 2229 return reinterpret_cast<Packet8us>(int_final); 2230 } 2231 2232 template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8us, Packet8bf>(const Packet8us& a) { 2233 //short -> int -> float -> bfloat16 2234 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF); 2235 Packet4ui int_cast = reinterpret_cast<Packet4ui>(a); 2236 Packet4ui int_even = pand<Packet4ui>(int_cast, p4ui_low_mask); 2237 Packet4ui int_odd = plogical_shift_right<16>(int_cast); 2238 Packet4f float_even = pcast<Packet4ui, Packet4f>(int_even); 2239 Packet4f float_odd = pcast<Packet4ui, Packet4f>(int_odd); 2240 return F32ToBf16(float_even, float_odd); 2241 } 2242 2243 2244 template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) { 2245 return reinterpret_cast<Packet4i>(a); 2246 } 2247 2248 template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) { 2249 return reinterpret_cast<Packet4f>(a); 2250 } 2251 2252 2253 2254 //---------- double ---------- 2255 #ifdef __VSX__ 2256 typedef __vector double Packet2d; 2257 typedef __vector unsigned long long Packet2ul; 2258 typedef __vector long long Packet2l; 2259 #if EIGEN_COMP_CLANG 2260 typedef Packet2ul Packet2bl; 2261 #else 2262 typedef __vector __bool long Packet2bl; 2263 #endif 2264 2265 static Packet2l p2l_ONE = { 1, 1 }; 2266 static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO); 2267 static Packet2ul p2ul_SIGN = { 0x8000000000000000ull, 0x8000000000000000ull }; 2268 static Packet2ul p2ul_PREV0DOT5 = { 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull }; 2269 static Packet2d p2d_ONE = { 1.0, 1.0 }; 2270 static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO); 2271 static Packet2d p2d_MZERO = { numext::bit_cast<double>(0x8000000000000000ull), 2272 numext::bit_cast<double>(0x8000000000000000ull) }; 2273 2274 #ifdef _BIG_ENDIAN 2275 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8)); 2276 #else 2277 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8)); 2278 #endif 2279 2280 template<int index> Packet2d vec_splat_dbl(Packet2d& a) 2281 { 2282 return vec_splat(a, index); 2283 } 2284 2285 template<> struct packet_traits<double> : default_packet_traits 2286 { 2287 typedef Packet2d type; 2288 typedef Packet2d half; 2289 enum { 2290 Vectorizable = 1, 2291 AlignedOnScalar = 1, 2292 size=2, 2293 HasHalfPacket = 1, 2294 2295 HasAdd = 1, 2296 HasSub = 1, 2297 HasMul = 1, 2298 HasDiv = 1, 2299 HasMin = 1, 2300 HasMax = 1, 2301 HasAbs = 1, 2302 HasSin = 0, 2303 HasCos = 0, 2304 HasLog = 0, 2305 HasExp = 1, 2306 HasSqrt = 1, 2307 HasRsqrt = 1, 2308 HasRound = 1, 2309 HasFloor = 1, 2310 HasCeil = 1, 2311 HasRint = 1, 2312 HasNegate = 1, 2313 HasBlend = 1 2314 }; 2315 }; 2316 2317 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; }; 2318 2319 inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) 2320 { 2321 union { 2322 Packet2l v; 2323 int64_t n[2]; 2324 } vt; 2325 vt.v = v; 2326 s << vt.n[0] << ", " << vt.n[1]; 2327 return s; 2328 } 2329 2330 inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) 2331 { 2332 union { 2333 Packet2d v; 2334 double n[2]; 2335 } vt; 2336 vt.v = v; 2337 s << vt.n[0] << ", " << vt.n[1]; 2338 return s; 2339 } 2340 2341 // Need to define them first or we get specialization after instantiation errors 2342 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) 2343 { 2344 EIGEN_DEBUG_ALIGNED_LOAD 2345 return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang 2346 } 2347 2348 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) 2349 { 2350 EIGEN_DEBUG_ALIGNED_STORE 2351 vec_xst(from, 0, to); 2352 } 2353 2354 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { 2355 Packet2d v = {from, from}; 2356 return v; 2357 } 2358 2359 template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) { 2360 Packet2l v = {static_cast<long long>(from), static_cast<long long>(from)}; 2361 return reinterpret_cast<Packet2d>(v); 2362 } 2363 2364 template<> EIGEN_STRONG_INLINE void 2365 pbroadcast4<Packet2d>(const double *a, 2366 Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) 2367 { 2368 //This way is faster than vec_splat (at least for doubles in Power 9) 2369 a0 = pset1<Packet2d>(a[0]); 2370 a1 = pset1<Packet2d>(a[1]); 2371 a2 = pset1<Packet2d>(a[2]); 2372 a3 = pset1<Packet2d>(a[3]); 2373 } 2374 2375 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) 2376 { 2377 EIGEN_ALIGN16 double af[2]; 2378 af[0] = from[0*stride]; 2379 af[1] = from[1*stride]; 2380 return pload<Packet2d>(af); 2381 } 2382 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) 2383 { 2384 EIGEN_ALIGN16 double af[2]; 2385 pstore<double>(af, from); 2386 to[0*stride] = af[0]; 2387 to[1*stride] = af[1]; 2388 } 2389 2390 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; } 2391 2392 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; } 2393 2394 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; } 2395 2396 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; } 2397 2398 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } 2399 2400 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); } 2401 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); } 2402 2403 // for some weird raisons, it has to be overloaded for packet of integers 2404 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } 2405 2406 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) 2407 { 2408 // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN 2409 Packet2d ret; 2410 __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); 2411 return ret; 2412 } 2413 2414 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) 2415 { 2416 // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN 2417 Packet2d ret; 2418 __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); 2419 return ret; 2420 } 2421 2422 template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmple(a,b)); } 2423 template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmplt(a,b)); } 2424 template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmpeq(a,b)); } 2425 template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { 2426 Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a,b)); 2427 return vec_nor(c,c); 2428 } 2429 2430 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } 2431 2432 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } 2433 2434 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } 2435 2436 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } 2437 2438 template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) 2439 { 2440 Packet2d t = vec_add(reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a); 2441 Packet2d res; 2442 2443 __asm__("xvrdpiz %x0, %x1\n\t" 2444 : "=&wa" (res) 2445 : "wa" (t)); 2446 2447 return res; 2448 } 2449 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); } 2450 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); } 2451 template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) 2452 { 2453 Packet2d res; 2454 2455 __asm__("xvrdpic %x0, %x1\n\t" 2456 : "=&wa" (res) 2457 : "wa" (a)); 2458 2459 return res; 2460 } 2461 2462 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) 2463 { 2464 EIGEN_DEBUG_UNALIGNED_LOAD 2465 return vec_xl(0, const_cast<double*>(from)); 2466 } 2467 2468 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) 2469 { 2470 Packet2d p; 2471 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from); 2472 else p = ploadu<Packet2d>(from); 2473 return vec_splat_dbl<0>(p); 2474 } 2475 2476 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) 2477 { 2478 EIGEN_DEBUG_UNALIGNED_STORE 2479 vec_xst(from, 0, to); 2480 } 2481 2482 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); } 2483 2484 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; } 2485 2486 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) 2487 { 2488 return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64)); 2489 } 2490 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } 2491 2492 // VSX support varies between different compilers and even different 2493 // versions of the same compiler. For gcc version >= 4.9.3, we can use 2494 // vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use 2495 // a slow version that works with older compilers. 2496 // Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles 2497 // are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963 2498 template<> 2499 inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x) { 2500 #if EIGEN_GNUC_AT_LEAST(5, 4) || \ 2501 (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1) 2502 return vec_cts(x, 0); // TODO: check clang version. 2503 #else 2504 double tmp[2]; 2505 memcpy(tmp, &x, sizeof(tmp)); 2506 Packet2l l = { static_cast<long long>(tmp[0]), 2507 static_cast<long long>(tmp[1]) }; 2508 return l; 2509 #endif 2510 } 2511 2512 template<> 2513 inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x) { 2514 unsigned long long tmp[2]; 2515 memcpy(tmp, &x, sizeof(tmp)); 2516 Packet2d d = { static_cast<double>(tmp[0]), 2517 static_cast<double>(tmp[1]) }; 2518 return d; 2519 } 2520 2521 2522 // Packet2l shifts. 2523 // For POWER8 we simply use vec_sr/l. 2524 // 2525 // Things are more complicated for POWER7. There is actually a 2526 // vec_xxsxdi intrinsic but it is not supported by some gcc versions. 2527 // So we need to shift by N % 32 and rearrage bytes. 2528 #ifdef __POWER8_VECTOR__ 2529 2530 template<int N> 2531 EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) { 2532 const Packet2ul shift = { N, N }; 2533 return vec_sl(a, shift); 2534 } 2535 2536 template<int N> 2537 EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) { 2538 const Packet2ul shift = { N, N }; 2539 return vec_sr(a, shift); 2540 } 2541 2542 #else 2543 2544 // Shifts [A, B, C, D] to [B, 0, D, 0]. 2545 // Used to implement left shifts for Packet2l. 2546 EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) { 2547 static const Packet16uc perm = { 2548 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, 2549 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b }; 2550 #ifdef _BIG_ENDIAN 2551 return vec_perm(p4i_ZERO, a, perm); 2552 #else 2553 return vec_perm(a, p4i_ZERO, perm); 2554 #endif 2555 } 2556 2557 // Shifts [A, B, C, D] to [0, A, 0, C]. 2558 // Used to implement right shifts for Packet2l. 2559 EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) { 2560 static const Packet16uc perm = { 2561 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 2562 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b }; 2563 #ifdef _BIG_ENDIAN 2564 return vec_perm(p4i_ZERO, a, perm); 2565 #else 2566 return vec_perm(a, p4i_ZERO, perm); 2567 #endif 2568 } 2569 2570 template<int N, typename EnableIf = void> 2571 struct plogical_shift_left_impl; 2572 2573 template<int N> 2574 struct plogical_shift_left_impl<N, typename enable_if<(N < 32) && (N >= 0)>::type> { 2575 static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { 2576 static const unsigned n = static_cast<unsigned>(N); 2577 const Packet4ui shift = {n, n, n, n}; 2578 const Packet4i ai = reinterpret_cast<Packet4i>(a); 2579 static const unsigned m = static_cast<unsigned>(32 - N); 2580 const Packet4ui shift_right = {m, m, m, m}; 2581 const Packet4i out_hi = vec_sl(ai, shift); 2582 const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right)); 2583 return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo)); 2584 } 2585 }; 2586 2587 template<int N> 2588 struct plogical_shift_left_impl<N, typename enable_if<(N >= 32)>::type> { 2589 static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { 2590 static const unsigned m = static_cast<unsigned>(N - 32); 2591 const Packet4ui shift = {m, m, m, m}; 2592 const Packet4i ai = reinterpret_cast<Packet4i>(a); 2593 return reinterpret_cast<Packet2l>(shift_even_left(vec_sl(ai, shift))); 2594 } 2595 }; 2596 2597 template<int N> 2598 EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) { 2599 return plogical_shift_left_impl<N>::run(a); 2600 } 2601 2602 template<int N, typename EnableIf = void> 2603 struct plogical_shift_right_impl; 2604 2605 template<int N> 2606 struct plogical_shift_right_impl<N, typename enable_if<(N < 32) && (N >= 0)>::type> { 2607 static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { 2608 static const unsigned n = static_cast<unsigned>(N); 2609 const Packet4ui shift = {n, n, n, n}; 2610 const Packet4i ai = reinterpret_cast<Packet4i>(a); 2611 static const unsigned m = static_cast<unsigned>(32 - N); 2612 const Packet4ui shift_left = {m, m, m, m}; 2613 const Packet4i out_lo = vec_sr(ai, shift); 2614 const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left)); 2615 return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo)); 2616 } 2617 }; 2618 2619 template<int N> 2620 struct plogical_shift_right_impl<N, typename enable_if<(N >= 32)>::type> { 2621 static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { 2622 static const unsigned m = static_cast<unsigned>(N - 32); 2623 const Packet4ui shift = {m, m, m, m}; 2624 const Packet4i ai = reinterpret_cast<Packet4i>(a); 2625 return reinterpret_cast<Packet2l>(shift_odd_right(vec_sr(ai, shift))); 2626 } 2627 }; 2628 2629 template<int N> 2630 EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) { 2631 return plogical_shift_right_impl<N>::run(a); 2632 } 2633 #endif 2634 2635 template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) { 2636 // Clamp exponent to [-2099, 2099] 2637 const Packet2d max_exponent = pset1<Packet2d>(2099.0); 2638 const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); 2639 2640 // Split 2^e into four factors and multiply: 2641 const Packet2l bias = { 1023, 1023 }; 2642 Packet2l b = plogical_shift_right<2>(e); // floor(e/4) 2643 Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); 2644 Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) 2645 b = psub(psub(psub(e, b), b), b); // e - 3b 2646 c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b) 2647 out = pmul(out, c); // a * 2^e 2648 return out; 2649 } 2650 2651 2652 // Extract exponent without existence of Packet2l. 2653 template<> 2654 EIGEN_STRONG_INLINE 2655 Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) { 2656 return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(reinterpret_cast<Packet2l>(pabs(a)))); 2657 } 2658 2659 template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d> (const Packet2d& a, Packet2d& exponent) { 2660 return pfrexp_generic(a, exponent); 2661 } 2662 2663 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) 2664 { 2665 Packet2d b, sum; 2666 b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8)); 2667 sum = a + b; 2668 return pfirst<Packet2d>(sum); 2669 } 2670 2671 // Other reduction functions: 2672 // mul 2673 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) 2674 { 2675 return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8)))); 2676 } 2677 2678 // min 2679 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) 2680 { 2681 return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8)))); 2682 } 2683 2684 // max 2685 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) 2686 { 2687 return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8)))); 2688 } 2689 2690 EIGEN_DEVICE_FUNC inline void 2691 ptranspose(PacketBlock<Packet2d,2>& kernel) { 2692 Packet2d t0, t1; 2693 t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); 2694 t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); 2695 kernel.packet[0] = t0; 2696 kernel.packet[1] = t1; 2697 } 2698 2699 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { 2700 Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; 2701 Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) ); 2702 return vec_sel(elsePacket, thenPacket, mask); 2703 } 2704 2705 2706 #endif // __VSX__ 2707 } // end namespace internal 2708 2709 } // end namespace Eigen 2710 2711 #endif // EIGEN_PACKET_MATH_ALTIVEC_H 2712