1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <[email protected]> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_PACKET_MATH_GPU_H 11 #define EIGEN_PACKET_MATH_GPU_H 12 13 namespace Eigen { 14 15 namespace internal { 16 17 // Read-only data cached load available. 18 #if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) 19 #define EIGEN_GPU_HAS_LDG 1 20 #endif 21 22 // FP16 math available. 23 #if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) 24 #define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1 25 #endif 26 27 #if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) 28 #define EIGEN_GPU_HAS_FP16_ARITHMETIC 1 29 #endif 30 31 // Make sure this is only available when targeting a GPU: we don't want to 32 // introduce conflicts between these packet_traits definitions and the ones 33 // we'll use on the host side (SSE, AVX, ...) 34 #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) 35 36 template<> struct is_arithmetic<float4> { enum { value = true }; }; 37 template<> struct is_arithmetic<double2> { enum { value = true }; }; 38 39 template<> struct packet_traits<float> : default_packet_traits 40 { 41 typedef float4 type; 42 typedef float4 half; 43 enum { 44 Vectorizable = 1, 45 AlignedOnScalar = 1, 46 size=4, 47 HasHalfPacket = 0, 48 49 HasDiv = 1, 50 HasSin = 0, 51 HasCos = 0, 52 HasLog = 1, 53 HasExp = 1, 54 HasSqrt = 1, 55 HasRsqrt = 1, 56 HasLGamma = 1, 57 HasDiGamma = 1, 58 HasZeta = 1, 59 HasPolygamma = 1, 60 HasErf = 1, 61 HasErfc = 1, 62 HasNdtri = 1, 63 HasBessel = 1, 64 HasIGamma = 1, 65 HasIGammaDerA = 1, 66 HasGammaSampleDerAlpha = 1, 67 HasIGammac = 1, 68 HasBetaInc = 1, 69 70 HasBlend = 0, 71 HasFloor = 1, 72 }; 73 }; 74 75 template<> struct packet_traits<double> : default_packet_traits 76 { 77 typedef double2 type; 78 typedef double2 half; 79 enum { 80 Vectorizable = 1, 81 AlignedOnScalar = 1, 82 size=2, 83 HasHalfPacket = 0, 84 85 HasDiv = 1, 86 HasLog = 1, 87 HasExp = 1, 88 HasSqrt = 1, 89 HasRsqrt = 1, 90 HasLGamma = 1, 91 HasDiGamma = 1, 92 HasZeta = 1, 93 HasPolygamma = 1, 94 HasErf = 1, 95 HasErfc = 1, 96 HasNdtri = 1, 97 HasBessel = 1, 98 HasIGamma = 1, 99 HasIGammaDerA = 1, 100 HasGammaSampleDerAlpha = 1, 101 HasIGammac = 1, 102 HasBetaInc = 1, 103 104 HasBlend = 0, 105 HasFloor = 1, 106 }; 107 }; 108 109 110 template<> struct unpacket_traits<float4> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef float4 half; }; 111 template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef double2 half; }; 112 113 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) { 114 return make_float4(from, from, from, from); 115 } 116 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) { 117 return make_double2(from, from); 118 } 119 120 // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler, 121 // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation 122 // of the functions, while the latter can only deal with one of them. 123 #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) 124 namespace { 125 126 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, 127 const float& b) { 128 return __int_as_float(__float_as_int(a) & __float_as_int(b)); 129 } 130 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a, 131 const double& b) { 132 return __longlong_as_double(__double_as_longlong(a) & 133 __double_as_longlong(b)); 134 } 135 136 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a, 137 const float& b) { 138 return __int_as_float(__float_as_int(a) | __float_as_int(b)); 139 } 140 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a, 141 const double& b) { 142 return __longlong_as_double(__double_as_longlong(a) | 143 __double_as_longlong(b)); 144 } 145 146 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a, 147 const float& b) { 148 return __int_as_float(__float_as_int(a) ^ __float_as_int(b)); 149 } 150 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a, 151 const double& b) { 152 return __longlong_as_double(__double_as_longlong(a) ^ 153 __double_as_longlong(b)); 154 } 155 156 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a, 157 const float& b) { 158 return __int_as_float(__float_as_int(a) & ~__float_as_int(b)); 159 } 160 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a, 161 const double& b) { 162 return __longlong_as_double(__double_as_longlong(a) & 163 ~__double_as_longlong(b)); 164 } 165 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a, 166 const float& b) { 167 return __int_as_float(a == b ? 0xffffffffu : 0u); 168 } 169 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a, 170 const double& b) { 171 return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull); 172 } 173 174 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a, 175 const float& b) { 176 return __int_as_float(a < b ? 0xffffffffu : 0u); 177 } 178 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a, 179 const double& b) { 180 return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull); 181 } 182 183 } // namespace 184 185 template <> 186 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a, 187 const float4& b) { 188 return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y), 189 bitwise_and(a.z, b.z), bitwise_and(a.w, b.w)); 190 } 191 template <> 192 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a, 193 const double2& b) { 194 return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y)); 195 } 196 197 template <> 198 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a, 199 const float4& b) { 200 return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y), 201 bitwise_or(a.z, b.z), bitwise_or(a.w, b.w)); 202 } 203 template <> 204 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a, 205 const double2& b) { 206 return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y)); 207 } 208 209 template <> 210 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a, 211 const float4& b) { 212 return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y), 213 bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w)); 214 } 215 template <> 216 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a, 217 const double2& b) { 218 return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y)); 219 } 220 221 template <> 222 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a, 223 const float4& b) { 224 return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y), 225 bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w)); 226 } 227 template <> 228 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 229 pandnot<double2>(const double2& a, const double2& b) { 230 return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y)); 231 } 232 233 template <> 234 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a, 235 const float4& b) { 236 return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z), 237 eq_mask(a.w, b.w)); 238 } 239 template <> 240 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a, 241 const float4& b) { 242 return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z), 243 lt_mask(a.w, b.w)); 244 } 245 template <> 246 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 247 pcmp_eq<double2>(const double2& a, const double2& b) { 248 return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y)); 249 } 250 template <> 251 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 252 pcmp_lt<double2>(const double2& a, const double2& b) { 253 return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y)); 254 } 255 #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) 256 257 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) { 258 return make_float4(a, a+1, a+2, a+3); 259 } 260 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) { 261 return make_double2(a, a+1); 262 } 263 264 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) { 265 return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); 266 } 267 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) { 268 return make_double2(a.x+b.x, a.y+b.y); 269 } 270 271 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) { 272 return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); 273 } 274 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) { 275 return make_double2(a.x-b.x, a.y-b.y); 276 } 277 278 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) { 279 return make_float4(-a.x, -a.y, -a.z, -a.w); 280 } 281 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) { 282 return make_double2(-a.x, -a.y); 283 } 284 285 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; } 286 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; } 287 288 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) { 289 return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); 290 } 291 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) { 292 return make_double2(a.x*b.x, a.y*b.y); 293 } 294 295 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) { 296 return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); 297 } 298 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) { 299 return make_double2(a.x/b.x, a.y/b.y); 300 } 301 302 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) { 303 return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w)); 304 } 305 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) { 306 return make_double2(fmin(a.x, b.x), fmin(a.y, b.y)); 307 } 308 309 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) { 310 return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w)); 311 } 312 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) { 313 return make_double2(fmax(a.x, b.x), fmax(a.y, b.y)); 314 } 315 316 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) { 317 return *reinterpret_cast<const float4*>(from); 318 } 319 320 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) { 321 return *reinterpret_cast<const double2*>(from); 322 } 323 324 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) { 325 return make_float4(from[0], from[1], from[2], from[3]); 326 } 327 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) { 328 return make_double2(from[0], from[1]); 329 } 330 331 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) { 332 return make_float4(from[0], from[0], from[1], from[1]); 333 } 334 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) { 335 return make_double2(from[0], from[0]); 336 } 337 338 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) { 339 *reinterpret_cast<float4*>(to) = from; 340 } 341 342 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) { 343 *reinterpret_cast<double2*>(to) = from; 344 } 345 346 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) { 347 to[0] = from.x; 348 to[1] = from.y; 349 to[2] = from.z; 350 to[3] = from.w; 351 } 352 353 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) { 354 to[0] = from.x; 355 to[1] = from.y; 356 } 357 358 template<> 359 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) { 360 #if defined(EIGEN_GPU_HAS_LDG) 361 return __ldg((const float4*)from); 362 #else 363 return make_float4(from[0], from[1], from[2], from[3]); 364 #endif 365 } 366 template<> 367 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) { 368 #if defined(EIGEN_GPU_HAS_LDG) 369 return __ldg((const double2*)from); 370 #else 371 return make_double2(from[0], from[1]); 372 #endif 373 } 374 375 template<> 376 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) { 377 #if defined(EIGEN_GPU_HAS_LDG) 378 return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); 379 #else 380 return make_float4(from[0], from[1], from[2], from[3]); 381 #endif 382 } 383 template<> 384 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) { 385 #if defined(EIGEN_GPU_HAS_LDG) 386 return make_double2(__ldg(from+0), __ldg(from+1)); 387 #else 388 return make_double2(from[0], from[1]); 389 #endif 390 } 391 392 template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) { 393 return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); 394 } 395 396 template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) { 397 return make_double2(from[0*stride], from[1*stride]); 398 } 399 400 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) { 401 to[stride*0] = from.x; 402 to[stride*1] = from.y; 403 to[stride*2] = from.z; 404 to[stride*3] = from.w; 405 } 406 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) { 407 to[stride*0] = from.x; 408 to[stride*1] = from.y; 409 } 410 411 template<> EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) { 412 return a.x; 413 } 414 template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) { 415 return a.x; 416 } 417 418 template<> EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) { 419 return a.x + a.y + a.z + a.w; 420 } 421 template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) { 422 return a.x + a.y; 423 } 424 425 template<> EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) { 426 return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w)); 427 } 428 template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) { 429 return fmax(a.x, a.y); 430 } 431 432 template<> EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) { 433 return fminf(fminf(a.x, a.y), fminf(a.z, a.w)); 434 } 435 template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) { 436 return fmin(a.x, a.y); 437 } 438 439 template<> EIGEN_DEVICE_FUNC inline float predux_mul<float4>(const float4& a) { 440 return a.x * a.y * a.z * a.w; 441 } 442 template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) { 443 return a.x * a.y; 444 } 445 446 template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) { 447 return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); 448 } 449 template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) { 450 return make_double2(fabs(a.x), fabs(a.y)); 451 } 452 453 template<> EIGEN_DEVICE_FUNC inline float4 pfloor<float4>(const float4& a) { 454 return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w)); 455 } 456 template<> EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) { 457 return make_double2(floor(a.x), floor(a.y)); 458 } 459 460 EIGEN_DEVICE_FUNC inline void 461 ptranspose(PacketBlock<float4,4>& kernel) { 462 float tmp = kernel.packet[0].y; 463 kernel.packet[0].y = kernel.packet[1].x; 464 kernel.packet[1].x = tmp; 465 466 tmp = kernel.packet[0].z; 467 kernel.packet[0].z = kernel.packet[2].x; 468 kernel.packet[2].x = tmp; 469 470 tmp = kernel.packet[0].w; 471 kernel.packet[0].w = kernel.packet[3].x; 472 kernel.packet[3].x = tmp; 473 474 tmp = kernel.packet[1].z; 475 kernel.packet[1].z = kernel.packet[2].y; 476 kernel.packet[2].y = tmp; 477 478 tmp = kernel.packet[1].w; 479 kernel.packet[1].w = kernel.packet[3].y; 480 kernel.packet[3].y = tmp; 481 482 tmp = kernel.packet[2].w; 483 kernel.packet[2].w = kernel.packet[3].z; 484 kernel.packet[3].z = tmp; 485 } 486 487 EIGEN_DEVICE_FUNC inline void 488 ptranspose(PacketBlock<double2,2>& kernel) { 489 double tmp = kernel.packet[0].y; 490 kernel.packet[0].y = kernel.packet[1].x; 491 kernel.packet[1].x = tmp; 492 } 493 494 #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) 495 496 // Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning 497 // its corresponding packet_traits<Eigen::half> must be visible on host. 498 #if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) 499 500 typedef ulonglong2 Packet4h2; 501 template<> struct unpacket_traits<Packet4h2> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; }; 502 template<> struct is_arithmetic<Packet4h2> { enum { value = true }; }; 503 504 template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; }; 505 template<> struct is_arithmetic<half2> { enum { value = true }; }; 506 507 template<> struct packet_traits<Eigen::half> : default_packet_traits 508 { 509 typedef Packet4h2 type; 510 typedef Packet4h2 half; 511 enum { 512 Vectorizable = 1, 513 AlignedOnScalar = 1, 514 size=8, 515 HasHalfPacket = 0, 516 HasAdd = 1, 517 HasSub = 1, 518 HasMul = 1, 519 HasDiv = 1, 520 HasSqrt = 1, 521 HasRsqrt = 1, 522 HasExp = 1, 523 HasExpm1 = 1, 524 HasLog = 1, 525 HasLog1p = 1 526 }; 527 }; 528 529 namespace { 530 // This is equivalent to make_half2, which is undocumented and doesn't seem to always exist. 531 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) { 532 #if defined(EIGEN_GPU_COMPILE_PHASE) 533 return __halves2half2(a, b); 534 #else 535 // Round-about way since __halves2half2 is a __device__ function. 536 return __floats2half2_rn(__half2float(a), __half2float(b)); 537 #endif 538 } 539 540 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) { 541 #if defined(EIGEN_GPU_COMPILE_PHASE) 542 return __low2half(a); 543 #else 544 return __float2half(__low2float(a)); 545 #endif 546 } 547 548 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) { 549 #if defined(EIGEN_GPU_COMPILE_PHASE) 550 return __high2half(a); 551 #else 552 return __float2half(__high2float(a)); 553 #endif 554 } 555 } // namespace 556 557 template<> 558 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) { 559 #if defined(EIGEN_GPU_COMPILE_PHASE) 560 return __half2half2(from); 561 #else 562 const float f = __half2float(from); 563 return __floats2half2_rn(f, f); 564 #endif 565 } 566 567 template <> 568 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 569 pset1<Packet4h2>(const Eigen::half& from) { 570 Packet4h2 r; 571 half2* p_alias = reinterpret_cast<half2*>(&r); 572 p_alias[0] = pset1<half2>(from); 573 p_alias[1] = pset1<half2>(from); 574 p_alias[2] = pset1<half2>(from); 575 p_alias[3] = pset1<half2>(from); 576 return r; 577 } 578 579 // We now need this visible on both host and device. 580 // #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) 581 namespace { 582 583 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { 584 return *reinterpret_cast<const half2*>(from); 585 } 586 587 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { 588 return combine_half(from[0], from[1]); 589 } 590 591 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { 592 return combine_half(from[0], from[0]); 593 } 594 595 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, 596 const half2& from) { 597 *reinterpret_cast<half2*>(to) = from; 598 } 599 600 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, 601 const half2& from) { 602 to[0] = get_half2_low(from); 603 to[1] = get_half2_high(from); 604 } 605 606 607 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned( 608 const Eigen::half* from) { 609 #if defined(EIGEN_GPU_HAS_LDG) 610 // Input is guaranteed to be properly aligned. 611 return __ldg(reinterpret_cast<const half2*>(from)); 612 #else 613 return combine_half(*(from+0), *(from+1)); 614 #endif 615 } 616 617 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned( 618 const Eigen::half* from) { 619 #if defined(EIGEN_GPU_HAS_LDG) 620 return __halves2half2(__ldg(from+0), __ldg(from+1)); 621 #else 622 return combine_half(*(from+0), *(from+1)); 623 #endif 624 } 625 626 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, 627 Index stride) { 628 return combine_half(from[0*stride], from[1*stride]); 629 } 630 631 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter( 632 Eigen::half* to, const half2& from, Index stride) { 633 to[stride*0] = get_half2_low(from); 634 to[stride*1] = get_half2_high(from); 635 } 636 637 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { 638 return get_half2_low(a); 639 } 640 641 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) { 642 half a1 = get_half2_low(a); 643 half a2 = get_half2_high(a); 644 half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF); 645 half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF); 646 return combine_half(result1, result2); 647 } 648 649 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) { 650 half true_half = half_impl::raw_uint16_to_half(0xffffu); 651 return pset1<half2>(true_half); 652 } 653 654 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) { 655 half false_half = half_impl::raw_uint16_to_half(0x0000u); 656 return pset1<half2>(false_half); 657 } 658 659 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void 660 ptranspose(PacketBlock<half2,2>& kernel) { 661 __half a1 = get_half2_low(kernel.packet[0]); 662 __half a2 = get_half2_high(kernel.packet[0]); 663 __half b1 = get_half2_low(kernel.packet[1]); 664 __half b2 = get_half2_high(kernel.packet[1]); 665 kernel.packet[0] = combine_half(a1, b1); 666 kernel.packet[1] = combine_half(a2, b2); 667 } 668 669 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { 670 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 671 return __halves2half2(a, __hadd(a, __float2half(1.0f))); 672 #else 673 float f = __half2float(a) + 1.0f; 674 return combine_half(a, __float2half(f)); 675 #endif 676 } 677 678 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, 679 const half2& a, 680 const half2& b) { 681 half mask_low = get_half2_low(mask); 682 half mask_high = get_half2_high(mask); 683 half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a); 684 half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a); 685 return combine_half(result_low, result_high); 686 } 687 688 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, 689 const half2& b) { 690 half true_half = half_impl::raw_uint16_to_half(0xffffu); 691 half false_half = half_impl::raw_uint16_to_half(0x0000u); 692 half a1 = get_half2_low(a); 693 half a2 = get_half2_high(a); 694 half b1 = get_half2_low(b); 695 half b2 = get_half2_high(b); 696 half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half; 697 half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half; 698 return combine_half(eq1, eq2); 699 } 700 701 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a, 702 const half2& b) { 703 half true_half = half_impl::raw_uint16_to_half(0xffffu); 704 half false_half = half_impl::raw_uint16_to_half(0x0000u); 705 half a1 = get_half2_low(a); 706 half a2 = get_half2_high(a); 707 half b1 = get_half2_low(b); 708 half b2 = get_half2_high(b); 709 half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half; 710 half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half; 711 return combine_half(eq1, eq2); 712 } 713 714 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, 715 const half2& b) { 716 half a1 = get_half2_low(a); 717 half a2 = get_half2_high(a); 718 half b1 = get_half2_low(b); 719 half b2 = get_half2_high(b); 720 half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x); 721 half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x); 722 return combine_half(result1, result2); 723 } 724 725 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, 726 const half2& b) { 727 half a1 = get_half2_low(a); 728 half a2 = get_half2_high(a); 729 half b1 = get_half2_low(b); 730 half b2 = get_half2_high(b); 731 half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x); 732 half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x); 733 return combine_half(result1, result2); 734 } 735 736 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, 737 const half2& b) { 738 half a1 = get_half2_low(a); 739 half a2 = get_half2_high(a); 740 half b1 = get_half2_low(b); 741 half b2 = get_half2_high(b); 742 half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x); 743 half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x); 744 return combine_half(result1, result2); 745 } 746 747 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, 748 const half2& b) { 749 half a1 = get_half2_low(a); 750 half a2 = get_half2_high(a); 751 half b1 = get_half2_low(b); 752 half b2 = get_half2_high(b); 753 half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x); 754 half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x); 755 return combine_half(result1, result2); 756 } 757 758 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, 759 const half2& b) { 760 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 761 return __hadd2(a, b); 762 #else 763 float a1 = __low2float(a); 764 float a2 = __high2float(a); 765 float b1 = __low2float(b); 766 float b2 = __high2float(b); 767 float r1 = a1 + b1; 768 float r2 = a2 + b2; 769 return __floats2half2_rn(r1, r2); 770 #endif 771 } 772 773 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, 774 const half2& b) { 775 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 776 return __hsub2(a, b); 777 #else 778 float a1 = __low2float(a); 779 float a2 = __high2float(a); 780 float b1 = __low2float(b); 781 float b2 = __high2float(b); 782 float r1 = a1 - b1; 783 float r2 = a2 - b2; 784 return __floats2half2_rn(r1, r2); 785 #endif 786 } 787 788 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { 789 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 790 return __hneg2(a); 791 #else 792 float a1 = __low2float(a); 793 float a2 = __high2float(a); 794 return __floats2half2_rn(-a1, -a2); 795 #endif 796 } 797 798 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } 799 800 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, 801 const half2& b) { 802 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 803 return __hmul2(a, b); 804 #else 805 float a1 = __low2float(a); 806 float a2 = __high2float(a); 807 float b1 = __low2float(b); 808 float b2 = __high2float(b); 809 float r1 = a1 * b1; 810 float r2 = a2 * b2; 811 return __floats2half2_rn(r1, r2); 812 #endif 813 } 814 815 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, 816 const half2& b, 817 const half2& c) { 818 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 819 return __hfma2(a, b, c); 820 #else 821 float a1 = __low2float(a); 822 float a2 = __high2float(a); 823 float b1 = __low2float(b); 824 float b2 = __high2float(b); 825 float c1 = __low2float(c); 826 float c2 = __high2float(c); 827 float r1 = a1 * b1 + c1; 828 float r2 = a2 * b2 + c2; 829 return __floats2half2_rn(r1, r2); 830 #endif 831 } 832 833 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, 834 const half2& b) { 835 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 836 return __h2div(a, b); 837 #else 838 float a1 = __low2float(a); 839 float a2 = __high2float(a); 840 float b1 = __low2float(b); 841 float b2 = __high2float(b); 842 float r1 = a1 / b1; 843 float r2 = a2 / b2; 844 return __floats2half2_rn(r1, r2); 845 #endif 846 } 847 848 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, 849 const half2& b) { 850 float a1 = __low2float(a); 851 float a2 = __high2float(a); 852 float b1 = __low2float(b); 853 float b2 = __high2float(b); 854 __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b); 855 __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b); 856 return combine_half(r1, r2); 857 } 858 859 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, 860 const half2& b) { 861 float a1 = __low2float(a); 862 float a2 = __high2float(a); 863 float b1 = __low2float(b); 864 float b2 = __high2float(b); 865 __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b); 866 __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b); 867 return combine_half(r1, r2); 868 } 869 870 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { 871 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 872 return __hadd(__low2half(a), __high2half(a)); 873 #else 874 float a1 = __low2float(a); 875 float a2 = __high2float(a); 876 return Eigen::half(__float2half(a1 + a2)); 877 #endif 878 } 879 880 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { 881 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 882 __half first = __low2half(a); 883 __half second = __high2half(a); 884 return __hgt(first, second) ? first : second; 885 #else 886 float a1 = __low2float(a); 887 float a2 = __high2float(a); 888 return a1 > a2 ? get_half2_low(a) : get_half2_high(a); 889 #endif 890 } 891 892 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { 893 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 894 __half first = __low2half(a); 895 __half second = __high2half(a); 896 return __hlt(first, second) ? first : second; 897 #else 898 float a1 = __low2float(a); 899 float a2 = __high2float(a); 900 return a1 < a2 ? get_half2_low(a) : get_half2_high(a); 901 #endif 902 } 903 904 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { 905 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 906 return __hmul(__low2half(a), __high2half(a)); 907 #else 908 float a1 = __low2float(a); 909 float a2 = __high2float(a); 910 return Eigen::half(__float2half(a1 * a2)); 911 #endif 912 } 913 914 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { 915 float a1 = __low2float(a); 916 float a2 = __high2float(a); 917 float r1 = log1pf(a1); 918 float r2 = log1pf(a2); 919 return __floats2half2_rn(r1, r2); 920 } 921 922 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { 923 float a1 = __low2float(a); 924 float a2 = __high2float(a); 925 float r1 = expm1f(a1); 926 float r2 = expm1f(a2); 927 return __floats2half2_rn(r1, r2); 928 } 929 930 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || \ 931 defined(EIGEN_HIP_DEVICE_COMPILE) 932 933 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 934 half2 plog(const half2& a) { 935 return h2log(a); 936 } 937 938 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 939 half2 pexp(const half2& a) { 940 return h2exp(a); 941 } 942 943 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 944 half2 psqrt(const half2& a) { 945 return h2sqrt(a); 946 } 947 948 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 949 half2 prsqrt(const half2& a) { 950 return h2rsqrt(a); 951 } 952 953 #else 954 955 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { 956 float a1 = __low2float(a); 957 float a2 = __high2float(a); 958 float r1 = logf(a1); 959 float r2 = logf(a2); 960 return __floats2half2_rn(r1, r2); 961 } 962 963 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { 964 float a1 = __low2float(a); 965 float a2 = __high2float(a); 966 float r1 = expf(a1); 967 float r2 = expf(a2); 968 return __floats2half2_rn(r1, r2); 969 } 970 971 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { 972 float a1 = __low2float(a); 973 float a2 = __high2float(a); 974 float r1 = sqrtf(a1); 975 float r2 = sqrtf(a2); 976 return __floats2half2_rn(r1, r2); 977 } 978 979 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { 980 float a1 = __low2float(a); 981 float a2 = __high2float(a); 982 float r1 = rsqrtf(a1); 983 float r2 = rsqrtf(a2); 984 return __floats2half2_rn(r1, r2); 985 } 986 #endif 987 } // namespace 988 989 template <> 990 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 991 pload<Packet4h2>(const Eigen::half* from) { 992 return *reinterpret_cast<const Packet4h2*>(from); 993 } 994 995 // unaligned load; 996 template <> 997 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 998 ploadu<Packet4h2>(const Eigen::half* from) { 999 Packet4h2 r; 1000 half2* p_alias = reinterpret_cast<half2*>(&r); 1001 p_alias[0] = ploadu(from + 0); 1002 p_alias[1] = ploadu(from + 2); 1003 p_alias[2] = ploadu(from + 4); 1004 p_alias[3] = ploadu(from + 6); 1005 return r; 1006 } 1007 1008 template <> 1009 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 1010 ploaddup<Packet4h2>(const Eigen::half* from) { 1011 Packet4h2 r; 1012 half2* p_alias = reinterpret_cast<half2*>(&r); 1013 p_alias[0] = ploaddup(from + 0); 1014 p_alias[1] = ploaddup(from + 1); 1015 p_alias[2] = ploaddup(from + 2); 1016 p_alias[3] = ploaddup(from + 3); 1017 return r; 1018 } 1019 1020 template <> 1021 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>( 1022 Eigen::half* to, const Packet4h2& from) { 1023 *reinterpret_cast<Packet4h2*>(to) = from; 1024 } 1025 1026 template <> 1027 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>( 1028 Eigen::half* to, const Packet4h2& from) { 1029 const half2* from_alias = reinterpret_cast<const half2*>(&from); 1030 pstoreu(to + 0,from_alias[0]); 1031 pstoreu(to + 2,from_alias[1]); 1032 pstoreu(to + 4,from_alias[2]); 1033 pstoreu(to + 6,from_alias[3]); 1034 } 1035 1036 template <> 1037 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 1038 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) { 1039 #if defined(EIGEN_GPU_HAS_LDG) 1040 Packet4h2 r; 1041 r = __ldg(reinterpret_cast<const Packet4h2*>(from)); 1042 return r; 1043 #else 1044 Packet4h2 r; 1045 half2* r_alias = reinterpret_cast<half2*>(&r); 1046 r_alias[0] = ploadt_ro_aligned(from + 0); 1047 r_alias[1] = ploadt_ro_aligned(from + 2); 1048 r_alias[2] = ploadt_ro_aligned(from + 4); 1049 r_alias[3] = ploadt_ro_aligned(from + 6); 1050 return r; 1051 #endif 1052 } 1053 1054 template <> 1055 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 1056 ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) { 1057 Packet4h2 r; 1058 half2* r_alias = reinterpret_cast<half2*>(&r); 1059 r_alias[0] = ploadt_ro_unaligned(from + 0); 1060 r_alias[1] = ploadt_ro_unaligned(from + 2); 1061 r_alias[2] = ploadt_ro_unaligned(from + 4); 1062 r_alias[3] = ploadt_ro_unaligned(from + 6); 1063 return r; 1064 } 1065 1066 template <> 1067 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 1068 pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) { 1069 Packet4h2 r; 1070 half2* p_alias = reinterpret_cast<half2*>(&r); 1071 p_alias[0] = combine_half(from[0 * stride], from[1 * stride]); 1072 p_alias[1] = combine_half(from[2 * stride], from[3 * stride]); 1073 p_alias[2] = combine_half(from[4 * stride], from[5 * stride]); 1074 p_alias[3] = combine_half(from[6 * stride], from[7 * stride]); 1075 return r; 1076 } 1077 1078 template <> 1079 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>( 1080 Eigen::half* to, const Packet4h2& from, Index stride) { 1081 const half2* from_alias = reinterpret_cast<const half2*>(&from); 1082 pscatter(to + stride * 0, from_alias[0], stride); 1083 pscatter(to + stride * 2, from_alias[1], stride); 1084 pscatter(to + stride * 4, from_alias[2], stride); 1085 pscatter(to + stride * 6, from_alias[3], stride); 1086 } 1087 1088 template <> 1089 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>( 1090 const Packet4h2& a) { 1091 return pfirst(*(reinterpret_cast<const half2*>(&a))); 1092 } 1093 1094 template <> 1095 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>( 1096 const Packet4h2& a) { 1097 Packet4h2 r; 1098 half2* p_alias = reinterpret_cast<half2*>(&r); 1099 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1100 p_alias[0] = pabs(a_alias[0]); 1101 p_alias[1] = pabs(a_alias[1]); 1102 p_alias[2] = pabs(a_alias[2]); 1103 p_alias[3] = pabs(a_alias[3]); 1104 return r; 1105 } 1106 1107 template <> 1108 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>( 1109 const Packet4h2& /*a*/) { 1110 half true_half = half_impl::raw_uint16_to_half(0xffffu); 1111 return pset1<Packet4h2>(true_half); 1112 } 1113 1114 template <> 1115 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(const Packet4h2& /*a*/) { 1116 half false_half = half_impl::raw_uint16_to_half(0x0000u); 1117 return pset1<Packet4h2>(false_half); 1118 } 1119 1120 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double( 1121 double* d_row0, double* d_row1, double* d_row2, double* d_row3, 1122 double* d_row4, double* d_row5, double* d_row6, double* d_row7) { 1123 double d_tmp; 1124 d_tmp = d_row0[1]; 1125 d_row0[1] = d_row4[0]; 1126 d_row4[0] = d_tmp; 1127 1128 d_tmp = d_row1[1]; 1129 d_row1[1] = d_row5[0]; 1130 d_row5[0] = d_tmp; 1131 1132 d_tmp = d_row2[1]; 1133 d_row2[1] = d_row6[0]; 1134 d_row6[0] = d_tmp; 1135 1136 d_tmp = d_row3[1]; 1137 d_row3[1] = d_row7[0]; 1138 d_row7[0] = d_tmp; 1139 } 1140 1141 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2( 1142 half2* f_row0, half2* f_row1, half2* f_row2, half2* f_row3) { 1143 half2 f_tmp; 1144 f_tmp = f_row0[1]; 1145 f_row0[1] = f_row2[0]; 1146 f_row2[0] = f_tmp; 1147 1148 f_tmp = f_row1[1]; 1149 f_row1[1] = f_row3[0]; 1150 f_row3[0] = f_tmp; 1151 } 1152 1153 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void 1154 ptranspose_half(half2& f0, half2& f1) { 1155 __half a1 = get_half2_low(f0); 1156 __half a2 = get_half2_high(f0); 1157 __half b1 = get_half2_low(f1); 1158 __half b2 = get_half2_high(f1); 1159 f0 = combine_half(a1, b1); 1160 f1 = combine_half(a2, b2); 1161 } 1162 1163 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void 1164 ptranspose(PacketBlock<Packet4h2,8>& kernel) { 1165 double* d_row0 = reinterpret_cast<double*>(&kernel.packet[0]); 1166 double* d_row1 = reinterpret_cast<double*>(&kernel.packet[1]); 1167 double* d_row2 = reinterpret_cast<double*>(&kernel.packet[2]); 1168 double* d_row3 = reinterpret_cast<double*>(&kernel.packet[3]); 1169 double* d_row4 = reinterpret_cast<double*>(&kernel.packet[4]); 1170 double* d_row5 = reinterpret_cast<double*>(&kernel.packet[5]); 1171 double* d_row6 = reinterpret_cast<double*>(&kernel.packet[6]); 1172 double* d_row7 = reinterpret_cast<double*>(&kernel.packet[7]); 1173 ptranspose_double(d_row0, d_row1, d_row2, d_row3, 1174 d_row4, d_row5, d_row6, d_row7); 1175 1176 1177 half2* f_row0 = reinterpret_cast<half2*>(d_row0); 1178 half2* f_row1 = reinterpret_cast<half2*>(d_row1); 1179 half2* f_row2 = reinterpret_cast<half2*>(d_row2); 1180 half2* f_row3 = reinterpret_cast<half2*>(d_row3); 1181 ptranspose_half2(f_row0, f_row1, f_row2, f_row3); 1182 ptranspose_half(f_row0[0], f_row1[0]); 1183 ptranspose_half(f_row0[1], f_row1[1]); 1184 ptranspose_half(f_row2[0], f_row3[0]); 1185 ptranspose_half(f_row2[1], f_row3[1]); 1186 1187 f_row0 = reinterpret_cast<half2*>(d_row0 + 1); 1188 f_row1 = reinterpret_cast<half2*>(d_row1 + 1); 1189 f_row2 = reinterpret_cast<half2*>(d_row2 + 1); 1190 f_row3 = reinterpret_cast<half2*>(d_row3 + 1); 1191 ptranspose_half2(f_row0, f_row1, f_row2, f_row3); 1192 ptranspose_half(f_row0[0], f_row1[0]); 1193 ptranspose_half(f_row0[1], f_row1[1]); 1194 ptranspose_half(f_row2[0], f_row3[0]); 1195 ptranspose_half(f_row2[1], f_row3[1]); 1196 1197 f_row0 = reinterpret_cast<half2*>(d_row4); 1198 f_row1 = reinterpret_cast<half2*>(d_row5); 1199 f_row2 = reinterpret_cast<half2*>(d_row6); 1200 f_row3 = reinterpret_cast<half2*>(d_row7); 1201 ptranspose_half2(f_row0, f_row1, f_row2, f_row3); 1202 ptranspose_half(f_row0[0], f_row1[0]); 1203 ptranspose_half(f_row0[1], f_row1[1]); 1204 ptranspose_half(f_row2[0], f_row3[0]); 1205 ptranspose_half(f_row2[1], f_row3[1]); 1206 1207 f_row0 = reinterpret_cast<half2*>(d_row4 + 1); 1208 f_row1 = reinterpret_cast<half2*>(d_row5 + 1); 1209 f_row2 = reinterpret_cast<half2*>(d_row6 + 1); 1210 f_row3 = reinterpret_cast<half2*>(d_row7 + 1); 1211 ptranspose_half2(f_row0, f_row1, f_row2, f_row3); 1212 ptranspose_half(f_row0[0], f_row1[0]); 1213 ptranspose_half(f_row0[1], f_row1[1]); 1214 ptranspose_half(f_row2[0], f_row3[0]); 1215 ptranspose_half(f_row2[1], f_row3[1]); 1216 1217 } 1218 1219 template <> 1220 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 1221 plset<Packet4h2>(const Eigen::half& a) { 1222 #if defined(EIGEN_HIP_DEVICE_COMPILE) 1223 1224 Packet4h2 r; 1225 half2* p_alias = reinterpret_cast<half2*>(&r); 1226 p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f))); 1227 p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)), 1228 __hadd(a, __float2half(3.0f))); 1229 p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), 1230 __hadd(a, __float2half(5.0f))); 1231 p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), 1232 __hadd(a, __float2half(7.0f))); 1233 return r; 1234 #elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) 1235 Packet4h2 r; 1236 half2* r_alias = reinterpret_cast<half2*>(&r); 1237 1238 half2 b = pset1<half2>(a); 1239 half2 c; 1240 half2 half_offset0 = __halves2half2(__float2half(0.0f),__float2half(2.0f)); 1241 half2 half_offset1 = __halves2half2(__float2half(4.0f),__float2half(6.0f)); 1242 1243 c = __hadd2(b, half_offset0); 1244 r_alias[0] = plset(__low2half(c)); 1245 r_alias[1] = plset(__high2half(c)); 1246 1247 c = __hadd2(b, half_offset1); 1248 r_alias[2] = plset(__low2half(c)); 1249 r_alias[3] = plset(__high2half(c)); 1250 1251 return r; 1252 1253 #else 1254 float f = __half2float(a); 1255 Packet4h2 r; 1256 half2* p_alias = reinterpret_cast<half2*>(&r); 1257 p_alias[0] = combine_half(a, __float2half(f + 1.0f)); 1258 p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f)); 1259 p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f)); 1260 p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f)); 1261 return r; 1262 #endif 1263 } 1264 1265 template <> 1266 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 1267 pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a, 1268 const Packet4h2& b) { 1269 Packet4h2 r; 1270 half2* r_alias = reinterpret_cast<half2*>(&r); 1271 const half2* mask_alias = reinterpret_cast<const half2*>(&mask); 1272 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1273 const half2* b_alias = reinterpret_cast<const half2*>(&b); 1274 r_alias[0] = pselect(mask_alias[0], a_alias[0], b_alias[0]); 1275 r_alias[1] = pselect(mask_alias[1], a_alias[1], b_alias[1]); 1276 r_alias[2] = pselect(mask_alias[2], a_alias[2], b_alias[2]); 1277 r_alias[3] = pselect(mask_alias[3], a_alias[3], b_alias[3]); 1278 return r; 1279 } 1280 1281 template <> 1282 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 1283 pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) { 1284 Packet4h2 r; 1285 half2* r_alias = reinterpret_cast<half2*>(&r); 1286 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1287 const half2* b_alias = reinterpret_cast<const half2*>(&b); 1288 r_alias[0] = pcmp_eq(a_alias[0], b_alias[0]); 1289 r_alias[1] = pcmp_eq(a_alias[1], b_alias[1]); 1290 r_alias[2] = pcmp_eq(a_alias[2], b_alias[2]); 1291 r_alias[3] = pcmp_eq(a_alias[3], b_alias[3]); 1292 return r; 1293 } 1294 1295 template <> 1296 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>( 1297 const Packet4h2& a, const Packet4h2& b) { 1298 Packet4h2 r; 1299 half2* r_alias = reinterpret_cast<half2*>(&r); 1300 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1301 const half2* b_alias = reinterpret_cast<const half2*>(&b); 1302 r_alias[0] = pand(a_alias[0], b_alias[0]); 1303 r_alias[1] = pand(a_alias[1], b_alias[1]); 1304 r_alias[2] = pand(a_alias[2], b_alias[2]); 1305 r_alias[3] = pand(a_alias[3], b_alias[3]); 1306 return r; 1307 } 1308 1309 template <> 1310 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>( 1311 const Packet4h2& a, const Packet4h2& b) { 1312 Packet4h2 r; 1313 half2* r_alias = reinterpret_cast<half2*>(&r); 1314 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1315 const half2* b_alias = reinterpret_cast<const half2*>(&b); 1316 r_alias[0] = por(a_alias[0], b_alias[0]); 1317 r_alias[1] = por(a_alias[1], b_alias[1]); 1318 r_alias[2] = por(a_alias[2], b_alias[2]); 1319 r_alias[3] = por(a_alias[3], b_alias[3]); 1320 return r; 1321 } 1322 1323 template <> 1324 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>( 1325 const Packet4h2& a, const Packet4h2& b) { 1326 Packet4h2 r; 1327 half2* r_alias = reinterpret_cast<half2*>(&r); 1328 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1329 const half2* b_alias = reinterpret_cast<const half2*>(&b); 1330 r_alias[0] = pxor(a_alias[0], b_alias[0]); 1331 r_alias[1] = pxor(a_alias[1], b_alias[1]); 1332 r_alias[2] = pxor(a_alias[2], b_alias[2]); 1333 r_alias[3] = pxor(a_alias[3], b_alias[3]); 1334 return r; 1335 } 1336 1337 template <> 1338 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 1339 pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) { 1340 Packet4h2 r; 1341 half2* r_alias = reinterpret_cast<half2*>(&r); 1342 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1343 const half2* b_alias = reinterpret_cast<const half2*>(&b); 1344 r_alias[0] = pandnot(a_alias[0], b_alias[0]); 1345 r_alias[1] = pandnot(a_alias[1], b_alias[1]); 1346 r_alias[2] = pandnot(a_alias[2], b_alias[2]); 1347 r_alias[3] = pandnot(a_alias[3], b_alias[3]); 1348 return r; 1349 } 1350 1351 template <> 1352 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>( 1353 const Packet4h2& a, const Packet4h2& b) { 1354 Packet4h2 r; 1355 half2* r_alias = reinterpret_cast<half2*>(&r); 1356 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1357 const half2* b_alias = reinterpret_cast<const half2*>(&b); 1358 r_alias[0] = padd(a_alias[0], b_alias[0]); 1359 r_alias[1] = padd(a_alias[1], b_alias[1]); 1360 r_alias[2] = padd(a_alias[2], b_alias[2]); 1361 r_alias[3] = padd(a_alias[3], b_alias[3]); 1362 return r; 1363 } 1364 1365 template <> 1366 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>( 1367 const Packet4h2& a, const Packet4h2& b) { 1368 Packet4h2 r; 1369 half2* r_alias = reinterpret_cast<half2*>(&r); 1370 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1371 const half2* b_alias = reinterpret_cast<const half2*>(&b); 1372 r_alias[0] = psub(a_alias[0], b_alias[0]); 1373 r_alias[1] = psub(a_alias[1], b_alias[1]); 1374 r_alias[2] = psub(a_alias[2], b_alias[2]); 1375 r_alias[3] = psub(a_alias[3], b_alias[3]); 1376 return r; 1377 } 1378 1379 template <> 1380 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pnegate(const Packet4h2& a) { 1381 Packet4h2 r; 1382 half2* r_alias = reinterpret_cast<half2*>(&r); 1383 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1384 r_alias[0] = pnegate(a_alias[0]); 1385 r_alias[1] = pnegate(a_alias[1]); 1386 r_alias[2] = pnegate(a_alias[2]); 1387 r_alias[3] = pnegate(a_alias[3]); 1388 return r; 1389 } 1390 1391 template <> 1392 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pconj(const Packet4h2& a) { 1393 return a; 1394 } 1395 1396 template <> 1397 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>( 1398 const Packet4h2& a, const Packet4h2& b) { 1399 Packet4h2 r; 1400 half2* r_alias = reinterpret_cast<half2*>(&r); 1401 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1402 const half2* b_alias = reinterpret_cast<const half2*>(&b); 1403 r_alias[0] = pmul(a_alias[0], b_alias[0]); 1404 r_alias[1] = pmul(a_alias[1], b_alias[1]); 1405 r_alias[2] = pmul(a_alias[2], b_alias[2]); 1406 r_alias[3] = pmul(a_alias[3], b_alias[3]); 1407 return r; 1408 } 1409 1410 template <> 1411 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>( 1412 const Packet4h2& a, const Packet4h2& b, const Packet4h2& c) { 1413 Packet4h2 r; 1414 half2* r_alias = reinterpret_cast<half2*>(&r); 1415 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1416 const half2* b_alias = reinterpret_cast<const half2*>(&b); 1417 const half2* c_alias = reinterpret_cast<const half2*>(&c); 1418 r_alias[0] = pmadd(a_alias[0], b_alias[0], c_alias[0]); 1419 r_alias[1] = pmadd(a_alias[1], b_alias[1], c_alias[1]); 1420 r_alias[2] = pmadd(a_alias[2], b_alias[2], c_alias[2]); 1421 r_alias[3] = pmadd(a_alias[3], b_alias[3], c_alias[3]); 1422 return r; 1423 } 1424 1425 template <> 1426 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>( 1427 const Packet4h2& a, const Packet4h2& b) { 1428 Packet4h2 r; 1429 half2* r_alias = reinterpret_cast<half2*>(&r); 1430 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1431 const half2* b_alias = reinterpret_cast<const half2*>(&b); 1432 r_alias[0] = pdiv(a_alias[0], b_alias[0]); 1433 r_alias[1] = pdiv(a_alias[1], b_alias[1]); 1434 r_alias[2] = pdiv(a_alias[2], b_alias[2]); 1435 r_alias[3] = pdiv(a_alias[3], b_alias[3]); 1436 return r; 1437 } 1438 1439 template <> 1440 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>( 1441 const Packet4h2& a, const Packet4h2& b) { 1442 Packet4h2 r; 1443 half2* r_alias = reinterpret_cast<half2*>(&r); 1444 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1445 const half2* b_alias = reinterpret_cast<const half2*>(&b); 1446 r_alias[0] = pmin(a_alias[0], b_alias[0]); 1447 r_alias[1] = pmin(a_alias[1], b_alias[1]); 1448 r_alias[2] = pmin(a_alias[2], b_alias[2]); 1449 r_alias[3] = pmin(a_alias[3], b_alias[3]); 1450 return r; 1451 } 1452 1453 template <> 1454 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>( 1455 const Packet4h2& a, const Packet4h2& b) { 1456 Packet4h2 r; 1457 half2* r_alias = reinterpret_cast<half2*>(&r); 1458 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1459 const half2* b_alias = reinterpret_cast<const half2*>(&b); 1460 r_alias[0] = pmax(a_alias[0], b_alias[0]); 1461 r_alias[1] = pmax(a_alias[1], b_alias[1]); 1462 r_alias[2] = pmax(a_alias[2], b_alias[2]); 1463 r_alias[3] = pmax(a_alias[3], b_alias[3]); 1464 return r; 1465 } 1466 1467 template <> 1468 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>( 1469 const Packet4h2& a) { 1470 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1471 1472 return predux(a_alias[0]) + predux(a_alias[1]) + 1473 predux(a_alias[2]) + predux(a_alias[3]); 1474 } 1475 1476 template <> 1477 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>( 1478 const Packet4h2& a) { 1479 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1480 half2 m0 = combine_half(predux_max(a_alias[0]), 1481 predux_max(a_alias[1])); 1482 half2 m1 = combine_half(predux_max(a_alias[2]), 1483 predux_max(a_alias[3])); 1484 __half first = predux_max(m0); 1485 __half second = predux_max(m1); 1486 #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) 1487 return (__hgt(first, second) ? first : second); 1488 #else 1489 float ffirst = __half2float(first); 1490 float fsecond = __half2float(second); 1491 return (ffirst > fsecond)? first: second; 1492 #endif 1493 } 1494 1495 template <> 1496 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>( 1497 const Packet4h2& a) { 1498 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1499 half2 m0 = combine_half(predux_min(a_alias[0]), 1500 predux_min(a_alias[1])); 1501 half2 m1 = combine_half(predux_min(a_alias[2]), 1502 predux_min(a_alias[3])); 1503 __half first = predux_min(m0); 1504 __half second = predux_min(m1); 1505 #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) 1506 return (__hlt(first, second) ? first : second); 1507 #else 1508 float ffirst = __half2float(first); 1509 float fsecond = __half2float(second); 1510 return (ffirst < fsecond)? first: second; 1511 #endif 1512 } 1513 1514 // likely overflow/underflow 1515 template <> 1516 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>( 1517 const Packet4h2& a) { 1518 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1519 return predux_mul(pmul(pmul(a_alias[0], a_alias[1]), 1520 pmul(a_alias[2], a_alias[3]))); 1521 } 1522 1523 template <> 1524 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 1525 plog1p<Packet4h2>(const Packet4h2& a) { 1526 Packet4h2 r; 1527 half2* r_alias = reinterpret_cast<half2*>(&r); 1528 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1529 r_alias[0] = plog1p(a_alias[0]); 1530 r_alias[1] = plog1p(a_alias[1]); 1531 r_alias[2] = plog1p(a_alias[2]); 1532 r_alias[3] = plog1p(a_alias[3]); 1533 return r; 1534 } 1535 1536 template <> 1537 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 1538 pexpm1<Packet4h2>(const Packet4h2& a) { 1539 Packet4h2 r; 1540 half2* r_alias = reinterpret_cast<half2*>(&r); 1541 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1542 r_alias[0] = pexpm1(a_alias[0]); 1543 r_alias[1] = pexpm1(a_alias[1]); 1544 r_alias[2] = pexpm1(a_alias[2]); 1545 r_alias[3] = pexpm1(a_alias[3]); 1546 return r; 1547 } 1548 1549 template <> 1550 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog<Packet4h2>(const Packet4h2& a) { 1551 Packet4h2 r; 1552 half2* r_alias = reinterpret_cast<half2*>(&r); 1553 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1554 r_alias[0] = plog(a_alias[0]); 1555 r_alias[1] = plog(a_alias[1]); 1556 r_alias[2] = plog(a_alias[2]); 1557 r_alias[3] = plog(a_alias[3]); 1558 return r; 1559 } 1560 1561 template <> 1562 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexp<Packet4h2>(const Packet4h2& a) { 1563 Packet4h2 r; 1564 half2* r_alias = reinterpret_cast<half2*>(&r); 1565 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1566 r_alias[0] = pexp(a_alias[0]); 1567 r_alias[1] = pexp(a_alias[1]); 1568 r_alias[2] = pexp(a_alias[2]); 1569 r_alias[3] = pexp(a_alias[3]); 1570 return r; 1571 } 1572 1573 template <> 1574 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt<Packet4h2>(const Packet4h2& a) { 1575 Packet4h2 r; 1576 half2* r_alias = reinterpret_cast<half2*>(&r); 1577 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1578 r_alias[0] = psqrt(a_alias[0]); 1579 r_alias[1] = psqrt(a_alias[1]); 1580 r_alias[2] = psqrt(a_alias[2]); 1581 r_alias[3] = psqrt(a_alias[3]); 1582 return r; 1583 } 1584 1585 template <> 1586 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 1587 prsqrt<Packet4h2>(const Packet4h2& a) { 1588 Packet4h2 r; 1589 half2* r_alias = reinterpret_cast<half2*>(&r); 1590 const half2* a_alias = reinterpret_cast<const half2*>(&a); 1591 r_alias[0] = prsqrt(a_alias[0]); 1592 r_alias[1] = prsqrt(a_alias[1]); 1593 r_alias[2] = prsqrt(a_alias[2]); 1594 r_alias[3] = prsqrt(a_alias[3]); 1595 return r; 1596 } 1597 1598 // The following specialized padd, pmul, pdiv, pmin, pmax, pset1 are needed for 1599 // the implementation of GPU half reduction. 1600 template<> 1601 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, 1602 const half2& b) { 1603 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 1604 return __hadd2(a, b); 1605 #else 1606 float a1 = __low2float(a); 1607 float a2 = __high2float(a); 1608 float b1 = __low2float(b); 1609 float b2 = __high2float(b); 1610 float r1 = a1 + b1; 1611 float r2 = a2 + b2; 1612 return __floats2half2_rn(r1, r2); 1613 #endif 1614 } 1615 1616 template<> 1617 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, 1618 const half2& b) { 1619 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 1620 return __hmul2(a, b); 1621 #else 1622 float a1 = __low2float(a); 1623 float a2 = __high2float(a); 1624 float b1 = __low2float(b); 1625 float b2 = __high2float(b); 1626 float r1 = a1 * b1; 1627 float r2 = a2 * b2; 1628 return __floats2half2_rn(r1, r2); 1629 #endif 1630 } 1631 1632 template<> 1633 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, 1634 const half2& b) { 1635 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 1636 return __h2div(a, b); 1637 #else 1638 float a1 = __low2float(a); 1639 float a2 = __high2float(a); 1640 float b1 = __low2float(b); 1641 float b2 = __high2float(b); 1642 float r1 = a1 / b1; 1643 float r2 = a2 / b2; 1644 return __floats2half2_rn(r1, r2); 1645 #endif 1646 } 1647 1648 template<> 1649 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, 1650 const half2& b) { 1651 float a1 = __low2float(a); 1652 float a2 = __high2float(a); 1653 float b1 = __low2float(b); 1654 float b2 = __high2float(b); 1655 __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b); 1656 __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b); 1657 return combine_half(r1, r2); 1658 } 1659 1660 template<> 1661 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, 1662 const half2& b) { 1663 float a1 = __low2float(a); 1664 float a2 = __high2float(a); 1665 float b1 = __low2float(b); 1666 float b2 = __high2float(b); 1667 __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b); 1668 __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b); 1669 return combine_half(r1, r2); 1670 } 1671 1672 // #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) 1673 1674 #endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) 1675 1676 #undef EIGEN_GPU_HAS_LDG 1677 #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC 1678 #undef EIGEN_GPU_HAS_FP16_ARITHMETIC 1679 1680 } // end namespace internal 1681 1682 } // end namespace Eigen 1683 1684 1685 #endif // EIGEN_PACKET_MATH_GPU_H 1686