1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // This Source Code Form is subject to the terms of the Mozilla 5 // Public License v. 2.0. If a copy of the MPL was not distributed 6 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 // 8 // The conversion routines are Copyright (c) Fabian Giesen, 2016. 9 // The original license follows: 10 // 11 // Copyright (c) Fabian Giesen, 2016 12 // All rights reserved. 13 // Redistribution and use in source and binary forms, with or without 14 // modification, are permitted. 15 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 27 28 // Standard 16-bit float type, mostly useful for GPUs. Defines a new 29 // type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with 30 // operator overloads such that it behaves basically as an arithmetic 31 // type. It will be quite slow on CPUs (so it is recommended to stay 32 // in fp32 for CPUs, except for simple parameter conversions, I/O 33 // to disk and the likes), but fast on GPUs. 34 35 36 #ifndef EIGEN_HALF_H 37 #define EIGEN_HALF_H 38 39 #include <sstream> 40 41 #if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) 42 // When compiling with GPU support, the "__half_raw" base class as well as 43 // some other routines are defined in the GPU compiler header files 44 // (cuda_fp16.h, hip_fp16.h), and they are not tagged constexpr 45 // As a consequence, we get compile failures when compiling Eigen with 46 // GPU support. Hence the need to disable EIGEN_CONSTEXPR when building 47 // Eigen with GPU support 48 #pragma push_macro("EIGEN_CONSTEXPR") 49 #undef EIGEN_CONSTEXPR 50 #define EIGEN_CONSTEXPR 51 #endif 52 53 #define F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, METHOD) \ 54 template <> \ 55 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED \ 56 PACKET_F16 METHOD<PACKET_F16>(const PACKET_F16& _x) { \ 57 return float2half(METHOD<PACKET_F>(half2float(_x))); \ 58 } 59 60 namespace Eigen { 61 62 struct half; 63 64 namespace half_impl { 65 66 // We want to use the __half_raw struct from the HIP header file only during the device compile phase. 67 // This is required because of a quirk in the way TensorFlow GPU builds are done. 68 // When compiling TensorFlow source code with GPU support, files that 69 // * contain GPU kernels (i.e. *.cu.cc files) are compiled via hipcc 70 // * do not contain GPU kernels ( i.e. *.cc files) are compiled via gcc (typically) 71 // 72 // Tensorflow uses the Eigen::half type as its FP16 type, and there are functions that 73 // * are defined in a file that gets compiled via hipcc AND 74 // * have Eigen::half as a pass-by-value argument AND 75 // * are called in a file that gets compiled via gcc 76 // 77 // In the scenario described above the caller and callee will see different versions 78 // of the Eigen::half base class __half_raw, and they will be compiled by different compilers 79 // 80 // There appears to be an ABI mismatch between gcc and clang (which is called by hipcc) that results in 81 // the callee getting corrupted values for the Eigen::half argument. 82 // 83 // Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves 84 // this error, and hence the following convoluted #if condition 85 #if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE) 86 // Make our own __half_raw definition that is similar to CUDA's. 87 struct __half_raw { 88 #if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)) 89 // Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF) 90 // The element type for shared memory cannot have non-trivial constructors 91 // and hence the following special casing (which skips the zero-initilization). 92 // Note that this check gets done even in the host compilation phase, and 93 // hence the need for this __half_raw__half_raw94 EIGEN_DEVICE_FUNC __half_raw() {} 95 #else 96 EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw() : x(0) {} 97 #endif 98 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) __half_raw__half_raw99 explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) { 100 } 101 __fp16 x; 102 #else __half_raw__half_raw103 explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(raw) {} 104 numext::uint16_t x; 105 #endif 106 }; 107 108 #elif defined(EIGEN_HAS_HIP_FP16) 109 // Nothing to do here 110 // HIP fp16 header file has a definition for __half_raw 111 #elif defined(EIGEN_HAS_CUDA_FP16) 112 #if EIGEN_CUDA_SDK_VER < 90000 113 // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw 114 typedef __half __half_raw; 115 #endif // defined(EIGEN_HAS_CUDA_FP16) 116 #elif defined(SYCL_DEVICE_ONLY) 117 typedef cl::sycl::half __half_raw; 118 #endif 119 120 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x); 121 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); 122 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); 123 124 struct half_base : public __half_raw { half_basehalf_base125 EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base() {} half_basehalf_base126 EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {} 127 128 #if defined(EIGEN_HAS_GPU_FP16) 129 #if defined(EIGEN_HAS_HIP_FP16) half_basehalf_base130 EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); } 131 #elif defined(EIGEN_HAS_CUDA_FP16) 132 #if EIGEN_CUDA_SDK_VER >= 90000 half_basehalf_base133 EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} 134 #endif 135 #endif 136 #endif 137 }; 138 139 } // namespace half_impl 140 141 // Class definition. 142 struct half : public half_impl::half_base { 143 144 // Writing this out as separate #if-else blocks to make the code easier to follow 145 // The same applies to most #if-else blocks in this file 146 #if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE) 147 // Use the same base class for the following two scenarios 148 // * when compiling without GPU support enabled 149 // * during host compile phase when compiling with GPU support enabled 150 typedef half_impl::__half_raw __half_raw; 151 #elif defined(EIGEN_HAS_HIP_FP16) 152 // Nothing to do here 153 // HIP fp16 header file has a definition for __half_raw 154 #elif defined(EIGEN_HAS_CUDA_FP16) 155 // Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so 156 // (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP! So keeping this within 157 // #if defined(EIGEN_HAS_CUDA_FP16) is needed 158 #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 159 typedef half_impl::__half_raw __half_raw; 160 #endif 161 #endif 162 halfhalf163 EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half() {} 164 halfhalf165 EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {} 166 167 #if defined(EIGEN_HAS_GPU_FP16) 168 #if defined(EIGEN_HAS_HIP_FP16) halfhalf169 EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {} 170 #elif defined(EIGEN_HAS_CUDA_FP16) 171 #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 halfhalf172 EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {} 173 #endif 174 #endif 175 #endif 176 177 halfhalf178 explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(bool b) 179 : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} 180 template<class T> halfhalf181 explicit EIGEN_DEVICE_FUNC half(T val) 182 : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(val))) {} halfhalf183 explicit EIGEN_DEVICE_FUNC half(float f) 184 : half_impl::half_base(half_impl::float_to_half_rtne(f)) {} 185 186 // Following the convention of numpy, converting between complex and 187 // float will lead to loss of imag value. 188 template<typename RealScalar> halfhalf189 explicit EIGEN_DEVICE_FUNC half(std::complex<RealScalar> c) 190 : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(c.real()))) {} 191 192 EIGEN_DEVICE_FUNC operator float() const { // NOLINT: Allow implicit conversion to float, because it is lossless. 193 return half_impl::half_to_float(*this); 194 } 195 196 #if defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE) __halfhalf197 EIGEN_DEVICE_FUNC operator __half() const { 198 ::__half_raw hr; 199 hr.x = x; 200 return __half(hr); 201 } 202 #endif 203 }; 204 205 } // end namespace Eigen 206 207 namespace std { 208 template<> 209 struct numeric_limits<Eigen::half> { 210 static const bool is_specialized = true; 211 static const bool is_signed = true; 212 static const bool is_integer = false; 213 static const bool is_exact = false; 214 static const bool has_infinity = true; 215 static const bool has_quiet_NaN = true; 216 static const bool has_signaling_NaN = true; 217 static const float_denorm_style has_denorm = denorm_present; 218 static const bool has_denorm_loss = false; 219 static const std::float_round_style round_style = std::round_to_nearest; 220 static const bool is_iec559 = false; 221 static const bool is_bounded = false; 222 static const bool is_modulo = false; 223 static const int digits = 11; 224 static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html 225 static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html 226 static const int radix = 2; 227 static const int min_exponent = -13; 228 static const int min_exponent10 = -4; 229 static const int max_exponent = 16; 230 static const int max_exponent10 = 4; 231 static const bool traps = true; 232 static const bool tinyness_before = false; 233 234 static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); } 235 static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } 236 static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } 237 static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); } 238 static Eigen::half round_error() { return Eigen::half(0.5); } 239 static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } 240 static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } 241 static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); } 242 static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); } 243 }; 244 245 // If std::numeric_limits<T> is specialized, should also specialize 246 // std::numeric_limits<const T>, std::numeric_limits<volatile T>, and 247 // std::numeric_limits<const volatile T> 248 // https://stackoverflow.com/a/16519653/ 249 template<> 250 struct numeric_limits<const Eigen::half> : numeric_limits<Eigen::half> {}; 251 template<> 252 struct numeric_limits<volatile Eigen::half> : numeric_limits<Eigen::half> {}; 253 template<> 254 struct numeric_limits<const volatile Eigen::half> : numeric_limits<Eigen::half> {}; 255 } // end namespace std 256 257 namespace Eigen { 258 259 namespace half_impl { 260 261 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ 262 EIGEN_CUDA_ARCH >= 530) || \ 263 (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) 264 // Note: We deliberatly do *not* define this to 1 even if we have Arm's native 265 // fp16 type since GPU halfs are rather different from native CPU halfs. 266 // TODO: Rename to something like EIGEN_HAS_NATIVE_GPU_FP16 267 #define EIGEN_HAS_NATIVE_FP16 268 #endif 269 270 // Intrinsics for native fp16 support. Note that on current hardware, 271 // these are no faster than fp32 arithmetic (you need to use the half2 272 // versions to get the ALU speed increased), but you do save the 273 // conversion steps back and forth. 274 275 #if defined(EIGEN_HAS_NATIVE_FP16) 276 EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { 277 #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 278 return __hadd(::__half(a), ::__half(b)); 279 #else 280 return __hadd(a, b); 281 #endif 282 } 283 EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { 284 return __hmul(a, b); 285 } 286 EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { 287 return __hsub(a, b); 288 } 289 EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { 290 #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 291 return __hdiv(a, b); 292 #else 293 float num = __half2float(a); 294 float denom = __half2float(b); 295 return __float2half(num / denom); 296 #endif 297 } 298 EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { 299 return __hneg(a); 300 } 301 EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) { 302 a = a + b; 303 return a; 304 } 305 EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) { 306 a = a * b; 307 return a; 308 } 309 EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) { 310 a = a - b; 311 return a; 312 } 313 EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) { 314 a = a / b; 315 return a; 316 } 317 EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) { 318 return __heq(a, b); 319 } 320 EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) { 321 return __hne(a, b); 322 } 323 EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) { 324 return __hlt(a, b); 325 } 326 EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) { 327 return __hle(a, b); 328 } 329 EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) { 330 return __hgt(a, b); 331 } 332 EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { 333 return __hge(a, b); 334 } 335 #endif 336 337 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) 338 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { 339 return half(vaddh_f16(a.x, b.x)); 340 } 341 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { 342 return half(vmulh_f16(a.x, b.x)); 343 } 344 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { 345 return half(vsubh_f16(a.x, b.x)); 346 } 347 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { 348 return half(vdivh_f16(a.x, b.x)); 349 } 350 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { 351 return half(vnegh_f16(a.x)); 352 } 353 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { 354 a = half(vaddh_f16(a.x, b.x)); 355 return a; 356 } 357 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { 358 a = half(vmulh_f16(a.x, b.x)); 359 return a; 360 } 361 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { 362 a = half(vsubh_f16(a.x, b.x)); 363 return a; 364 } 365 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { 366 a = half(vdivh_f16(a.x, b.x)); 367 return a; 368 } 369 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { 370 return vceqh_f16(a.x, b.x); 371 } 372 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { 373 return !vceqh_f16(a.x, b.x); 374 } 375 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { 376 return vclth_f16(a.x, b.x); 377 } 378 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { 379 return vcleh_f16(a.x, b.x); 380 } 381 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { 382 return vcgth_f16(a.x, b.x); 383 } 384 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { 385 return vcgeh_f16(a.x, b.x); 386 } 387 // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler, 388 // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation 389 // of the functions, while the latter can only deal with one of them. 390 #elif !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats 391 392 #if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC) 393 // We need to provide emulated *host-side* FP16 operators for clang. 394 #pragma push_macro("EIGEN_DEVICE_FUNC") 395 #undef EIGEN_DEVICE_FUNC 396 #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16) 397 #define EIGEN_DEVICE_FUNC __host__ 398 #else // both host and device need emulated ops. 399 #define EIGEN_DEVICE_FUNC __host__ __device__ 400 #endif 401 #endif 402 403 // Definitions for CPUs and older HIP+CUDA, mostly working through conversion 404 // to/from fp32. 405 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { 406 return half(float(a) + float(b)); 407 } 408 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { 409 return half(float(a) * float(b)); 410 } 411 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { 412 return half(float(a) - float(b)); 413 } 414 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { 415 return half(float(a) / float(b)); 416 } 417 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { 418 half result; 419 result.x = a.x ^ 0x8000; 420 return result; 421 } 422 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { 423 a = half(float(a) + float(b)); 424 return a; 425 } 426 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { 427 a = half(float(a) * float(b)); 428 return a; 429 } 430 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { 431 a = half(float(a) - float(b)); 432 return a; 433 } 434 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { 435 a = half(float(a) / float(b)); 436 return a; 437 } 438 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { 439 return numext::equal_strict(float(a),float(b)); 440 } 441 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { 442 return numext::not_equal_strict(float(a), float(b)); 443 } 444 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { 445 return float(a) < float(b); 446 } 447 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { 448 return float(a) <= float(b); 449 } 450 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { 451 return float(a) > float(b); 452 } 453 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { 454 return float(a) >= float(b); 455 } 456 457 #if defined(__clang__) && defined(__CUDA__) 458 #pragma pop_macro("EIGEN_DEVICE_FUNC") 459 #endif 460 #endif // Emulate support for half floats 461 462 // Division by an index. Do it in full float precision to avoid accuracy 463 // issues in converting the denominator to half. 464 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { 465 return half(static_cast<float>(a) / static_cast<float>(b)); 466 } 467 468 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator++(half& a) { 469 a += half(1); 470 return a; 471 } 472 473 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator--(half& a) { 474 a -= half(1); 475 return a; 476 } 477 478 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator++(half& a, int) { 479 half original_value = a; 480 ++a; 481 return original_value; 482 } 483 484 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator--(half& a, int) { 485 half original_value = a; 486 --a; 487 return original_value; 488 } 489 490 // Conversion routines, including fallbacks for the host or older CUDA. 491 // Note that newer Intel CPUs (Haswell or newer) have vectorized versions of 492 // these in hardware. If we need more performance on older/other CPUs, they are 493 // also possible to vectorize directly. 494 495 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x) { 496 // We cannot simply do a "return __half_raw(x)" here, because __half_raw is union type 497 // in the hip_fp16 header file, and that will trigger a compile error 498 // On the other hand, having anything but a return statement also triggers a compile error 499 // because this is constexpr function. 500 // Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out 501 // of this catch22 by having separate bodies for GPU / non GPU 502 #if defined(EIGEN_HAS_GPU_FP16) 503 __half_raw h; 504 h.x = x; 505 return h; 506 #else 507 return __half_raw(x); 508 #endif 509 } 510 511 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const __half_raw& h) { 512 // HIP/CUDA/Default have a member 'x' of type uint16_t. 513 // For ARM64 native half, the member 'x' is of type __fp16, so we need to bit-cast. 514 // For SYCL, cl::sycl::half is _Float16, so cast directly. 515 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) 516 return numext::bit_cast<numext::uint16_t>(h.x); 517 #elif defined(SYCL_DEVICE_ONLY) 518 return numext::bit_cast<numext::uint16_t>(h); 519 #else 520 return h.x; 521 #endif 522 } 523 524 union float32_bits { 525 unsigned int u; 526 float f; 527 }; 528 529 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { 530 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ 531 (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) 532 __half tmp_ff = __float2half(ff); 533 return *(__half_raw*)&tmp_ff; 534 535 #elif defined(EIGEN_HAS_FP16_C) 536 __half_raw h; 537 h.x = _cvtss_sh(ff, 0); 538 return h; 539 540 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) 541 __half_raw h; 542 h.x = static_cast<__fp16>(ff); 543 return h; 544 545 #else 546 float32_bits f; f.f = ff; 547 548 const float32_bits f32infty = { 255 << 23 }; 549 const float32_bits f16max = { (127 + 16) << 23 }; 550 const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; 551 unsigned int sign_mask = 0x80000000u; 552 __half_raw o; 553 o.x = static_cast<numext::uint16_t>(0x0u); 554 555 unsigned int sign = f.u & sign_mask; 556 f.u ^= sign; 557 558 // NOTE all the integer compares in this function can be safely 559 // compiled into signed compares since all operands are below 560 // 0x80000000. Important if you want fast straight SSE2 code 561 // (since there's no unsigned PCMPGTD). 562 563 if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) 564 o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf 565 } else { // (De)normalized number or zero 566 if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero 567 // use a magic value to align our 10 mantissa bits at the bottom of 568 // the float. as long as FP addition is round-to-nearest-even this 569 // just works. 570 f.f += denorm_magic.f; 571 572 // and one integer subtract of the bias later, we have our final float! 573 o.x = static_cast<numext::uint16_t>(f.u - denorm_magic.u); 574 } else { 575 unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd 576 577 // update exponent, rounding bias part 1 578 // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but 579 // without arithmetic overflow. 580 f.u += 0xc8000fffU; 581 // rounding bias part 2 582 f.u += mant_odd; 583 // take the bits! 584 o.x = static_cast<numext::uint16_t>(f.u >> 13); 585 } 586 } 587 588 o.x |= static_cast<numext::uint16_t>(sign >> 16); 589 return o; 590 #endif 591 } 592 593 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { 594 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ 595 (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) 596 return __half2float(h); 597 #elif defined(EIGEN_HAS_FP16_C) 598 return _cvtsh_ss(h.x); 599 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) 600 return static_cast<float>(h.x); 601 #else 602 const float32_bits magic = { 113 << 23 }; 603 const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift 604 float32_bits o; 605 606 o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits 607 unsigned int exp = shifted_exp & o.u; // just the exponent 608 o.u += (127 - 15) << 23; // exponent adjust 609 610 // handle exponent special cases 611 if (exp == shifted_exp) { // Inf/NaN? 612 o.u += (128 - 16) << 23; // extra exp adjust 613 } else if (exp == 0) { // Zero/Denormal? 614 o.u += 1 << 23; // extra exp adjust 615 o.f -= magic.f; // renormalize 616 } 617 618 o.u |= (h.x & 0x8000) << 16; // sign bit 619 return o.f; 620 #endif 621 } 622 623 // --- standard functions --- 624 625 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { 626 #ifdef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 627 return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) == 0x7c00; 628 #else 629 return (a.x & 0x7fff) == 0x7c00; 630 #endif 631 } 632 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { 633 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ 634 (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) 635 return __hisnan(a); 636 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) 637 return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00; 638 #else 639 return (a.x & 0x7fff) > 0x7c00; 640 #endif 641 } 642 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) { 643 return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a)); 644 } 645 646 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { 647 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) 648 return half(vabsh_f16(a.x)); 649 #else 650 half result; 651 result.x = a.x & 0x7FFF; 652 return result; 653 #endif 654 } 655 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { 656 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ 657 defined(EIGEN_HIP_DEVICE_COMPILE) 658 return half(hexp(a)); 659 #else 660 return half(::expf(float(a))); 661 #endif 662 } 663 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { 664 return half(numext::expm1(float(a))); 665 } 666 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { 667 #if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ 668 (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) 669 return half(::hlog(a)); 670 #else 671 return half(::logf(float(a))); 672 #endif 673 } 674 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) { 675 return half(numext::log1p(float(a))); 676 } 677 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { 678 return half(::log10f(float(a))); 679 } 680 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) { 681 return half(static_cast<float>(EIGEN_LOG2E) * ::logf(float(a))); 682 } 683 684 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { 685 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ 686 defined(EIGEN_HIP_DEVICE_COMPILE) 687 return half(hsqrt(a)); 688 #else 689 return half(::sqrtf(float(a))); 690 #endif 691 } 692 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) { 693 return half(::powf(float(a), float(b))); 694 } 695 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) { 696 return half(::sinf(float(a))); 697 } 698 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) { 699 return half(::cosf(float(a))); 700 } 701 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) { 702 return half(::tanf(float(a))); 703 } 704 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { 705 return half(::tanhf(float(a))); 706 } 707 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half asin(const half& a) { 708 return half(::asinf(float(a))); 709 } 710 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) { 711 return half(::acosf(float(a))); 712 } 713 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { 714 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ 715 defined(EIGEN_HIP_DEVICE_COMPILE) 716 return half(hfloor(a)); 717 #else 718 return half(::floorf(float(a))); 719 #endif 720 } 721 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { 722 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ 723 defined(EIGEN_HIP_DEVICE_COMPILE) 724 return half(hceil(a)); 725 #else 726 return half(::ceilf(float(a))); 727 #endif 728 } 729 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half rint(const half& a) { 730 return half(::rintf(float(a))); 731 } 732 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half round(const half& a) { 733 return half(::roundf(float(a))); 734 } 735 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half fmod(const half& a, const half& b) { 736 return half(::fmodf(float(a), float(b))); 737 } 738 739 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { 740 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ 741 (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) 742 return __hlt(b, a) ? b : a; 743 #else 744 const float f1 = static_cast<float>(a); 745 const float f2 = static_cast<float>(b); 746 return f2 < f1 ? b : a; 747 #endif 748 } 749 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { 750 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ 751 (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) 752 return __hlt(a, b) ? b : a; 753 #else 754 const float f1 = static_cast<float>(a); 755 const float f2 = static_cast<float>(b); 756 return f1 < f2 ? b : a; 757 #endif 758 } 759 760 #ifndef EIGEN_NO_IO 761 EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) { 762 os << static_cast<float>(v); 763 return os; 764 } 765 #endif 766 767 } // end namespace half_impl 768 769 // import Eigen::half_impl::half into Eigen namespace 770 // using half_impl::half; 771 772 namespace internal { 773 774 template<> 775 struct random_default_impl<half, false, false> 776 { 777 static inline half run(const half& x, const half& y) 778 { 779 return x + (y-x) * half(float(std::rand()) / float(RAND_MAX)); 780 } 781 static inline half run() 782 { 783 return run(half(-1.f), half(1.f)); 784 } 785 }; 786 787 template<> struct is_arithmetic<half> { enum { value = true }; }; 788 789 } // end namespace internal 790 791 template<> struct NumTraits<Eigen::half> 792 : GenericNumTraits<Eigen::half> 793 { 794 enum { 795 IsSigned = true, 796 IsInteger = false, 797 IsComplex = false, 798 RequireInitialization = false 799 }; 800 801 EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half epsilon() { 802 return half_impl::raw_uint16_to_half(0x0800); 803 } 804 EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { 805 return half_impl::raw_uint16_to_half(0x211f); // Eigen::half(1e-2f); 806 } 807 EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half highest() { 808 return half_impl::raw_uint16_to_half(0x7bff); 809 } 810 EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half lowest() { 811 return half_impl::raw_uint16_to_half(0xfbff); 812 } 813 EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half infinity() { 814 return half_impl::raw_uint16_to_half(0x7c00); 815 } 816 EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { 817 return half_impl::raw_uint16_to_half(0x7e00); 818 } 819 }; 820 821 } // end namespace Eigen 822 823 #if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) 824 #pragma pop_macro("EIGEN_CONSTEXPR") 825 #endif 826 827 namespace Eigen { 828 namespace numext { 829 830 #if defined(EIGEN_GPU_COMPILE_PHASE) 831 832 template <> 833 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::half& h) { 834 return (half_impl::isnan)(h); 835 } 836 837 template <> 838 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::half& h) { 839 return (half_impl::isinf)(h); 840 } 841 842 template <> 843 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::half& h) { 844 return (half_impl::isfinite)(h); 845 } 846 847 #endif 848 849 template <> 850 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bit_cast<Eigen::half, uint16_t>(const uint16_t& src) { 851 return Eigen::half(Eigen::half_impl::raw_uint16_to_half(src)); 852 } 853 854 template <> 855 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast<uint16_t, Eigen::half>(const Eigen::half& src) { 856 return Eigen::half_impl::raw_half_as_uint16(src); 857 } 858 859 } // namespace numext 860 } // namespace Eigen 861 862 // Add the missing shfl* intrinsics. 863 // The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300. 864 // CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)) 865 // 866 // HIP and CUDA prior to SDK 9.0 define 867 // __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float 868 // CUDA since 9.0 deprecates those and instead defines 869 // __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync, 870 // with native support for __half and __nv_bfloat16 871 // 872 // Note that the following are __device__ - only functions. 873 #if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) \ 874 || defined(EIGEN_HIPCC) 875 876 #if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000 877 878 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane, int width=warpSize) { 879 const __half h = var; 880 return static_cast<Eigen::half>(__shfl_sync(mask, h, srcLane, width)); 881 } 882 883 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) { 884 const __half h = var; 885 return static_cast<Eigen::half>(__shfl_up_sync(mask, h, delta, width)); 886 } 887 888 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) { 889 const __half h = var; 890 return static_cast<Eigen::half>(__shfl_down_sync(mask, h, delta, width)); 891 } 892 893 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask, int width=warpSize) { 894 const __half h = var; 895 return static_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width)); 896 } 897 898 #else // HIP or CUDA SDK < 9.0 899 900 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width=warpSize) { 901 const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var)); 902 return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl(ivar, srcLane, width))); 903 } 904 905 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width=warpSize) { 906 const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var)); 907 return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_up(ivar, delta, width))); 908 } 909 910 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width=warpSize) { 911 const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var)); 912 return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_down(ivar, delta, width))); 913 } 914 915 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { 916 const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var)); 917 return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_xor(ivar, laneMask, width))); 918 } 919 920 #endif // HIP vs CUDA 921 #endif // __shfl* 922 923 // ldg() has an overload for __half_raw, but we also need one for Eigen::half. 924 #if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) \ 925 || defined(EIGEN_HIPCC) 926 EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) { 927 return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr))); 928 } 929 #endif // __ldg 930 931 #if EIGEN_HAS_STD_HASH 932 namespace std { 933 template <> 934 struct hash<Eigen::half> { 935 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const { 936 return static_cast<std::size_t>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(a)); 937 } 938 }; 939 } // end namespace std 940 #endif 941 942 #endif // EIGEN_HALF_H 943