1 #pragma once 2 3 #include <assert.h> 4 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) 5 #include <cuda_runtime.h> 6 #endif 7 8 namespace at::cuda::detail { 9 10 // A utility class to implement integer division by multiplication, given a fixed 11 // divisor. 12 // 13 // WARNING: The fast divider algorithm is only implemented for unsigned int; 14 // otherwise we default to plain integer division. For unsigned int, 15 // we further assume that the dividend is at most INT32_MAX. Thus, 16 // IntDivider must NOT be used for general integer division. 17 // 18 // This reduced range is enough for our purpose, and it allows us to 19 // slightly simplify the computation. 20 // 21 // (NOTE: Below, "2^k" denotes exponentiation, i.e., 1<<k.) 22 // 23 // For any N-bit unsigned integer d (> 0), we can find a "magic number" m (2^N 24 // <= m < 2^(N+1)) and shift s such that: 25 // 26 // \floor(n / d) = \floor((m * n) / 2^(N+s)). 27 // 28 // Given such m and s, the integer division can be then implemented as: 29 // 30 // let m' = m - 2^N // 0 <= m' < 2^N 31 // 32 // fast_integer_division(n): 33 // // Multiply two N-bit unsigned integers: the result is a 2N-bit unsigned 34 // // integer. Then take the higher N bits. 35 // t = (m' * n) >> N 36 // 37 // // Here we use the fact that n is less than 2^(N-1): otherwise the value 38 // // of (t + n) may not fit in an N-bit integer. 39 // return (t + n) >> s 40 // 41 // Finding such a magic number is surprisingly easy: 42 // 43 // s = \ceil(\log_2 d) 44 // m' = \floor(2^N * (2^s - d) / d) + 1 // Need 2N-bit integer arithmetic. 45 // 46 // See also: 47 // - Division by Invariant Integers Using Multiplication, 48 // Torbjörn Granlund and Peter L. Montgomery, 1994. 49 // 50 // - http://www.hackersdelight.org/magic.htm 51 // 52 // - http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html 53 54 // Result of div/mod operation stored together. 55 template <typename Value> 56 struct DivMod { 57 Value div, mod; 58 DivModat::cuda::detail::DivMod59 C10_HOST_DEVICE DivMod(Value div, Value mod) : div(div), mod(mod) { } 60 }; 61 62 // Base case: we only have an implementation for uint32_t for now. For 63 // everything else, we use plain division. 64 template <typename Value> 65 struct IntDivider { 66 IntDivider() = default; IntDividerat::cuda::detail::IntDivider67 IntDivider(Value d) : divisor(d) { } 68 divat::cuda::detail::IntDivider69 C10_HOST_DEVICE inline Value div(Value n) const { return n / divisor; } modat::cuda::detail::IntDivider70 C10_HOST_DEVICE inline Value mod(Value n) const { return n % divisor; } divmodat::cuda::detail::IntDivider71 C10_HOST_DEVICE inline DivMod<Value> divmod(Value n) const { 72 return DivMod<Value>(n / divisor, n % divisor); 73 } 74 75 Value divisor; 76 }; 77 78 // Implement fast integer division. 79 template <> 80 struct IntDivider<unsigned int> { 81 static_assert(sizeof(unsigned int) == 4, "Assumes 32-bit unsigned int."); 82 83 IntDivider() = default; 84 IntDividerat::cuda::detail::IntDivider85 IntDivider(unsigned int d) : divisor(d) { 86 assert(divisor >= 1 && divisor <= INT32_MAX); 87 88 // TODO: gcc/clang has __builtin_clz() but it's not portable. 89 for (shift = 0; shift < 32; shift++) if ((1U << shift) >= divisor) break; 90 91 uint64_t one = 1; 92 uint64_t magic = ((one << 32) * ((one << shift) - divisor)) / divisor + 1; 93 m1 = magic; 94 assert(m1 > 0 && m1 == magic); // m1 must fit in 32 bits. 95 } 96 divat::cuda::detail::IntDivider97 C10_HOST_DEVICE inline unsigned int div(unsigned int n) const { 98 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) 99 // 't' is the higher 32-bits of unsigned 32-bit multiplication of 'n' and 100 // 'm1'. 101 unsigned int t = __umulhi(n, m1); 102 return (t + n) >> shift; 103 #else 104 // Using uint64_t so that the addition does not overflow. 105 uint64_t t = ((uint64_t) n * m1) >> 32; 106 return (t + n) >> shift; 107 #endif 108 } 109 modat::cuda::detail::IntDivider110 C10_HOST_DEVICE inline unsigned int mod(unsigned int n) const { 111 return n - div(n) * divisor; 112 } 113 divmodat::cuda::detail::IntDivider114 C10_HOST_DEVICE inline DivMod<unsigned int> divmod(unsigned int n) const { 115 unsigned int q = div(n); 116 return DivMod<unsigned int>(q, n - q * divisor); 117 } 118 119 unsigned int divisor; // d above. 120 unsigned int m1; // Magic number: m' above. 121 unsigned int shift; // Shift amounts. 122 }; 123 124 } // namespace at::cuda::detail 125