xref: /aosp_15_r20/external/pytorch/aten/src/ATen/cuda/detail/IntegerDivider.cuh (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #pragma once
2 
3 #include <assert.h>
4 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
5 #include <cuda_runtime.h>
6 #endif
7 
8 namespace at::cuda::detail {
9 
10 // A utility class to implement integer division by multiplication, given a fixed
11 // divisor.
12 //
13 // WARNING: The fast divider algorithm is only implemented for unsigned int;
14 //          otherwise we default to plain integer division.  For unsigned int,
15 //          we further assume that the dividend is at most INT32_MAX.  Thus,
16 //          IntDivider must NOT be used for general integer division.
17 //
18 //          This reduced range is enough for our purpose, and it allows us to
19 //          slightly simplify the computation.
20 //
21 // (NOTE: Below, "2^k" denotes exponentiation, i.e., 1<<k.)
22 //
23 // For any N-bit unsigned integer d (> 0), we can find a "magic number" m (2^N
24 // <= m < 2^(N+1)) and shift s such that:
25 //
26 //    \floor(n / d) = \floor((m * n) / 2^(N+s)).
27 //
28 // Given such m and s, the integer division can be then implemented as:
29 //
30 //    let m' = m - 2^N  // 0 <= m' < 2^N
31 //
32 //    fast_integer_division(n):
33 //      // Multiply two N-bit unsigned integers: the result is a 2N-bit unsigned
34 //      // integer.  Then take the higher N bits.
35 //      t = (m' * n) >> N
36 //
37 //      // Here we use the fact that n is less than 2^(N-1): otherwise the value
38 //      // of (t + n) may not fit in an N-bit integer.
39 //      return (t + n) >> s
40 //
41 // Finding such a magic number is surprisingly easy:
42 //
43 //    s  = \ceil(\log_2 d)
44 //    m' = \floor(2^N * (2^s - d) / d) + 1  // Need 2N-bit integer arithmetic.
45 //
46 // See also:
47 //    - Division by Invariant Integers Using Multiplication,
48 //      Torbjörn Granlund and Peter L. Montgomery, 1994.
49 //
50 //    - http://www.hackersdelight.org/magic.htm
51 //
52 //    - http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
53 
54 // Result of div/mod operation stored together.
55 template <typename Value>
56 struct DivMod {
57   Value div, mod;
58 
DivModat::cuda::detail::DivMod59   C10_HOST_DEVICE DivMod(Value div, Value mod) : div(div), mod(mod) { }
60 };
61 
62 // Base case: we only have an implementation for uint32_t for now.  For
63 // everything else, we use plain division.
64 template <typename Value>
65 struct IntDivider {
66   IntDivider() = default;
IntDividerat::cuda::detail::IntDivider67   IntDivider(Value d) : divisor(d) { }
68 
divat::cuda::detail::IntDivider69   C10_HOST_DEVICE inline Value div(Value n) const { return n / divisor; }
modat::cuda::detail::IntDivider70   C10_HOST_DEVICE inline Value mod(Value n) const { return n % divisor; }
divmodat::cuda::detail::IntDivider71   C10_HOST_DEVICE inline DivMod<Value> divmod(Value n) const {
72     return DivMod<Value>(n / divisor, n % divisor);
73   }
74 
75   Value divisor;
76 };
77 
78 // Implement fast integer division.
79 template <>
80 struct IntDivider<unsigned int> {
81   static_assert(sizeof(unsigned int) == 4, "Assumes 32-bit unsigned int.");
82 
83   IntDivider() = default;
84 
IntDividerat::cuda::detail::IntDivider85   IntDivider(unsigned int d) : divisor(d) {
86     assert(divisor >= 1 && divisor <= INT32_MAX);
87 
88     // TODO: gcc/clang has __builtin_clz() but it's not portable.
89     for (shift = 0; shift < 32; shift++) if ((1U << shift) >= divisor) break;
90 
91     uint64_t one = 1;
92     uint64_t magic = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
93     m1 = magic;
94     assert(m1 > 0 && m1 == magic);  // m1 must fit in 32 bits.
95   }
96 
divat::cuda::detail::IntDivider97   C10_HOST_DEVICE inline unsigned int div(unsigned int n) const {
98 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
99     // 't' is the higher 32-bits of unsigned 32-bit multiplication of 'n' and
100     // 'm1'.
101     unsigned int t = __umulhi(n, m1);
102     return (t + n) >> shift;
103 #else
104     // Using uint64_t so that the addition does not overflow.
105     uint64_t t = ((uint64_t) n * m1) >> 32;
106     return (t + n) >> shift;
107 #endif
108   }
109 
modat::cuda::detail::IntDivider110   C10_HOST_DEVICE inline unsigned int mod(unsigned int n) const {
111     return n - div(n) * divisor;
112   }
113 
divmodat::cuda::detail::IntDivider114   C10_HOST_DEVICE inline DivMod<unsigned int> divmod(unsigned int n) const {
115     unsigned int q = div(n);
116     return DivMod<unsigned int>(q, n - q * divisor);
117   }
118 
119   unsigned int divisor;  // d above.
120   unsigned int m1;  // Magic number: m' above.
121   unsigned int shift;  // Shift amounts.
122 };
123 
124 }  // namespace at::cuda::detail
125