xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/cuda/SummaryOps.cu (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/AccumulateType.h>
3 #include <ATen/Dispatch.h>
4 #include <ATen/NumericUtils.h>
5 #include <ATen/core/Tensor.h>
6 #include <ATen/cuda/CUDAContext.h>
7 #include <ATen/native/Resize.h>
8 #include <ATen/cuda/Atomic.cuh>
9 #include <ATen/cuda/CUDAApplyUtils.cuh>
10 
11 #ifndef AT_PER_OPERATOR_HEADERS
12 #include <ATen/Functions.h>
13 #include <ATen/NativeFunctions.h>
14 #else
15 #include <ATen/ops/bincount_native.h>
16 #include <ATen/ops/empty.h>
17 #include <ATen/ops/histc_native.h>
18 #include <ATen/ops/zeros.h>
19 #endif
20 
21 namespace at {
22 namespace cuda {
23 #define RATIO_OF_GMEM_ATOMIC_ADD_TO_SMEM_ATOMIC_ADD 8
24 #define FOR_KERNEL_LOOP(i, lim)                                      \
25   for (IndexType i = blockIdx.x * blockDim.x + threadIdx.x; i < lim; \
26        i += gridDim.x * blockDim.x)
27 
28 /*
29   Memory types used for the 3 histogram implementations.
30   See `CUDA_tensor_histogram` below.
31  */
32 enum class CUDAHistogramMemoryType { SHARED, GLOBAL };
33 namespace {
34 template <typename input_t, typename IndexType>
getBin(input_t bVal,at::acc_type<input_t,true> minvalue,at::acc_type<input_t,true> maxvalue,int64_t nbins)35 __device__ static IndexType getBin(
36     input_t bVal,
37     at::acc_type<input_t, /*is_cuda=*/true> minvalue,
38     at::acc_type<input_t, /*is_cuda=*/true> maxvalue,
39     int64_t nbins) {
40   IndexType bin = (int)(((bVal - minvalue)) * nbins / (maxvalue - minvalue));
41   // (only applicable for histc)
42   // while each bin is inclusive at the lower end and exclusive at the higher,
43   // i.e. [start, end) the last bin is inclusive at both, i.e. [start, end], in
44   // order to include maxvalue if exists therefore when bin == nbins, adjust bin
45   // to the last bin
46   if (bin == nbins)
47     bin -= 1;
48   return bin;
49 }
50 }
51 
52 /*
53   Kernel for computing the histogram of the input.
54  */
55 template <
56     typename output_t,
57     typename input_t,
58     typename IndexType,
59     int ADims,
60     int PDims,
61     int BDims,
62     CUDAHistogramMemoryType MemoryType,
63     typename Op>
C10_LAUNCH_BOUNDS_1(cuda::getApplyBlockSize ())64 C10_LAUNCH_BOUNDS_1(cuda::getApplyBlockSize())
65 __global__ void kernelHistogram1D(
66     detail::TensorInfo<output_t, IndexType> a, /* output */
67     detail::TensorInfo<output_t, IndexType> p, /* partial output */
68     detail::TensorInfo<const input_t, IndexType> b, /* input */
69     int64_t nbins,
70     at::acc_type<input_t, /*is_cuda=*/true> minvalue,
71     at::acc_type<input_t, /*is_cuda=*/true> maxvalue,
72     IndexType totalElements,
73     Op getOp) {
74   extern __shared__ unsigned char my_smem[];
75   output_t* smem = nullptr;
76 
77   if (MemoryType == CUDAHistogramMemoryType::SHARED) {
78     ////////////////////////// Shared memory //////////////////////////
79     // atomically add to block specific shared memory
80     // then atomically add to the global output tensor
81     smem = reinterpret_cast<output_t*>(my_smem);
82     for (IndexType i = threadIdx.x; i < a.sizes[0]; i += blockDim.x) {
83       smem[i] = 0;
84     }
85     __syncthreads();
86     FOR_KERNEL_LOOP(linearIndex, totalElements) {
87       // Convert `linearIndex` into an offset of `b`
88       const IndexType bOffset =
89           detail::IndexToOffset<const input_t, IndexType, BDims>::get(linearIndex, b);
90       const auto bVal = b.data[bOffset];
91       if (bVal >= minvalue && bVal <= maxvalue) {
92         // Use value at `b` as an offset of `smem`
93         const IndexType bin =
94             getBin<input_t, IndexType>(bVal, minvalue, maxvalue, nbins);
95         gpuAtomicAddNoReturn(&smem[bin], getOp(linearIndex));
96       }
97     }
98     __syncthreads();
99     // NOTE: atomically update output bin count.
100     //   Atomic update is imp since __syncthread() will only synchronize threads
101     //   in a given block, not across blocks.
102     for (IndexType i = threadIdx.x; i < a.sizes[0]; i += blockDim.x) {
103       const IndexType aOffset =
104           detail::IndexToOffset<output_t, IndexType, ADims>::get(i, a);
105       gpuAtomicAddNoReturn(&a.data[aOffset], smem[i]);
106     }
107 
108   } else {
109     ////////////////////////// Global memory //////////////////////////
110     // atomically add to the output tensor
111     // compute histogram for the block
112     FOR_KERNEL_LOOP(linearIndex, totalElements) {
113       // Convert `linearIndex` into an offset of `b`
114       const IndexType bOffset =
115           detail::IndexToOffset<const input_t, IndexType, BDims>::get(linearIndex, b);
116       const auto bVal = b.data[bOffset];
117       if (bVal >= minvalue && bVal <= maxvalue) {
118         // Use value at `b` as an offset of `a`
119         const IndexType bin =
120             getBin<input_t, IndexType>(bVal, minvalue, maxvalue, nbins);
121         const IndexType aOffset =
122             detail::IndexToOffset<output_t, IndexType, ADims>::get(bin, a);
123         gpuAtomicAddNoReturn(&a.data[aOffset], getOp(linearIndex));
124       }
125     }
126   }
127 }
128 
129 #define HANDLE_CASE(MEMORY_TYPE, WEIGHTS_OP, SHARED_MEM)                 \
130   kernelHistogram1D<                                                     \
131       output_t,                                                          \
132       input_t,                                                           \
133       IndexType,                                                         \
134       1,                                                                 \
135       2,                                                                 \
136       -1,                                                                \
137       MEMORY_TYPE><<<grid, block, SHARED_MEM, getCurrentCUDAStream()>>>( \
138       aInfo,                                                             \
139       pInfo,                                                             \
140       bInfo,                                                             \
141       nbins,                                                             \
142       minvalue,                                                          \
143       maxvalue,                                                          \
144       totalElements,                                                     \
145       WEIGHTS_OP);                                                       \
146   C10_CUDA_KERNEL_LAUNCH_CHECK();
147 
148 #define HANDLE_SWITCH_CASE(mType, getOp)                                   \
149   switch (mType) {                                                         \
150     case CUDAHistogramMemoryType::SHARED:                                  \
151       HANDLE_CASE(CUDAHistogramMemoryType::SHARED, getOp, sharedMem);      \
152       break;                                                               \
153     default:                                                               \
154       HANDLE_CASE(CUDAHistogramMemoryType::GLOBAL, getOp, 0);              \
155   }
156 
157 /*
158   Calculate the frequency of the input values.
159 
160   `a` contains the final output or the histogram.
161   Input `b` is assumed to be 1-D non-negative int array.
162   `c` optionally contains the weight vector.
163   See `help torch.bincount` for details on the math.
164 
165   3 implementations based of input size and memory usage:
166     case: enough shared mem
167         SHARED: Each block atomically adds to it's own **shared** hist copy,
168         then atomically updates the global tensor.
169     case: no enough shared mem
170         GLOBAL: all threads atomically update to a single **global** hist copy.
171  */
172 template <typename output_t, typename input_t, bool HasWeights>
CUDA_tensor_histogram(at::Tensor a,at::Tensor b,at::Tensor c,int64_t nbins,at::acc_type<input_t,true> minvalue,at::acc_type<input_t,true> maxvalue,TensorArgType aType=TensorArgType::ReadWrite,TensorArgType bType=TensorArgType::ReadOnly,TensorArgType cType=TensorArgType::ReadOnly)173 bool CUDA_tensor_histogram(
174     at::Tensor a, /* output */
175     at::Tensor b, /* input */
176     at::Tensor c, /* weights(optional) */
177     int64_t nbins,
178     at::acc_type<input_t, /*is_cuda=*/true> minvalue,
179     at::acc_type<input_t, /*is_cuda=*/true> maxvalue,
180     TensorArgType aType = TensorArgType::ReadWrite,
181     TensorArgType bType = TensorArgType::ReadOnly,
182     TensorArgType cType = TensorArgType::ReadOnly) {
183   checkBackend("CUDA_tensor_histogram", {a, b}, Backend::CUDA);
184   if (HasWeights) {
185     checkBackend("CUDA_tensor_histogram", {c}, Backend::CUDA);
186   }
187   auto totalElements = b.numel();
188 
189   if (totalElements == 0) {
190     return false;
191   }
192 
193   const dim3 block = getApplyBlock();
194   dim3 grid;
195   auto curDevice = current_device();
196   if (curDevice == -1 || !getApplyGrid(totalElements, grid, curDevice)) {
197     return false;
198   }
199 
200   CUDAHistogramMemoryType memType = CUDAHistogramMemoryType::GLOBAL;
201   auto maxSharedMem = getCurrentDeviceProperties()->sharedMemPerBlock;
202   auto sharedMem = nbins * sizeof(output_t) + 8; // 8 guard bytes
203   // determine memory type to use in the kernel
204   if (sharedMem < maxSharedMem) {
205     // Solve equations:
206     // (1) #(smem atomicAdd per SM) = totalElements / min(grid.x, #SM)
207     // (2) #(gmem atomicAdd) = grid.x * nbins
208     // (3) RATIO_OF_GMEM_ATOMIC_ADD_TO_SMEM_ATOMIC_ADD = #(gmem atomicAdd) / #(smem atomicAdd per SM)
209     unsigned optimalGrid = ceil_div<size_t>(RATIO_OF_GMEM_ATOMIC_ADD_TO_SMEM_ATOMIC_ADD * totalElements,
210                                             nbins * getCurrentDeviceProperties()->multiProcessorCount);
211     if (optimalGrid < (unsigned)getCurrentDeviceProperties()->multiProcessorCount) {
212       optimalGrid = 1 + (unsigned)std::sqrt(RATIO_OF_GMEM_ATOMIC_ADD_TO_SMEM_ATOMIC_ADD * totalElements / nbins);
213     }
214     auto optimalSteps = ceil_div<size_t>(totalElements, optimalGrid * block.x);
215     optimalGrid = ceil_div<size_t>(totalElements, optimalSteps * block.x);
216     grid.x = std::min(grid.x, optimalGrid);
217     memType = CUDAHistogramMemoryType::SHARED;
218   }
219 
220   using IndexType = int64_t;
221   auto aInfo = detail::getTensorInfo<output_t, IndexType>(a);
222   auto bInfo = detail::getTensorInfo<const input_t, IndexType>(b);
223   detail::TensorInfo<output_t, IndexType> pInfo(nullptr, 0, {}, {});
224 
225   if (HasWeights) {
226     auto cInfo = detail::getTensorInfo<output_t, IndexType>(c);
227     const auto getWeightsOp = [cInfo] __device__(IndexType cIndex) {
228       const IndexType cOffset =
229           detail::IndexToOffset<output_t, IndexType, 1>::get(cIndex, cInfo);
230       return cInfo.data[cOffset];
231     };
232     HANDLE_SWITCH_CASE(memType, getWeightsOp)
233   } else {
234     static const auto getDummyOp = [] __device__(IndexType) { return 1L; };
235     HANDLE_SWITCH_CASE(memType, getDummyOp)
236   }
237   return true;
238 }
239 
240 #undef HANDLE_CASE
241 #undef HANDLE_SWITCH_CASE
242 #undef FOR_KERNEL_LOOP
243 #undef RATIO_OF_GMEM_ATOMIC_ADD_TO_SMEM_ATOMIC_ADD
244 } // namespace cuda
245 
246 namespace {
247 ///////////////// bincount /////////////////
248 template <typename input_t, typename weights_t>
_bincount_cuda_template(const Tensor & self,const Tensor & weights,int64_t minlength)249 Tensor _bincount_cuda_template(
250     const Tensor& self,
251     const Tensor& weights,
252     int64_t minlength) {
253   if (minlength < 0) {
254     AT_ERROR("minlength should be >= 0");
255   }
256   if (self.dim() == 1 && self.numel() == 0) {
257     return at::zeros(
258         {minlength},
259         kLong,
260         std::nullopt /* layout */,
261         kCUDA,
262         std::nullopt /* pin_memory */);
263   }
264   if (self.dim() != 1 ||
265       (!std::is_same<input_t, uint8_t>::value &&
266        *self.min().cpu().const_data_ptr<input_t>() < 0)) {
267     AT_ERROR("bincount only supports 1-d non-negative integral inputs.");
268   }
269 
270   bool has_weights = weights.defined();
271   if (has_weights && (weights.dim() != 1 || weights.size(0) != self.size(0))) {
272     AT_ERROR("weights should be 1-d and have the same length as input");
273   }
274 
275   const int64_t nbins =
276       std::max(self.max().item<input_t>() + (int64_t)1, minlength);
277 
278   // we are using acc_type for the bounds, in particular int64_t for integers
279   // in order to avoid overflows (e.g. using 256 bins for dtype uint8)
280   using bounds_t = at::acc_type<input_t, /*is_cuda=*/true>;
281   const bounds_t minvalue = 0;
282   const bounds_t maxvalue = nbins;
283   // alloc output counter on GPU
284   Tensor output;
285   if (has_weights) {
286     output = at::zeros(
287         {nbins},
288         optTypeMetaToScalarType(weights.options().dtype_opt()),
289         weights.options().layout_opt(),
290         weights.options().device_opt(),
291         weights.options().pinned_memory_opt());
292     cuda::CUDA_tensor_histogram<weights_t, input_t, true>(
293         output, self, weights, nbins, minvalue, maxvalue);
294   } else {
295     output = at::zeros(
296         {nbins},
297         kLong,
298         std::nullopt /* layout */,
299         DeviceType::CUDA,
300         std::nullopt /* pin_memory */);
301     cuda::CUDA_tensor_histogram<int64_t, input_t, false>(
302         output, self, weights, nbins, minvalue, maxvalue);
303   }
304   return output;
305 }
306 
307 ///////////////// histc /////////////////
308 template <typename input_t>
_histc_cuda_template(const Tensor & self,int64_t nbins,at::acc_type<input_t,true> min,at::acc_type<input_t,true> max)309 Tensor _histc_cuda_template(
310     const Tensor& self,
311     int64_t nbins,
312     at::acc_type<input_t, /*is_cuda=*/true> min,
313     at::acc_type<input_t, /*is_cuda=*/true> max) {
314   if (nbins <= 0) {
315     AT_ERROR("bins must be > 0");
316   }
317   Tensor output = at::zeros(
318       {nbins},
319       self.scalar_type(),
320       std::nullopt /* layout */,
321       DeviceType::CUDA,
322       std::nullopt /* pin_memory */);
323   input_t minvalue = min;
324   input_t maxvalue = max;
325   if (min == max && self.numel() > 0) {
326     minvalue = *self.min().cpu().const_data_ptr<input_t>();
327     maxvalue = *self.max().cpu().const_data_ptr<input_t>();
328   }
329   if (minvalue == maxvalue) {
330     minvalue = minvalue - 1;
331     maxvalue = maxvalue + 1;
332   }
333 
334 #if !defined(USE_ROCM)
335   TORCH_CHECK(
336       !(at::_isinf(minvalue) || at::_isinf(maxvalue) ||
337         at::_isnan(minvalue) || at::_isnan(maxvalue)),
338       "range of [",
339       minvalue,
340       ", ",
341       maxvalue,
342       "] is not finite");
343 #else
344   TORCH_CHECK(
345       !(std::isinf(minvalue) || std::isinf(maxvalue) || std::isnan(minvalue) ||
346         std::isnan(maxvalue)),
347       "range of [",
348       minvalue,
349       ", ",
350       maxvalue,
351       "] is not finite");
352 #endif
353   TORCH_CHECK(minvalue < maxvalue, "max must be larger than min");
354 
355   cuda::CUDA_tensor_histogram<input_t, input_t, false>(
356       output, self, Tensor(), nbins, minvalue, maxvalue);
357   return output;
358 }
359 } // namespace
360 
361 namespace native {
_bincount_cuda(const Tensor & self,const std::optional<Tensor> & weights_opt,int64_t minlength)362 Tensor _bincount_cuda(
363     const Tensor& self, const std::optional<Tensor>& weights_opt,
364     int64_t minlength) {
365   // See [Note: hacky wrapper removal for optional tensor]
366   c10::MaybeOwned<Tensor> weights_maybe_owned = at::borrow_from_optional_tensor(weights_opt);
367   const Tensor& weights = *weights_maybe_owned;
368 
369   if (weights_opt.has_value()) {
370     // See Note [Writing Nondeterministic Operations]
371     // Nondeterministic if weights are given, because of floating point
372     // atomicAdd usage
373     globalContext().alertNotDeterministic("_bincount_cuda");
374   }
375   return AT_DISPATCH_INTEGRAL_TYPES(self.scalar_type(), "bincount_cuda", [&] {
376     const auto scalar = weights.scalar_type();
377     if (scalar == ScalarType::Undefined || scalar == ScalarType::Float)
378       return _bincount_cuda_template<scalar_t, float>(self, weights, minlength);
379     return _bincount_cuda_template<scalar_t, double>(
380         self, weights.to(kDouble), minlength);
381   });
382 }
383 
_histc_cuda(const Tensor & self,int64_t nbins,const Scalar & min,const Scalar & max)384 Tensor _histc_cuda(
385     const Tensor& self,
386     int64_t nbins,
387     const Scalar& min,
388     const Scalar& max) {
389   if (self.scalar_type() == ScalarType::Half) {
390     AT_ERROR("HalfTensor is not supported");
391   }
392   // See Note [Writing Nondeterministic Operations]
393   // Nondeterministic because of atomicAdd usage
394   globalContext().alertNotDeterministic("_histc_cuda");
395   return AT_DISPATCH_ALL_TYPES(self.scalar_type(), "histc", [&] {
396     using bounds_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
397     return _histc_cuda_template<scalar_t>(
398         self, nbins, min.to<bounds_t>(), max.to<bounds_t>());
399   });
400 }
401 
_histc_out_cuda(const Tensor & self,int64_t bins,const Scalar & min,const Scalar & max,Tensor & result)402 Tensor& _histc_out_cuda(const Tensor& self, int64_t bins, const Scalar& min, const Scalar& max, Tensor& result) {
403   auto ret = _histc_cuda(self, bins, min, max);
404   resize_output(result, ret.sizes());
405   result.copy_(ret);
406   return result;
407 }
408 } // namespace native
409 } // namespace at
410