xref: /aosp_15_r20/external/pytorch/aten/src/ATen/cuda/ApplyGridUtils.cuh (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #include <ATen/cuda/CUDAContext.h>
2 
3 #include <cuda_runtime.h>
4 
5 namespace at::cuda {
6 
7 /**
8    Computes ceil(a / b)
9 */
10 template <typename T>
ATenCeilDiv(T a,T b)11 __host__ __device__ __forceinline__ T ATenCeilDiv(T a, T b) {
12   return (a + b - 1) / b;
13 }
14 
15 namespace {
16 
17 // Threads per block for our apply kernel
18 // FIXME: use occupancy calculator instead
19 constexpr uint32_t AT_APPLY_THREADS_PER_BLOCK = 512;
20 constexpr uint32_t AT_APPLY_BLOCKS_PER_SM = 4;
21 
22 template <int step = 1>
getApplyGrid(uint64_t totalElements,dim3 & grid,c10::DeviceIndex curDevice,int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK)23 inline bool getApplyGrid(uint64_t totalElements, dim3& grid, c10::DeviceIndex curDevice, int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) {
24   if (curDevice == -1) return false;
25   uint64_t numel_per_thread = static_cast<uint64_t>(max_threads_per_block) * static_cast<uint64_t>(step);
26   uint64_t numBlocks = ATenCeilDiv(totalElements, numel_per_thread);
27   uint64_t maxGridX = at::cuda::getDeviceProperties(curDevice)->maxGridSize[0];
28   if (numBlocks > maxGridX)
29     numBlocks = maxGridX;
30   grid = dim3(numBlocks);
31   return true;
32 }
33 
getApplyBlocksPerSM()34 constexpr int getApplyBlocksPerSM() {
35   return AT_APPLY_BLOCKS_PER_SM;
36 }
37 
getApplyBlockSize()38 constexpr int getApplyBlockSize() {
39   return AT_APPLY_THREADS_PER_BLOCK;
40 }
41 
getApplyBlock(int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK)42 inline dim3 getApplyBlock(int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) {
43   return dim3(max_threads_per_block);
44 }
45 
46 } // anonymous namespace
47 } // namespace at::cuda
48