1 #include <ATen/cuda/CUDAContext.h> 2 3 #include <cuda_runtime.h> 4 5 namespace at::cuda { 6 7 /** 8 Computes ceil(a / b) 9 */ 10 template <typename T> ATenCeilDiv(T a,T b)11__host__ __device__ __forceinline__ T ATenCeilDiv(T a, T b) { 12 return (a + b - 1) / b; 13 } 14 15 namespace { 16 17 // Threads per block for our apply kernel 18 // FIXME: use occupancy calculator instead 19 constexpr uint32_t AT_APPLY_THREADS_PER_BLOCK = 512; 20 constexpr uint32_t AT_APPLY_BLOCKS_PER_SM = 4; 21 22 template <int step = 1> getApplyGrid(uint64_t totalElements,dim3 & grid,c10::DeviceIndex curDevice,int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK)23inline bool getApplyGrid(uint64_t totalElements, dim3& grid, c10::DeviceIndex curDevice, int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) { 24 if (curDevice == -1) return false; 25 uint64_t numel_per_thread = static_cast<uint64_t>(max_threads_per_block) * static_cast<uint64_t>(step); 26 uint64_t numBlocks = ATenCeilDiv(totalElements, numel_per_thread); 27 uint64_t maxGridX = at::cuda::getDeviceProperties(curDevice)->maxGridSize[0]; 28 if (numBlocks > maxGridX) 29 numBlocks = maxGridX; 30 grid = dim3(numBlocks); 31 return true; 32 } 33 getApplyBlocksPerSM()34constexpr int getApplyBlocksPerSM() { 35 return AT_APPLY_BLOCKS_PER_SM; 36 } 37 getApplyBlockSize()38constexpr int getApplyBlockSize() { 39 return AT_APPLY_THREADS_PER_BLOCK; 40 } 41 getApplyBlock(int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK)42inline dim3 getApplyBlock(int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) { 43 return dim3(max_threads_per_block); 44 } 45 46 } // anonymous namespace 47 } // namespace at::cuda 48