/aosp_15_r20/external/clang/test/CodeGenCUDA/ |
H A D | launch-bounds.cu | 32 template <int max_threads_per_block> 34 __launch_bounds__(max_threads_per_block) in __launch_bounds__() argument 42 template <int max_threads_per_block, int min_blocks_per_mp> 44 __launch_bounds__(max_threads_per_block, min_blocks_per_mp) in __launch_bounds__() argument 54 template <int max_threads_per_block, int min_blocks_per_mp> 56 __launch_bounds__(max_threads_per_block + constint, 57 min_blocks_per_mp + max_threads_per_block)
|
/aosp_15_r20/external/pytorch/aten/src/ATen/cuda/ |
H A D | CUDAApplyUtils.cuh | 356 int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK, 359 C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) in C10_LAUNCH_BOUNDS_2() argument 378 int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK, 403 const dim3 block = getApplyBlock(max_threads_per_block); in CUDA_tensor_apply2() 408 if (!getApplyGrid<step>(totalElements, grid, curDevice, max_threads_per_block)) { in CUDA_tensor_apply2() 443 max_threads_per_block, \ in CUDA_tensor_apply2() 526 int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK, 534 max_threads_per_block, min_blocks_per_sm>(a, b, op, aType, bType); in CUDA_tensor_apply2()
|
H A D | ApplyGridUtils.cuh | 23 …nt64_t totalElements, dim3& grid, c10::DeviceIndex curDevice, int max_threads_per_block=AT_APPLY_T… in getApplyGrid() argument 25 …uint64_t numel_per_thread = static_cast<uint64_t>(max_threads_per_block) * static_cast<uint64_t>(s… in getApplyGrid() 42 inline dim3 getApplyBlock(int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) { in getApplyBlock() argument 43 return dim3(max_threads_per_block); in getApplyBlock()
|
/aosp_15_r20/external/pytorch/c10/macros/ |
H A D | Macros.h | 314 #define C10_LAUNCH_BOUNDS_1(max_threads_per_block) \ argument 315 __launch_bounds__((C10_MAX_THREADS_PER_BLOCK((max_threads_per_block)))) 316 #define C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) \ argument 318 (C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))), \ 319 (C10_MIN_BLOCKS_PER_SM((max_threads_per_block), (min_blocks_per_sm))))
|
/aosp_15_r20/external/pytorch/aten/src/ATen/cuda/detail/ |
H A D | KernelUtils.h | 26 inline int GET_BLOCKS(const int64_t N, const int64_t max_threads_per_block=CUDA_NUM_THREADS) { 31 auto block_num = (N - 1) / max_threads_per_block + 1;
|
/aosp_15_r20/external/mesa3d/src/gallium/frontends/rusticl/api/ |
H A D | kernel.rs | 107 CL_KERNEL_WORK_GROUP_SIZE => cl_prop::<usize>(kernel.max_threads_per_block(dev)), in query() 191 if threads > dev.max_threads_per_block() { in query() 208 let threads = kernel.max_threads_per_block(dev); in query() 613 if threads != 0 && threads > k.max_threads_per_block(q.device) { in enqueue_ndrange_kernel()
|
H A D | device.rs | 192 CL_DEVICE_MAX_WORK_GROUP_SIZE => cl_prop::<usize>(dev.max_threads_per_block()), in query()
|
/aosp_15_r20/external/mesa3d/src/gallium/drivers/softpipe/ |
H A D | sp_screen.c | 538 uint64_t *max_threads_per_block = ret; in softpipe_get_compute_param() local 539 *max_threads_per_block = 1024; in softpipe_get_compute_param()
|
/aosp_15_r20/external/tensorflow/tensorflow/core/profiler/backends/gpu/ |
H A D | cupti_collector.cc | 409 auto max_threads_per_block = in GetDeviceCapabilities() local 428 max_threads_per_block && max_threads_per_sm && regs_per_block && in GetDeviceCapabilities() 434 device_properties_.maxThreadsPerBlock = *max_threads_per_block; in GetDeviceCapabilities()
|
/aosp_15_r20/external/mesa3d/src/gallium/frontends/clover/api/ |
H A D | kernel.cpp | 164 buf.as_scalar<size_t>() = dev.max_threads_per_block(); in clGetKernelWorkGroupInfo() 303 q.device().max_threads_per_block()) in validate_block_size()
|
H A D | device.cpp | 142 buf.as_scalar<size_t>() = dev.max_threads_per_block(); in clGetDeviceInfo()
|
/aosp_15_r20/external/mesa3d/src/gallium/frontends/clover/core/ |
H A D | device.cpp | 75 dev.max_threads_per_block() < 1 || in get_highest_supported_version() 315 device::max_threads_per_block() const { in max_threads_per_block() function in device
|
H A D | device.hpp | 71 size_t max_threads_per_block() const;
|
H A D | kernel.cpp | 149 q.device().max_threads_per_block(), q.device().max_block_size(), in optimal_block_size()
|
/aosp_15_r20/external/mesa3d/src/gallium/drivers/llvmpipe/ |
H A D | lp_screen.c | 515 uint64_t *max_threads_per_block = ret; in llvmpipe_get_compute_param() local 516 *max_threads_per_block = 1024; in llvmpipe_get_compute_param()
|
/aosp_15_r20/external/mesa3d/src/mesa/state_tracker/ |
H A D | st_extensions.c | 1690 uint64_t max_local_size, max_threads_per_block; in st_init_extensions() local 1698 &max_threads_per_block); in st_init_extensions() 1703 consts->MaxComputeWorkGroupInvocations = max_threads_per_block; in st_init_extensions() 1713 max_threads_per_block >= 1024 && in st_init_extensions()
|
/aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/ |
H A D | si_get.c | 1251 uint64_t *max_threads_per_block = ret; in si_get_compute_param() local 1252 *max_threads_per_block = get_max_threads_per_block(sscreen, ir_type); in si_get_compute_param()
|
/aosp_15_r20/external/mesa3d/src/gallium/drivers/virgl/ |
H A D | virgl_screen.c | 645 uint64_t *max_threads_per_block = ret; in virgl_get_compute_param() local 646 *max_threads_per_block = vscreen->caps.caps.v2.max_compute_work_group_invocations; in virgl_get_compute_param()
|
/aosp_15_r20/external/mesa3d/src/gallium/drivers/r600/ |
H A D | r600_pipe_common.c | 941 uint64_t *max_threads_per_block = ret; in r600_get_compute_param() local 942 *max_threads_per_block = get_max_threads_per_block(rscreen, ir_type); in r600_get_compute_param()
|
/aosp_15_r20/external/mesa3d/src/gallium/frontends/lavapipe/ |
H A D | lvp_device.c | 776 uint64_t max_threads_per_block, max_local_size; in lvp_get_properties() local 784 &max_threads_per_block); in lvp_get_properties() 858 .maxComputeWorkGroupInvocations = max_threads_per_block, in lvp_get_properties()
|
/aosp_15_r20/external/mesa3d/src/gallium/frontends/rusticl/core/ |
H A D | kernel.rs | 1206 let mut threads = self.max_threads_per_block(d); in suggest_local_size() 1703 pub fn max_threads_per_block(&self, dev: &Device) -> usize { in max_threads_per_block() method
|
H A D | device.rs | 953 pub fn max_threads_per_block(&self) -> usize { in max_threads_per_block() method
|
/aosp_15_r20/external/mesa3d/src/amd/vulkan/ |
H A D | radv_device.c | 1192 uint32_t max_threads_per_block = 2048; in radv_CreateDevice() local 1193 device->scratch_waves = MAX2(32 * pdev->info.num_cu, max_threads_per_block / 64); in radv_CreateDevice()
|