Home
last modified time | relevance | path

Searched refs:max_threads_per_block (Results 1 – 23 of 23) sorted by relevance

/aosp_15_r20/external/clang/test/CodeGenCUDA/
H A Dlaunch-bounds.cu32 template <int max_threads_per_block>
34 __launch_bounds__(max_threads_per_block) in __launch_bounds__() argument
42 template <int max_threads_per_block, int min_blocks_per_mp>
44 __launch_bounds__(max_threads_per_block, min_blocks_per_mp) in __launch_bounds__() argument
54 template <int max_threads_per_block, int min_blocks_per_mp>
56 __launch_bounds__(max_threads_per_block + constint,
57 min_blocks_per_mp + max_threads_per_block)
/aosp_15_r20/external/pytorch/aten/src/ATen/cuda/
H A DCUDAApplyUtils.cuh356 int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
359 C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) in C10_LAUNCH_BOUNDS_2() argument
378 int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
403 const dim3 block = getApplyBlock(max_threads_per_block); in CUDA_tensor_apply2()
408 if (!getApplyGrid<step>(totalElements, grid, curDevice, max_threads_per_block)) { in CUDA_tensor_apply2()
443 max_threads_per_block, \ in CUDA_tensor_apply2()
526 int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
534 max_threads_per_block, min_blocks_per_sm>(a, b, op, aType, bType); in CUDA_tensor_apply2()
H A DApplyGridUtils.cuh23 …nt64_t totalElements, dim3& grid, c10::DeviceIndex curDevice, int max_threads_per_block=AT_APPLY_T… in getApplyGrid() argument
25 …uint64_t numel_per_thread = static_cast<uint64_t>(max_threads_per_block) * static_cast<uint64_t>(s… in getApplyGrid()
42 inline dim3 getApplyBlock(int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) { in getApplyBlock() argument
43 return dim3(max_threads_per_block); in getApplyBlock()
/aosp_15_r20/external/pytorch/c10/macros/
H A DMacros.h314 #define C10_LAUNCH_BOUNDS_1(max_threads_per_block) \ argument
315 __launch_bounds__((C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))))
316 #define C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) \ argument
318 (C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))), \
319 (C10_MIN_BLOCKS_PER_SM((max_threads_per_block), (min_blocks_per_sm))))
/aosp_15_r20/external/pytorch/aten/src/ATen/cuda/detail/
H A DKernelUtils.h26 inline int GET_BLOCKS(const int64_t N, const int64_t max_threads_per_block=CUDA_NUM_THREADS) {
31 auto block_num = (N - 1) / max_threads_per_block + 1;
/aosp_15_r20/external/mesa3d/src/gallium/frontends/rusticl/api/
H A Dkernel.rs107 CL_KERNEL_WORK_GROUP_SIZE => cl_prop::<usize>(kernel.max_threads_per_block(dev)), in query()
191 if threads > dev.max_threads_per_block() { in query()
208 let threads = kernel.max_threads_per_block(dev); in query()
613 if threads != 0 && threads > k.max_threads_per_block(q.device) { in enqueue_ndrange_kernel()
H A Ddevice.rs192 CL_DEVICE_MAX_WORK_GROUP_SIZE => cl_prop::<usize>(dev.max_threads_per_block()), in query()
/aosp_15_r20/external/mesa3d/src/gallium/drivers/softpipe/
H A Dsp_screen.c538 uint64_t *max_threads_per_block = ret; in softpipe_get_compute_param() local
539 *max_threads_per_block = 1024; in softpipe_get_compute_param()
/aosp_15_r20/external/tensorflow/tensorflow/core/profiler/backends/gpu/
H A Dcupti_collector.cc409 auto max_threads_per_block = in GetDeviceCapabilities() local
428 max_threads_per_block && max_threads_per_sm && regs_per_block && in GetDeviceCapabilities()
434 device_properties_.maxThreadsPerBlock = *max_threads_per_block; in GetDeviceCapabilities()
/aosp_15_r20/external/mesa3d/src/gallium/frontends/clover/api/
H A Dkernel.cpp164 buf.as_scalar<size_t>() = dev.max_threads_per_block(); in clGetKernelWorkGroupInfo()
303 q.device().max_threads_per_block()) in validate_block_size()
H A Ddevice.cpp142 buf.as_scalar<size_t>() = dev.max_threads_per_block(); in clGetDeviceInfo()
/aosp_15_r20/external/mesa3d/src/gallium/frontends/clover/core/
H A Ddevice.cpp75 dev.max_threads_per_block() < 1 || in get_highest_supported_version()
315 device::max_threads_per_block() const { in max_threads_per_block() function in device
H A Ddevice.hpp71 size_t max_threads_per_block() const;
H A Dkernel.cpp149 q.device().max_threads_per_block(), q.device().max_block_size(), in optimal_block_size()
/aosp_15_r20/external/mesa3d/src/gallium/drivers/llvmpipe/
H A Dlp_screen.c515 uint64_t *max_threads_per_block = ret; in llvmpipe_get_compute_param() local
516 *max_threads_per_block = 1024; in llvmpipe_get_compute_param()
/aosp_15_r20/external/mesa3d/src/mesa/state_tracker/
H A Dst_extensions.c1690 uint64_t max_local_size, max_threads_per_block; in st_init_extensions() local
1698 &max_threads_per_block); in st_init_extensions()
1703 consts->MaxComputeWorkGroupInvocations = max_threads_per_block; in st_init_extensions()
1713 max_threads_per_block >= 1024 && in st_init_extensions()
/aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/
H A Dsi_get.c1251 uint64_t *max_threads_per_block = ret; in si_get_compute_param() local
1252 *max_threads_per_block = get_max_threads_per_block(sscreen, ir_type); in si_get_compute_param()
/aosp_15_r20/external/mesa3d/src/gallium/drivers/virgl/
H A Dvirgl_screen.c645 uint64_t *max_threads_per_block = ret; in virgl_get_compute_param() local
646 *max_threads_per_block = vscreen->caps.caps.v2.max_compute_work_group_invocations; in virgl_get_compute_param()
/aosp_15_r20/external/mesa3d/src/gallium/drivers/r600/
H A Dr600_pipe_common.c941 uint64_t *max_threads_per_block = ret; in r600_get_compute_param() local
942 *max_threads_per_block = get_max_threads_per_block(rscreen, ir_type); in r600_get_compute_param()
/aosp_15_r20/external/mesa3d/src/gallium/frontends/lavapipe/
H A Dlvp_device.c776 uint64_t max_threads_per_block, max_local_size; in lvp_get_properties() local
784 &max_threads_per_block); in lvp_get_properties()
858 .maxComputeWorkGroupInvocations = max_threads_per_block, in lvp_get_properties()
/aosp_15_r20/external/mesa3d/src/gallium/frontends/rusticl/core/
H A Dkernel.rs1206 let mut threads = self.max_threads_per_block(d); in suggest_local_size()
1703 pub fn max_threads_per_block(&self, dev: &Device) -> usize { in max_threads_per_block() method
H A Ddevice.rs953 pub fn max_threads_per_block(&self) -> usize { in max_threads_per_block() method
/aosp_15_r20/external/mesa3d/src/amd/vulkan/
H A Dradv_device.c1192 uint32_t max_threads_per_block = 2048; in radv_CreateDevice() local
1193 device->scratch_waves = MAX2(32 * pdev->info.num_cu, max_threads_per_block / 64); in radv_CreateDevice()