xref: /aosp_15_r20/external/pytorch/aten/src/ATen/cuda/Sleep.cu (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #include <ATen/cuda/CUDAContextLight.h>
2 #include <ATen/cuda/Sleep.h>
3 
4 #include <c10/cuda/CUDAException.h>
5 #include <c10/cuda/CUDAStream.h>
6 
7 namespace at::cuda {
8 namespace {
spin_kernel(int64_t cycles)9 __global__ void spin_kernel(int64_t cycles) {
10   // Few AMD specific GPUs have different clock intrinsic
11 #if defined(__GFX11__) && defined(USE_ROCM) && !defined(__CUDA_ARCH__)
12   int64_t start_clock = wall_clock64();
13 #else
14   // see concurrentKernels CUDA sampl
15   int64_t start_clock = clock64();
16 #endif
17   int64_t clock_offset = 0;
18   while (clock_offset < cycles)
19   {
20 #if defined(__GFX11__) && defined(USE_ROCM) && !defined(__CUDA_ARCH__)
21     clock_offset = wall_clock64() - start_clock;
22 #else
23     clock_offset = clock64() - start_clock;
24 #endif
25   }
26 }
27 }
28 
sleep(int64_t cycles)29 void sleep(int64_t cycles) {
30   dim3 grid(1);
31   dim3 block(1);
32   spin_kernel<<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(cycles);
33   C10_CUDA_KERNEL_LAUNCH_CHECK();
34 }
35 
36 #ifdef USE_ROCM
flush_icache_kernel()37 __global__ void flush_icache_kernel()
38 {
39     asm __volatile__("s_icache_inv \n\t"
40                      "s_nop 0 \n\t"
41                      "s_nop 0 \n\t"
42                      "s_nop 0 \n\t"
43                      "s_nop 0 \n\t"
44                      "s_nop 0 \n\t"
45                      "s_nop 0 \n\t"
46                      "s_nop 0 \n\t"
47                      "s_nop 0 \n\t"
48                      "s_nop 0 \n\t"
49                      "s_nop 0 \n\t"
50                      "s_nop 0 \n\t"
51                      "s_nop 0 \n\t"
52                      "s_nop 0 \n\t"
53                      "s_nop 0 \n\t"
54                      "s_nop 0 \n\t"
55                      "s_nop 0 \n\t" ::
56                          :);
57 }
58 #endif
59 
flush_icache()60 void flush_icache() {
61 #ifdef USE_ROCM
62   dim3 grid(at::cuda::getCurrentDeviceProperties()->multiProcessorCount * 60);
63   dim3 block(64);
64   flush_icache_kernel<<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>();
65   C10_CUDA_KERNEL_LAUNCH_CHECK();
66 #endif
67 }
68 
69 }  // namespace at::cuda
70