1 #include <ATen/cuda/CUDAContextLight.h> 2 #include <ATen/cuda/Sleep.h> 3 4 #include <c10/cuda/CUDAException.h> 5 #include <c10/cuda/CUDAStream.h> 6 7 namespace at::cuda { 8 namespace { spin_kernel(int64_t cycles)9__global__ void spin_kernel(int64_t cycles) { 10 // Few AMD specific GPUs have different clock intrinsic 11 #if defined(__GFX11__) && defined(USE_ROCM) && !defined(__CUDA_ARCH__) 12 int64_t start_clock = wall_clock64(); 13 #else 14 // see concurrentKernels CUDA sampl 15 int64_t start_clock = clock64(); 16 #endif 17 int64_t clock_offset = 0; 18 while (clock_offset < cycles) 19 { 20 #if defined(__GFX11__) && defined(USE_ROCM) && !defined(__CUDA_ARCH__) 21 clock_offset = wall_clock64() - start_clock; 22 #else 23 clock_offset = clock64() - start_clock; 24 #endif 25 } 26 } 27 } 28 sleep(int64_t cycles)29void sleep(int64_t cycles) { 30 dim3 grid(1); 31 dim3 block(1); 32 spin_kernel<<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(cycles); 33 C10_CUDA_KERNEL_LAUNCH_CHECK(); 34 } 35 36 #ifdef USE_ROCM flush_icache_kernel()37__global__ void flush_icache_kernel() 38 { 39 asm __volatile__("s_icache_inv \n\t" 40 "s_nop 0 \n\t" 41 "s_nop 0 \n\t" 42 "s_nop 0 \n\t" 43 "s_nop 0 \n\t" 44 "s_nop 0 \n\t" 45 "s_nop 0 \n\t" 46 "s_nop 0 \n\t" 47 "s_nop 0 \n\t" 48 "s_nop 0 \n\t" 49 "s_nop 0 \n\t" 50 "s_nop 0 \n\t" 51 "s_nop 0 \n\t" 52 "s_nop 0 \n\t" 53 "s_nop 0 \n\t" 54 "s_nop 0 \n\t" 55 "s_nop 0 \n\t" :: 56 :); 57 } 58 #endif 59 flush_icache()60void flush_icache() { 61 #ifdef USE_ROCM 62 dim3 grid(at::cuda::getCurrentDeviceProperties()->multiProcessorCount * 60); 63 dim3 block(64); 64 flush_icache_kernel<<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(); 65 C10_CUDA_KERNEL_LAUNCH_CHECK(); 66 #endif 67 } 68 69 } // namespace at::cuda 70