#include #include #include namespace c10::cuda { namespace { // returns -1 on failure int32_t driver_version() { int driver_version = -1; C10_CUDA_IGNORE_ERROR(cudaDriverGetVersion(&driver_version)); return driver_version; } int device_count_impl(bool fail_if_no_driver) { int count = 0; auto err = C10_CUDA_ERROR_HANDLED(c10::cuda::GetDeviceCount(&count)); if (err == cudaSuccess) { return count; } // Clear out the error state, so we don't spuriously trigger someone else. // (This shouldn't really matter, since we won't be running very much CUDA // code in this regime.) cudaError_t last_err C10_UNUSED = cudaGetLastError(); switch (err) { case cudaErrorNoDevice: // Zero devices is ok here count = 0; break; case cudaErrorInsufficientDriver: { auto version = driver_version(); if (version <= 0) { if (!fail_if_no_driver) { // No CUDA driver means no devices count = 0; break; } TORCH_CHECK( false, "Found no NVIDIA driver on your system. Please check that you " "have an NVIDIA GPU and installed a driver from " "http://www.nvidia.com/Download/index.aspx"); } else { TORCH_CHECK( false, "The NVIDIA driver on your system is too old (found version ", version, "). Please update your GPU driver by downloading and installing " "a new version from the URL: " "http://www.nvidia.com/Download/index.aspx Alternatively, go to: " "https://pytorch.org to install a PyTorch version that has been " "compiled with your version of the CUDA driver."); } } break; case cudaErrorInitializationError: TORCH_CHECK( false, "CUDA driver initialization failed, you might not " "have a CUDA gpu."); break; case cudaErrorUnknown: TORCH_CHECK( false, "CUDA unknown error - this may be due to an " "incorrectly set up environment, e.g. changing env " "variable CUDA_VISIBLE_DEVICES after program start. " "Setting the available devices to be zero."); break; #if C10_ASAN_ENABLED case cudaErrorMemoryAllocation: // In ASAN mode, we know that a cudaErrorMemoryAllocation error will // pop up if compiled with NVCC (clang-cuda is fine) TORCH_CHECK( false, "Got 'out of memory' error while trying to initialize CUDA. " "CUDA with nvcc does not work well with ASAN and it's probably " "the reason. We will simply shut down CUDA support. If you " "would like to use GPUs, turn off ASAN."); break; #endif // C10_ASAN_ENABLED default: TORCH_CHECK( false, "Unexpected error from cudaGetDeviceCount(). Did you run " "some cuda functions before calling NumCudaDevices() " "that might have already set an error? Error ", err, ": ", cudaGetErrorString(err)); } return count; } } // namespace DeviceIndex device_count() noexcept { // initialize number of devices only once static int count = []() { try { auto result = device_count_impl(/*fail_if_no_driver=*/false); TORCH_INTERNAL_ASSERT( result <= std::numeric_limits::max(), "Too many CUDA devices, DeviceIndex overflowed"); return result; } catch (const c10::Error& ex) { // We don't want to fail, but still log the warning // msg() returns the message without the stack trace TORCH_WARN("CUDA initialization: ", ex.msg()); return 0; } }(); return static_cast(count); } DeviceIndex device_count_ensure_non_zero() { // Call the implementation every time to throw the exception int count = device_count_impl(/*fail_if_no_driver=*/true); // Zero gpus doesn't produce a warning in `device_count` but we fail here TORCH_CHECK(count, "No CUDA GPUs are available"); TORCH_INTERNAL_ASSERT( count <= std::numeric_limits::max(), "Too many CUDA devices, DeviceIndex overflowed"); return static_cast(count); } DeviceIndex current_device() { DeviceIndex cur_device = -1; C10_CUDA_CHECK(c10::cuda::GetDevice(&cur_device)); return cur_device; } void set_device(DeviceIndex device) { C10_CUDA_CHECK(c10::cuda::SetDevice(device)); } void device_synchronize() { const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); if (C10_UNLIKELY(interp)) { (*interp)->trace_gpu_device_synchronization(c10::kCUDA); } C10_CUDA_CHECK(cudaDeviceSynchronize()); } // this function has to be called from callers performing cuda synchronizing // operations, to raise proper error or warning void warn_or_error_on_sync() { if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_ERROR) { TORCH_CHECK(false, "called a synchronizing CUDA operation"); } else if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_WARN) { TORCH_WARN("called a synchronizing CUDA operation"); } } std::optional getDeviceIndexWithPrimaryContext() { // check current device first auto current_device_index = current_device(); if (current_device_index >= 0) { if (hasPrimaryContext(current_device_index)) { return current_device_index; } } for (const auto device_index : c10::irange(at::cuda::device_count())) { if (device_index == current_device_index) continue; if (hasPrimaryContext(device_index)) { return device_index; } } return std::nullopt; } namespace _internal { bool dummyHasPrimaryContext(C10_UNUSED DeviceIndex device_index) { TORCH_CHECK(false, "Should never been called"); } bool (*hasPrimaryContext)(DeviceIndex) = dummyHasPrimaryContext; // Private api to be called from CUDAHooks.cpp C10_CUDA_API void setHasPrimaryContext(bool (*func)(DeviceIndex)) { hasPrimaryContext = func ? func : dummyHasPrimaryContext; } } // namespace _internal bool hasPrimaryContext(DeviceIndex device_index) { return _internal::hasPrimaryContext(device_index); } // Wrappers for raw CUDA device management functions cudaError_t GetDeviceCount(int* dev_count) { return cudaGetDeviceCount(dev_count); } // This is a codepath for CUDA 12 that comes with a critical change in behavior // of `cudaSetDevice`. Unlike to previous CUDA versions that allocate context // lazily CUDA 12.x eagerly allocates primary context the moment `cudaSetDevice` // is called. This can lead to dramatic consequences and pollute the device // memory in distributed runs. To avoid unnecessary context creation a new // function called `MaybeSetDevice` was introduced. This function is to be // called in device guard destructor and at the exit of torch.cuda.device // context manager. The behavior of `MaybeSetDevice` is quite simple, it calls // to `cudaSetDevice` if context already exist or if context was not allocated // on targeted device it simply saves the device index. This way we can keep // PyTorch backward compatible for applications like this: // // ``` // import torch // x = torch.empty(1, device=“cuda:1”) # no CUDA context on cuda:0 after this // call y = torch.empty(1, device=“cuda”) # CUDA context is created on cuda:0 // ``` #if CUDA_VERSION >= 12000 thread_local DeviceIndex targetDeviceIndex = -1; cudaError_t GetDevice(DeviceIndex* device) { if (targetDeviceIndex >= 0) { *device = targetDeviceIndex; return cudaSuccess; } int tmp_device = -1; auto err = cudaGetDevice(&tmp_device); if (err == cudaSuccess) { TORCH_INTERNAL_ASSERT( tmp_device >= 0 && tmp_device <= std::numeric_limits::max(), "cudaGetDevice returns invalid device ", tmp_device); *device = static_cast(tmp_device); } return err; } cudaError_t SetDevice(DeviceIndex device) { TORCH_CHECK(device >= 0, "device id must be positive!", device); targetDeviceIndex = -1; int cur_device = -1; C10_CUDA_CHECK(cudaGetDevice(&cur_device)); if (device == cur_device) { return cudaSuccess; } return cudaSetDevice(device); } cudaError_t MaybeSetDevice(DeviceIndex device) { if (hasPrimaryContext(device)) { return c10::cuda::SetDevice(device); } targetDeviceIndex = device; return cudaSuccess; } // This function always initializes the CUDA context // on to_device DeviceIndex ExchangeDevice(DeviceIndex to_device) { auto cur_device = targetDeviceIndex; targetDeviceIndex = -1; if (cur_device < 0) { int tmp_device = -1; C10_CUDA_CHECK(cudaGetDevice(&tmp_device)); cur_device = static_cast(tmp_device); if (to_device == cur_device) { return cur_device; } } C10_CUDA_CHECK(cudaSetDevice(to_device)); return cur_device; } // This function does not initialize the CUDA context // on to_device if it does not already exist DeviceIndex MaybeExchangeDevice(DeviceIndex to_device) { int tmp_cur_device = -1; C10_CUDA_CHECK(cudaGetDevice(&tmp_cur_device)); TORCH_INTERNAL_ASSERT( tmp_cur_device >= 0 && tmp_cur_device <= std::numeric_limits::max(), "cudaGetDevice returns invalid device ", tmp_cur_device); auto cur_device = static_cast(tmp_cur_device); if (to_device == tmp_cur_device) { return cur_device; } if (hasPrimaryContext(to_device)) { C10_CUDA_CHECK(cudaSetDevice(to_device)); } else { targetDeviceIndex = to_device; } return cur_device; } void SetTargetDevice() { if (targetDeviceIndex >= 0) { C10_CUDA_CHECK(c10::cuda::SetDevice(targetDeviceIndex)); } } #else cudaError_t GetDevice(DeviceIndex* device) { int tmp_device = -1; auto err = cudaGetDevice(&tmp_device); if (err == cudaSuccess) { TORCH_INTERNAL_ASSERT( tmp_device >= 0 && tmp_device <= std::numeric_limits::max(), "cudaGetDevice returns invalid device ", tmp_device); *device = static_cast(tmp_device); } return err; } cudaError_t SetDevice(DeviceIndex device) { TORCH_CHECK(device >= 0, "device id must be positive!", device); int cur_device = -1; C10_CUDA_CHECK(cudaGetDevice(&cur_device)); if (device == cur_device) { return cudaSuccess; } return cudaSetDevice(device); } cudaError_t MaybeSetDevice(DeviceIndex device) { return c10::cuda::SetDevice(device); } DeviceIndex ExchangeDevice(DeviceIndex to_device) { DeviceIndex cur_device = -1; C10_CUDA_CHECK(c10::cuda::GetDevice(&cur_device)); if (to_device == cur_device) { return cur_device; } C10_CUDA_CHECK(cudaSetDevice(to_device)); return cur_device; } DeviceIndex MaybeExchangeDevice(DeviceIndex to_device) { return c10::cuda::ExchangeDevice(to_device); } void SetTargetDevice() { // no-op on CUDA version < 12.x } #endif } // namespace c10::cuda