xref: /aosp_15_r20/external/pytorch/aten/src/ATen/cuda/detail/CUDAHooks.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #include <ATen/cuda/detail/CUDAHooks.h>
2 
3 #include <ATen/cuda/CUDAGeneratorImpl.h>
4 #include <ATen/Context.h>
5 #include <ATen/DeviceGuard.h>
6 #include <ATen/DynamicLibrary.h>
7 #include <ATen/core/Vitals.h>
8 #include <ATen/cuda/CUDAConfig.h>
9 #include <ATen/cuda/CUDADevice.h>
10 #include <ATen/cuda/Exceptions.h>
11 #include <ATen/cuda/PeerToPeerAccess.h>
12 #include <ATen/cuda/PinnedMemoryAllocator.h>
13 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
14 #include <ATen/detail/CUDAHooksInterface.h>
15 #include <ATen/native/cuda/CuFFTPlanCache.h>
16 #include <c10/util/Exception.h>
17 #include <c10/cuda/CUDACachingAllocator.h>
18 #include <c10/cuda/CUDAFunctions.h>
19 #include <c10/util/irange.h>
20 
21 #if AT_CUDNN_ENABLED()
22 #include <ATen/cudnn/cudnn-wrapper.h>
23 #endif
24 
25 #if AT_MAGMA_ENABLED()
26 #include <magma_v2.h>
27 #endif
28 
29 #if defined(USE_ROCM)
30 #include <miopen/version.h>
31 #endif
32 
33 #ifndef USE_ROCM
34 #include <ATen/cuda/detail/LazyNVRTC.h>
35 #endif
36 
37 #include <cuda.h>
38 
39 #include <sstream>
40 #include <cstddef>
41 #include <functional>
42 #include <memory>
43 
44 namespace c10::cuda::_internal {
45 void setHasPrimaryContext(bool (*func)(DeviceIndex));
46 }
47 
48 namespace at::cuda::detail {
49 
50 const at::cuda::NVRTC& nvrtc();
51 DeviceIndex current_device();
52 
53 static void (*magma_init_fn)() = nullptr;
54 
set_magma_init_fn(void (* fn)())55 void set_magma_init_fn(void (*fn)()) {
56   magma_init_fn = fn;
57 }
58 
59 namespace {
_hasPrimaryContext(DeviceIndex device_index)60 bool _hasPrimaryContext(DeviceIndex device_index) {
61   TORCH_CHECK(device_index >= 0 && device_index < at::cuda::device_count(),
62               "hasPrimaryContext expects a valid device index, but got device_index=", device_index);
63   unsigned int ctx_flags;
64   // In standalone tests of cuDevicePrimaryCtxGetState, I've seen the "active" argument end up with weird
65   // (garbage-looking nonzero) values when the context is not active, unless I initialize it to zero.
66   int ctx_is_active = 0;
67   AT_CUDA_DRIVER_CHECK(nvrtc().cuDevicePrimaryCtxGetState(device_index, &ctx_flags, &ctx_is_active));
68   return ctx_is_active == 1;
69 }
70 
71 // Register hasPrimaryContext back to c10::cuda
72 struct _Initializer {
_Initializerat::cuda::detail::__anon747d83640111::_Initializer73   _Initializer() {
74       c10::cuda::_internal::setHasPrimaryContext(_hasPrimaryContext);
75   }
~_Initializerat::cuda::detail::__anon747d83640111::_Initializer76   ~_Initializer() {
77       c10::cuda::_internal::setHasPrimaryContext(nullptr);
78   }
79 } initializer;
80 } // anonymous namespace
81 
82 // Sets the CUDA_MODULE_LOADING environment variable
83 // if it's not set by the user.
maybe_set_cuda_module_loading(const std::string & def_value)84 void maybe_set_cuda_module_loading(const std::string &def_value) {
85   auto value = std::getenv("CUDA_MODULE_LOADING");
86   if (!value) {
87 #ifdef _WIN32
88     auto env_var = "CUDA_MODULE_LOADING=" + def_value;
89     _putenv(env_var.c_str());
90 #else
91     setenv("CUDA_MODULE_LOADING", def_value.c_str(), 1);
92 #endif
93   }
94 }
95 
96 // NB: deleter is dynamic, because we need it to live in a separate
97 // compilation unit (alt is to have another method in hooks, but
98 // let's not if we don't need to!)
initCUDA() const99 void CUDAHooks::initCUDA() const {
100   C10_LOG_API_USAGE_ONCE("aten.init.cuda");
101   // Force the update to enable unit testing. This code get executed before unit tests
102   // have a chance to enable vitals.
103   at::vitals::VitalsAPI.setVital("CUDA", "used", "true", /* force = */ true);
104 
105   maybe_set_cuda_module_loading("LAZY");
106   const auto num_devices = c10::cuda::device_count_ensure_non_zero();
107   c10::cuda::CUDACachingAllocator::init(num_devices);
108   at::cuda::detail::init_p2p_access_cache(num_devices);
109 
110 #if AT_MAGMA_ENABLED()
111   TORCH_INTERNAL_ASSERT(magma_init_fn != nullptr, "Cannot initialize magma, init routine not set");
112   magma_init_fn();
113 #endif
114 }
115 
getDefaultCUDAGenerator(DeviceIndex device_index) const116 const Generator& CUDAHooks::getDefaultCUDAGenerator(DeviceIndex device_index) const {
117   return at::cuda::detail::getDefaultCUDAGenerator(device_index);
118 }
119 
getDeviceFromPtr(void * data) const120 Device CUDAHooks::getDeviceFromPtr(void* data) const {
121   return at::cuda::getDeviceFromPtr(data);
122 }
123 
isPinnedPtr(const void * data) const124 bool CUDAHooks::isPinnedPtr(const void* data) const {
125   // First check if driver is broken/missing, in which case PyTorch CPU
126   // functionalities should still work, we should report `false` here.
127   if (!at::cuda::is_available()) {
128     return false;
129   }
130   // cudaPointerGetAttributes grabs context on the current device, so we set
131   // device to one that already has context, if exists.
132   at::OptionalDeviceGuard device_guard;
133   auto primary_ctx_device_index = getDeviceIndexWithPrimaryContext();
134   if (primary_ctx_device_index.has_value()) {
135     device_guard.reset_device(at::Device(at::DeviceType::CUDA, *primary_ctx_device_index));
136   }
137   cudaPointerAttributes attr;
138   // We do not believe that CUDA needs mutable access to the data
139   // here.
140   cudaError_t err = cudaPointerGetAttributes(&attr, data);
141 #if !defined(USE_ROCM)
142   if (err == cudaErrorInvalidValue) {
143     (void)cudaGetLastError(); // clear CUDA error
144     return false;
145   }
146   AT_CUDA_CHECK(err);
147 #else
148   // HIP throws hipErrorUnknown here
149   if (err != cudaSuccess) {
150     (void)cudaGetLastError(); // clear HIP error
151     return false;
152   }
153 #endif
154   return attr.type == cudaMemoryTypeHost;
155 }
156 
hasCUDA() const157 bool CUDAHooks::hasCUDA() const {
158   return at::cuda::is_available();
159 }
160 
hasMAGMA() const161 bool CUDAHooks::hasMAGMA() const {
162 #if AT_MAGMA_ENABLED()
163   return true;
164 #else
165   return false;
166 #endif
167 }
168 
hasCuDNN() const169 bool CUDAHooks::hasCuDNN() const {
170   return AT_CUDNN_ENABLED();
171 }
172 
hasCuSOLVER() const173 bool CUDAHooks::hasCuSOLVER() const {
174 #if defined(CUDART_VERSION) && defined(CUSOLVER_VERSION)
175   return true;
176 #elif AT_ROCM_ENABLED()
177   return true;
178 #else
179   return false;
180 #endif
181 }
182 
hasCuBLASLt() const183 bool CUDAHooks::hasCuBLASLt() const {
184 #if defined(CUDART_VERSION)
185   return true;
186 #elif AT_ROCM_ENABLED()
187   return true;
188 #else
189   return false;
190 #endif
191 }
192 
hasROCM() const193 bool CUDAHooks::hasROCM() const {
194   // Currently, this is same as `compiledWithMIOpen`.
195   // But in future if there are ROCm builds without MIOpen,
196   // then `hasROCM` should return true while `compiledWithMIOpen`
197   // should return false
198   return AT_ROCM_ENABLED();
199 }
200 
201 #if defined(USE_DIRECT_NVRTC)
load_nvrtc()202 static std::pair<std::unique_ptr<at::DynamicLibrary>, at::cuda::NVRTC*> load_nvrtc() {
203   return std::make_pair(nullptr, at::cuda::load_nvrtc());
204 }
205 #elif !defined(USE_ROCM)
load_nvrtc()206 static std::pair<std::unique_ptr<at::DynamicLibrary>, at::cuda::NVRTC*> load_nvrtc() {
207   return std::make_pair(nullptr, &at::cuda::detail::lazyNVRTC);
208 }
209 #else
load_nvrtc()210 static std::pair<std::unique_ptr<at::DynamicLibrary>, at::cuda::NVRTC*> load_nvrtc() {
211 #if defined(_WIN32)
212   std::string libcaffe2_nvrtc = "caffe2_nvrtc.dll";
213 #elif defined(__APPLE__)
214   std::string libcaffe2_nvrtc = "libcaffe2_nvrtc.dylib";
215 #else
216   std::string libcaffe2_nvrtc = "libcaffe2_nvrtc.so";
217 #endif
218   std::unique_ptr<at::DynamicLibrary> libnvrtc_stub(
219       new at::DynamicLibrary(libcaffe2_nvrtc.c_str()));
220   auto fn = (at::cuda::NVRTC * (*)()) libnvrtc_stub->sym("load_nvrtc");
221   return std::make_pair(std::move(libnvrtc_stub), fn());
222 }
223 #endif
224 
nvrtc()225 const at::cuda::NVRTC& nvrtc() {
226   // must hold onto DynamicLibrary otherwise it will unload
227   static auto handle = load_nvrtc();
228   return *handle.second;
229 }
230 
nvrtc() const231 const at::cuda::NVRTC& CUDAHooks::nvrtc() const {
232   return at::cuda::detail::nvrtc();
233 }
234 
current_device()235 DeviceIndex current_device() {
236   c10::DeviceIndex device = 0;
237   cudaError_t err = c10::cuda::GetDevice(&device);
238   if (err == cudaSuccess) {
239     return device;
240   }
241   return -1;
242 }
243 
current_device() const244 DeviceIndex CUDAHooks::current_device() const {
245   return at::cuda::detail::current_device();
246 }
247 
hasPrimaryContext(DeviceIndex device_index) const248 bool CUDAHooks::hasPrimaryContext(DeviceIndex device_index) const {
249   return _hasPrimaryContext(device_index);
250 }
251 
getPinnedMemoryAllocator() const252 Allocator* CUDAHooks::getPinnedMemoryAllocator() const {
253   return at::cuda::getPinnedMemoryAllocator();
254 }
255 
getCUDADeviceAllocator() const256 Allocator* CUDAHooks::getCUDADeviceAllocator() const {
257   return at::cuda::getCUDADeviceAllocator();
258 }
259 
compiledWithCuDNN() const260 bool CUDAHooks::compiledWithCuDNN() const {
261   return AT_CUDNN_ENABLED();
262 }
263 
compiledWithMIOpen() const264 bool CUDAHooks::compiledWithMIOpen() const {
265   return AT_ROCM_ENABLED();
266 }
267 
supportsDilatedConvolutionWithCuDNN() const268 bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
269 #if AT_CUDNN_ENABLED()
270   // NOTE: extra parenthesis around numbers disable clang warnings about
271   // dead code
272   return true;
273 #else
274   return false;
275 #endif
276 }
277 
supportsDepthwiseConvolutionWithCuDNN() const278 bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const {
279 #if AT_CUDNN_ENABLED()
280   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
281   // Check for Volta cores
282   if (prop->major >= 7) {
283     return true;
284   } else {
285     return false;
286   }
287 #else
288   return false;
289 #endif
290 }
291 
supportsBFloat16ConvolutionWithCuDNNv8() const292 bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const {
293 #if AT_CUDNN_ENABLED()
294   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
295   // Check for Volta cores
296   if (prop->major >= 8) {
297     return true;
298   } else {
299     return false;
300   }
301 #else
302   return false;
303 #endif
304 }
305 
versionCuDNN() const306 long CUDAHooks::versionCuDNN() const {
307 #if AT_CUDNN_ENABLED()
308   return CUDNN_VERSION;
309 #else
310   AT_ERROR("Cannot query CuDNN version if ATen_cuda is not built with CuDNN");
311 #endif
312 }
313 
versionCUDART() const314 long CUDAHooks::versionCUDART() const {
315 #ifdef CUDART_VERSION
316   return CUDART_VERSION;
317 #else
318   TORCH_CHECK(
319     false,
320     "Cannot query CUDART version because CUDART is not available");
321 #endif
322 }
323 
hasCUDART() const324 bool CUDAHooks::hasCUDART() const {
325 #ifdef CUDART_VERSION
326   return true;
327 #else
328   return false;
329 #endif
330 }
331 
showConfig() const332 std::string CUDAHooks::showConfig() const {
333   std::ostringstream oss;
334 
335   int runtimeVersion;
336   cudaRuntimeGetVersion(&runtimeVersion);
337 
338   auto printCudaStyleVersion = [&](int v) {
339 #ifdef USE_ROCM
340     // HIP_VERSION value format was changed after ROCm v4.2 to include the patch number
341     if(v < 500) {
342       // If major=xx, minor=yy then format -> xxyy
343       oss << (v / 100) << "." << (v % 10);
344     }
345     else {
346       // If major=xx, minor=yy & patch=zzzzz then format -> xxyyzzzzz
347       oss << (v / 10000000) << "." << (v / 100000 % 100) << "." << (v % 100000);
348     }
349 #else
350     oss << (v / 1000) << "." << (v / 10 % 100);
351     if (v % 10 != 0) {
352       oss << "." << (v % 10);
353     }
354 #endif
355   };
356 
357 #if !defined(USE_ROCM)
358   oss << "  - CUDA Runtime ";
359 #else
360   oss << "  - HIP Runtime ";
361 #endif
362   printCudaStyleVersion(runtimeVersion);
363   oss << "\n";
364 
365   // TODO: Make HIPIFY understand CUDART_VERSION macro
366 #if !defined(USE_ROCM)
367   if (runtimeVersion != CUDART_VERSION) {
368     oss << "  - Built with CUDA Runtime ";
369     printCudaStyleVersion(CUDART_VERSION);
370     oss << "\n";
371   }
372   oss << "  - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << "\n";
373 #endif
374 
375 #if !defined(USE_ROCM)
376 #if AT_CUDNN_ENABLED()
377 
378 
379   auto printCudnnStyleVersion = [&](int v) {
380     oss << (v / 1000) << "." << (v / 100 % 10);
381     if (v % 100 != 0) {
382       oss << "." << (v % 100);
383     }
384   };
385 
386   size_t cudnnVersion = cudnnGetVersion();
387   oss << "  - CuDNN ";
388   printCudnnStyleVersion(cudnnVersion);
389   size_t cudnnCudartVersion = cudnnGetCudartVersion();
390   if (cudnnCudartVersion != CUDART_VERSION) {
391     oss << "  (built against CUDA ";
392     printCudaStyleVersion(cudnnCudartVersion);
393     oss << ")";
394   }
395   oss << "\n";
396   if (cudnnVersion != CUDNN_VERSION) {
397     oss << "    - Built with CuDNN ";
398     printCudnnStyleVersion(CUDNN_VERSION);
399     oss << "\n";
400   }
401 #endif
402 #else
403   // TODO: Check if miopen has the functions above and unify
404   oss << "  - MIOpen " << MIOPEN_VERSION_MAJOR << "." << MIOPEN_VERSION_MINOR << "." << MIOPEN_VERSION_PATCH << "\n";
405 #endif
406 
407 #if AT_MAGMA_ENABLED()
408   oss << "  - Magma " << MAGMA_VERSION_MAJOR << "." << MAGMA_VERSION_MINOR << "." << MAGMA_VERSION_MICRO << "\n";
409 #endif
410 
411   return oss.str();
412 }
413 
batchnormMinEpsilonCuDNN() const414 double CUDAHooks::batchnormMinEpsilonCuDNN() const {
415 #if AT_CUDNN_ENABLED()
416   return CUDNN_BN_MIN_EPSILON;
417 #else
418   AT_ERROR(
419       "Cannot query CUDNN_BN_MIN_EPSILON if ATen_cuda is not built with CuDNN");
420 #endif
421 }
422 
cuFFTGetPlanCacheMaxSize(DeviceIndex device_index) const423 int64_t CUDAHooks::cuFFTGetPlanCacheMaxSize(DeviceIndex device_index) const {
424   return at::native::detail::cufft_get_plan_cache_max_size_impl(device_index);
425 }
426 
cuFFTSetPlanCacheMaxSize(DeviceIndex device_index,int64_t max_size) const427 void CUDAHooks::cuFFTSetPlanCacheMaxSize(DeviceIndex device_index, int64_t max_size) const {
428   at::native::detail::cufft_set_plan_cache_max_size_impl(device_index, max_size);
429 }
430 
cuFFTGetPlanCacheSize(DeviceIndex device_index) const431 int64_t CUDAHooks::cuFFTGetPlanCacheSize(DeviceIndex device_index) const {
432   return at::native::detail::cufft_get_plan_cache_size_impl(device_index);
433 }
434 
cuFFTClearPlanCache(DeviceIndex device_index) const435 void CUDAHooks::cuFFTClearPlanCache(DeviceIndex device_index) const {
436   at::native::detail::cufft_clear_plan_cache_impl(device_index);
437 }
438 
getNumGPUs() const439 int CUDAHooks::getNumGPUs() const {
440   return at::cuda::device_count();
441 }
442 
443 #ifdef USE_ROCM
isGPUArch(DeviceIndex device_index,const std::vector<std::string> & archs) const444 bool CUDAHooks::isGPUArch(DeviceIndex device_index, const std::vector<std::string>& archs) const {
445   hipDeviceProp_t* prop = at::cuda::getDeviceProperties(device_index);
446   std::string device_arch = prop->gcnArchName;
447   for (std::string arch : archs) {
448       size_t substring = device_arch.find(arch);
449       if (substring != std::string::npos) {
450           return true;
451       }
452   }
453   return false;
454 }
455 #endif
456 
deviceSynchronize(DeviceIndex device_index) const457 void CUDAHooks::deviceSynchronize(DeviceIndex device_index) const {
458   at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
459   c10::cuda::device_synchronize();
460 }
461 
462 // Sigh, the registry doesn't support namespaces :(
463 using at::CUDAHooksRegistry;
464 using at::RegistererCUDAHooksRegistry;
465 
466 REGISTER_CUDA_HOOKS(CUDAHooks);
467 
468 } // namespace at::cuda::detail
469