1 #include <ATen/cuda/detail/CUDAHooks.h>
2
3 #include <ATen/cuda/CUDAGeneratorImpl.h>
4 #include <ATen/Context.h>
5 #include <ATen/DeviceGuard.h>
6 #include <ATen/DynamicLibrary.h>
7 #include <ATen/core/Vitals.h>
8 #include <ATen/cuda/CUDAConfig.h>
9 #include <ATen/cuda/CUDADevice.h>
10 #include <ATen/cuda/Exceptions.h>
11 #include <ATen/cuda/PeerToPeerAccess.h>
12 #include <ATen/cuda/PinnedMemoryAllocator.h>
13 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
14 #include <ATen/detail/CUDAHooksInterface.h>
15 #include <ATen/native/cuda/CuFFTPlanCache.h>
16 #include <c10/util/Exception.h>
17 #include <c10/cuda/CUDACachingAllocator.h>
18 #include <c10/cuda/CUDAFunctions.h>
19 #include <c10/util/irange.h>
20
21 #if AT_CUDNN_ENABLED()
22 #include <ATen/cudnn/cudnn-wrapper.h>
23 #endif
24
25 #if AT_MAGMA_ENABLED()
26 #include <magma_v2.h>
27 #endif
28
29 #if defined(USE_ROCM)
30 #include <miopen/version.h>
31 #endif
32
33 #ifndef USE_ROCM
34 #include <ATen/cuda/detail/LazyNVRTC.h>
35 #endif
36
37 #include <cuda.h>
38
39 #include <sstream>
40 #include <cstddef>
41 #include <functional>
42 #include <memory>
43
44 namespace c10::cuda::_internal {
45 void setHasPrimaryContext(bool (*func)(DeviceIndex));
46 }
47
48 namespace at::cuda::detail {
49
50 const at::cuda::NVRTC& nvrtc();
51 DeviceIndex current_device();
52
53 static void (*magma_init_fn)() = nullptr;
54
set_magma_init_fn(void (* fn)())55 void set_magma_init_fn(void (*fn)()) {
56 magma_init_fn = fn;
57 }
58
59 namespace {
_hasPrimaryContext(DeviceIndex device_index)60 bool _hasPrimaryContext(DeviceIndex device_index) {
61 TORCH_CHECK(device_index >= 0 && device_index < at::cuda::device_count(),
62 "hasPrimaryContext expects a valid device index, but got device_index=", device_index);
63 unsigned int ctx_flags;
64 // In standalone tests of cuDevicePrimaryCtxGetState, I've seen the "active" argument end up with weird
65 // (garbage-looking nonzero) values when the context is not active, unless I initialize it to zero.
66 int ctx_is_active = 0;
67 AT_CUDA_DRIVER_CHECK(nvrtc().cuDevicePrimaryCtxGetState(device_index, &ctx_flags, &ctx_is_active));
68 return ctx_is_active == 1;
69 }
70
71 // Register hasPrimaryContext back to c10::cuda
72 struct _Initializer {
_Initializerat::cuda::detail::__anon747d83640111::_Initializer73 _Initializer() {
74 c10::cuda::_internal::setHasPrimaryContext(_hasPrimaryContext);
75 }
~_Initializerat::cuda::detail::__anon747d83640111::_Initializer76 ~_Initializer() {
77 c10::cuda::_internal::setHasPrimaryContext(nullptr);
78 }
79 } initializer;
80 } // anonymous namespace
81
82 // Sets the CUDA_MODULE_LOADING environment variable
83 // if it's not set by the user.
maybe_set_cuda_module_loading(const std::string & def_value)84 void maybe_set_cuda_module_loading(const std::string &def_value) {
85 auto value = std::getenv("CUDA_MODULE_LOADING");
86 if (!value) {
87 #ifdef _WIN32
88 auto env_var = "CUDA_MODULE_LOADING=" + def_value;
89 _putenv(env_var.c_str());
90 #else
91 setenv("CUDA_MODULE_LOADING", def_value.c_str(), 1);
92 #endif
93 }
94 }
95
96 // NB: deleter is dynamic, because we need it to live in a separate
97 // compilation unit (alt is to have another method in hooks, but
98 // let's not if we don't need to!)
initCUDA() const99 void CUDAHooks::initCUDA() const {
100 C10_LOG_API_USAGE_ONCE("aten.init.cuda");
101 // Force the update to enable unit testing. This code get executed before unit tests
102 // have a chance to enable vitals.
103 at::vitals::VitalsAPI.setVital("CUDA", "used", "true", /* force = */ true);
104
105 maybe_set_cuda_module_loading("LAZY");
106 const auto num_devices = c10::cuda::device_count_ensure_non_zero();
107 c10::cuda::CUDACachingAllocator::init(num_devices);
108 at::cuda::detail::init_p2p_access_cache(num_devices);
109
110 #if AT_MAGMA_ENABLED()
111 TORCH_INTERNAL_ASSERT(magma_init_fn != nullptr, "Cannot initialize magma, init routine not set");
112 magma_init_fn();
113 #endif
114 }
115
getDefaultCUDAGenerator(DeviceIndex device_index) const116 const Generator& CUDAHooks::getDefaultCUDAGenerator(DeviceIndex device_index) const {
117 return at::cuda::detail::getDefaultCUDAGenerator(device_index);
118 }
119
getDeviceFromPtr(void * data) const120 Device CUDAHooks::getDeviceFromPtr(void* data) const {
121 return at::cuda::getDeviceFromPtr(data);
122 }
123
isPinnedPtr(const void * data) const124 bool CUDAHooks::isPinnedPtr(const void* data) const {
125 // First check if driver is broken/missing, in which case PyTorch CPU
126 // functionalities should still work, we should report `false` here.
127 if (!at::cuda::is_available()) {
128 return false;
129 }
130 // cudaPointerGetAttributes grabs context on the current device, so we set
131 // device to one that already has context, if exists.
132 at::OptionalDeviceGuard device_guard;
133 auto primary_ctx_device_index = getDeviceIndexWithPrimaryContext();
134 if (primary_ctx_device_index.has_value()) {
135 device_guard.reset_device(at::Device(at::DeviceType::CUDA, *primary_ctx_device_index));
136 }
137 cudaPointerAttributes attr;
138 // We do not believe that CUDA needs mutable access to the data
139 // here.
140 cudaError_t err = cudaPointerGetAttributes(&attr, data);
141 #if !defined(USE_ROCM)
142 if (err == cudaErrorInvalidValue) {
143 (void)cudaGetLastError(); // clear CUDA error
144 return false;
145 }
146 AT_CUDA_CHECK(err);
147 #else
148 // HIP throws hipErrorUnknown here
149 if (err != cudaSuccess) {
150 (void)cudaGetLastError(); // clear HIP error
151 return false;
152 }
153 #endif
154 return attr.type == cudaMemoryTypeHost;
155 }
156
hasCUDA() const157 bool CUDAHooks::hasCUDA() const {
158 return at::cuda::is_available();
159 }
160
hasMAGMA() const161 bool CUDAHooks::hasMAGMA() const {
162 #if AT_MAGMA_ENABLED()
163 return true;
164 #else
165 return false;
166 #endif
167 }
168
hasCuDNN() const169 bool CUDAHooks::hasCuDNN() const {
170 return AT_CUDNN_ENABLED();
171 }
172
hasCuSOLVER() const173 bool CUDAHooks::hasCuSOLVER() const {
174 #if defined(CUDART_VERSION) && defined(CUSOLVER_VERSION)
175 return true;
176 #elif AT_ROCM_ENABLED()
177 return true;
178 #else
179 return false;
180 #endif
181 }
182
hasCuBLASLt() const183 bool CUDAHooks::hasCuBLASLt() const {
184 #if defined(CUDART_VERSION)
185 return true;
186 #elif AT_ROCM_ENABLED()
187 return true;
188 #else
189 return false;
190 #endif
191 }
192
hasROCM() const193 bool CUDAHooks::hasROCM() const {
194 // Currently, this is same as `compiledWithMIOpen`.
195 // But in future if there are ROCm builds without MIOpen,
196 // then `hasROCM` should return true while `compiledWithMIOpen`
197 // should return false
198 return AT_ROCM_ENABLED();
199 }
200
201 #if defined(USE_DIRECT_NVRTC)
load_nvrtc()202 static std::pair<std::unique_ptr<at::DynamicLibrary>, at::cuda::NVRTC*> load_nvrtc() {
203 return std::make_pair(nullptr, at::cuda::load_nvrtc());
204 }
205 #elif !defined(USE_ROCM)
load_nvrtc()206 static std::pair<std::unique_ptr<at::DynamicLibrary>, at::cuda::NVRTC*> load_nvrtc() {
207 return std::make_pair(nullptr, &at::cuda::detail::lazyNVRTC);
208 }
209 #else
load_nvrtc()210 static std::pair<std::unique_ptr<at::DynamicLibrary>, at::cuda::NVRTC*> load_nvrtc() {
211 #if defined(_WIN32)
212 std::string libcaffe2_nvrtc = "caffe2_nvrtc.dll";
213 #elif defined(__APPLE__)
214 std::string libcaffe2_nvrtc = "libcaffe2_nvrtc.dylib";
215 #else
216 std::string libcaffe2_nvrtc = "libcaffe2_nvrtc.so";
217 #endif
218 std::unique_ptr<at::DynamicLibrary> libnvrtc_stub(
219 new at::DynamicLibrary(libcaffe2_nvrtc.c_str()));
220 auto fn = (at::cuda::NVRTC * (*)()) libnvrtc_stub->sym("load_nvrtc");
221 return std::make_pair(std::move(libnvrtc_stub), fn());
222 }
223 #endif
224
nvrtc()225 const at::cuda::NVRTC& nvrtc() {
226 // must hold onto DynamicLibrary otherwise it will unload
227 static auto handle = load_nvrtc();
228 return *handle.second;
229 }
230
nvrtc() const231 const at::cuda::NVRTC& CUDAHooks::nvrtc() const {
232 return at::cuda::detail::nvrtc();
233 }
234
current_device()235 DeviceIndex current_device() {
236 c10::DeviceIndex device = 0;
237 cudaError_t err = c10::cuda::GetDevice(&device);
238 if (err == cudaSuccess) {
239 return device;
240 }
241 return -1;
242 }
243
current_device() const244 DeviceIndex CUDAHooks::current_device() const {
245 return at::cuda::detail::current_device();
246 }
247
hasPrimaryContext(DeviceIndex device_index) const248 bool CUDAHooks::hasPrimaryContext(DeviceIndex device_index) const {
249 return _hasPrimaryContext(device_index);
250 }
251
getPinnedMemoryAllocator() const252 Allocator* CUDAHooks::getPinnedMemoryAllocator() const {
253 return at::cuda::getPinnedMemoryAllocator();
254 }
255
getCUDADeviceAllocator() const256 Allocator* CUDAHooks::getCUDADeviceAllocator() const {
257 return at::cuda::getCUDADeviceAllocator();
258 }
259
compiledWithCuDNN() const260 bool CUDAHooks::compiledWithCuDNN() const {
261 return AT_CUDNN_ENABLED();
262 }
263
compiledWithMIOpen() const264 bool CUDAHooks::compiledWithMIOpen() const {
265 return AT_ROCM_ENABLED();
266 }
267
supportsDilatedConvolutionWithCuDNN() const268 bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
269 #if AT_CUDNN_ENABLED()
270 // NOTE: extra parenthesis around numbers disable clang warnings about
271 // dead code
272 return true;
273 #else
274 return false;
275 #endif
276 }
277
supportsDepthwiseConvolutionWithCuDNN() const278 bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const {
279 #if AT_CUDNN_ENABLED()
280 cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
281 // Check for Volta cores
282 if (prop->major >= 7) {
283 return true;
284 } else {
285 return false;
286 }
287 #else
288 return false;
289 #endif
290 }
291
supportsBFloat16ConvolutionWithCuDNNv8() const292 bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const {
293 #if AT_CUDNN_ENABLED()
294 cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
295 // Check for Volta cores
296 if (prop->major >= 8) {
297 return true;
298 } else {
299 return false;
300 }
301 #else
302 return false;
303 #endif
304 }
305
versionCuDNN() const306 long CUDAHooks::versionCuDNN() const {
307 #if AT_CUDNN_ENABLED()
308 return CUDNN_VERSION;
309 #else
310 AT_ERROR("Cannot query CuDNN version if ATen_cuda is not built with CuDNN");
311 #endif
312 }
313
versionCUDART() const314 long CUDAHooks::versionCUDART() const {
315 #ifdef CUDART_VERSION
316 return CUDART_VERSION;
317 #else
318 TORCH_CHECK(
319 false,
320 "Cannot query CUDART version because CUDART is not available");
321 #endif
322 }
323
hasCUDART() const324 bool CUDAHooks::hasCUDART() const {
325 #ifdef CUDART_VERSION
326 return true;
327 #else
328 return false;
329 #endif
330 }
331
showConfig() const332 std::string CUDAHooks::showConfig() const {
333 std::ostringstream oss;
334
335 int runtimeVersion;
336 cudaRuntimeGetVersion(&runtimeVersion);
337
338 auto printCudaStyleVersion = [&](int v) {
339 #ifdef USE_ROCM
340 // HIP_VERSION value format was changed after ROCm v4.2 to include the patch number
341 if(v < 500) {
342 // If major=xx, minor=yy then format -> xxyy
343 oss << (v / 100) << "." << (v % 10);
344 }
345 else {
346 // If major=xx, minor=yy & patch=zzzzz then format -> xxyyzzzzz
347 oss << (v / 10000000) << "." << (v / 100000 % 100) << "." << (v % 100000);
348 }
349 #else
350 oss << (v / 1000) << "." << (v / 10 % 100);
351 if (v % 10 != 0) {
352 oss << "." << (v % 10);
353 }
354 #endif
355 };
356
357 #if !defined(USE_ROCM)
358 oss << " - CUDA Runtime ";
359 #else
360 oss << " - HIP Runtime ";
361 #endif
362 printCudaStyleVersion(runtimeVersion);
363 oss << "\n";
364
365 // TODO: Make HIPIFY understand CUDART_VERSION macro
366 #if !defined(USE_ROCM)
367 if (runtimeVersion != CUDART_VERSION) {
368 oss << " - Built with CUDA Runtime ";
369 printCudaStyleVersion(CUDART_VERSION);
370 oss << "\n";
371 }
372 oss << " - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << "\n";
373 #endif
374
375 #if !defined(USE_ROCM)
376 #if AT_CUDNN_ENABLED()
377
378
379 auto printCudnnStyleVersion = [&](int v) {
380 oss << (v / 1000) << "." << (v / 100 % 10);
381 if (v % 100 != 0) {
382 oss << "." << (v % 100);
383 }
384 };
385
386 size_t cudnnVersion = cudnnGetVersion();
387 oss << " - CuDNN ";
388 printCudnnStyleVersion(cudnnVersion);
389 size_t cudnnCudartVersion = cudnnGetCudartVersion();
390 if (cudnnCudartVersion != CUDART_VERSION) {
391 oss << " (built against CUDA ";
392 printCudaStyleVersion(cudnnCudartVersion);
393 oss << ")";
394 }
395 oss << "\n";
396 if (cudnnVersion != CUDNN_VERSION) {
397 oss << " - Built with CuDNN ";
398 printCudnnStyleVersion(CUDNN_VERSION);
399 oss << "\n";
400 }
401 #endif
402 #else
403 // TODO: Check if miopen has the functions above and unify
404 oss << " - MIOpen " << MIOPEN_VERSION_MAJOR << "." << MIOPEN_VERSION_MINOR << "." << MIOPEN_VERSION_PATCH << "\n";
405 #endif
406
407 #if AT_MAGMA_ENABLED()
408 oss << " - Magma " << MAGMA_VERSION_MAJOR << "." << MAGMA_VERSION_MINOR << "." << MAGMA_VERSION_MICRO << "\n";
409 #endif
410
411 return oss.str();
412 }
413
batchnormMinEpsilonCuDNN() const414 double CUDAHooks::batchnormMinEpsilonCuDNN() const {
415 #if AT_CUDNN_ENABLED()
416 return CUDNN_BN_MIN_EPSILON;
417 #else
418 AT_ERROR(
419 "Cannot query CUDNN_BN_MIN_EPSILON if ATen_cuda is not built with CuDNN");
420 #endif
421 }
422
cuFFTGetPlanCacheMaxSize(DeviceIndex device_index) const423 int64_t CUDAHooks::cuFFTGetPlanCacheMaxSize(DeviceIndex device_index) const {
424 return at::native::detail::cufft_get_plan_cache_max_size_impl(device_index);
425 }
426
cuFFTSetPlanCacheMaxSize(DeviceIndex device_index,int64_t max_size) const427 void CUDAHooks::cuFFTSetPlanCacheMaxSize(DeviceIndex device_index, int64_t max_size) const {
428 at::native::detail::cufft_set_plan_cache_max_size_impl(device_index, max_size);
429 }
430
cuFFTGetPlanCacheSize(DeviceIndex device_index) const431 int64_t CUDAHooks::cuFFTGetPlanCacheSize(DeviceIndex device_index) const {
432 return at::native::detail::cufft_get_plan_cache_size_impl(device_index);
433 }
434
cuFFTClearPlanCache(DeviceIndex device_index) const435 void CUDAHooks::cuFFTClearPlanCache(DeviceIndex device_index) const {
436 at::native::detail::cufft_clear_plan_cache_impl(device_index);
437 }
438
getNumGPUs() const439 int CUDAHooks::getNumGPUs() const {
440 return at::cuda::device_count();
441 }
442
443 #ifdef USE_ROCM
isGPUArch(DeviceIndex device_index,const std::vector<std::string> & archs) const444 bool CUDAHooks::isGPUArch(DeviceIndex device_index, const std::vector<std::string>& archs) const {
445 hipDeviceProp_t* prop = at::cuda::getDeviceProperties(device_index);
446 std::string device_arch = prop->gcnArchName;
447 for (std::string arch : archs) {
448 size_t substring = device_arch.find(arch);
449 if (substring != std::string::npos) {
450 return true;
451 }
452 }
453 return false;
454 }
455 #endif
456
deviceSynchronize(DeviceIndex device_index) const457 void CUDAHooks::deviceSynchronize(DeviceIndex device_index) const {
458 at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
459 c10::cuda::device_synchronize();
460 }
461
462 // Sigh, the registry doesn't support namespaces :(
463 using at::CUDAHooksRegistry;
464 using at::RegistererCUDAHooksRegistry;
465
466 REGISTER_CUDA_HOOKS(CUDAHooks);
467
468 } // namespace at::cuda::detail
469