xref: /aosp_15_r20/external/pytorch/torch/csrc/cuda/Module.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #include <ATen/ATen.h>
2 #include <ATen/core/TensorBody.h>
3 #include <ATen/cuda/CUDAConfig.h>
4 #include <ATen/native/ConvUtils.h>
5 #include <c10/core/Device.h>
6 #include <c10/core/TensorImpl.h>
7 #include <c10/util/UniqueVoidPtr.h>
8 #include <fmt/core.h>
9 #include <pybind11/pytypes.h>
10 #include <torch/csrc/utils/python_arg_parser.h>
11 #include <unordered_set>
12 
13 #if AT_CUDNN_ENABLED()
14 
15 #endif
16 #include <ATen/cuda/CUDAContext.h>
17 #include <ATen/cuda/CUDAGeneratorImpl.h>
18 #include <ATen/cuda/CachingHostAllocator.h>
19 #include <ATen/cuda/Sleep.h>
20 #include <ATen/cuda/detail/CUDAHooks.h>
21 #include <ATen/cuda/jiterator.h>
22 #include <ATen/cuda/tunable/Tunable.h>
23 #include <c10/core/StorageImpl.h>
24 #include <c10/cuda/CUDAAllocatorConfig.h>
25 #include <c10/cuda/CUDACachingAllocator.h>
26 #include <c10/cuda/CUDAFunctions.h>
27 #include <ATen/cuda/CUDAGraphsUtils.cuh>
28 
29 #ifdef USE_NCCL
30 #include <torch/csrc/cuda/python_nccl.h>
31 #endif
32 #include <c10/util/CallOnce.h>
33 #include <c10/util/irange.h>
34 
35 #include <torch/csrc/CudaIPCTypes.h>
36 #include <torch/csrc/Generator.h>
37 #include <torch/csrc/cuda/CUDAPluggableAllocator.h>
38 #include <torch/csrc/cuda/GdsFile.h>
39 #include <torch/csrc/cuda/THCP.h>
40 #include <torch/csrc/cuda/memory_snapshot.h>
41 #include <torch/csrc/cuda/python_comm.h>
42 #include <torch/csrc/profiler/python/combined_traceback.h>
43 #include <torch/csrc/python_headers.h>
44 #include <torch/csrc/utils/device_lazy_init.h>
45 #include <torch/csrc/utils/pybind.h>
46 #include <torch/csrc/utils/pycfunction_helpers.h>
47 #include <torch/csrc/utils/python_numbers.h>
48 #include <torch/csrc/utils/python_strings.h>
49 #include <array>
50 #include <chrono>
51 #include <iostream>
52 #include <sstream>
53 #include <thread>
54 #include <unordered_map>
55 #ifndef WIN32
56 #include <pthread.h>
57 #endif
58 
59 using namespace torch;
60 
61 static bool in_bad_fork = false; // True for children forked after cuda init
62 
63 #ifndef WIN32
64 // Called in the forked child if cuda has already been initialized
forked_child()65 static void forked_child() {
66   in_bad_fork = true;
67   torch::utils::set_requires_device_init(at::kCUDA, true);
68 }
69 #endif
70 
71 // Should be called before the first cuda call.
72 // Note: This is distinct from initExtension because a stub cuda implementation
73 // has some working functions (e.g. device_count) but cannot fully initialize.
poison_fork()74 static void poison_fork() {
75 #ifndef WIN32
76   static c10::once_flag flag;
77   c10::call_once(flag, [] { pthread_atfork(nullptr, nullptr, forked_child); });
78 #endif
79 }
80 
81 ////////////////////////////////////////////////////////////////////////////////
82 // CUDA management methods
83 ////////////////////////////////////////////////////////////////////////////////
84 
THCPModule_setDevice_wrap(PyObject * self,PyObject * arg)85 PyObject* THCPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
86   HANDLE_TH_ERRORS
87   TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to setDevice");
88   auto device = THPUtils_unpackLong(arg);
89 
90   torch::utils::device_lazy_init(at::kCUDA);
91   c10::cuda::set_device(static_cast<c10::DeviceIndex>(device));
92 
93   Py_RETURN_NONE;
94   END_HANDLE_TH_ERRORS
95 }
96 
THCPModule_exchangeDevice(PyObject * self,PyObject * arg)97 PyObject* THCPModule_exchangeDevice(PyObject* self, PyObject* arg) {
98   HANDLE_TH_ERRORS
99   TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to exchangeDevice");
100   auto device_index = THPUtils_unpackDeviceIndex(arg);
101   if (device_index < 0) {
102     return THPUtils_packInt32(-1);
103   }
104 
105   torch::utils::device_lazy_init(at::kCUDA);
106   auto current_device = c10::cuda::ExchangeDevice(device_index);
107 
108   return THPUtils_packDeviceIndex(current_device);
109   END_HANDLE_TH_ERRORS
110 }
111 
THCPModule_maybeExchangeDevice(PyObject * self,PyObject * arg)112 PyObject* THCPModule_maybeExchangeDevice(PyObject* self, PyObject* arg) {
113   HANDLE_TH_ERRORS
114   TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to exchangeDevice");
115   auto device_index = THPUtils_unpackDeviceIndex(arg);
116   if (device_index < 0) {
117     return THPUtils_packInt32(-1);
118   }
119 
120   torch::utils::device_lazy_init(at::kCUDA);
121   auto current_device = c10::cuda::MaybeExchangeDevice(device_index);
122 
123   return THPUtils_packDeviceIndex(current_device);
124   END_HANDLE_TH_ERRORS
125 }
126 
THCPModule_getDevice_wrap(PyObject * self,PyObject * noargs)127 PyObject* THCPModule_getDevice_wrap(PyObject* self, PyObject* noargs) {
128   HANDLE_TH_ERRORS
129   torch::utils::device_lazy_init(at::kCUDA);
130   // NOLINTNEXTLINE(bugprone-signed-char-misuse)
131   auto device = static_cast<int32_t>(c10::cuda::current_device());
132   return THPUtils_packInt32(device);
133   END_HANDLE_TH_ERRORS
134 }
135 
THCPModule_canDeviceAccessPeer_wrap(PyObject * self,PyObject * args)136 PyObject* THCPModule_canDeviceAccessPeer_wrap(PyObject* self, PyObject* args) {
137   HANDLE_TH_ERRORS
138   PyObject* arg1 = nullptr;
139   PyObject* arg2 = nullptr;
140   if (!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
141     THPUtils_invalidArguments(
142         args,
143         nullptr,
144         "can_device_peer_access",
145         1,
146         "(int device, int peer_device);");
147     return nullptr;
148   }
149   TORCH_CHECK(
150       THPUtils_checkLong(arg1), "invalid argument to canDeviceAccessPeer");
151   TORCH_CHECK(
152       THPUtils_checkLong(arg2), "invalid argument to canDeviceAccessPeer");
153   int64_t device = THPUtils_unpackLong(arg1);
154   int64_t peer_device = THPUtils_unpackLong(arg2);
155 
156   torch::utils::device_lazy_init(at::kCUDA);
157   auto can_access = at::cuda::canDeviceAccessPeer(device, peer_device);
158   return PyBool_FromLong(can_access);
159   END_HANDLE_TH_ERRORS
160 }
161 
THCPModule_getDeviceCount_wrap(PyObject * self,PyObject * noargs)162 PyObject* THCPModule_getDeviceCount_wrap(PyObject* self, PyObject* noargs) {
163   HANDLE_TH_ERRORS
164   poison_fork();
165   return THPUtils_packUInt64(at::cuda::device_count());
166   END_HANDLE_TH_ERRORS
167 }
168 
THCPModule_getArchFlags(PyObject * self,PyObject * noargs)169 PyObject* THCPModule_getArchFlags(PyObject* self, PyObject* noargs) {
170   HANDLE_TH_ERRORS
171   poison_fork();
172 #ifdef CUDA_ARCH_FLAGS
173   static const char* flags = C10_STRINGIZE(CUDA_ARCH_FLAGS);
174   return THPUtils_packString(flags);
175 #else
176   Py_RETURN_NONE;
177 #endif
178   END_HANDLE_TH_ERRORS
179 }
180 
THCPModule_isInBadFork(PyObject * self,PyObject * noargs)181 static PyObject* THCPModule_isInBadFork(PyObject* self, PyObject* noargs) {
182   HANDLE_TH_ERRORS
183   return PyBool_FromLong(in_bad_fork);
184   END_HANDLE_TH_ERRORS
185 }
186 
THCPModule_getCurrentStream_wrap(PyObject *,PyObject * device_index)187 PyObject* THCPModule_getCurrentStream_wrap(
188     PyObject* /* unused */,
189     PyObject* device_index) {
190   HANDLE_TH_ERRORS
191   TORCH_CHECK(
192       THPUtils_checkLong(device_index), "invalid argument to getCurrentStream");
193   auto c10_device_index = THPUtils_unpackDeviceIndex(device_index);
194   auto stream = at::cuda::getCurrentCUDAStream(c10_device_index);
195   PyObject* output_tuple = PyTuple_New(3);
196   PyTuple_SetItem(
197       output_tuple, 0, THPUtils_packInt64(static_cast<int64_t>(stream.id())));
198   PyTuple_SetItem(
199       output_tuple, 1, THPUtils_packDeviceIndex(stream.device_index()));
200   PyTuple_SetItem(
201       output_tuple,
202       2,
203       THPUtils_packInt64(static_cast<int64_t>(stream.device_type())));
204   return output_tuple;
205   END_HANDLE_TH_ERRORS
206 }
207 
THCPModule_getCurrentStream_raw(PyObject *,PyObject * device_index)208 PyObject* THCPModule_getCurrentStream_raw(
209     PyObject* /* unused */,
210     PyObject* device_index) {
211   HANDLE_TH_ERRORS
212   TORCH_CHECK(
213       THPUtils_checkLong(device_index), "invalid argument to getCurrentStream");
214   auto c10_device_index = THPUtils_unpackDeviceIndex(device_index);
215   return PyLong_FromVoidPtr(
216       at::cuda::getCurrentCUDAStream(c10_device_index).stream());
217   END_HANDLE_TH_ERRORS
218 }
219 
THCPModule_getDefaultStream_wrap(PyObject *,PyObject * device_index)220 PyObject* THCPModule_getDefaultStream_wrap(
221     PyObject* /* unused */,
222     PyObject* device_index) {
223   HANDLE_TH_ERRORS
224   TORCH_CHECK(
225       THPUtils_checkLong(device_index), "invalid argument to getDefaultStream");
226   auto c10_device_index = THPUtils_unpackDeviceIndex(device_index);
227   auto stream = at::cuda::getDefaultCUDAStream(c10_device_index);
228   PyObject* output_tuple = PyTuple_New(3);
229   PyTuple_SetItem(
230       output_tuple, 0, THPUtils_packInt64(static_cast<int64_t>(stream.id())));
231   PyTuple_SetItem(
232       output_tuple, 1, THPUtils_packDeviceIndex(stream.device_index()));
233   PyTuple_SetItem(
234       output_tuple,
235       2,
236       THPUtils_packInt64(static_cast<int64_t>(stream.device_type())));
237   return output_tuple;
238   END_HANDLE_TH_ERRORS
239 }
240 
THCPModule_setStream_wrap(PyObject * self,PyObject * args,PyObject * kwargs)241 PyObject* THCPModule_setStream_wrap(
242     PyObject* self,
243     PyObject* args,
244     PyObject* kwargs) {
245   HANDLE_TH_ERRORS
246   int64_t stream_id = 0;
247   int64_t device_index = 0;
248   int64_t device_type = 0;
249 
250   // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
251   constexpr const char* kwlist[] = {
252       "stream_id", "device_index", "device_type", nullptr};
253   if (!PyArg_ParseTupleAndKeywords(
254           args,
255           kwargs,
256           "|LLL",
257           // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
258           const_cast<char**>(kwlist),
259           &stream_id,
260           &device_index,
261           &device_type)) {
262   }
263 
264   auto stream = at::cuda::CUDAStream::unpack3(
265       stream_id,
266       static_cast<c10::DeviceIndex>(device_index),
267       static_cast<c10::DeviceType>(device_type));
268 
269   auto device = c10::cuda::current_device();
270   if (device != stream.device_index()) {
271     c10::cuda::set_device(stream.device_index());
272   }
273   at::cuda::setCurrentCUDAStream(stream);
274   Py_RETURN_NONE;
275   END_HANDLE_TH_ERRORS
276 }
277 
THCPModule_getCompiledVersion(PyObject * self,PyObject * noargs)278 PyObject* THCPModule_getCompiledVersion(PyObject* self, PyObject* noargs) {
279 #if defined(USE_ROCM)
280   return THPUtils_packInt64((int64_t)ROCM_VERSION);
281 #else
282   return THPUtils_packInt64((int64_t)CUDA_VERSION);
283 #endif
284 }
285 
THCPModule_cudaHostAllocator(PyObject * _unused,PyObject * noargs)286 PyObject* THCPModule_cudaHostAllocator(PyObject* _unused, PyObject* noargs) {
287   HANDLE_TH_ERRORS
288   c10::Allocator* allocator = at::cuda::getCachingHostAllocator();
289   return PyLong_FromVoidPtr(allocator);
290   END_HANDLE_TH_ERRORS
291 }
292 
THCPModule_cudaCachingAllocator_raw_alloc(PyObject * _unused,PyObject * args)293 PyObject* THCPModule_cudaCachingAllocator_raw_alloc(
294     PyObject* _unused,
295     PyObject* args) {
296   HANDLE_TH_ERRORS
297   PyObject* size_o = nullptr;
298   PyObject* stream_o = nullptr;
299   if (!PyArg_ParseTuple(args, "OO", &size_o, &stream_o)) {
300     THPUtils_invalidArguments(
301         args,
302         nullptr,
303         "caching_allocator_alloc",
304         1,
305         "(ssize_t size, intptr_t stream);");
306     return nullptr;
307   }
308   auto size = PyLong_AsSsize_t(size_o);
309   cudaStream_t stream = static_cast<cudaStream_t>(PyLong_AsVoidPtr(stream_o));
310   void* mem = nullptr;
311   {
312     pybind11::gil_scoped_release no_gil;
313     mem = c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(size, stream);
314   }
315   return PyLong_FromVoidPtr(mem);
316   END_HANDLE_TH_ERRORS
317 }
318 
319 // Unpack a PyObject to at::Scalar, throw an exception if it fails
as_scalar(PyObject * arg)320 at::Scalar as_scalar(PyObject* arg) {
321   // Zero-dim tensors are converted to Scalars as-is. Note this doesn't
322   // currently handle most NumPy scalar types except np.float64.
323   if (THPVariable_Check(arg)) {
324     return THPVariable_Unpack(arg).item();
325   }
326 
327   if (THPUtils_checkLong(arg)) {
328     return at::Scalar(static_cast<int64_t>(THPUtils_unpackLong(arg)));
329   }
330 
331   if (PyBool_Check(arg)) {
332     return at::Scalar(THPUtils_unpackBool(arg));
333   }
334 
335   if (PyComplex_Check(arg)) {
336     return at::Scalar(THPUtils_unpackComplexDouble(arg));
337   }
338   return at::Scalar(THPUtils_unpackDouble(arg));
339 }
340 
341 // Entrypoint for the callable created by torch.cuda.jiterator
342 // See jiterator.py for more details
THCPModule_cudaJiteratorCompileAndLaunchKernel(PyObject * _unused,PyObject * args)343 PyObject* THCPModule_cudaJiteratorCompileAndLaunchKernel(
344     PyObject* _unused,
345     PyObject* args) {
346   HANDLE_TH_ERRORS
347 
348   PyObject* code_string_o = nullptr;
349   PyObject* kernel_name_o = nullptr;
350   PyObject* return_by_ref_o = nullptr;
351   PyObject* num_outputs_o = nullptr;
352   PyObject* tensors_o = nullptr;
353   PyObject* kwargs_o = nullptr;
354   if (!PyArg_ParseTuple(
355           args,
356           "OOOOO|O",
357           &code_string_o,
358           &kernel_name_o,
359           &return_by_ref_o,
360           &num_outputs_o,
361           &tensors_o,
362           &kwargs_o)) {
363     return nullptr;
364   }
365 
366   const std::string code_string = THPUtils_unpackString(code_string_o);
367   const std::string kernel_name = THPUtils_unpackString(kernel_name_o);
368   const bool return_by_ref = THPUtils_unpackBool(return_by_ref_o);
369   const int num_outputs = static_cast<int>(THPUtils_unpackLong(num_outputs_o));
370 
371   TORCH_CHECK(
372       PyTuple_Check(tensors_o),
373       "tensors argument is expected to "
374       "be a tuple, but got ",
375       THPUtils_typename(tensors_o));
376   Py_ssize_t num_tensors = PyTuple_GET_SIZE(tensors_o);
377 
378   c10::SmallVector<at::Tensor> tensors;
379   for (const auto i : c10::irange(num_tensors)) {
380     PyObject* _tensor = PyTuple_GET_ITEM(tensors_o, i);
381     TORCH_CHECK(
382         THPVariable_Check(_tensor),
383         i,
384         " of input tensors tuple is not a Tensor");
385 
386     tensors.emplace_back(THPVariable_Unpack(_tensor));
387   }
388 
389   c10::SmallVector<at::Scalar> extra_args;
390   PyObject* key = nullptr;
391   PyObject* value = nullptr;
392   Py_ssize_t pos = 0;
393   while (PyDict_Next(kwargs_o, &pos, &key, &value)) {
394     extra_args.emplace_back(as_scalar(value));
395   }
396 
397   c10::SmallVector<at::Tensor> outputs = at::cuda::CompileAndLaunchKernel(
398       code_string,
399       kernel_name,
400       num_outputs,
401       tensors,
402       extra_args,
403       return_by_ref);
404 
405   if (num_outputs == 1) {
406     return THPVariable_Wrap(outputs[0]);
407   } else {
408     PyObject* output_tuple = PyTuple_New(num_outputs);
409     for (int i = 0; i < num_outputs; ++i) {
410       PyTuple_SetItem(output_tuple, i, THPVariable_Wrap(outputs[i]));
411     }
412     return output_tuple;
413   }
414 
415   END_HANDLE_TH_ERRORS
416 }
417 
THCPModule_cudaCachingAllocator_raw_delete(PyObject * _unused,PyObject * obj)418 PyObject* THCPModule_cudaCachingAllocator_raw_delete(
419     PyObject* _unused,
420     PyObject* obj) {
421   HANDLE_TH_ERRORS
422   void* mem_ptr = PyLong_AsVoidPtr(obj);
423   {
424     pybind11::gil_scoped_release no_gil;
425     c10::cuda::CUDACachingAllocator::raw_delete(mem_ptr);
426   }
427   Py_RETURN_NONE;
428   END_HANDLE_TH_ERRORS
429 }
430 
THCPModule_cudaCachingAllocator_set_allocator_settings(PyObject * _unused,PyObject * env)431 PyObject* THCPModule_cudaCachingAllocator_set_allocator_settings(
432     PyObject* _unused,
433     PyObject* env) {
434   HANDLE_TH_ERRORS
435   c10::cuda::CUDACachingAllocator::setAllocatorSettings(
436       THPUtils_unpackString(env));
437   Py_RETURN_NONE;
438   END_HANDLE_TH_ERRORS
439 }
440 
THCPModule_getAllocatorBackend(PyObject * _unused,PyObject * noargs)441 PyObject* THCPModule_getAllocatorBackend(PyObject* _unused, PyObject* noargs) {
442   HANDLE_TH_ERRORS
443   return THPUtils_packString(c10::cuda::CUDACachingAllocator::name());
444   END_HANDLE_TH_ERRORS
445 }
446 
THCPModule_cudaSynchronize(PyObject * _unused,PyObject * noargs)447 PyObject* THCPModule_cudaSynchronize(PyObject* _unused, PyObject* noargs) {
448   HANDLE_TH_ERRORS {
449     pybind11::gil_scoped_release no_gil;
450     c10::cuda::device_synchronize();
451   }
452   Py_RETURN_NONE;
453   END_HANDLE_TH_ERRORS
454 }
455 
THCPModule_cudaIPCCollect(PyObject * _unused,PyObject * noargs)456 PyObject* THCPModule_cudaIPCCollect(PyObject* _unused, PyObject* noargs) {
457   HANDLE_TH_ERRORS
458   torch::CudaIPCCollect();
459   Py_RETURN_NONE;
460   END_HANDLE_TH_ERRORS
461 }
462 
THCPModule_cudaSleep(PyObject * _unused,PyObject * cycles)463 PyObject* THCPModule_cudaSleep(PyObject* _unused, PyObject* cycles) {
464   HANDLE_TH_ERRORS
465   TORCH_CHECK(
466       THPUtils_checkLong(cycles), "torch.cuda._sleep(): expected 'int'");
467   int64_t unpacked_cycles = THPUtils_unpackLong(cycles);
468   {
469     pybind11::gil_scoped_release no_gil;
470     at::cuda::sleep(unpacked_cycles);
471   }
472   Py_RETURN_NONE;
473   END_HANDLE_TH_ERRORS
474 }
475 
476 // We need to ensure that as long as a thread will NEVER loose the GIL as long
477 // as it holds the CUDA mutex. Otherwise another thread might be scheduled and
478 // try to e.g. allocate a new tensor which will cause a deadlock. It's enough to
479 // have a single global, because it can be only set once (cudaMutex is not
480 // recursive) by the thread that owns the mutex (obviously there can be only one
481 // such thread).
482 static PyGILState_STATE cudaMutexGILState;
483 
THCPModule_cudaLockMutex(PyObject * module,PyObject * noargs)484 PyObject* THCPModule_cudaLockMutex(PyObject* module, PyObject* noargs) {
485   auto mutex = c10::cuda::getFreeMutex();
486   // This has to be a busy loop because we **absolutely need to** hold the GIL
487   // or it's a recipe for a deadlock otherwise (if we let other Python threads
488   // run while we have the cudaMutex, but not the GIL, they might try to e.g.
489   // free a CUDA tensor and acquire the cudaMutex without giving up the GIL,
490   // because it happens deep within THC).
491   while (true) {
492     if (mutex->try_lock())
493       break;
494     {
495       pybind11::gil_scoped_release no_gil;
496       std::this_thread::sleep_for(std::chrono::microseconds(10));
497     }
498   }
499 
500   cudaMutexGILState = PyGILState_Ensure();
501   Py_RETURN_NONE;
502 }
503 
THCPModule_cudaUnlockMutex(PyObject * module,PyObject * noargs)504 PyObject* THCPModule_cudaUnlockMutex(PyObject* module, PyObject* noargs) {
505   auto mutex = c10::cuda::getFreeMutex();
506   PyGILState_Release(cudaMutexGILState);
507   mutex->unlock();
508   Py_RETURN_NONE;
509 }
510 
THCPModule_hasPrimaryContext(PyObject * _unused,PyObject * arg)511 PyObject* THCPModule_hasPrimaryContext(PyObject* _unused, PyObject* arg) {
512   HANDLE_TH_ERRORS
513   TORCH_CHECK(
514       THPUtils_checkLong(arg), "invalid argument to has_primary_context");
515   auto device_index = THPUtils_unpackDeviceIndex(arg);
516   if (c10::cuda::hasPrimaryContext(device_index)) {
517     Py_RETURN_TRUE;
518   } else {
519     Py_RETURN_FALSE;
520   }
521   END_HANDLE_TH_ERRORS
522 }
523 
THCPModule_setMemoryFraction(PyObject * _unused,PyObject * args)524 PyObject* THCPModule_setMemoryFraction(PyObject* _unused, PyObject* args) {
525   HANDLE_TH_ERRORS
526   PyObject* fraction_o = nullptr;
527   PyObject* device_o = nullptr;
528   if (!PyArg_ParseTuple(args, "OO", &fraction_o, &device_o)) {
529     THPUtils_invalidArguments(
530         args,
531         nullptr,
532         "set_memory_fraction",
533         1,
534         "(double fraction, int device);");
535     return nullptr;
536   }
537   double fraction = PyFloat_AsDouble(fraction_o);
538   auto device_index = THPUtils_unpackDeviceIndex(device_o);
539 
540   c10::cuda::CUDACachingAllocator::setMemoryFraction(fraction, device_index);
541   END_HANDLE_TH_ERRORS
542   Py_RETURN_NONE;
543 }
544 
THCPModule_hostEmptyCache(PyObject * _unused,PyObject * noargs)545 PyObject* THCPModule_hostEmptyCache(PyObject* _unused, PyObject* noargs) {
546   HANDLE_TH_ERRORS {
547     pybind11::gil_scoped_release no_gil;
548     at::cuda::CachingHostAllocator_emptyCache();
549   }
550   END_HANDLE_TH_ERRORS
551   Py_RETURN_NONE;
552 }
553 
THCPModule_emptyCache(PyObject * _unused,PyObject * noargs)554 PyObject* THCPModule_emptyCache(PyObject* _unused, PyObject* noargs) {
555   HANDLE_TH_ERRORS {
556     pybind11::gil_scoped_release no_gil;
557     c10::cuda::CUDACachingAllocator::emptyCache();
558   }
559   END_HANDLE_TH_ERRORS
560   Py_RETURN_NONE;
561 }
562 
THCPModule_memoryStats(PyObject * _unused,PyObject * arg)563 PyObject* THCPModule_memoryStats(PyObject* _unused, PyObject* arg) {
564   HANDLE_TH_ERRORS
565   TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to memory_allocated");
566   const auto device_index = THPUtils_unpackDeviceIndex(arg);
567 
568   using c10::CachingDeviceAllocator::DeviceStats;
569   using c10::CachingDeviceAllocator::Stat;
570   using c10::CachingDeviceAllocator::StatArray;
571   using c10::CachingDeviceAllocator::StatType;
572 
573   const auto statToDict = [](const Stat& stat) {
574     py::dict dict;
575 
576     dict["current"] = stat.current;
577     dict["peak"] = stat.peak;
578     dict["allocated"] = stat.allocated;
579     dict["freed"] = stat.freed;
580     return dict;
581   };
582 
583   const auto statArrayToDict = [=](const StatArray& statArray) {
584     const std::array<const char*, static_cast<size_t>(StatType::NUM_TYPES)>
585         statTypeNames = {"all", "small_pool", "large_pool"};
586     py::dict dict;
587     for (const auto i : c10::irange(statTypeNames.size())) {
588       dict[statTypeNames[i]] = statToDict(statArray[i]);
589     }
590     return dict;
591   };
592 
593   const DeviceStats stats =
594       c10::cuda::CUDACachingAllocator::getDeviceStats(device_index);
595 
596   py::dict result;
597   result["num_alloc_retries"] = stats.num_alloc_retries;
598   result["num_ooms"] = stats.num_ooms;
599   result["max_split_size"] = stats.max_split_size;
600   result["num_sync_all_streams"] = stats.num_sync_all_streams;
601   result["num_device_alloc"] = stats.num_device_alloc;
602   result["num_device_free"] = stats.num_device_free;
603   result["allocation"] = statArrayToDict(stats.allocation);
604   result["segment"] = statArrayToDict(stats.segment);
605   result["active"] = statArrayToDict(stats.active);
606   result["inactive_split"] = statArrayToDict(stats.inactive_split);
607   result["allocated_bytes"] = statArrayToDict(stats.allocated_bytes);
608   result["reserved_bytes"] = statArrayToDict(stats.reserved_bytes);
609   result["active_bytes"] = statArrayToDict(stats.active_bytes);
610   result["inactive_split_bytes"] = statArrayToDict(stats.inactive_split_bytes);
611   result["requested_bytes"] = statArrayToDict(stats.requested_bytes);
612   result["oversize_allocations"] = statToDict(stats.oversize_allocations);
613   result["oversize_segments"] = statToDict(stats.oversize_segments);
614 
615   return result.release().ptr();
616   END_HANDLE_TH_ERRORS
617 }
618 
THCPModule_resetAccumulatedMemoryStats(PyObject * _unused,PyObject * arg)619 PyObject* THCPModule_resetAccumulatedMemoryStats(
620     PyObject* _unused,
621     PyObject* arg) {
622   HANDLE_TH_ERRORS
623   TORCH_CHECK(
624       THPUtils_checkLong(arg),
625       "invalid argument to reset_accumulated_memory_stats");
626   const auto device_index = THPUtils_unpackDeviceIndex(arg);
627   c10::cuda::CUDACachingAllocator::resetAccumulatedStats(device_index);
628   END_HANDLE_TH_ERRORS
629   Py_RETURN_NONE;
630 }
631 
THCPModule_resetPeakMemoryStats(PyObject * _unused,PyObject * arg)632 PyObject* THCPModule_resetPeakMemoryStats(PyObject* _unused, PyObject* arg) {
633   HANDLE_TH_ERRORS
634   TORCH_CHECK(
635       THPUtils_checkLong(arg), "invalid argument to reset_peak_memory_stats");
636   const auto device_index = THPUtils_unpackDeviceIndex(arg);
637   c10::cuda::CUDACachingAllocator::resetPeakStats(device_index);
638   END_HANDLE_TH_ERRORS
639   Py_RETURN_NONE;
640 }
641 
getFromContext(const std::shared_ptr<c10::GatheredContext> & x)642 CapturedTraceback* getFromContext(
643     const std::shared_ptr<c10::GatheredContext>& x) {
644   if (CapturedTraceback* sc = dynamic_cast<CapturedTraceback*>(x.get())) {
645     return sc;
646   }
647   TORCH_CHECK(
648       false,
649       "attempting to gather stack context from the wrong StackContext type.");
650 }
651 
THCPModule_memorySnapshot(PyObject * _unused,PyObject * noargs)652 PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
653   HANDLE_TH_ERRORS
654 
655   using c10::cuda::CUDACachingAllocator::BlockInfo;
656   using c10::cuda::CUDACachingAllocator::SegmentInfo;
657 
658   py::str device_s = "device";
659   py::str address_s = "address";
660   py::str total_size_s = "total_size";
661   py::str allocated_size_s = "allocated_size";
662   py::str active_size_s = "active_size";
663   py::str requested_size_s = "requested_size";
664   py::str stream_s = "stream";
665   py::str segment_type_s = "segment_type";
666   py::str segment_pool_id = "segment_pool_id";
667   py::str large_s = "large";
668   py::str small_s = "small";
669   py::str size_s = "size";
670   py::str state_s = "state";
671   py::str active_allocated_s = "active_allocated";
672   py::str active_pending_free_s = "active_pending_free";
673   py::str inactive_s = "inactive";
674   py::str addr_s = "addr";
675   py::str cpp_frames_s = "cpp_frames";
676   py::str blocks_s = "blocks";
677   py::str is_expandable_s = "is_expandable";
678   py::str frames_s = "frames";
679   py::str time_us_s = "time_us";
680 
681   py::list empty_frames;
682   std::vector<CapturedTraceback*> to_gather_frames;
683   std::vector<py::dict> to_gather_dest;
684 
685   auto add_frame_key = [&](const py::dict& d,
686                            const std::shared_ptr<c10::GatheredContext>& ctx) {
687     if (ctx) {
688       auto sc = getFromContext(ctx);
689       to_gather_frames.emplace_back(sc);
690       to_gather_dest.emplace_back(d);
691     } else {
692       d[frames_s] = empty_frames;
693     }
694   };
695 
696   const auto segmentInfoToDict = [&](const SegmentInfo& segmentInfo) {
697     py::dict segmentDict;
698     segmentDict[device_s] = segmentInfo.device;
699     segmentDict[address_s] = segmentInfo.address;
700     segmentDict[total_size_s] = segmentInfo.total_size;
701     segmentDict[allocated_size_s] = segmentInfo.allocated_size;
702     segmentDict[active_size_s] = segmentInfo.active_size;
703     segmentDict[requested_size_s] = segmentInfo.requested_size;
704     // we want the python objects to pickle easily so use an int to
705     // represent the stream rather than a torch.cuda.stream object
706     segmentDict[stream_s] = int64_t(segmentInfo.stream);
707     segmentDict[segment_type_s] = (segmentInfo.is_large ? large_s : small_s);
708     segmentDict[segment_pool_id] = segmentInfo.owner_private_pool_id;
709     segmentDict[is_expandable_s] = segmentInfo.is_expandable;
710     add_frame_key(segmentDict, segmentInfo.context_when_allocated);
711 
712     auto address = segmentInfo.address;
713     py::list blocks;
714     for (const auto& blockInfo : segmentInfo.blocks) {
715       py::dict blockDict;
716       blockDict[address_s] = address;
717       blockDict[size_s] = blockInfo.size;
718       blockDict[requested_size_s] = blockInfo.requested_size;
719       blockDict[state_s] =
720           (blockInfo.allocated
721                ? active_allocated_s
722                : (blockInfo.active ? active_pending_free_s : inactive_s));
723       add_frame_key(blockDict, blockInfo.context_when_allocated);
724       blocks.append(blockDict);
725       address += blockInfo.size;
726     }
727     segmentDict[blocks_s] = blocks;
728 
729     return segmentDict;
730   };
731 
732   auto snapshot = c10::cuda::CUDACachingAllocator::snapshot();
733 
734   py::list segments;
735 
736   for (const auto& segmentInfo : snapshot.segments) {
737     segments.append(segmentInfoToDict(segmentInfo));
738   }
739 
740   py::list traces;
741   py::str action_s = "action";
742   py::str alloc_s = "alloc";
743   py::str free_requested_s = "free_requested";
744   py::str free_completed_s = "free_completed";
745   py::str segment_alloc_s = "segment_alloc";
746   py::str segment_free_s = "segment_free";
747   py::str segment_map_s = "segment_map";
748   py::str segment_unmap_s = "segment_unmap";
749 
750   py::str snapshot_s = "snapshot";
751   py::str oom_s = "oom";
752   py::str device_free_s = "device_free";
753 
754   using namespace c10::cuda::CUDACachingAllocator;
755 
756   auto action_to_str = [&](TraceEntry::Action action) {
757     switch (action) {
758       case TraceEntry::ALLOC:
759         return alloc_s;
760       case TraceEntry::FREE_REQUESTED:
761         return free_requested_s;
762       case TraceEntry::FREE_COMPLETED:
763         return free_completed_s;
764       case TraceEntry::SEGMENT_ALLOC:
765         return segment_alloc_s;
766       case TraceEntry::SEGMENT_FREE:
767         return segment_free_s;
768       case TraceEntry::OOM:
769         return oom_s;
770       case TraceEntry::SNAPSHOT:
771         return snapshot_s;
772       case TraceEntry::SEGMENT_UNMAP:
773         return segment_unmap_s;
774       case TraceEntry::SEGMENT_MAP:
775         return segment_map_s;
776     }
777     throw std::runtime_error("unreachable");
778   };
779 
780   for (const auto& traceInfo : snapshot.device_traces) {
781     py::list trace;
782     for (const auto& te : traceInfo) {
783       py::dict trace_entry;
784       if (te.context_) {
785         // without further compression frames can get really large on dump
786         auto sc = getFromContext(te.context_);
787         to_gather_frames.emplace_back(sc);
788         to_gather_dest.emplace_back(trace_entry);
789       }
790       trace_entry[action_s] = action_to_str(te.action_);
791       trace_entry[TraceEntry::OOM == te.action_ ? device_free_s : addr_s] =
792           te.addr_;
793       trace_entry[size_s] = te.size_;
794       trace_entry[stream_s] = int64_t(te.stream_);
795       trace_entry[time_us_s] = te.time_.t_;
796       trace.append(trace_entry);
797     }
798     traces.append(trace);
799   }
800 
801   py::list external_annotations;
802   for (const auto& ae : snapshot.external_annotations) {
803     py::dict annotation_entry;
804     for (const auto& md : ae.metadata_) {
805       annotation_entry[(py::str)md.first] = md.second;
806     }
807     annotation_entry[device_s] = ae.device_;
808     annotation_entry[time_us_s] = ae.time_.t_;
809     external_annotations.append(annotation_entry);
810   }
811 
812   py::dict allocator_settings;
813   py::str last_allocator_settings_s = "PYTORCH_CUDA_ALLOC_CONF";
814   py::str max_split_size_s = "max_split_size";
815   py::str garbage_collection_threshold_s = "garbage_collection_threshold";
816   py::str expandable_segments_s = "expandable_segments";
817   py::str pinned_num_register_threads_s = "pinned_num_register_threads";
818   py::str release_lock_on_malloc_s = "release_lock_on_cudamalloc";
819   py::str pinned_use_host_register_s = "pinned_use_cuda_host_register";
820   py::str roundup_power2_divisions_s = "roundup_power2_divisions";
821 
822   allocator_settings[last_allocator_settings_s] =
823       snapshot.config_metadata.last_allocator_settings;
824   allocator_settings[max_split_size_s] =
825       int64_t(snapshot.config_metadata.max_split_size);
826   allocator_settings[garbage_collection_threshold_s] =
827       snapshot.config_metadata.garbage_collection_threshold;
828   allocator_settings[expandable_segments_s] =
829       snapshot.config_metadata.expandable_segments;
830   allocator_settings[pinned_num_register_threads_s] =
831       int64_t(snapshot.config_metadata.pinned_num_register_threads);
832   allocator_settings[release_lock_on_malloc_s] =
833       snapshot.config_metadata.release_lock_on_malloc;
834   allocator_settings[pinned_use_host_register_s] =
835       snapshot.config_metadata.pinned_use_host_register;
836   unsigned int roundup_key = 1;
837   py::dict roundup_settings;
838   for (const auto& v : snapshot.config_metadata.roundup_power2_divisions) {
839     py::str roundup_key_s = std::to_string(roundup_key);
840     roundup_settings[roundup_key_s] = int64_t(v);
841     roundup_key *= 2;
842   }
843   allocator_settings[roundup_power2_divisions_s] = roundup_settings;
844 
845   py::dict result;
846   result["segments"] = segments;
847   result["device_traces"] = traces;
848   result["allocator_settings"] = allocator_settings;
849   result["external_annotations"] = external_annotations;
850 
851   auto frames = py_symbolize(to_gather_frames);
852   for (auto i : c10::irange(frames.size())) {
853     to_gather_dest.at(i)[frames_s] = frames.at(i);
854   }
855 
856   return result.release().ptr();
857   END_HANDLE_TH_ERRORS
858 }
859 
THCPModule_attachOutOfMemoryObserver(PyObject * _unused,PyObject * observer)860 PyObject* THCPModule_attachOutOfMemoryObserver(
861     PyObject* _unused,
862     PyObject* observer) {
863   HANDLE_TH_ERRORS
864   Py_XINCREF(observer);
865   auto obs = [observer](
866                  int64_t device,
867                  int64_t alloc,
868                  int64_t device_allocated,
869                  int64_t device_free) {
870     py::gil_scoped_acquire g;
871     PyObject* result = PyObject_CallFunction(
872         observer, "LLLL", device, alloc, device_allocated, device_free);
873     if (!result) {
874       throw py::error_already_set();
875     }
876     Py_XDECREF(result);
877   };
878   at::globalContext().lazyInitCUDA();
879   c10::cuda::CUDACachingAllocator::attachOutOfMemoryObserver(std::move(obs));
880   Py_RETURN_NONE;
881   END_HANDLE_TH_ERRORS
882 }
883 
THCPModule_cudaSetSyncDebugMode(PyObject * _unused,PyObject * arg)884 PyObject* THCPModule_cudaSetSyncDebugMode(PyObject* _unused, PyObject* arg) {
885   HANDLE_TH_ERRORS
886   TORCH_WARN_ONCE(
887       "Synchronization debug mode is a prototype feature and does not yet detect all "
888       "synchronizing operations");
889   TORCH_CHECK(
890       THPUtils_checkLong(arg), "invalid argument to set_sync_debug_mode");
891   int64_t debug_mode = THPUtils_unpackLong(arg);
892   TORCH_CHECK(
893       debug_mode >= 0 && debug_mode <= 2,
894       "invalid value of debug_mode, expected one of 0,1,2");
895   c10::cuda::SyncDebugMode l = c10::cuda::SyncDebugMode::L_DISABLED;
896   switch (debug_mode) {
897     case 0:
898       l = c10::cuda::SyncDebugMode::L_DISABLED;
899       break;
900     case 1:
901       l = c10::cuda::SyncDebugMode::L_WARN;
902       break;
903     case 2:
904       l = c10::cuda::SyncDebugMode::L_ERROR;
905       break;
906     default:
907       break; // can't happen
908   }
909   c10::cuda::warning_state().set_sync_debug_mode(l);
910   Py_RETURN_NONE;
911   END_HANDLE_TH_ERRORS
912 }
913 
THCPModule_cudaGetSyncDebugMode(PyObject * self,PyObject * noargs)914 PyObject* THCPModule_cudaGetSyncDebugMode(PyObject* self, PyObject* noargs) {
915   HANDLE_TH_ERRORS
916   auto debug_mode = c10::cuda::warning_state().get_sync_debug_mode();
917   switch (debug_mode) {
918     case c10::cuda::SyncDebugMode::L_DISABLED:
919       return THPUtils_packInt32(0);
920     case c10::cuda::SyncDebugMode::L_WARN:
921       return THPUtils_packInt32(1);
922     case c10::cuda::SyncDebugMode::L_ERROR:
923       return THPUtils_packInt32(2);
924     default:
925       return THPUtils_packInt32(-1); // can't happen
926   }
927   END_HANDLE_TH_ERRORS
928 }
929 
uuid_to_string(const char * uuid_bytes)930 std::string uuid_to_string(const char* uuid_bytes) {
931   // UUIDs are a 128-bit label. CUDA and HIP store this as char[16].
932   // For string representation, the code here expands this to
933   // 8-4-4-4-12 hex format, so each byte becomes 2 hex characters.
934   return fmt::format(
935       "{:02x}{:02x}{:02x}{:02x}-"
936       "{:02x}{:02x}-"
937       "{:02x}{:02x}-"
938       "{:02x}{:02x}-"
939       "{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}",
940       (uint8_t)uuid_bytes[0],
941       (uint8_t)uuid_bytes[1],
942       (uint8_t)uuid_bytes[2],
943       (uint8_t)uuid_bytes[3],
944       (uint8_t)uuid_bytes[4],
945       (uint8_t)uuid_bytes[5],
946       (uint8_t)uuid_bytes[6],
947       (uint8_t)uuid_bytes[7],
948       (uint8_t)uuid_bytes[8],
949       (uint8_t)uuid_bytes[9],
950       (uint8_t)uuid_bytes[10],
951       (uint8_t)uuid_bytes[11],
952       (uint8_t)uuid_bytes[12],
953       (uint8_t)uuid_bytes[13],
954       (uint8_t)uuid_bytes[14],
955       (uint8_t)uuid_bytes[15]);
956 }
957 
958 ////////////////////////////////////////////////////////////////////////////////
959 // Cuda module initialization
960 ////////////////////////////////////////////////////////////////////////////////
961 
registerCudaDeviceProperties(PyObject * module)962 static void registerCudaDeviceProperties(PyObject* module) {
963   // Add _cudaDevicePropertires class to torch._C
964   auto m = py::handle(module).cast<py::module>();
965   // until internal build is using a rocm version with uuid attr
966 #ifndef FBCODE_CAFFE2
967   // CUuuid is defined in either cuda.h or driver_types.h
968   // hipified to hipUUID which is defined in hip_runtime_api.h
969   py::class_<CUuuid>(m, "_CUuuid")
970       .def_property_readonly(
971           "bytes",
972           [](const CUuuid& uuid) {
973             return std::vector<uint8_t>(uuid.bytes, uuid.bytes + 16);
974           })
975       .def("__str__", [](const CUuuid& uuid) {
976         return uuid_to_string(uuid.bytes);
977       });
978 #endif
979   py::class_<cudaDeviceProp>(m, "_CudaDeviceProperties")
980       .def_readonly("name", &cudaDeviceProp::name)
981       .def_readonly("major", &cudaDeviceProp::major)
982       .def_readonly("minor", &cudaDeviceProp::minor)
983       .def_readonly("is_multi_gpu_board", &cudaDeviceProp::isMultiGpuBoard)
984       .def_readonly("is_integrated", &cudaDeviceProp::integrated)
985       .def_readonly(
986           "multi_processor_count", &cudaDeviceProp::multiProcessorCount)
987       .def_readonly("total_memory", &cudaDeviceProp::totalGlobalMem)
988       .def_readonly(
989           "max_threads_per_multi_processor",
990           &cudaDeviceProp::maxThreadsPerMultiProcessor)
991       .def_readonly("warp_size", &cudaDeviceProp::warpSize)
992 #if !USE_ROCM
993       // NVIDA only property
994       .def_readonly(
995           "regs_per_multiprocessor", &cudaDeviceProp::regsPerMultiprocessor)
996 #endif // USE_ROCM
997       // HIP-only property; reuse name attribute for CUDA builds
998       .def_readonly(
999           "gcnArchName",
1000 #if USE_ROCM
1001           &cudaDeviceProp::gcnArchName
1002 #else
1003           &cudaDeviceProp::name
1004 #endif // USE_ROCM
1005           )
1006 #ifndef FBCODE_CAFFE2
1007       .def_readonly("uuid", &cudaDeviceProp::uuid)
1008 #endif
1009       .def_readonly("L2_cache_size", &cudaDeviceProp::l2CacheSize)
1010       .def("__repr__", [](const cudaDeviceProp& prop) {
1011         std::ostringstream stream;
1012         stream << "_CudaDeviceProperties(name='" << prop.name
1013                << "', major=" << prop.major << ", minor=" << prop.minor
1014 #if USE_ROCM
1015                << ", gcnArchName='" << prop.gcnArchName << "'"
1016 #endif // USE_ROCM
1017                << ", total_memory=" << prop.totalGlobalMem / (1024ull * 1024)
1018                << "MB, multi_processor_count=" << prop.multiProcessorCount
1019 #ifndef FBCODE_CAFFE2
1020                << ", uuid=" << uuid_to_string(prop.uuid.bytes)
1021 #endif
1022                << ", L2_cache_size=" << prop.l2CacheSize / (1024ull * 1024)
1023                << "MB)";
1024         return stream.str();
1025       });
1026 
1027   m.def(
1028       "_cuda_record_memory_history_legacy",
1029       static_cast<void (*)(bool, bool, int64_t, bool, bool)>(
1030           torch::cuda::_record_memory_history));
1031 
1032   m.def(
1033       "_cuda_record_memory_history",
1034       static_cast<void (*)(
1035           std::optional<std::string>,
1036           std::optional<std::string>,
1037           const std::string&,
1038           size_t)>(torch::cuda::_record_memory_history));
1039 
1040   m.def("_cuda_isHistoryEnabled", []() {
1041     return c10::cuda::CUDACachingAllocator::isHistoryEnabled();
1042   });
1043 
1044   m.def("_cuda_get_conv_benchmark_empty_cache", []() {
1045     return at::native::_cudnn_get_conv_benchmark_empty_cache();
1046   });
1047 
1048   m.def("_cudnn_set_conv_benchmark_empty_cache", [](bool enable) {
1049     return at::native::_cudnn_set_conv_benchmark_empty_cache(enable);
1050   });
1051 }
1052 
1053 // We choose to ignore certain blocks that are currently allocated
1054 // when we set the pool to its checkpoint. For those blocks, we need
1055 // to swap out the deleter function of their corresponding blocks
1056 // so that a deallocation is not triggered when they die.
removeStorageDeleterFns(const std::vector<c10::StorageImpl * > & stale_live_storages,std::unordered_set<void * > definitely_stale_pointers)1057 void removeStorageDeleterFns(
1058     const std::vector<c10::StorageImpl*>& stale_live_storages,
1059     std::unordered_set<void*> definitely_stale_pointers) {
1060   for (c10::StorageImpl* stale_storage : stale_live_storages) {
1061     auto ptr = stale_storage->data_ptr().get();
1062     auto allocated_pointer = definitely_stale_pointers.find(ptr);
1063     TORCH_CHECK(allocated_pointer != definitely_stale_pointers.end());
1064     auto t = c10::cuda::CUDACachingAllocator::get();
1065     bool succeeded = stale_storage->mutable_data_ptr().compare_exchange_deleter(
1066         t->raw_deleter(), &c10::detail::deleteNothing);
1067 
1068     TORCH_CHECK(
1069         succeeded,
1070         "Unexpected deleter function on storage, could not swap function");
1071   }
1072 }
1073 
addStorageDeleterFns(std::vector<c10::StorageImpl * > & storages_to_add_deleters_to,c10::cuda::CUDACachingAllocator::CheckpointDelta & delta)1074 void addStorageDeleterFns(
1075     std::vector<c10::StorageImpl*>& storages_to_add_deleters_to,
1076     c10::cuda::CUDACachingAllocator::CheckpointDelta& delta) {
1077   std::unordered_map<void*, c10::StorageImpl*> storages;
1078   for (auto& storage : storages_to_add_deleters_to) {
1079     storages[storage->data_ptr().get()] = storage;
1080   }
1081 
1082   for (auto& data_ptr : delta.dataptrs_allocd) {
1083     auto storage_pair = storages.find(data_ptr.get());
1084     if (storage_pair != storages.end()) {
1085       auto ctx = storage_pair->second->data_ptr().get_context();
1086       TORCH_CHECK(ctx == nullptr, " Not expecting deleter function");
1087       storage_pair->second->set_data_ptr_noswap(std::move(data_ptr));
1088     } else {
1089       data_ptr.release_context();
1090     }
1091   }
1092 }
1093 
registerCudaPluggableAllocator(PyObject * module)1094 static void registerCudaPluggableAllocator(PyObject* module) {
1095   auto m = py::handle(module).cast<py::module>();
1096 
1097   // NOLINTNEXTLINE(bugprone-unused-raii)
1098   py::class_<
1099       c10::cuda::CUDACachingAllocator::CUDAAllocator,
1100       std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>>(
1101       m, "_cuda_CUDAAllocator");
1102   m.def("_cuda_getAllocator", []() {
1103     return py::cast(torch::cuda::CUDAPluggableAllocator::getCurrentAllocator());
1104   });
1105 
1106   m.def(
1107       "_cuda_changeCurrentAllocator",
1108       [](const std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>&
1109              allocator) {
1110         torch::cuda::CUDAPluggableAllocator::changeCurrentAllocator(allocator);
1111       });
1112   py::class_<
1113       torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator,
1114       c10::cuda::CUDACachingAllocator::CUDAAllocator,
1115       std::shared_ptr<
1116           torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator>>(
1117       m, "_CUDAPluggableAllocator")
1118       .def(
1119           "set_init_fn",
1120           [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
1121              uint64_t func_ptr) {
1122             using FuncType = void(int);
1123             std::function<FuncType> func =
1124                 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1125                 reinterpret_cast<FuncType*>(func_ptr);
1126             self.set_init_fn(func);
1127           })
1128       .def(
1129           "set_reset_fn",
1130           [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
1131              uint64_t func_ptr) {
1132             using FuncType = void();
1133             std::function<FuncType> func =
1134                 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1135                 reinterpret_cast<FuncType*>(func_ptr);
1136             self.set_reset_fn(func);
1137           })
1138       .def(
1139           "set_memory_fraction_fn",
1140           [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
1141              uint64_t func_ptr) {
1142             using FuncType = void(double, int);
1143             std::function<FuncType> func =
1144                 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1145                 reinterpret_cast<FuncType*>(func_ptr);
1146             self.set_memory_fraction_fn(func);
1147           })
1148       .def(
1149           "set_base_alloc_fn",
1150           [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
1151              uint64_t func_ptr) {
1152             using FuncType = void*(void*, size_t*);
1153             std::function<FuncType> func =
1154                 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1155                 reinterpret_cast<FuncType*>(func_ptr);
1156             self.set_base_alloc_fn(func);
1157           })
1158       .def(
1159           "set_record_stream_fn",
1160           [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
1161              uint64_t func_ptr) {
1162             using FuncType = void(void*, cudaStream_t);
1163             std::function<FuncType> func =
1164                 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1165                 reinterpret_cast<FuncType*>(func_ptr);
1166             self.set_record_stream_fn(func);
1167           })
1168       .def(
1169           "set_begin_allocate_to_pool",
1170           [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
1171              uint64_t func_ptr) {
1172             using FuncType = void(
1173                 int, c10::cuda::MempoolId_t, std::function<bool(cudaStream_t)>);
1174             std::function<FuncType> func =
1175                 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1176                 reinterpret_cast<FuncType*>(func_ptr);
1177             self.set_begin_allocate_to_pool(func);
1178           })
1179       .def(
1180           "set_end_allocate_to_pool_fn",
1181           [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
1182              uint64_t func_ptr) {
1183             using FuncType = void(int, c10::cuda::MempoolId_t);
1184             std::function<FuncType> func =
1185                 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1186                 reinterpret_cast<FuncType*>(func_ptr);
1187             self.set_end_allocate_to_pool_fn(func);
1188           })
1189       .def(
1190           "set_release_pool",
1191           [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
1192              uint64_t func_ptr) {
1193             using FuncType = void(int, c10::cuda::MempoolId_t);
1194             std::function<FuncType> func =
1195                 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1196                 reinterpret_cast<FuncType*>(func_ptr);
1197             self.set_release_pool(func);
1198           });
1199   m.def("_cuda_customAllocator", [](uint64_t malloc_ptr, uint64_t free_ptr) {
1200     using namespace torch::cuda::CUDAPluggableAllocator;
1201     std::function<MallocFuncType> malloc_fn =
1202         // NOLINTNEXTLINE(performance-no-int-to-ptr)
1203         reinterpret_cast<MallocFuncType*>(malloc_ptr);
1204     std::function<FreeFuncType> free_fn =
1205         // NOLINTNEXTLINE(performance-no-int-to-ptr)
1206         reinterpret_cast<FreeFuncType*>(free_ptr);
1207     return createCustomAllocator(malloc_fn, free_fn);
1208   });
1209 
1210   // NOLINTNEXTLINE(bugprone-unused-raii)
1211   py::class_<
1212       c10::cuda::CUDACachingAllocator::AllocatorState,
1213       std::shared_ptr<c10::cuda::CUDACachingAllocator::AllocatorState>>(
1214       m, "_cuda_CUDAAllocator_AllocatorState");
1215 
1216   m.def(
1217       "_cuda_getCheckpointState",
1218       [](c10::DeviceIndex device, c10::cuda::MempoolId_t id) {
1219         return c10::cuda::CUDACachingAllocator::getCheckpointState(device, id);
1220       });
1221 
1222   m.def("_free_And_Remove_DeleterFn", [](size_t storage_impl_ptr) {
1223     // NOLINTNEXTLINE(performance-no-int-to-ptr)
1224     c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr;
1225     auto alloc = c10::cuda::CUDACachingAllocator::get();
1226     auto data_ptr = storage_impl->data_ptr().get();
1227     bool succeeded = storage_impl->mutable_data_ptr().compare_exchange_deleter(
1228         alloc->raw_deleter(), c10::detail::deleteNothing);
1229     TORCH_CHECK(succeeded, "Expected standard deleter");
1230     c10::cuda::CUDACachingAllocator::raw_delete(data_ptr);
1231   });
1232 
1233   m.def(
1234       "_set_storage_access_error_msg", [](const at::Tensor& t, std::string s) {
1235         t.unsafeGetTensorImpl()
1236             ->release_storage_and_set_meta_custom_data_ptr_error_msg_(s);
1237       });
1238 
1239   m.def("_has_Standard_Deleter", [](size_t storage_impl_ptr) {
1240     // NOLINTNEXTLINE(performance-no-int-to-ptr)
1241     c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr;
1242     auto alloc = c10::cuda::CUDACachingAllocator::get();
1243     return (storage_impl->data_ptr().get_deleter() == alloc->raw_deleter());
1244   });
1245 
1246   m.def("_storage_Use_Count", [](size_t storage_impl_ptr) {
1247     // NOLINTNEXTLINE(performance-no-int-to-ptr)
1248     c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr;
1249     return c10::raw::weak_intrusive_ptr::use_count(storage_impl);
1250   });
1251 
1252   m.def(
1253       "_tensors_data_ptrs_at_indices_equal",
1254       [](py::list& tensors, py::list& data_ptrs, py::list& indices) {
1255         for (size_t i = 0, end = indices.size(); i < end; ++i) {
1256           auto index = indices[i].cast<int64_t>();
1257           auto t = tensors[index].cast<at::Tensor>();
1258           auto data_ptr = data_ptrs[index].cast<int64_t>();
1259           if (reinterpret_cast<int64_t>(t.data_ptr()) != data_ptr) {
1260             return false;
1261           }
1262         }
1263         return true;
1264       });
1265 
1266   m.def(
1267       "_construct_CUDA_Tensor_From_Storage_And_Metadata",
1268       [](py::dict& metadata, c10::Storage s) {
1269         auto dtype_arg = metadata["dtype"].ptr();
1270         auto meta = scalarTypeToTypeMeta(toScalarType(dtype_arg));
1271 
1272         constexpr c10::DispatchKeySet cuda_dks(c10::DispatchKey::CUDA);
1273         at::Tensor tensor = at::detail::make_tensor_base<c10::TensorImpl>(
1274             std::move(s), cuda_dks, meta);
1275 
1276         tensor.unsafeGetTensorImpl()->set_sizes_and_strides(
1277             metadata["size"].cast<std::vector<int64_t>>(),
1278             metadata["stride"].cast<std::vector<int64_t>>());
1279         tensor.unsafeGetTensorImpl()->set_storage_offset(
1280             metadata["storage_offset"].cast<int64_t>());
1281         return tensor;
1282       });
1283 
1284   m.def(
1285       "_cuda_beginAllocateCurrentStreamToPool",
1286       [](c10::DeviceIndex device, at::cuda::MempoolId_t mempool_id) {
1287         auto stream = at::cuda::getCurrentCUDAStream(device);
1288         TORCH_CHECK(stream, "Expected stream capture to be under way");
1289         c10::cuda::CUDACachingAllocator::beginAllocateToPool(
1290             device, mempool_id, [stream](cudaStream_t target) {
1291               return target == stream;
1292             });
1293       });
1294 
1295   m.def(
1296       "_cuda_beginAllocateToPool",
1297       [](c10::DeviceIndex device, at::cuda::MempoolId_t mempool_id) {
1298         c10::cuda::CUDACachingAllocator::beginAllocateToPool(
1299             device, mempool_id, [](cudaStream_t) { return true; });
1300       });
1301 
1302   m.def(
1303       "_cuda_endAllocateCurrentStreamToPool",
1304       [](c10::DeviceIndex device, at::cuda::MempoolId_t mempool_id) {
1305         c10::cuda::CUDACachingAllocator::endAllocateToPool(device, mempool_id);
1306       });
1307 
1308   m.def(
1309       "_cuda_releasePool",
1310       [](c10::DeviceIndex device, at::cuda::MempoolId_t mempool_id) {
1311         c10::cuda::CUDACachingAllocator::releasePool(device, mempool_id);
1312       });
1313 
1314   m.def(
1315       "_cuda_checkPoolLiveAllocations",
1316       [](c10::DeviceIndex device,
1317          at::cuda::MempoolId_t mempool_id,
1318          const py::set& expected_live_allocations) {
1319         std::unordered_set<void*> allocations;
1320         allocations.reserve(expected_live_allocations.size());
1321         for (auto& elem : expected_live_allocations) {
1322           // NOLINTNEXTLINE(performance-no-int-to-ptr)
1323           allocations.insert(reinterpret_cast<void*>(py::cast<size_t>(elem)));
1324         }
1325         return c10::cuda::CUDACachingAllocator::checkPoolLiveAllocations(
1326             device, mempool_id, allocations);
1327       });
1328 
1329   m.def(
1330       "_cuda_setCheckpointPoolState",
1331       [](c10::DeviceIndex device,
1332          std::shared_ptr<c10::cuda::CUDACachingAllocator::AllocatorState> pps,
1333          const std::vector<size_t>& stale_storages_ptr,
1334          const std::vector<size_t>& storages_to_add_deleters_to_ptr = {}) {
1335         std::unordered_set<c10::StorageImpl*> ptr_set;
1336         // iterate on std::vector for determinism
1337         std::vector<c10::StorageImpl*> ptrs;
1338         for (size_t ptr_int : stale_storages_ptr) {
1339           // NOLINTNEXTLINE(performance-no-int-to-ptr)
1340           c10::StorageImpl* ptr = (c10::StorageImpl*)ptr_int;
1341           if (!ptr_set.count(ptr)) {
1342             ptrs.push_back(ptr);
1343             ptr_set.insert(ptr);
1344           }
1345         }
1346         auto delta = c10::cuda::CUDACachingAllocator::setCheckpointPoolState(
1347             device, std::move(pps));
1348         auto& freed_pointers = delta.ptrs_freed;
1349 
1350         std::unordered_set<void*> allocd_set;
1351         for (auto& data_ptr : delta.dataptrs_allocd) {
1352           allocd_set.insert(data_ptr.get());
1353         }
1354         std::unordered_set<void*> freed_pointer_set;
1355         size_t definite_freed_count = 0;
1356         for (void* ptr : freed_pointers) {
1357           if (!allocd_set.count(ptr)) {
1358             definite_freed_count += 1;
1359           }
1360           freed_pointer_set.insert((ptr));
1361         }
1362         // that block has already been freed,
1363         // so even those this will error, so too will the allocator
1364         // when the corresponding tensor dies because there is no
1365         // live tensor corresponding to it
1366         TORCH_CHECK(
1367             ptr_set.size() >= definite_freed_count,
1368             "Any stale tensors which are being manually freed"
1369             " must be passed to set checkpoint");
1370 
1371         removeStorageDeleterFns(ptrs, freed_pointer_set);
1372         std::vector<c10::StorageImpl*> storages_to_add_deleters_to;
1373         storages_to_add_deleters_to.reserve(
1374             storages_to_add_deleters_to_ptr.size());
1375         for (size_t ptr_int : storages_to_add_deleters_to_ptr) {
1376           // NOLINTNEXTLINE(performance-no-int-to-ptr)
1377           storages_to_add_deleters_to.push_back((c10::StorageImpl*)ptr_int);
1378         }
1379 
1380         addStorageDeleterFns(storages_to_add_deleters_to, delta);
1381       });
1382 }
1383 
bindGetDeviceProperties(PyObject * module)1384 static void bindGetDeviceProperties(PyObject* module) {
1385   // Add method to torch.cuda
1386   auto m = py::handle(module).cast<py::module>();
1387   m.def(
1388       "_get_device_properties",
1389       [](c10::DeviceIndex device) -> cudaDeviceProp* {
1390         return at::cuda::getDeviceProperties(device);
1391       },
1392       py::return_value_policy::reference);
1393 }
1394 
1395 // Callback for python part. Used for additional initialization of python
1396 // classes
THCPModule_initExtension(PyObject * self,PyObject * noargs)1397 static PyObject* THCPModule_initExtension(PyObject* self, PyObject* noargs) {
1398 #if C10_ASAN_ENABLED
1399   TORCH_WARN(
1400       "torch.cuda: your pytorch binary has address sanitizer (asan) built in, "
1401       "asan is currently not compatible with torch.cuda module, "
1402       "you might get unexpected behavior (eg. out of memory, crash, etc.), "
1403       "please rebuild pytorch without asan if you need to use this module");
1404 #endif
1405   HANDLE_TH_ERRORS
1406   TORCH_INTERNAL_ASSERT(!in_bad_fork); // Handled at python level
1407   poison_fork();
1408   at::globalContext().lazyInitCUDA();
1409 
1410   auto m = THPObjectPtr(PyImport_ImportModule("torch.cuda"));
1411   if (!m)
1412     throw python_error();
1413 
1414   auto set_module_attr = [&](const char* name, PyObject* v) {
1415     // PyObject_SetAttrString doesn't steal reference. So no need to incref.
1416     if (PyObject_SetAttrString(m, name, v) < 0) {
1417       throw python_error();
1418     }
1419   };
1420 
1421   auto num_gpus = c10::cuda::device_count();
1422   auto default_cuda_generators = PyTuple_New(static_cast<Py_ssize_t>(num_gpus));
1423   for (const auto i : c10::irange(num_gpus)) {
1424     auto cast_gen = (THPGenerator*)THPGenerator_initDefaultGenerator(
1425         at::cuda::detail::getDefaultCUDAGenerator(i));
1426     // This reference is meant to be given away, so no need to incref here.
1427     PyTuple_SetItem(default_cuda_generators, i, (PyObject*)cast_gen);
1428   }
1429   set_module_attr("default_generators", default_cuda_generators);
1430   bindGetDeviceProperties(m);
1431 
1432   Py_RETURN_NONE;
1433   END_HANDLE_TH_ERRORS
1434 }
1435 
THCPModule_getCurrentBlasHandle_wrap(PyObject * self,PyObject * noargs)1436 PyObject* THCPModule_getCurrentBlasHandle_wrap(
1437     PyObject* self,
1438     PyObject* noargs) {
1439   HANDLE_TH_ERRORS
1440   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
1441   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
1442   return PyLong_FromVoidPtr(handle);
1443   END_HANDLE_TH_ERRORS
1444 }
1445 
THCPModule_clearBlasWorkspaces_wrap(PyObject * self,PyObject * noargs)1446 static PyObject* THCPModule_clearBlasWorkspaces_wrap(
1447     PyObject* self,
1448     PyObject* noargs) {
1449   HANDLE_TH_ERRORS
1450   at::cuda::clearCublasWorkspaces();
1451   Py_RETURN_NONE;
1452   END_HANDLE_TH_ERRORS
1453 }
1454 
THCPModule_rocm_is_backward_pass(PyObject * _unused,PyObject * noargs)1455 PyObject* THCPModule_rocm_is_backward_pass(
1456     PyObject* _unused,
1457     PyObject* noargs) {
1458   HANDLE_TH_ERRORS
1459 #if USE_ROCM
1460   if (at::ROCmBackwardPassGuard::is_backward_pass()) {
1461     Py_RETURN_TRUE;
1462   } else {
1463     Py_RETURN_FALSE;
1464   }
1465 #else
1466   Py_RETURN_FALSE;
1467 #endif
1468   END_HANDLE_TH_ERRORS
1469 }
1470 
THCPModule_cuda_tunableop_enable(PyObject * _unused,PyObject * arg)1471 PyObject* THCPModule_cuda_tunableop_enable(PyObject* _unused, PyObject* arg) {
1472   HANDLE_TH_ERRORS
1473   TORCH_CHECK(
1474       THPUtils_checkBool(arg),
1475       "cuda_tunableop_enable expects a bool, but got ",
1476       THPUtils_typename(arg));
1477   at::cuda::tunable::getTuningContext()->EnableTunableOp(
1478       THPUtils_unpackBool(arg));
1479   Py_RETURN_NONE;
1480   END_HANDLE_TH_ERRORS
1481 }
1482 
THCPModule_cuda_tunableop_is_enabled(PyObject * _unused,PyObject * noarg)1483 PyObject* THCPModule_cuda_tunableop_is_enabled(
1484     PyObject* _unused,
1485     PyObject* noarg) {
1486   HANDLE_TH_ERRORS
1487   if (at::cuda::tunable::getTuningContext()->IsTunableOpEnabled()) {
1488     Py_RETURN_TRUE;
1489   } else {
1490     Py_RETURN_FALSE;
1491   }
1492   END_HANDLE_TH_ERRORS
1493 }
1494 
THCPModule_cuda_tunableop_tuning_enable(PyObject * _unused,PyObject * arg)1495 PyObject* THCPModule_cuda_tunableop_tuning_enable(
1496     PyObject* _unused,
1497     PyObject* arg) {
1498   HANDLE_TH_ERRORS
1499   TORCH_CHECK(
1500       THPUtils_checkBool(arg),
1501       "cuda_tunableop_tuning_enable expects a bool, but got ",
1502       THPUtils_typename(arg));
1503   at::cuda::tunable::getTuningContext()->EnableTuning(THPUtils_unpackBool(arg));
1504   Py_RETURN_NONE;
1505   END_HANDLE_TH_ERRORS
1506 }
1507 
THCPModule_cuda_tunableop_tuning_is_enabled(PyObject * _unused,PyObject * noarg)1508 PyObject* THCPModule_cuda_tunableop_tuning_is_enabled(
1509     PyObject* _unused,
1510     PyObject* noarg) {
1511   HANDLE_TH_ERRORS
1512   if (at::cuda::tunable::getTuningContext()->IsTuningEnabled()) {
1513     Py_RETURN_TRUE;
1514   } else {
1515     Py_RETURN_FALSE;
1516   }
1517   END_HANDLE_TH_ERRORS
1518 }
1519 
THCPModule_cuda_tunableop_write_file_on_exit(PyObject * _unused,PyObject * arg)1520 PyObject* THCPModule_cuda_tunableop_write_file_on_exit(
1521     PyObject* _unused,
1522     PyObject* arg) {
1523   HANDLE_TH_ERRORS
1524   TORCH_CHECK(
1525       THPUtils_checkBool(arg),
1526       "cuda_tunableop_write_file_on_exit expects a bool, but got ",
1527       THPUtils_typename(arg));
1528   at::cuda::tunable::getTuningContext()->WriteFileOnExit(
1529       THPUtils_unpackBool(arg));
1530   Py_RETURN_NONE;
1531   END_HANDLE_TH_ERRORS
1532 }
1533 
THCPModule_cuda_tunableop_set_max_tuning_duration(PyObject * _unused,PyObject * arg)1534 PyObject* THCPModule_cuda_tunableop_set_max_tuning_duration(
1535     PyObject* _unused,
1536     PyObject* arg) {
1537   HANDLE_TH_ERRORS
1538   TORCH_CHECK(
1539       THPUtils_checkLong(arg),
1540       "cuda_tunableop_set_max_tuning_duration expects an int, but got ",
1541       THPUtils_typename(arg));
1542   auto duration = static_cast<int>(THPUtils_unpackLong(arg));
1543   at::cuda::tunable::getTuningContext()->SetMaxTuningDurationMs(duration);
1544   Py_RETURN_NONE;
1545   END_HANDLE_TH_ERRORS
1546 }
1547 
THCPModule_cuda_tunableop_get_max_tuning_duration(PyObject * _unused,PyObject * noargs)1548 PyObject* THCPModule_cuda_tunableop_get_max_tuning_duration(
1549     PyObject* _unused,
1550     PyObject* noargs) {
1551   HANDLE_TH_ERRORS
1552   return THPUtils_packInt32(
1553       at::cuda::tunable::getTuningContext()->GetMaxTuningDurationMs());
1554   END_HANDLE_TH_ERRORS
1555 }
1556 
THCPModule_cuda_tunableop_set_max_tuning_iterations(PyObject * _unused,PyObject * arg)1557 PyObject* THCPModule_cuda_tunableop_set_max_tuning_iterations(
1558     PyObject* _unused,
1559     PyObject* arg) {
1560   HANDLE_TH_ERRORS
1561   TORCH_CHECK(
1562       THPUtils_checkLong(arg),
1563       "cuda_tunableop_set_max_tuning_iterations expects an int, but got ",
1564       THPUtils_typename(arg));
1565   auto iterations = static_cast<int>(THPUtils_unpackLong(arg));
1566   at::cuda::tunable::getTuningContext()->SetMaxTuningIterations(iterations);
1567   Py_RETURN_NONE;
1568   END_HANDLE_TH_ERRORS
1569 }
1570 
THCPModule_cuda_tunableop_get_max_tuning_iterations(PyObject * _unused,PyObject * noargs)1571 PyObject* THCPModule_cuda_tunableop_get_max_tuning_iterations(
1572     PyObject* _unused,
1573     PyObject* noargs) {
1574   HANDLE_TH_ERRORS
1575   return THPUtils_packInt32(
1576       at::cuda::tunable::getTuningContext()->GetMaxTuningIterations());
1577   END_HANDLE_TH_ERRORS
1578 }
1579 
THCPModule_cuda_tunableop_set_filename(PyObject * _unused,PyObject * args)1580 PyObject* THCPModule_cuda_tunableop_set_filename(
1581     PyObject* _unused,
1582     PyObject* args) {
1583   HANDLE_TH_ERRORS
1584   PyObject* obj_str = nullptr;
1585   PyObject* obj_ord = nullptr;
1586   if (!PyArg_ParseTuple(args, "O|O", &obj_str, &obj_ord)) {
1587   }
1588   TORCH_CHECK(
1589       THPUtils_checkString(obj_str),
1590       "cuda_tunableop_set_filename expects a string, but got ",
1591       THPUtils_typename(obj_str));
1592   auto filename = THPUtils_unpackString(obj_str);
1593   bool dev = false;
1594   if (obj_ord) {
1595     TORCH_CHECK(
1596         THPUtils_checkBool(obj_ord),
1597         "cuda_tunableop_set_filename expects a bool, but got ",
1598         THPUtils_typename(obj_ord));
1599     dev = THPUtils_unpackBool(obj_ord);
1600   }
1601   at::cuda::tunable::getTuningContext()->SetFilename(filename, dev);
1602   Py_RETURN_NONE;
1603   END_HANDLE_TH_ERRORS
1604 }
1605 
THCPModule_cuda_tunableop_get_filename(PyObject * _unused,PyObject * noargs)1606 PyObject* THCPModule_cuda_tunableop_get_filename(
1607     PyObject* _unused,
1608     PyObject* noargs) {
1609   HANDLE_TH_ERRORS
1610   return THPUtils_packString(
1611       at::cuda::tunable::getTuningContext()->GetFilename());
1612   END_HANDLE_TH_ERRORS
1613 }
1614 
THCPModule_cuda_tunableop_write_file(PyObject * _unused,PyObject * args)1615 PyObject* THCPModule_cuda_tunableop_write_file(
1616     PyObject* _unused,
1617     PyObject* args) {
1618   HANDLE_TH_ERRORS
1619   PyObject* str = nullptr;
1620   bool success = false;
1621   if (!PyArg_ParseTuple(args, "|O", &str)) {
1622   }
1623   if (str) {
1624     TORCH_CHECK(
1625         THPUtils_checkString(str),
1626         "cuda_tunableop_write_file expects a string, but got ",
1627         THPUtils_typename(str));
1628     auto filename = THPUtils_unpackString(str);
1629     success = at::cuda::tunable::getTuningContext()->WriteFile(filename);
1630   } else {
1631     success = at::cuda::tunable::getTuningContext()->WriteFile();
1632   }
1633   if (success) {
1634     Py_RETURN_TRUE;
1635   } else {
1636     Py_RETURN_FALSE;
1637   }
1638   END_HANDLE_TH_ERRORS
1639 }
1640 
THCPModule_cuda_tunableop_read_file(PyObject * _unused,PyObject * args)1641 PyObject* THCPModule_cuda_tunableop_read_file(
1642     PyObject* _unused,
1643     PyObject* args) {
1644   HANDLE_TH_ERRORS
1645   PyObject* str = nullptr;
1646   bool success = false;
1647   if (!PyArg_ParseTuple(args, "|O", &str)) {
1648   }
1649   if (str) {
1650     TORCH_CHECK(
1651         THPUtils_checkString(str),
1652         "cuda_tunableop_read_file expects a string, but got ",
1653         THPUtils_typename(str));
1654     auto filename = THPUtils_unpackString(str);
1655     success = at::cuda::tunable::getTuningContext()->ReadFile(filename);
1656   } else {
1657     success = at::cuda::tunable::getTuningContext()->ReadFile();
1658   }
1659   if (success) {
1660     Py_RETURN_TRUE;
1661   } else {
1662     Py_RETURN_FALSE;
1663   }
1664   END_HANDLE_TH_ERRORS
1665 }
1666 
THCPModule_cuda_tunableop_get_results(PyObject * _unused,PyObject * noargs)1667 PyObject* THCPModule_cuda_tunableop_get_results(
1668     PyObject* _unused,
1669     PyObject* noargs) {
1670   HANDLE_TH_ERRORS
1671   auto results =
1672       at::cuda::tunable::getTuningContext()->GetTuningResultsManager().Dump();
1673   size_t result_size = 0;
1674   for (const auto& [op_sig, kernelmap] : results) {
1675     result_size += kernelmap.size();
1676   }
1677   THPObjectPtr outer_tuple(PyTuple_New(result_size));
1678   if (!outer_tuple)
1679     throw python_error();
1680   size_t result_index = 0;
1681   for (const auto& [op_sig, kernelmap] : results) {
1682     for (const auto& [param_sig, result] : kernelmap) {
1683       THPObjectPtr inner_tuple(PyTuple_New(4));
1684       if (!inner_tuple)
1685         throw python_error();
1686       PyObject* obj_op_sig = THPUtils_packString(op_sig);
1687       if (!obj_op_sig)
1688         throw python_error();
1689       PyObject* obj_param_sig = THPUtils_packString(param_sig);
1690       if (!obj_param_sig)
1691         throw python_error();
1692       PyObject* obj_result_key = THPUtils_packString(result.GetKey());
1693       if (!obj_result_key)
1694         throw python_error();
1695       PyObject* obj_result_time = PyFloat_FromDouble(result.GetTime());
1696       if (!obj_result_time)
1697         throw python_error();
1698       PyTuple_SET_ITEM(inner_tuple.get(), 0, obj_op_sig);
1699       PyTuple_SET_ITEM(inner_tuple.get(), 1, obj_param_sig);
1700       PyTuple_SET_ITEM(inner_tuple.get(), 2, obj_result_key);
1701       PyTuple_SET_ITEM(inner_tuple.get(), 3, obj_result_time);
1702       PyTuple_SET_ITEM(
1703           outer_tuple.get(), result_index++, inner_tuple.release());
1704     }
1705   }
1706   return outer_tuple.release();
1707   END_HANDLE_TH_ERRORS
1708 }
1709 
THCPModule_cuda_tunableop_get_validators(PyObject * _unused,PyObject * noargs)1710 PyObject* THCPModule_cuda_tunableop_get_validators(
1711     PyObject* _unused,
1712     PyObject* noargs) {
1713   HANDLE_TH_ERRORS
1714   auto validators = at::cuda::tunable::getTuningContext()
1715                         ->GetTuningResultsValidator()
1716                         .GetAllValidators();
1717   THPObjectPtr outer_tuple(PyTuple_New(validators.size()));
1718   if (!outer_tuple)
1719     throw python_error();
1720   size_t validator_index = 0;
1721   for (const auto& [key, val] : validators) {
1722     THPObjectPtr inner_tuple(PyTuple_New(2));
1723     if (!inner_tuple)
1724       throw python_error();
1725     PyObject* obj_key = THPUtils_packString(key);
1726     if (!obj_key)
1727       throw python_error();
1728     PyObject* obj_val = THPUtils_packString(val);
1729     if (!obj_val)
1730       throw python_error();
1731     PyTuple_SET_ITEM(inner_tuple.get(), 0, obj_key);
1732     PyTuple_SET_ITEM(inner_tuple.get(), 1, obj_val);
1733     PyTuple_SET_ITEM(
1734         outer_tuple.get(), validator_index++, inner_tuple.release());
1735   }
1736   return outer_tuple.release();
1737   END_HANDLE_TH_ERRORS
1738 }
1739 
THCPModule_isCurrentStreamCapturing_wrap(PyObject * self,PyObject * noargs)1740 static PyObject* THCPModule_isCurrentStreamCapturing_wrap(
1741     PyObject* self,
1742     PyObject* noargs) {
1743   HANDLE_TH_ERRORS
1744   // If there's no cuda context, at::cuda::currentStreamCaptureStatus returns
1745   // CaptureStatus::None without initializing a context.
1746   if (at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None) {
1747     Py_RETURN_FALSE;
1748   } else {
1749     Py_RETURN_TRUE;
1750   }
1751   END_HANDLE_TH_ERRORS
1752 }
1753 
THCPModule_setBenchmarkLimitCuDNN(PyObject * _unused,PyObject * arg)1754 PyObject* THCPModule_setBenchmarkLimitCuDNN(PyObject* _unused, PyObject* arg) {
1755   HANDLE_TH_ERRORS
1756   TORCH_CHECK(
1757       THPUtils_checkLong(arg),
1758       "set_benchmark_limit_cudnn expects an int, "
1759       "but got ",
1760       THPUtils_typename(arg));
1761 #if defined(USE_ROCM)
1762   TORCH_WARN_ONCE(
1763       "cuDNN Benchmark limit is not supported in MIOpen and will have no effect.");
1764 #endif
1765   auto benchmark_limit = static_cast<int>(THPUtils_unpackLong(arg));
1766   at::globalContext().setBenchmarkLimitCuDNN(benchmark_limit);
1767   Py_RETURN_NONE;
1768   END_HANDLE_TH_ERRORS
1769 }
1770 
THCPModule_benchmarkLimitCuDNN(PyObject * _unused,PyObject * noargs)1771 PyObject* THCPModule_benchmarkLimitCuDNN(PyObject* _unused, PyObject* noargs) {
1772   return THPUtils_packInt32(at::globalContext().benchmarkLimitCuDNN());
1773 }
1774 
1775 // NOLINTNEXTLINE(*-c-arrays*, *-global-variables)
1776 static struct PyMethodDef _THCPModule_methods[] = {
1777     {"_cuda_init", THCPModule_initExtension, METH_NOARGS, nullptr},
1778     {"_cuda_setDevice", THCPModule_setDevice_wrap, METH_O, nullptr},
1779     {"_cuda_exchangeDevice", THCPModule_exchangeDevice, METH_O, nullptr},
1780     {"_cuda_maybeExchangeDevice",
1781      THCPModule_maybeExchangeDevice,
1782      METH_O,
1783      nullptr},
1784     {"_cuda_getDevice", THCPModule_getDevice_wrap, METH_NOARGS, nullptr},
1785     {"_cuda_getDeviceCount",
1786      THCPModule_getDeviceCount_wrap,
1787      METH_NOARGS,
1788      nullptr},
1789     {"_cuda_canDeviceAccessPeer",
1790      THCPModule_canDeviceAccessPeer_wrap,
1791      METH_VARARGS,
1792      nullptr},
1793     {"_cuda_getArchFlags", THCPModule_getArchFlags, METH_NOARGS, nullptr},
1794     {"_cuda_isInBadFork", THCPModule_isInBadFork, METH_NOARGS, nullptr},
1795     {"_cuda_getCurrentStream",
1796      THCPModule_getCurrentStream_wrap,
1797      METH_O,
1798      nullptr},
1799     {"_cuda_getCurrentRawStream",
1800      THCPModule_getCurrentStream_raw,
1801      METH_O,
1802      nullptr},
1803     {"_cuda_getDefaultStream",
1804      THCPModule_getDefaultStream_wrap,
1805      METH_O,
1806      nullptr},
1807     {"_cuda_getCurrentBlasHandle",
1808      THCPModule_getCurrentBlasHandle_wrap,
1809      METH_NOARGS,
1810      nullptr},
1811     {"_cuda_clearCublasWorkspaces",
1812      THCPModule_clearBlasWorkspaces_wrap,
1813      METH_NOARGS,
1814      nullptr},
1815     {"_cuda_isCurrentStreamCapturing",
1816      THCPModule_isCurrentStreamCapturing_wrap,
1817      METH_NOARGS,
1818      nullptr},
1819     {"_cuda_setStream",
1820      castPyCFunctionWithKeywords(THCPModule_setStream_wrap),
1821      METH_VARARGS | METH_KEYWORDS,
1822      nullptr},
1823     {"_cuda_getCompiledVersion",
1824      THCPModule_getCompiledVersion,
1825      METH_NOARGS,
1826      nullptr},
1827     {"_cuda_hasPrimaryContext", THCPModule_hasPrimaryContext, METH_O, nullptr},
1828     {"_cuda_setMemoryFraction",
1829      THCPModule_setMemoryFraction,
1830      METH_VARARGS,
1831      nullptr},
1832     {"_cuda_emptyCache", THCPModule_emptyCache, METH_NOARGS, nullptr},
1833     {"_cuda_memoryStats", THCPModule_memoryStats, METH_O, nullptr},
1834     {"_cuda_resetAccumulatedMemoryStats",
1835      THCPModule_resetAccumulatedMemoryStats,
1836      METH_O,
1837      nullptr},
1838     {"_cuda_resetPeakMemoryStats",
1839      THCPModule_resetPeakMemoryStats,
1840      METH_O,
1841      nullptr},
1842     {"_cuda_memorySnapshot", THCPModule_memorySnapshot, METH_NOARGS, nullptr},
1843     {"_cuda_attach_out_of_memory_observer",
1844      THCPModule_attachOutOfMemoryObserver,
1845      METH_O,
1846      nullptr},
1847     {"_cuda_cudaHostAllocator",
1848      THCPModule_cudaHostAllocator,
1849      METH_NOARGS,
1850      nullptr},
1851     {"_host_emptyCache", THCPModule_hostEmptyCache, METH_NOARGS, nullptr},
1852     {"_cuda_cudaCachingAllocator_raw_alloc",
1853      THCPModule_cudaCachingAllocator_raw_alloc,
1854      METH_VARARGS,
1855      nullptr},
1856     {"_cuda_cudaCachingAllocator_raw_delete",
1857      THCPModule_cudaCachingAllocator_raw_delete,
1858      METH_O,
1859      nullptr},
1860     {"_cuda_cudaCachingAllocator_set_allocator_settings",
1861      THCPModule_cudaCachingAllocator_set_allocator_settings,
1862      METH_O,
1863      nullptr},
1864     {"_cuda_getAllocatorBackend",
1865      THCPModule_getAllocatorBackend,
1866      METH_NOARGS,
1867      nullptr},
1868     {"_cuda_synchronize", THCPModule_cudaSynchronize, METH_NOARGS, nullptr},
1869     {"_cuda_ipc_collect", THCPModule_cudaIPCCollect, METH_NOARGS, nullptr},
1870     {"_cuda_sleep", THCPModule_cudaSleep, METH_O, nullptr},
1871     {"_cuda_lock_mutex", THCPModule_cudaLockMutex, METH_NOARGS, nullptr},
1872     {"_cuda_unlock_mutex", THCPModule_cudaUnlockMutex, METH_NOARGS, nullptr},
1873     {"_cuda_set_sync_debug_mode",
1874      THCPModule_cudaSetSyncDebugMode,
1875      METH_O,
1876      nullptr},
1877     {"_cuda_get_sync_debug_mode",
1878      THCPModule_cudaGetSyncDebugMode,
1879      METH_NOARGS,
1880      nullptr},
1881     {"_cuda_jiterator_compile_and_launch_kernel",
1882      THCPModule_cudaJiteratorCompileAndLaunchKernel,
1883      METH_VARARGS,
1884      nullptr},
1885     {"_cuda_get_cudnn_benchmark_limit",
1886      THCPModule_benchmarkLimitCuDNN,
1887      METH_NOARGS,
1888      nullptr},
1889     {"_cuda_set_cudnn_benchmark_limit",
1890      THCPModule_setBenchmarkLimitCuDNN,
1891      METH_O,
1892      nullptr},
1893 #ifdef USE_NCCL
1894     {"_nccl_version", THCPModule_nccl_version, METH_NOARGS, nullptr},
1895     {"_nccl_version_suffix",
1896      THCPModule_nccl_version_suffix,
1897      METH_NOARGS,
1898      nullptr},
1899     {"_nccl_unique_id", THCPModule_nccl_unique_id, METH_NOARGS, nullptr},
1900     {"_nccl_init_rank", THCPModule_nccl_init_rank, METH_VARARGS, nullptr},
1901     {"_nccl_reduce", THCPModule_nccl_reduce, METH_VARARGS, nullptr},
1902     {"_nccl_all_reduce", THCPModule_nccl_all_reduce, METH_VARARGS, nullptr},
1903     {"_nccl_broadcast", THCPModule_nccl_broadcast, METH_VARARGS, nullptr},
1904     {"_nccl_all_gather", THCPModule_nccl_all_gather, METH_VARARGS, nullptr},
1905     {"_nccl_reduce_scatter",
1906      THCPModule_nccl_reduce_scatter,
1907      METH_VARARGS,
1908      nullptr},
1909 #endif
1910     {"_rocm_is_backward_pass",
1911      THCPModule_rocm_is_backward_pass,
1912      METH_NOARGS,
1913      nullptr},
1914     {"_cuda_tunableop_enable",
1915      THCPModule_cuda_tunableop_enable,
1916      METH_O,
1917      nullptr},
1918     {"_cuda_tunableop_is_enabled",
1919      THCPModule_cuda_tunableop_is_enabled,
1920      METH_NOARGS,
1921      nullptr},
1922     {"_cuda_tunableop_tuning_enable",
1923      THCPModule_cuda_tunableop_tuning_enable,
1924      METH_O,
1925      nullptr},
1926     {"_cuda_tunableop_tuning_is_enabled",
1927      THCPModule_cuda_tunableop_tuning_is_enabled,
1928      METH_NOARGS,
1929      nullptr},
1930     {"_cuda_tunableop_write_file_on_exit",
1931      THCPModule_cuda_tunableop_write_file_on_exit,
1932      METH_O,
1933      nullptr},
1934     {"_cuda_tunableop_set_max_tuning_duration",
1935      THCPModule_cuda_tunableop_set_max_tuning_duration,
1936      METH_O,
1937      nullptr},
1938     {"_cuda_tunableop_get_max_tuning_duration",
1939      THCPModule_cuda_tunableop_get_max_tuning_duration,
1940      METH_NOARGS,
1941      nullptr},
1942     {"_cuda_tunableop_set_max_tuning_iterations",
1943      THCPModule_cuda_tunableop_set_max_tuning_iterations,
1944      METH_O,
1945      nullptr},
1946     {"_cuda_tunableop_get_max_tuning_iterations",
1947      THCPModule_cuda_tunableop_get_max_tuning_iterations,
1948      METH_NOARGS,
1949      nullptr},
1950     {"_cuda_tunableop_set_filename",
1951      THCPModule_cuda_tunableop_set_filename,
1952      METH_VARARGS,
1953      nullptr},
1954     {"_cuda_tunableop_get_filename",
1955      THCPModule_cuda_tunableop_get_filename,
1956      METH_NOARGS,
1957      nullptr},
1958     {"_cuda_tunableop_write_file",
1959      THCPModule_cuda_tunableop_write_file,
1960      METH_VARARGS,
1961      nullptr},
1962     {"_cuda_tunableop_read_file",
1963      THCPModule_cuda_tunableop_read_file,
1964      METH_VARARGS,
1965      nullptr},
1966     {"_cuda_tunableop_get_results",
1967      THCPModule_cuda_tunableop_get_results,
1968      METH_NOARGS,
1969      nullptr},
1970     {"_cuda_tunableop_get_validators",
1971      THCPModule_cuda_tunableop_get_validators,
1972      METH_NOARGS,
1973      nullptr},
1974     {nullptr}};
1975 
THCPModule_methods()1976 PyMethodDef* THCPModule_methods() {
1977   return _THCPModule_methods;
1978 }
1979 
1980 namespace torch::cuda {
1981 
1982 namespace shared {
1983 
1984 void initCudartBindings(PyObject* module);
1985 void initNvtxBindings(PyObject* module);
1986 void initGdsBindings(PyObject* module);
1987 #if defined(USE_CUDNN) || defined(USE_ROCM)
1988 void initCudnnBindings(PyObject* module);
1989 #endif
1990 #if defined(USE_CUSPARSELT)
1991 void initCusparseltBindings(PyObject* module);
1992 #endif
1993 
1994 } // namespace shared
1995 
initModule(PyObject * module)1996 void initModule(PyObject* module) {
1997   python::initCommMethods(module);
1998   // As weird as it seems, this file is also compiled for ROCm,
1999   // so this condition might not always be true...
2000   shared::initCudartBindings(module);
2001   shared::initNvtxBindings(module);
2002 #if defined(USE_CUDNN) || defined(USE_ROCM)
2003   shared::initCudnnBindings(module);
2004 #endif
2005 #if defined(USE_CUSPARSELT)
2006   shared::initCusparseltBindings(module);
2007 #endif
2008   shared::initGdsBindings(module);
2009   registerCudaDeviceProperties(module);
2010   registerCudaPluggableAllocator(module);
2011 }
2012 
2013 } // namespace torch::cuda
2014