1 #include <ATen/ATen.h>
2 #include <ATen/core/TensorBody.h>
3 #include <ATen/cuda/CUDAConfig.h>
4 #include <ATen/native/ConvUtils.h>
5 #include <c10/core/Device.h>
6 #include <c10/core/TensorImpl.h>
7 #include <c10/util/UniqueVoidPtr.h>
8 #include <fmt/core.h>
9 #include <pybind11/pytypes.h>
10 #include <torch/csrc/utils/python_arg_parser.h>
11 #include <unordered_set>
12
13 #if AT_CUDNN_ENABLED()
14
15 #endif
16 #include <ATen/cuda/CUDAContext.h>
17 #include <ATen/cuda/CUDAGeneratorImpl.h>
18 #include <ATen/cuda/CachingHostAllocator.h>
19 #include <ATen/cuda/Sleep.h>
20 #include <ATen/cuda/detail/CUDAHooks.h>
21 #include <ATen/cuda/jiterator.h>
22 #include <ATen/cuda/tunable/Tunable.h>
23 #include <c10/core/StorageImpl.h>
24 #include <c10/cuda/CUDAAllocatorConfig.h>
25 #include <c10/cuda/CUDACachingAllocator.h>
26 #include <c10/cuda/CUDAFunctions.h>
27 #include <ATen/cuda/CUDAGraphsUtils.cuh>
28
29 #ifdef USE_NCCL
30 #include <torch/csrc/cuda/python_nccl.h>
31 #endif
32 #include <c10/util/CallOnce.h>
33 #include <c10/util/irange.h>
34
35 #include <torch/csrc/CudaIPCTypes.h>
36 #include <torch/csrc/Generator.h>
37 #include <torch/csrc/cuda/CUDAPluggableAllocator.h>
38 #include <torch/csrc/cuda/GdsFile.h>
39 #include <torch/csrc/cuda/THCP.h>
40 #include <torch/csrc/cuda/memory_snapshot.h>
41 #include <torch/csrc/cuda/python_comm.h>
42 #include <torch/csrc/profiler/python/combined_traceback.h>
43 #include <torch/csrc/python_headers.h>
44 #include <torch/csrc/utils/device_lazy_init.h>
45 #include <torch/csrc/utils/pybind.h>
46 #include <torch/csrc/utils/pycfunction_helpers.h>
47 #include <torch/csrc/utils/python_numbers.h>
48 #include <torch/csrc/utils/python_strings.h>
49 #include <array>
50 #include <chrono>
51 #include <iostream>
52 #include <sstream>
53 #include <thread>
54 #include <unordered_map>
55 #ifndef WIN32
56 #include <pthread.h>
57 #endif
58
59 using namespace torch;
60
61 static bool in_bad_fork = false; // True for children forked after cuda init
62
63 #ifndef WIN32
64 // Called in the forked child if cuda has already been initialized
forked_child()65 static void forked_child() {
66 in_bad_fork = true;
67 torch::utils::set_requires_device_init(at::kCUDA, true);
68 }
69 #endif
70
71 // Should be called before the first cuda call.
72 // Note: This is distinct from initExtension because a stub cuda implementation
73 // has some working functions (e.g. device_count) but cannot fully initialize.
poison_fork()74 static void poison_fork() {
75 #ifndef WIN32
76 static c10::once_flag flag;
77 c10::call_once(flag, [] { pthread_atfork(nullptr, nullptr, forked_child); });
78 #endif
79 }
80
81 ////////////////////////////////////////////////////////////////////////////////
82 // CUDA management methods
83 ////////////////////////////////////////////////////////////////////////////////
84
THCPModule_setDevice_wrap(PyObject * self,PyObject * arg)85 PyObject* THCPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
86 HANDLE_TH_ERRORS
87 TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to setDevice");
88 auto device = THPUtils_unpackLong(arg);
89
90 torch::utils::device_lazy_init(at::kCUDA);
91 c10::cuda::set_device(static_cast<c10::DeviceIndex>(device));
92
93 Py_RETURN_NONE;
94 END_HANDLE_TH_ERRORS
95 }
96
THCPModule_exchangeDevice(PyObject * self,PyObject * arg)97 PyObject* THCPModule_exchangeDevice(PyObject* self, PyObject* arg) {
98 HANDLE_TH_ERRORS
99 TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to exchangeDevice");
100 auto device_index = THPUtils_unpackDeviceIndex(arg);
101 if (device_index < 0) {
102 return THPUtils_packInt32(-1);
103 }
104
105 torch::utils::device_lazy_init(at::kCUDA);
106 auto current_device = c10::cuda::ExchangeDevice(device_index);
107
108 return THPUtils_packDeviceIndex(current_device);
109 END_HANDLE_TH_ERRORS
110 }
111
THCPModule_maybeExchangeDevice(PyObject * self,PyObject * arg)112 PyObject* THCPModule_maybeExchangeDevice(PyObject* self, PyObject* arg) {
113 HANDLE_TH_ERRORS
114 TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to exchangeDevice");
115 auto device_index = THPUtils_unpackDeviceIndex(arg);
116 if (device_index < 0) {
117 return THPUtils_packInt32(-1);
118 }
119
120 torch::utils::device_lazy_init(at::kCUDA);
121 auto current_device = c10::cuda::MaybeExchangeDevice(device_index);
122
123 return THPUtils_packDeviceIndex(current_device);
124 END_HANDLE_TH_ERRORS
125 }
126
THCPModule_getDevice_wrap(PyObject * self,PyObject * noargs)127 PyObject* THCPModule_getDevice_wrap(PyObject* self, PyObject* noargs) {
128 HANDLE_TH_ERRORS
129 torch::utils::device_lazy_init(at::kCUDA);
130 // NOLINTNEXTLINE(bugprone-signed-char-misuse)
131 auto device = static_cast<int32_t>(c10::cuda::current_device());
132 return THPUtils_packInt32(device);
133 END_HANDLE_TH_ERRORS
134 }
135
THCPModule_canDeviceAccessPeer_wrap(PyObject * self,PyObject * args)136 PyObject* THCPModule_canDeviceAccessPeer_wrap(PyObject* self, PyObject* args) {
137 HANDLE_TH_ERRORS
138 PyObject* arg1 = nullptr;
139 PyObject* arg2 = nullptr;
140 if (!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
141 THPUtils_invalidArguments(
142 args,
143 nullptr,
144 "can_device_peer_access",
145 1,
146 "(int device, int peer_device);");
147 return nullptr;
148 }
149 TORCH_CHECK(
150 THPUtils_checkLong(arg1), "invalid argument to canDeviceAccessPeer");
151 TORCH_CHECK(
152 THPUtils_checkLong(arg2), "invalid argument to canDeviceAccessPeer");
153 int64_t device = THPUtils_unpackLong(arg1);
154 int64_t peer_device = THPUtils_unpackLong(arg2);
155
156 torch::utils::device_lazy_init(at::kCUDA);
157 auto can_access = at::cuda::canDeviceAccessPeer(device, peer_device);
158 return PyBool_FromLong(can_access);
159 END_HANDLE_TH_ERRORS
160 }
161
THCPModule_getDeviceCount_wrap(PyObject * self,PyObject * noargs)162 PyObject* THCPModule_getDeviceCount_wrap(PyObject* self, PyObject* noargs) {
163 HANDLE_TH_ERRORS
164 poison_fork();
165 return THPUtils_packUInt64(at::cuda::device_count());
166 END_HANDLE_TH_ERRORS
167 }
168
THCPModule_getArchFlags(PyObject * self,PyObject * noargs)169 PyObject* THCPModule_getArchFlags(PyObject* self, PyObject* noargs) {
170 HANDLE_TH_ERRORS
171 poison_fork();
172 #ifdef CUDA_ARCH_FLAGS
173 static const char* flags = C10_STRINGIZE(CUDA_ARCH_FLAGS);
174 return THPUtils_packString(flags);
175 #else
176 Py_RETURN_NONE;
177 #endif
178 END_HANDLE_TH_ERRORS
179 }
180
THCPModule_isInBadFork(PyObject * self,PyObject * noargs)181 static PyObject* THCPModule_isInBadFork(PyObject* self, PyObject* noargs) {
182 HANDLE_TH_ERRORS
183 return PyBool_FromLong(in_bad_fork);
184 END_HANDLE_TH_ERRORS
185 }
186
THCPModule_getCurrentStream_wrap(PyObject *,PyObject * device_index)187 PyObject* THCPModule_getCurrentStream_wrap(
188 PyObject* /* unused */,
189 PyObject* device_index) {
190 HANDLE_TH_ERRORS
191 TORCH_CHECK(
192 THPUtils_checkLong(device_index), "invalid argument to getCurrentStream");
193 auto c10_device_index = THPUtils_unpackDeviceIndex(device_index);
194 auto stream = at::cuda::getCurrentCUDAStream(c10_device_index);
195 PyObject* output_tuple = PyTuple_New(3);
196 PyTuple_SetItem(
197 output_tuple, 0, THPUtils_packInt64(static_cast<int64_t>(stream.id())));
198 PyTuple_SetItem(
199 output_tuple, 1, THPUtils_packDeviceIndex(stream.device_index()));
200 PyTuple_SetItem(
201 output_tuple,
202 2,
203 THPUtils_packInt64(static_cast<int64_t>(stream.device_type())));
204 return output_tuple;
205 END_HANDLE_TH_ERRORS
206 }
207
THCPModule_getCurrentStream_raw(PyObject *,PyObject * device_index)208 PyObject* THCPModule_getCurrentStream_raw(
209 PyObject* /* unused */,
210 PyObject* device_index) {
211 HANDLE_TH_ERRORS
212 TORCH_CHECK(
213 THPUtils_checkLong(device_index), "invalid argument to getCurrentStream");
214 auto c10_device_index = THPUtils_unpackDeviceIndex(device_index);
215 return PyLong_FromVoidPtr(
216 at::cuda::getCurrentCUDAStream(c10_device_index).stream());
217 END_HANDLE_TH_ERRORS
218 }
219
THCPModule_getDefaultStream_wrap(PyObject *,PyObject * device_index)220 PyObject* THCPModule_getDefaultStream_wrap(
221 PyObject* /* unused */,
222 PyObject* device_index) {
223 HANDLE_TH_ERRORS
224 TORCH_CHECK(
225 THPUtils_checkLong(device_index), "invalid argument to getDefaultStream");
226 auto c10_device_index = THPUtils_unpackDeviceIndex(device_index);
227 auto stream = at::cuda::getDefaultCUDAStream(c10_device_index);
228 PyObject* output_tuple = PyTuple_New(3);
229 PyTuple_SetItem(
230 output_tuple, 0, THPUtils_packInt64(static_cast<int64_t>(stream.id())));
231 PyTuple_SetItem(
232 output_tuple, 1, THPUtils_packDeviceIndex(stream.device_index()));
233 PyTuple_SetItem(
234 output_tuple,
235 2,
236 THPUtils_packInt64(static_cast<int64_t>(stream.device_type())));
237 return output_tuple;
238 END_HANDLE_TH_ERRORS
239 }
240
THCPModule_setStream_wrap(PyObject * self,PyObject * args,PyObject * kwargs)241 PyObject* THCPModule_setStream_wrap(
242 PyObject* self,
243 PyObject* args,
244 PyObject* kwargs) {
245 HANDLE_TH_ERRORS
246 int64_t stream_id = 0;
247 int64_t device_index = 0;
248 int64_t device_type = 0;
249
250 // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
251 constexpr const char* kwlist[] = {
252 "stream_id", "device_index", "device_type", nullptr};
253 if (!PyArg_ParseTupleAndKeywords(
254 args,
255 kwargs,
256 "|LLL",
257 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
258 const_cast<char**>(kwlist),
259 &stream_id,
260 &device_index,
261 &device_type)) {
262 }
263
264 auto stream = at::cuda::CUDAStream::unpack3(
265 stream_id,
266 static_cast<c10::DeviceIndex>(device_index),
267 static_cast<c10::DeviceType>(device_type));
268
269 auto device = c10::cuda::current_device();
270 if (device != stream.device_index()) {
271 c10::cuda::set_device(stream.device_index());
272 }
273 at::cuda::setCurrentCUDAStream(stream);
274 Py_RETURN_NONE;
275 END_HANDLE_TH_ERRORS
276 }
277
THCPModule_getCompiledVersion(PyObject * self,PyObject * noargs)278 PyObject* THCPModule_getCompiledVersion(PyObject* self, PyObject* noargs) {
279 #if defined(USE_ROCM)
280 return THPUtils_packInt64((int64_t)ROCM_VERSION);
281 #else
282 return THPUtils_packInt64((int64_t)CUDA_VERSION);
283 #endif
284 }
285
THCPModule_cudaHostAllocator(PyObject * _unused,PyObject * noargs)286 PyObject* THCPModule_cudaHostAllocator(PyObject* _unused, PyObject* noargs) {
287 HANDLE_TH_ERRORS
288 c10::Allocator* allocator = at::cuda::getCachingHostAllocator();
289 return PyLong_FromVoidPtr(allocator);
290 END_HANDLE_TH_ERRORS
291 }
292
THCPModule_cudaCachingAllocator_raw_alloc(PyObject * _unused,PyObject * args)293 PyObject* THCPModule_cudaCachingAllocator_raw_alloc(
294 PyObject* _unused,
295 PyObject* args) {
296 HANDLE_TH_ERRORS
297 PyObject* size_o = nullptr;
298 PyObject* stream_o = nullptr;
299 if (!PyArg_ParseTuple(args, "OO", &size_o, &stream_o)) {
300 THPUtils_invalidArguments(
301 args,
302 nullptr,
303 "caching_allocator_alloc",
304 1,
305 "(ssize_t size, intptr_t stream);");
306 return nullptr;
307 }
308 auto size = PyLong_AsSsize_t(size_o);
309 cudaStream_t stream = static_cast<cudaStream_t>(PyLong_AsVoidPtr(stream_o));
310 void* mem = nullptr;
311 {
312 pybind11::gil_scoped_release no_gil;
313 mem = c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(size, stream);
314 }
315 return PyLong_FromVoidPtr(mem);
316 END_HANDLE_TH_ERRORS
317 }
318
319 // Unpack a PyObject to at::Scalar, throw an exception if it fails
as_scalar(PyObject * arg)320 at::Scalar as_scalar(PyObject* arg) {
321 // Zero-dim tensors are converted to Scalars as-is. Note this doesn't
322 // currently handle most NumPy scalar types except np.float64.
323 if (THPVariable_Check(arg)) {
324 return THPVariable_Unpack(arg).item();
325 }
326
327 if (THPUtils_checkLong(arg)) {
328 return at::Scalar(static_cast<int64_t>(THPUtils_unpackLong(arg)));
329 }
330
331 if (PyBool_Check(arg)) {
332 return at::Scalar(THPUtils_unpackBool(arg));
333 }
334
335 if (PyComplex_Check(arg)) {
336 return at::Scalar(THPUtils_unpackComplexDouble(arg));
337 }
338 return at::Scalar(THPUtils_unpackDouble(arg));
339 }
340
341 // Entrypoint for the callable created by torch.cuda.jiterator
342 // See jiterator.py for more details
THCPModule_cudaJiteratorCompileAndLaunchKernel(PyObject * _unused,PyObject * args)343 PyObject* THCPModule_cudaJiteratorCompileAndLaunchKernel(
344 PyObject* _unused,
345 PyObject* args) {
346 HANDLE_TH_ERRORS
347
348 PyObject* code_string_o = nullptr;
349 PyObject* kernel_name_o = nullptr;
350 PyObject* return_by_ref_o = nullptr;
351 PyObject* num_outputs_o = nullptr;
352 PyObject* tensors_o = nullptr;
353 PyObject* kwargs_o = nullptr;
354 if (!PyArg_ParseTuple(
355 args,
356 "OOOOO|O",
357 &code_string_o,
358 &kernel_name_o,
359 &return_by_ref_o,
360 &num_outputs_o,
361 &tensors_o,
362 &kwargs_o)) {
363 return nullptr;
364 }
365
366 const std::string code_string = THPUtils_unpackString(code_string_o);
367 const std::string kernel_name = THPUtils_unpackString(kernel_name_o);
368 const bool return_by_ref = THPUtils_unpackBool(return_by_ref_o);
369 const int num_outputs = static_cast<int>(THPUtils_unpackLong(num_outputs_o));
370
371 TORCH_CHECK(
372 PyTuple_Check(tensors_o),
373 "tensors argument is expected to "
374 "be a tuple, but got ",
375 THPUtils_typename(tensors_o));
376 Py_ssize_t num_tensors = PyTuple_GET_SIZE(tensors_o);
377
378 c10::SmallVector<at::Tensor> tensors;
379 for (const auto i : c10::irange(num_tensors)) {
380 PyObject* _tensor = PyTuple_GET_ITEM(tensors_o, i);
381 TORCH_CHECK(
382 THPVariable_Check(_tensor),
383 i,
384 " of input tensors tuple is not a Tensor");
385
386 tensors.emplace_back(THPVariable_Unpack(_tensor));
387 }
388
389 c10::SmallVector<at::Scalar> extra_args;
390 PyObject* key = nullptr;
391 PyObject* value = nullptr;
392 Py_ssize_t pos = 0;
393 while (PyDict_Next(kwargs_o, &pos, &key, &value)) {
394 extra_args.emplace_back(as_scalar(value));
395 }
396
397 c10::SmallVector<at::Tensor> outputs = at::cuda::CompileAndLaunchKernel(
398 code_string,
399 kernel_name,
400 num_outputs,
401 tensors,
402 extra_args,
403 return_by_ref);
404
405 if (num_outputs == 1) {
406 return THPVariable_Wrap(outputs[0]);
407 } else {
408 PyObject* output_tuple = PyTuple_New(num_outputs);
409 for (int i = 0; i < num_outputs; ++i) {
410 PyTuple_SetItem(output_tuple, i, THPVariable_Wrap(outputs[i]));
411 }
412 return output_tuple;
413 }
414
415 END_HANDLE_TH_ERRORS
416 }
417
THCPModule_cudaCachingAllocator_raw_delete(PyObject * _unused,PyObject * obj)418 PyObject* THCPModule_cudaCachingAllocator_raw_delete(
419 PyObject* _unused,
420 PyObject* obj) {
421 HANDLE_TH_ERRORS
422 void* mem_ptr = PyLong_AsVoidPtr(obj);
423 {
424 pybind11::gil_scoped_release no_gil;
425 c10::cuda::CUDACachingAllocator::raw_delete(mem_ptr);
426 }
427 Py_RETURN_NONE;
428 END_HANDLE_TH_ERRORS
429 }
430
THCPModule_cudaCachingAllocator_set_allocator_settings(PyObject * _unused,PyObject * env)431 PyObject* THCPModule_cudaCachingAllocator_set_allocator_settings(
432 PyObject* _unused,
433 PyObject* env) {
434 HANDLE_TH_ERRORS
435 c10::cuda::CUDACachingAllocator::setAllocatorSettings(
436 THPUtils_unpackString(env));
437 Py_RETURN_NONE;
438 END_HANDLE_TH_ERRORS
439 }
440
THCPModule_getAllocatorBackend(PyObject * _unused,PyObject * noargs)441 PyObject* THCPModule_getAllocatorBackend(PyObject* _unused, PyObject* noargs) {
442 HANDLE_TH_ERRORS
443 return THPUtils_packString(c10::cuda::CUDACachingAllocator::name());
444 END_HANDLE_TH_ERRORS
445 }
446
THCPModule_cudaSynchronize(PyObject * _unused,PyObject * noargs)447 PyObject* THCPModule_cudaSynchronize(PyObject* _unused, PyObject* noargs) {
448 HANDLE_TH_ERRORS {
449 pybind11::gil_scoped_release no_gil;
450 c10::cuda::device_synchronize();
451 }
452 Py_RETURN_NONE;
453 END_HANDLE_TH_ERRORS
454 }
455
THCPModule_cudaIPCCollect(PyObject * _unused,PyObject * noargs)456 PyObject* THCPModule_cudaIPCCollect(PyObject* _unused, PyObject* noargs) {
457 HANDLE_TH_ERRORS
458 torch::CudaIPCCollect();
459 Py_RETURN_NONE;
460 END_HANDLE_TH_ERRORS
461 }
462
THCPModule_cudaSleep(PyObject * _unused,PyObject * cycles)463 PyObject* THCPModule_cudaSleep(PyObject* _unused, PyObject* cycles) {
464 HANDLE_TH_ERRORS
465 TORCH_CHECK(
466 THPUtils_checkLong(cycles), "torch.cuda._sleep(): expected 'int'");
467 int64_t unpacked_cycles = THPUtils_unpackLong(cycles);
468 {
469 pybind11::gil_scoped_release no_gil;
470 at::cuda::sleep(unpacked_cycles);
471 }
472 Py_RETURN_NONE;
473 END_HANDLE_TH_ERRORS
474 }
475
476 // We need to ensure that as long as a thread will NEVER loose the GIL as long
477 // as it holds the CUDA mutex. Otherwise another thread might be scheduled and
478 // try to e.g. allocate a new tensor which will cause a deadlock. It's enough to
479 // have a single global, because it can be only set once (cudaMutex is not
480 // recursive) by the thread that owns the mutex (obviously there can be only one
481 // such thread).
482 static PyGILState_STATE cudaMutexGILState;
483
THCPModule_cudaLockMutex(PyObject * module,PyObject * noargs)484 PyObject* THCPModule_cudaLockMutex(PyObject* module, PyObject* noargs) {
485 auto mutex = c10::cuda::getFreeMutex();
486 // This has to be a busy loop because we **absolutely need to** hold the GIL
487 // or it's a recipe for a deadlock otherwise (if we let other Python threads
488 // run while we have the cudaMutex, but not the GIL, they might try to e.g.
489 // free a CUDA tensor and acquire the cudaMutex without giving up the GIL,
490 // because it happens deep within THC).
491 while (true) {
492 if (mutex->try_lock())
493 break;
494 {
495 pybind11::gil_scoped_release no_gil;
496 std::this_thread::sleep_for(std::chrono::microseconds(10));
497 }
498 }
499
500 cudaMutexGILState = PyGILState_Ensure();
501 Py_RETURN_NONE;
502 }
503
THCPModule_cudaUnlockMutex(PyObject * module,PyObject * noargs)504 PyObject* THCPModule_cudaUnlockMutex(PyObject* module, PyObject* noargs) {
505 auto mutex = c10::cuda::getFreeMutex();
506 PyGILState_Release(cudaMutexGILState);
507 mutex->unlock();
508 Py_RETURN_NONE;
509 }
510
THCPModule_hasPrimaryContext(PyObject * _unused,PyObject * arg)511 PyObject* THCPModule_hasPrimaryContext(PyObject* _unused, PyObject* arg) {
512 HANDLE_TH_ERRORS
513 TORCH_CHECK(
514 THPUtils_checkLong(arg), "invalid argument to has_primary_context");
515 auto device_index = THPUtils_unpackDeviceIndex(arg);
516 if (c10::cuda::hasPrimaryContext(device_index)) {
517 Py_RETURN_TRUE;
518 } else {
519 Py_RETURN_FALSE;
520 }
521 END_HANDLE_TH_ERRORS
522 }
523
THCPModule_setMemoryFraction(PyObject * _unused,PyObject * args)524 PyObject* THCPModule_setMemoryFraction(PyObject* _unused, PyObject* args) {
525 HANDLE_TH_ERRORS
526 PyObject* fraction_o = nullptr;
527 PyObject* device_o = nullptr;
528 if (!PyArg_ParseTuple(args, "OO", &fraction_o, &device_o)) {
529 THPUtils_invalidArguments(
530 args,
531 nullptr,
532 "set_memory_fraction",
533 1,
534 "(double fraction, int device);");
535 return nullptr;
536 }
537 double fraction = PyFloat_AsDouble(fraction_o);
538 auto device_index = THPUtils_unpackDeviceIndex(device_o);
539
540 c10::cuda::CUDACachingAllocator::setMemoryFraction(fraction, device_index);
541 END_HANDLE_TH_ERRORS
542 Py_RETURN_NONE;
543 }
544
THCPModule_hostEmptyCache(PyObject * _unused,PyObject * noargs)545 PyObject* THCPModule_hostEmptyCache(PyObject* _unused, PyObject* noargs) {
546 HANDLE_TH_ERRORS {
547 pybind11::gil_scoped_release no_gil;
548 at::cuda::CachingHostAllocator_emptyCache();
549 }
550 END_HANDLE_TH_ERRORS
551 Py_RETURN_NONE;
552 }
553
THCPModule_emptyCache(PyObject * _unused,PyObject * noargs)554 PyObject* THCPModule_emptyCache(PyObject* _unused, PyObject* noargs) {
555 HANDLE_TH_ERRORS {
556 pybind11::gil_scoped_release no_gil;
557 c10::cuda::CUDACachingAllocator::emptyCache();
558 }
559 END_HANDLE_TH_ERRORS
560 Py_RETURN_NONE;
561 }
562
THCPModule_memoryStats(PyObject * _unused,PyObject * arg)563 PyObject* THCPModule_memoryStats(PyObject* _unused, PyObject* arg) {
564 HANDLE_TH_ERRORS
565 TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to memory_allocated");
566 const auto device_index = THPUtils_unpackDeviceIndex(arg);
567
568 using c10::CachingDeviceAllocator::DeviceStats;
569 using c10::CachingDeviceAllocator::Stat;
570 using c10::CachingDeviceAllocator::StatArray;
571 using c10::CachingDeviceAllocator::StatType;
572
573 const auto statToDict = [](const Stat& stat) {
574 py::dict dict;
575
576 dict["current"] = stat.current;
577 dict["peak"] = stat.peak;
578 dict["allocated"] = stat.allocated;
579 dict["freed"] = stat.freed;
580 return dict;
581 };
582
583 const auto statArrayToDict = [=](const StatArray& statArray) {
584 const std::array<const char*, static_cast<size_t>(StatType::NUM_TYPES)>
585 statTypeNames = {"all", "small_pool", "large_pool"};
586 py::dict dict;
587 for (const auto i : c10::irange(statTypeNames.size())) {
588 dict[statTypeNames[i]] = statToDict(statArray[i]);
589 }
590 return dict;
591 };
592
593 const DeviceStats stats =
594 c10::cuda::CUDACachingAllocator::getDeviceStats(device_index);
595
596 py::dict result;
597 result["num_alloc_retries"] = stats.num_alloc_retries;
598 result["num_ooms"] = stats.num_ooms;
599 result["max_split_size"] = stats.max_split_size;
600 result["num_sync_all_streams"] = stats.num_sync_all_streams;
601 result["num_device_alloc"] = stats.num_device_alloc;
602 result["num_device_free"] = stats.num_device_free;
603 result["allocation"] = statArrayToDict(stats.allocation);
604 result["segment"] = statArrayToDict(stats.segment);
605 result["active"] = statArrayToDict(stats.active);
606 result["inactive_split"] = statArrayToDict(stats.inactive_split);
607 result["allocated_bytes"] = statArrayToDict(stats.allocated_bytes);
608 result["reserved_bytes"] = statArrayToDict(stats.reserved_bytes);
609 result["active_bytes"] = statArrayToDict(stats.active_bytes);
610 result["inactive_split_bytes"] = statArrayToDict(stats.inactive_split_bytes);
611 result["requested_bytes"] = statArrayToDict(stats.requested_bytes);
612 result["oversize_allocations"] = statToDict(stats.oversize_allocations);
613 result["oversize_segments"] = statToDict(stats.oversize_segments);
614
615 return result.release().ptr();
616 END_HANDLE_TH_ERRORS
617 }
618
THCPModule_resetAccumulatedMemoryStats(PyObject * _unused,PyObject * arg)619 PyObject* THCPModule_resetAccumulatedMemoryStats(
620 PyObject* _unused,
621 PyObject* arg) {
622 HANDLE_TH_ERRORS
623 TORCH_CHECK(
624 THPUtils_checkLong(arg),
625 "invalid argument to reset_accumulated_memory_stats");
626 const auto device_index = THPUtils_unpackDeviceIndex(arg);
627 c10::cuda::CUDACachingAllocator::resetAccumulatedStats(device_index);
628 END_HANDLE_TH_ERRORS
629 Py_RETURN_NONE;
630 }
631
THCPModule_resetPeakMemoryStats(PyObject * _unused,PyObject * arg)632 PyObject* THCPModule_resetPeakMemoryStats(PyObject* _unused, PyObject* arg) {
633 HANDLE_TH_ERRORS
634 TORCH_CHECK(
635 THPUtils_checkLong(arg), "invalid argument to reset_peak_memory_stats");
636 const auto device_index = THPUtils_unpackDeviceIndex(arg);
637 c10::cuda::CUDACachingAllocator::resetPeakStats(device_index);
638 END_HANDLE_TH_ERRORS
639 Py_RETURN_NONE;
640 }
641
getFromContext(const std::shared_ptr<c10::GatheredContext> & x)642 CapturedTraceback* getFromContext(
643 const std::shared_ptr<c10::GatheredContext>& x) {
644 if (CapturedTraceback* sc = dynamic_cast<CapturedTraceback*>(x.get())) {
645 return sc;
646 }
647 TORCH_CHECK(
648 false,
649 "attempting to gather stack context from the wrong StackContext type.");
650 }
651
THCPModule_memorySnapshot(PyObject * _unused,PyObject * noargs)652 PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
653 HANDLE_TH_ERRORS
654
655 using c10::cuda::CUDACachingAllocator::BlockInfo;
656 using c10::cuda::CUDACachingAllocator::SegmentInfo;
657
658 py::str device_s = "device";
659 py::str address_s = "address";
660 py::str total_size_s = "total_size";
661 py::str allocated_size_s = "allocated_size";
662 py::str active_size_s = "active_size";
663 py::str requested_size_s = "requested_size";
664 py::str stream_s = "stream";
665 py::str segment_type_s = "segment_type";
666 py::str segment_pool_id = "segment_pool_id";
667 py::str large_s = "large";
668 py::str small_s = "small";
669 py::str size_s = "size";
670 py::str state_s = "state";
671 py::str active_allocated_s = "active_allocated";
672 py::str active_pending_free_s = "active_pending_free";
673 py::str inactive_s = "inactive";
674 py::str addr_s = "addr";
675 py::str cpp_frames_s = "cpp_frames";
676 py::str blocks_s = "blocks";
677 py::str is_expandable_s = "is_expandable";
678 py::str frames_s = "frames";
679 py::str time_us_s = "time_us";
680
681 py::list empty_frames;
682 std::vector<CapturedTraceback*> to_gather_frames;
683 std::vector<py::dict> to_gather_dest;
684
685 auto add_frame_key = [&](const py::dict& d,
686 const std::shared_ptr<c10::GatheredContext>& ctx) {
687 if (ctx) {
688 auto sc = getFromContext(ctx);
689 to_gather_frames.emplace_back(sc);
690 to_gather_dest.emplace_back(d);
691 } else {
692 d[frames_s] = empty_frames;
693 }
694 };
695
696 const auto segmentInfoToDict = [&](const SegmentInfo& segmentInfo) {
697 py::dict segmentDict;
698 segmentDict[device_s] = segmentInfo.device;
699 segmentDict[address_s] = segmentInfo.address;
700 segmentDict[total_size_s] = segmentInfo.total_size;
701 segmentDict[allocated_size_s] = segmentInfo.allocated_size;
702 segmentDict[active_size_s] = segmentInfo.active_size;
703 segmentDict[requested_size_s] = segmentInfo.requested_size;
704 // we want the python objects to pickle easily so use an int to
705 // represent the stream rather than a torch.cuda.stream object
706 segmentDict[stream_s] = int64_t(segmentInfo.stream);
707 segmentDict[segment_type_s] = (segmentInfo.is_large ? large_s : small_s);
708 segmentDict[segment_pool_id] = segmentInfo.owner_private_pool_id;
709 segmentDict[is_expandable_s] = segmentInfo.is_expandable;
710 add_frame_key(segmentDict, segmentInfo.context_when_allocated);
711
712 auto address = segmentInfo.address;
713 py::list blocks;
714 for (const auto& blockInfo : segmentInfo.blocks) {
715 py::dict blockDict;
716 blockDict[address_s] = address;
717 blockDict[size_s] = blockInfo.size;
718 blockDict[requested_size_s] = blockInfo.requested_size;
719 blockDict[state_s] =
720 (blockInfo.allocated
721 ? active_allocated_s
722 : (blockInfo.active ? active_pending_free_s : inactive_s));
723 add_frame_key(blockDict, blockInfo.context_when_allocated);
724 blocks.append(blockDict);
725 address += blockInfo.size;
726 }
727 segmentDict[blocks_s] = blocks;
728
729 return segmentDict;
730 };
731
732 auto snapshot = c10::cuda::CUDACachingAllocator::snapshot();
733
734 py::list segments;
735
736 for (const auto& segmentInfo : snapshot.segments) {
737 segments.append(segmentInfoToDict(segmentInfo));
738 }
739
740 py::list traces;
741 py::str action_s = "action";
742 py::str alloc_s = "alloc";
743 py::str free_requested_s = "free_requested";
744 py::str free_completed_s = "free_completed";
745 py::str segment_alloc_s = "segment_alloc";
746 py::str segment_free_s = "segment_free";
747 py::str segment_map_s = "segment_map";
748 py::str segment_unmap_s = "segment_unmap";
749
750 py::str snapshot_s = "snapshot";
751 py::str oom_s = "oom";
752 py::str device_free_s = "device_free";
753
754 using namespace c10::cuda::CUDACachingAllocator;
755
756 auto action_to_str = [&](TraceEntry::Action action) {
757 switch (action) {
758 case TraceEntry::ALLOC:
759 return alloc_s;
760 case TraceEntry::FREE_REQUESTED:
761 return free_requested_s;
762 case TraceEntry::FREE_COMPLETED:
763 return free_completed_s;
764 case TraceEntry::SEGMENT_ALLOC:
765 return segment_alloc_s;
766 case TraceEntry::SEGMENT_FREE:
767 return segment_free_s;
768 case TraceEntry::OOM:
769 return oom_s;
770 case TraceEntry::SNAPSHOT:
771 return snapshot_s;
772 case TraceEntry::SEGMENT_UNMAP:
773 return segment_unmap_s;
774 case TraceEntry::SEGMENT_MAP:
775 return segment_map_s;
776 }
777 throw std::runtime_error("unreachable");
778 };
779
780 for (const auto& traceInfo : snapshot.device_traces) {
781 py::list trace;
782 for (const auto& te : traceInfo) {
783 py::dict trace_entry;
784 if (te.context_) {
785 // without further compression frames can get really large on dump
786 auto sc = getFromContext(te.context_);
787 to_gather_frames.emplace_back(sc);
788 to_gather_dest.emplace_back(trace_entry);
789 }
790 trace_entry[action_s] = action_to_str(te.action_);
791 trace_entry[TraceEntry::OOM == te.action_ ? device_free_s : addr_s] =
792 te.addr_;
793 trace_entry[size_s] = te.size_;
794 trace_entry[stream_s] = int64_t(te.stream_);
795 trace_entry[time_us_s] = te.time_.t_;
796 trace.append(trace_entry);
797 }
798 traces.append(trace);
799 }
800
801 py::list external_annotations;
802 for (const auto& ae : snapshot.external_annotations) {
803 py::dict annotation_entry;
804 for (const auto& md : ae.metadata_) {
805 annotation_entry[(py::str)md.first] = md.second;
806 }
807 annotation_entry[device_s] = ae.device_;
808 annotation_entry[time_us_s] = ae.time_.t_;
809 external_annotations.append(annotation_entry);
810 }
811
812 py::dict allocator_settings;
813 py::str last_allocator_settings_s = "PYTORCH_CUDA_ALLOC_CONF";
814 py::str max_split_size_s = "max_split_size";
815 py::str garbage_collection_threshold_s = "garbage_collection_threshold";
816 py::str expandable_segments_s = "expandable_segments";
817 py::str pinned_num_register_threads_s = "pinned_num_register_threads";
818 py::str release_lock_on_malloc_s = "release_lock_on_cudamalloc";
819 py::str pinned_use_host_register_s = "pinned_use_cuda_host_register";
820 py::str roundup_power2_divisions_s = "roundup_power2_divisions";
821
822 allocator_settings[last_allocator_settings_s] =
823 snapshot.config_metadata.last_allocator_settings;
824 allocator_settings[max_split_size_s] =
825 int64_t(snapshot.config_metadata.max_split_size);
826 allocator_settings[garbage_collection_threshold_s] =
827 snapshot.config_metadata.garbage_collection_threshold;
828 allocator_settings[expandable_segments_s] =
829 snapshot.config_metadata.expandable_segments;
830 allocator_settings[pinned_num_register_threads_s] =
831 int64_t(snapshot.config_metadata.pinned_num_register_threads);
832 allocator_settings[release_lock_on_malloc_s] =
833 snapshot.config_metadata.release_lock_on_malloc;
834 allocator_settings[pinned_use_host_register_s] =
835 snapshot.config_metadata.pinned_use_host_register;
836 unsigned int roundup_key = 1;
837 py::dict roundup_settings;
838 for (const auto& v : snapshot.config_metadata.roundup_power2_divisions) {
839 py::str roundup_key_s = std::to_string(roundup_key);
840 roundup_settings[roundup_key_s] = int64_t(v);
841 roundup_key *= 2;
842 }
843 allocator_settings[roundup_power2_divisions_s] = roundup_settings;
844
845 py::dict result;
846 result["segments"] = segments;
847 result["device_traces"] = traces;
848 result["allocator_settings"] = allocator_settings;
849 result["external_annotations"] = external_annotations;
850
851 auto frames = py_symbolize(to_gather_frames);
852 for (auto i : c10::irange(frames.size())) {
853 to_gather_dest.at(i)[frames_s] = frames.at(i);
854 }
855
856 return result.release().ptr();
857 END_HANDLE_TH_ERRORS
858 }
859
THCPModule_attachOutOfMemoryObserver(PyObject * _unused,PyObject * observer)860 PyObject* THCPModule_attachOutOfMemoryObserver(
861 PyObject* _unused,
862 PyObject* observer) {
863 HANDLE_TH_ERRORS
864 Py_XINCREF(observer);
865 auto obs = [observer](
866 int64_t device,
867 int64_t alloc,
868 int64_t device_allocated,
869 int64_t device_free) {
870 py::gil_scoped_acquire g;
871 PyObject* result = PyObject_CallFunction(
872 observer, "LLLL", device, alloc, device_allocated, device_free);
873 if (!result) {
874 throw py::error_already_set();
875 }
876 Py_XDECREF(result);
877 };
878 at::globalContext().lazyInitCUDA();
879 c10::cuda::CUDACachingAllocator::attachOutOfMemoryObserver(std::move(obs));
880 Py_RETURN_NONE;
881 END_HANDLE_TH_ERRORS
882 }
883
THCPModule_cudaSetSyncDebugMode(PyObject * _unused,PyObject * arg)884 PyObject* THCPModule_cudaSetSyncDebugMode(PyObject* _unused, PyObject* arg) {
885 HANDLE_TH_ERRORS
886 TORCH_WARN_ONCE(
887 "Synchronization debug mode is a prototype feature and does not yet detect all "
888 "synchronizing operations");
889 TORCH_CHECK(
890 THPUtils_checkLong(arg), "invalid argument to set_sync_debug_mode");
891 int64_t debug_mode = THPUtils_unpackLong(arg);
892 TORCH_CHECK(
893 debug_mode >= 0 && debug_mode <= 2,
894 "invalid value of debug_mode, expected one of 0,1,2");
895 c10::cuda::SyncDebugMode l = c10::cuda::SyncDebugMode::L_DISABLED;
896 switch (debug_mode) {
897 case 0:
898 l = c10::cuda::SyncDebugMode::L_DISABLED;
899 break;
900 case 1:
901 l = c10::cuda::SyncDebugMode::L_WARN;
902 break;
903 case 2:
904 l = c10::cuda::SyncDebugMode::L_ERROR;
905 break;
906 default:
907 break; // can't happen
908 }
909 c10::cuda::warning_state().set_sync_debug_mode(l);
910 Py_RETURN_NONE;
911 END_HANDLE_TH_ERRORS
912 }
913
THCPModule_cudaGetSyncDebugMode(PyObject * self,PyObject * noargs)914 PyObject* THCPModule_cudaGetSyncDebugMode(PyObject* self, PyObject* noargs) {
915 HANDLE_TH_ERRORS
916 auto debug_mode = c10::cuda::warning_state().get_sync_debug_mode();
917 switch (debug_mode) {
918 case c10::cuda::SyncDebugMode::L_DISABLED:
919 return THPUtils_packInt32(0);
920 case c10::cuda::SyncDebugMode::L_WARN:
921 return THPUtils_packInt32(1);
922 case c10::cuda::SyncDebugMode::L_ERROR:
923 return THPUtils_packInt32(2);
924 default:
925 return THPUtils_packInt32(-1); // can't happen
926 }
927 END_HANDLE_TH_ERRORS
928 }
929
uuid_to_string(const char * uuid_bytes)930 std::string uuid_to_string(const char* uuid_bytes) {
931 // UUIDs are a 128-bit label. CUDA and HIP store this as char[16].
932 // For string representation, the code here expands this to
933 // 8-4-4-4-12 hex format, so each byte becomes 2 hex characters.
934 return fmt::format(
935 "{:02x}{:02x}{:02x}{:02x}-"
936 "{:02x}{:02x}-"
937 "{:02x}{:02x}-"
938 "{:02x}{:02x}-"
939 "{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}",
940 (uint8_t)uuid_bytes[0],
941 (uint8_t)uuid_bytes[1],
942 (uint8_t)uuid_bytes[2],
943 (uint8_t)uuid_bytes[3],
944 (uint8_t)uuid_bytes[4],
945 (uint8_t)uuid_bytes[5],
946 (uint8_t)uuid_bytes[6],
947 (uint8_t)uuid_bytes[7],
948 (uint8_t)uuid_bytes[8],
949 (uint8_t)uuid_bytes[9],
950 (uint8_t)uuid_bytes[10],
951 (uint8_t)uuid_bytes[11],
952 (uint8_t)uuid_bytes[12],
953 (uint8_t)uuid_bytes[13],
954 (uint8_t)uuid_bytes[14],
955 (uint8_t)uuid_bytes[15]);
956 }
957
958 ////////////////////////////////////////////////////////////////////////////////
959 // Cuda module initialization
960 ////////////////////////////////////////////////////////////////////////////////
961
registerCudaDeviceProperties(PyObject * module)962 static void registerCudaDeviceProperties(PyObject* module) {
963 // Add _cudaDevicePropertires class to torch._C
964 auto m = py::handle(module).cast<py::module>();
965 // until internal build is using a rocm version with uuid attr
966 #ifndef FBCODE_CAFFE2
967 // CUuuid is defined in either cuda.h or driver_types.h
968 // hipified to hipUUID which is defined in hip_runtime_api.h
969 py::class_<CUuuid>(m, "_CUuuid")
970 .def_property_readonly(
971 "bytes",
972 [](const CUuuid& uuid) {
973 return std::vector<uint8_t>(uuid.bytes, uuid.bytes + 16);
974 })
975 .def("__str__", [](const CUuuid& uuid) {
976 return uuid_to_string(uuid.bytes);
977 });
978 #endif
979 py::class_<cudaDeviceProp>(m, "_CudaDeviceProperties")
980 .def_readonly("name", &cudaDeviceProp::name)
981 .def_readonly("major", &cudaDeviceProp::major)
982 .def_readonly("minor", &cudaDeviceProp::minor)
983 .def_readonly("is_multi_gpu_board", &cudaDeviceProp::isMultiGpuBoard)
984 .def_readonly("is_integrated", &cudaDeviceProp::integrated)
985 .def_readonly(
986 "multi_processor_count", &cudaDeviceProp::multiProcessorCount)
987 .def_readonly("total_memory", &cudaDeviceProp::totalGlobalMem)
988 .def_readonly(
989 "max_threads_per_multi_processor",
990 &cudaDeviceProp::maxThreadsPerMultiProcessor)
991 .def_readonly("warp_size", &cudaDeviceProp::warpSize)
992 #if !USE_ROCM
993 // NVIDA only property
994 .def_readonly(
995 "regs_per_multiprocessor", &cudaDeviceProp::regsPerMultiprocessor)
996 #endif // USE_ROCM
997 // HIP-only property; reuse name attribute for CUDA builds
998 .def_readonly(
999 "gcnArchName",
1000 #if USE_ROCM
1001 &cudaDeviceProp::gcnArchName
1002 #else
1003 &cudaDeviceProp::name
1004 #endif // USE_ROCM
1005 )
1006 #ifndef FBCODE_CAFFE2
1007 .def_readonly("uuid", &cudaDeviceProp::uuid)
1008 #endif
1009 .def_readonly("L2_cache_size", &cudaDeviceProp::l2CacheSize)
1010 .def("__repr__", [](const cudaDeviceProp& prop) {
1011 std::ostringstream stream;
1012 stream << "_CudaDeviceProperties(name='" << prop.name
1013 << "', major=" << prop.major << ", minor=" << prop.minor
1014 #if USE_ROCM
1015 << ", gcnArchName='" << prop.gcnArchName << "'"
1016 #endif // USE_ROCM
1017 << ", total_memory=" << prop.totalGlobalMem / (1024ull * 1024)
1018 << "MB, multi_processor_count=" << prop.multiProcessorCount
1019 #ifndef FBCODE_CAFFE2
1020 << ", uuid=" << uuid_to_string(prop.uuid.bytes)
1021 #endif
1022 << ", L2_cache_size=" << prop.l2CacheSize / (1024ull * 1024)
1023 << "MB)";
1024 return stream.str();
1025 });
1026
1027 m.def(
1028 "_cuda_record_memory_history_legacy",
1029 static_cast<void (*)(bool, bool, int64_t, bool, bool)>(
1030 torch::cuda::_record_memory_history));
1031
1032 m.def(
1033 "_cuda_record_memory_history",
1034 static_cast<void (*)(
1035 std::optional<std::string>,
1036 std::optional<std::string>,
1037 const std::string&,
1038 size_t)>(torch::cuda::_record_memory_history));
1039
1040 m.def("_cuda_isHistoryEnabled", []() {
1041 return c10::cuda::CUDACachingAllocator::isHistoryEnabled();
1042 });
1043
1044 m.def("_cuda_get_conv_benchmark_empty_cache", []() {
1045 return at::native::_cudnn_get_conv_benchmark_empty_cache();
1046 });
1047
1048 m.def("_cudnn_set_conv_benchmark_empty_cache", [](bool enable) {
1049 return at::native::_cudnn_set_conv_benchmark_empty_cache(enable);
1050 });
1051 }
1052
1053 // We choose to ignore certain blocks that are currently allocated
1054 // when we set the pool to its checkpoint. For those blocks, we need
1055 // to swap out the deleter function of their corresponding blocks
1056 // so that a deallocation is not triggered when they die.
removeStorageDeleterFns(const std::vector<c10::StorageImpl * > & stale_live_storages,std::unordered_set<void * > definitely_stale_pointers)1057 void removeStorageDeleterFns(
1058 const std::vector<c10::StorageImpl*>& stale_live_storages,
1059 std::unordered_set<void*> definitely_stale_pointers) {
1060 for (c10::StorageImpl* stale_storage : stale_live_storages) {
1061 auto ptr = stale_storage->data_ptr().get();
1062 auto allocated_pointer = definitely_stale_pointers.find(ptr);
1063 TORCH_CHECK(allocated_pointer != definitely_stale_pointers.end());
1064 auto t = c10::cuda::CUDACachingAllocator::get();
1065 bool succeeded = stale_storage->mutable_data_ptr().compare_exchange_deleter(
1066 t->raw_deleter(), &c10::detail::deleteNothing);
1067
1068 TORCH_CHECK(
1069 succeeded,
1070 "Unexpected deleter function on storage, could not swap function");
1071 }
1072 }
1073
addStorageDeleterFns(std::vector<c10::StorageImpl * > & storages_to_add_deleters_to,c10::cuda::CUDACachingAllocator::CheckpointDelta & delta)1074 void addStorageDeleterFns(
1075 std::vector<c10::StorageImpl*>& storages_to_add_deleters_to,
1076 c10::cuda::CUDACachingAllocator::CheckpointDelta& delta) {
1077 std::unordered_map<void*, c10::StorageImpl*> storages;
1078 for (auto& storage : storages_to_add_deleters_to) {
1079 storages[storage->data_ptr().get()] = storage;
1080 }
1081
1082 for (auto& data_ptr : delta.dataptrs_allocd) {
1083 auto storage_pair = storages.find(data_ptr.get());
1084 if (storage_pair != storages.end()) {
1085 auto ctx = storage_pair->second->data_ptr().get_context();
1086 TORCH_CHECK(ctx == nullptr, " Not expecting deleter function");
1087 storage_pair->second->set_data_ptr_noswap(std::move(data_ptr));
1088 } else {
1089 data_ptr.release_context();
1090 }
1091 }
1092 }
1093
registerCudaPluggableAllocator(PyObject * module)1094 static void registerCudaPluggableAllocator(PyObject* module) {
1095 auto m = py::handle(module).cast<py::module>();
1096
1097 // NOLINTNEXTLINE(bugprone-unused-raii)
1098 py::class_<
1099 c10::cuda::CUDACachingAllocator::CUDAAllocator,
1100 std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>>(
1101 m, "_cuda_CUDAAllocator");
1102 m.def("_cuda_getAllocator", []() {
1103 return py::cast(torch::cuda::CUDAPluggableAllocator::getCurrentAllocator());
1104 });
1105
1106 m.def(
1107 "_cuda_changeCurrentAllocator",
1108 [](const std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>&
1109 allocator) {
1110 torch::cuda::CUDAPluggableAllocator::changeCurrentAllocator(allocator);
1111 });
1112 py::class_<
1113 torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator,
1114 c10::cuda::CUDACachingAllocator::CUDAAllocator,
1115 std::shared_ptr<
1116 torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator>>(
1117 m, "_CUDAPluggableAllocator")
1118 .def(
1119 "set_init_fn",
1120 [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
1121 uint64_t func_ptr) {
1122 using FuncType = void(int);
1123 std::function<FuncType> func =
1124 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1125 reinterpret_cast<FuncType*>(func_ptr);
1126 self.set_init_fn(func);
1127 })
1128 .def(
1129 "set_reset_fn",
1130 [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
1131 uint64_t func_ptr) {
1132 using FuncType = void();
1133 std::function<FuncType> func =
1134 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1135 reinterpret_cast<FuncType*>(func_ptr);
1136 self.set_reset_fn(func);
1137 })
1138 .def(
1139 "set_memory_fraction_fn",
1140 [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
1141 uint64_t func_ptr) {
1142 using FuncType = void(double, int);
1143 std::function<FuncType> func =
1144 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1145 reinterpret_cast<FuncType*>(func_ptr);
1146 self.set_memory_fraction_fn(func);
1147 })
1148 .def(
1149 "set_base_alloc_fn",
1150 [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
1151 uint64_t func_ptr) {
1152 using FuncType = void*(void*, size_t*);
1153 std::function<FuncType> func =
1154 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1155 reinterpret_cast<FuncType*>(func_ptr);
1156 self.set_base_alloc_fn(func);
1157 })
1158 .def(
1159 "set_record_stream_fn",
1160 [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
1161 uint64_t func_ptr) {
1162 using FuncType = void(void*, cudaStream_t);
1163 std::function<FuncType> func =
1164 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1165 reinterpret_cast<FuncType*>(func_ptr);
1166 self.set_record_stream_fn(func);
1167 })
1168 .def(
1169 "set_begin_allocate_to_pool",
1170 [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
1171 uint64_t func_ptr) {
1172 using FuncType = void(
1173 int, c10::cuda::MempoolId_t, std::function<bool(cudaStream_t)>);
1174 std::function<FuncType> func =
1175 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1176 reinterpret_cast<FuncType*>(func_ptr);
1177 self.set_begin_allocate_to_pool(func);
1178 })
1179 .def(
1180 "set_end_allocate_to_pool_fn",
1181 [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
1182 uint64_t func_ptr) {
1183 using FuncType = void(int, c10::cuda::MempoolId_t);
1184 std::function<FuncType> func =
1185 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1186 reinterpret_cast<FuncType*>(func_ptr);
1187 self.set_end_allocate_to_pool_fn(func);
1188 })
1189 .def(
1190 "set_release_pool",
1191 [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
1192 uint64_t func_ptr) {
1193 using FuncType = void(int, c10::cuda::MempoolId_t);
1194 std::function<FuncType> func =
1195 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1196 reinterpret_cast<FuncType*>(func_ptr);
1197 self.set_release_pool(func);
1198 });
1199 m.def("_cuda_customAllocator", [](uint64_t malloc_ptr, uint64_t free_ptr) {
1200 using namespace torch::cuda::CUDAPluggableAllocator;
1201 std::function<MallocFuncType> malloc_fn =
1202 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1203 reinterpret_cast<MallocFuncType*>(malloc_ptr);
1204 std::function<FreeFuncType> free_fn =
1205 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1206 reinterpret_cast<FreeFuncType*>(free_ptr);
1207 return createCustomAllocator(malloc_fn, free_fn);
1208 });
1209
1210 // NOLINTNEXTLINE(bugprone-unused-raii)
1211 py::class_<
1212 c10::cuda::CUDACachingAllocator::AllocatorState,
1213 std::shared_ptr<c10::cuda::CUDACachingAllocator::AllocatorState>>(
1214 m, "_cuda_CUDAAllocator_AllocatorState");
1215
1216 m.def(
1217 "_cuda_getCheckpointState",
1218 [](c10::DeviceIndex device, c10::cuda::MempoolId_t id) {
1219 return c10::cuda::CUDACachingAllocator::getCheckpointState(device, id);
1220 });
1221
1222 m.def("_free_And_Remove_DeleterFn", [](size_t storage_impl_ptr) {
1223 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1224 c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr;
1225 auto alloc = c10::cuda::CUDACachingAllocator::get();
1226 auto data_ptr = storage_impl->data_ptr().get();
1227 bool succeeded = storage_impl->mutable_data_ptr().compare_exchange_deleter(
1228 alloc->raw_deleter(), c10::detail::deleteNothing);
1229 TORCH_CHECK(succeeded, "Expected standard deleter");
1230 c10::cuda::CUDACachingAllocator::raw_delete(data_ptr);
1231 });
1232
1233 m.def(
1234 "_set_storage_access_error_msg", [](const at::Tensor& t, std::string s) {
1235 t.unsafeGetTensorImpl()
1236 ->release_storage_and_set_meta_custom_data_ptr_error_msg_(s);
1237 });
1238
1239 m.def("_has_Standard_Deleter", [](size_t storage_impl_ptr) {
1240 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1241 c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr;
1242 auto alloc = c10::cuda::CUDACachingAllocator::get();
1243 return (storage_impl->data_ptr().get_deleter() == alloc->raw_deleter());
1244 });
1245
1246 m.def("_storage_Use_Count", [](size_t storage_impl_ptr) {
1247 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1248 c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr;
1249 return c10::raw::weak_intrusive_ptr::use_count(storage_impl);
1250 });
1251
1252 m.def(
1253 "_tensors_data_ptrs_at_indices_equal",
1254 [](py::list& tensors, py::list& data_ptrs, py::list& indices) {
1255 for (size_t i = 0, end = indices.size(); i < end; ++i) {
1256 auto index = indices[i].cast<int64_t>();
1257 auto t = tensors[index].cast<at::Tensor>();
1258 auto data_ptr = data_ptrs[index].cast<int64_t>();
1259 if (reinterpret_cast<int64_t>(t.data_ptr()) != data_ptr) {
1260 return false;
1261 }
1262 }
1263 return true;
1264 });
1265
1266 m.def(
1267 "_construct_CUDA_Tensor_From_Storage_And_Metadata",
1268 [](py::dict& metadata, c10::Storage s) {
1269 auto dtype_arg = metadata["dtype"].ptr();
1270 auto meta = scalarTypeToTypeMeta(toScalarType(dtype_arg));
1271
1272 constexpr c10::DispatchKeySet cuda_dks(c10::DispatchKey::CUDA);
1273 at::Tensor tensor = at::detail::make_tensor_base<c10::TensorImpl>(
1274 std::move(s), cuda_dks, meta);
1275
1276 tensor.unsafeGetTensorImpl()->set_sizes_and_strides(
1277 metadata["size"].cast<std::vector<int64_t>>(),
1278 metadata["stride"].cast<std::vector<int64_t>>());
1279 tensor.unsafeGetTensorImpl()->set_storage_offset(
1280 metadata["storage_offset"].cast<int64_t>());
1281 return tensor;
1282 });
1283
1284 m.def(
1285 "_cuda_beginAllocateCurrentStreamToPool",
1286 [](c10::DeviceIndex device, at::cuda::MempoolId_t mempool_id) {
1287 auto stream = at::cuda::getCurrentCUDAStream(device);
1288 TORCH_CHECK(stream, "Expected stream capture to be under way");
1289 c10::cuda::CUDACachingAllocator::beginAllocateToPool(
1290 device, mempool_id, [stream](cudaStream_t target) {
1291 return target == stream;
1292 });
1293 });
1294
1295 m.def(
1296 "_cuda_beginAllocateToPool",
1297 [](c10::DeviceIndex device, at::cuda::MempoolId_t mempool_id) {
1298 c10::cuda::CUDACachingAllocator::beginAllocateToPool(
1299 device, mempool_id, [](cudaStream_t) { return true; });
1300 });
1301
1302 m.def(
1303 "_cuda_endAllocateCurrentStreamToPool",
1304 [](c10::DeviceIndex device, at::cuda::MempoolId_t mempool_id) {
1305 c10::cuda::CUDACachingAllocator::endAllocateToPool(device, mempool_id);
1306 });
1307
1308 m.def(
1309 "_cuda_releasePool",
1310 [](c10::DeviceIndex device, at::cuda::MempoolId_t mempool_id) {
1311 c10::cuda::CUDACachingAllocator::releasePool(device, mempool_id);
1312 });
1313
1314 m.def(
1315 "_cuda_checkPoolLiveAllocations",
1316 [](c10::DeviceIndex device,
1317 at::cuda::MempoolId_t mempool_id,
1318 const py::set& expected_live_allocations) {
1319 std::unordered_set<void*> allocations;
1320 allocations.reserve(expected_live_allocations.size());
1321 for (auto& elem : expected_live_allocations) {
1322 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1323 allocations.insert(reinterpret_cast<void*>(py::cast<size_t>(elem)));
1324 }
1325 return c10::cuda::CUDACachingAllocator::checkPoolLiveAllocations(
1326 device, mempool_id, allocations);
1327 });
1328
1329 m.def(
1330 "_cuda_setCheckpointPoolState",
1331 [](c10::DeviceIndex device,
1332 std::shared_ptr<c10::cuda::CUDACachingAllocator::AllocatorState> pps,
1333 const std::vector<size_t>& stale_storages_ptr,
1334 const std::vector<size_t>& storages_to_add_deleters_to_ptr = {}) {
1335 std::unordered_set<c10::StorageImpl*> ptr_set;
1336 // iterate on std::vector for determinism
1337 std::vector<c10::StorageImpl*> ptrs;
1338 for (size_t ptr_int : stale_storages_ptr) {
1339 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1340 c10::StorageImpl* ptr = (c10::StorageImpl*)ptr_int;
1341 if (!ptr_set.count(ptr)) {
1342 ptrs.push_back(ptr);
1343 ptr_set.insert(ptr);
1344 }
1345 }
1346 auto delta = c10::cuda::CUDACachingAllocator::setCheckpointPoolState(
1347 device, std::move(pps));
1348 auto& freed_pointers = delta.ptrs_freed;
1349
1350 std::unordered_set<void*> allocd_set;
1351 for (auto& data_ptr : delta.dataptrs_allocd) {
1352 allocd_set.insert(data_ptr.get());
1353 }
1354 std::unordered_set<void*> freed_pointer_set;
1355 size_t definite_freed_count = 0;
1356 for (void* ptr : freed_pointers) {
1357 if (!allocd_set.count(ptr)) {
1358 definite_freed_count += 1;
1359 }
1360 freed_pointer_set.insert((ptr));
1361 }
1362 // that block has already been freed,
1363 // so even those this will error, so too will the allocator
1364 // when the corresponding tensor dies because there is no
1365 // live tensor corresponding to it
1366 TORCH_CHECK(
1367 ptr_set.size() >= definite_freed_count,
1368 "Any stale tensors which are being manually freed"
1369 " must be passed to set checkpoint");
1370
1371 removeStorageDeleterFns(ptrs, freed_pointer_set);
1372 std::vector<c10::StorageImpl*> storages_to_add_deleters_to;
1373 storages_to_add_deleters_to.reserve(
1374 storages_to_add_deleters_to_ptr.size());
1375 for (size_t ptr_int : storages_to_add_deleters_to_ptr) {
1376 // NOLINTNEXTLINE(performance-no-int-to-ptr)
1377 storages_to_add_deleters_to.push_back((c10::StorageImpl*)ptr_int);
1378 }
1379
1380 addStorageDeleterFns(storages_to_add_deleters_to, delta);
1381 });
1382 }
1383
bindGetDeviceProperties(PyObject * module)1384 static void bindGetDeviceProperties(PyObject* module) {
1385 // Add method to torch.cuda
1386 auto m = py::handle(module).cast<py::module>();
1387 m.def(
1388 "_get_device_properties",
1389 [](c10::DeviceIndex device) -> cudaDeviceProp* {
1390 return at::cuda::getDeviceProperties(device);
1391 },
1392 py::return_value_policy::reference);
1393 }
1394
1395 // Callback for python part. Used for additional initialization of python
1396 // classes
THCPModule_initExtension(PyObject * self,PyObject * noargs)1397 static PyObject* THCPModule_initExtension(PyObject* self, PyObject* noargs) {
1398 #if C10_ASAN_ENABLED
1399 TORCH_WARN(
1400 "torch.cuda: your pytorch binary has address sanitizer (asan) built in, "
1401 "asan is currently not compatible with torch.cuda module, "
1402 "you might get unexpected behavior (eg. out of memory, crash, etc.), "
1403 "please rebuild pytorch without asan if you need to use this module");
1404 #endif
1405 HANDLE_TH_ERRORS
1406 TORCH_INTERNAL_ASSERT(!in_bad_fork); // Handled at python level
1407 poison_fork();
1408 at::globalContext().lazyInitCUDA();
1409
1410 auto m = THPObjectPtr(PyImport_ImportModule("torch.cuda"));
1411 if (!m)
1412 throw python_error();
1413
1414 auto set_module_attr = [&](const char* name, PyObject* v) {
1415 // PyObject_SetAttrString doesn't steal reference. So no need to incref.
1416 if (PyObject_SetAttrString(m, name, v) < 0) {
1417 throw python_error();
1418 }
1419 };
1420
1421 auto num_gpus = c10::cuda::device_count();
1422 auto default_cuda_generators = PyTuple_New(static_cast<Py_ssize_t>(num_gpus));
1423 for (const auto i : c10::irange(num_gpus)) {
1424 auto cast_gen = (THPGenerator*)THPGenerator_initDefaultGenerator(
1425 at::cuda::detail::getDefaultCUDAGenerator(i));
1426 // This reference is meant to be given away, so no need to incref here.
1427 PyTuple_SetItem(default_cuda_generators, i, (PyObject*)cast_gen);
1428 }
1429 set_module_attr("default_generators", default_cuda_generators);
1430 bindGetDeviceProperties(m);
1431
1432 Py_RETURN_NONE;
1433 END_HANDLE_TH_ERRORS
1434 }
1435
THCPModule_getCurrentBlasHandle_wrap(PyObject * self,PyObject * noargs)1436 PyObject* THCPModule_getCurrentBlasHandle_wrap(
1437 PyObject* self,
1438 PyObject* noargs) {
1439 HANDLE_TH_ERRORS
1440 // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
1441 cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
1442 return PyLong_FromVoidPtr(handle);
1443 END_HANDLE_TH_ERRORS
1444 }
1445
THCPModule_clearBlasWorkspaces_wrap(PyObject * self,PyObject * noargs)1446 static PyObject* THCPModule_clearBlasWorkspaces_wrap(
1447 PyObject* self,
1448 PyObject* noargs) {
1449 HANDLE_TH_ERRORS
1450 at::cuda::clearCublasWorkspaces();
1451 Py_RETURN_NONE;
1452 END_HANDLE_TH_ERRORS
1453 }
1454
THCPModule_rocm_is_backward_pass(PyObject * _unused,PyObject * noargs)1455 PyObject* THCPModule_rocm_is_backward_pass(
1456 PyObject* _unused,
1457 PyObject* noargs) {
1458 HANDLE_TH_ERRORS
1459 #if USE_ROCM
1460 if (at::ROCmBackwardPassGuard::is_backward_pass()) {
1461 Py_RETURN_TRUE;
1462 } else {
1463 Py_RETURN_FALSE;
1464 }
1465 #else
1466 Py_RETURN_FALSE;
1467 #endif
1468 END_HANDLE_TH_ERRORS
1469 }
1470
THCPModule_cuda_tunableop_enable(PyObject * _unused,PyObject * arg)1471 PyObject* THCPModule_cuda_tunableop_enable(PyObject* _unused, PyObject* arg) {
1472 HANDLE_TH_ERRORS
1473 TORCH_CHECK(
1474 THPUtils_checkBool(arg),
1475 "cuda_tunableop_enable expects a bool, but got ",
1476 THPUtils_typename(arg));
1477 at::cuda::tunable::getTuningContext()->EnableTunableOp(
1478 THPUtils_unpackBool(arg));
1479 Py_RETURN_NONE;
1480 END_HANDLE_TH_ERRORS
1481 }
1482
THCPModule_cuda_tunableop_is_enabled(PyObject * _unused,PyObject * noarg)1483 PyObject* THCPModule_cuda_tunableop_is_enabled(
1484 PyObject* _unused,
1485 PyObject* noarg) {
1486 HANDLE_TH_ERRORS
1487 if (at::cuda::tunable::getTuningContext()->IsTunableOpEnabled()) {
1488 Py_RETURN_TRUE;
1489 } else {
1490 Py_RETURN_FALSE;
1491 }
1492 END_HANDLE_TH_ERRORS
1493 }
1494
THCPModule_cuda_tunableop_tuning_enable(PyObject * _unused,PyObject * arg)1495 PyObject* THCPModule_cuda_tunableop_tuning_enable(
1496 PyObject* _unused,
1497 PyObject* arg) {
1498 HANDLE_TH_ERRORS
1499 TORCH_CHECK(
1500 THPUtils_checkBool(arg),
1501 "cuda_tunableop_tuning_enable expects a bool, but got ",
1502 THPUtils_typename(arg));
1503 at::cuda::tunable::getTuningContext()->EnableTuning(THPUtils_unpackBool(arg));
1504 Py_RETURN_NONE;
1505 END_HANDLE_TH_ERRORS
1506 }
1507
THCPModule_cuda_tunableop_tuning_is_enabled(PyObject * _unused,PyObject * noarg)1508 PyObject* THCPModule_cuda_tunableop_tuning_is_enabled(
1509 PyObject* _unused,
1510 PyObject* noarg) {
1511 HANDLE_TH_ERRORS
1512 if (at::cuda::tunable::getTuningContext()->IsTuningEnabled()) {
1513 Py_RETURN_TRUE;
1514 } else {
1515 Py_RETURN_FALSE;
1516 }
1517 END_HANDLE_TH_ERRORS
1518 }
1519
THCPModule_cuda_tunableop_write_file_on_exit(PyObject * _unused,PyObject * arg)1520 PyObject* THCPModule_cuda_tunableop_write_file_on_exit(
1521 PyObject* _unused,
1522 PyObject* arg) {
1523 HANDLE_TH_ERRORS
1524 TORCH_CHECK(
1525 THPUtils_checkBool(arg),
1526 "cuda_tunableop_write_file_on_exit expects a bool, but got ",
1527 THPUtils_typename(arg));
1528 at::cuda::tunable::getTuningContext()->WriteFileOnExit(
1529 THPUtils_unpackBool(arg));
1530 Py_RETURN_NONE;
1531 END_HANDLE_TH_ERRORS
1532 }
1533
THCPModule_cuda_tunableop_set_max_tuning_duration(PyObject * _unused,PyObject * arg)1534 PyObject* THCPModule_cuda_tunableop_set_max_tuning_duration(
1535 PyObject* _unused,
1536 PyObject* arg) {
1537 HANDLE_TH_ERRORS
1538 TORCH_CHECK(
1539 THPUtils_checkLong(arg),
1540 "cuda_tunableop_set_max_tuning_duration expects an int, but got ",
1541 THPUtils_typename(arg));
1542 auto duration = static_cast<int>(THPUtils_unpackLong(arg));
1543 at::cuda::tunable::getTuningContext()->SetMaxTuningDurationMs(duration);
1544 Py_RETURN_NONE;
1545 END_HANDLE_TH_ERRORS
1546 }
1547
THCPModule_cuda_tunableop_get_max_tuning_duration(PyObject * _unused,PyObject * noargs)1548 PyObject* THCPModule_cuda_tunableop_get_max_tuning_duration(
1549 PyObject* _unused,
1550 PyObject* noargs) {
1551 HANDLE_TH_ERRORS
1552 return THPUtils_packInt32(
1553 at::cuda::tunable::getTuningContext()->GetMaxTuningDurationMs());
1554 END_HANDLE_TH_ERRORS
1555 }
1556
THCPModule_cuda_tunableop_set_max_tuning_iterations(PyObject * _unused,PyObject * arg)1557 PyObject* THCPModule_cuda_tunableop_set_max_tuning_iterations(
1558 PyObject* _unused,
1559 PyObject* arg) {
1560 HANDLE_TH_ERRORS
1561 TORCH_CHECK(
1562 THPUtils_checkLong(arg),
1563 "cuda_tunableop_set_max_tuning_iterations expects an int, but got ",
1564 THPUtils_typename(arg));
1565 auto iterations = static_cast<int>(THPUtils_unpackLong(arg));
1566 at::cuda::tunable::getTuningContext()->SetMaxTuningIterations(iterations);
1567 Py_RETURN_NONE;
1568 END_HANDLE_TH_ERRORS
1569 }
1570
THCPModule_cuda_tunableop_get_max_tuning_iterations(PyObject * _unused,PyObject * noargs)1571 PyObject* THCPModule_cuda_tunableop_get_max_tuning_iterations(
1572 PyObject* _unused,
1573 PyObject* noargs) {
1574 HANDLE_TH_ERRORS
1575 return THPUtils_packInt32(
1576 at::cuda::tunable::getTuningContext()->GetMaxTuningIterations());
1577 END_HANDLE_TH_ERRORS
1578 }
1579
THCPModule_cuda_tunableop_set_filename(PyObject * _unused,PyObject * args)1580 PyObject* THCPModule_cuda_tunableop_set_filename(
1581 PyObject* _unused,
1582 PyObject* args) {
1583 HANDLE_TH_ERRORS
1584 PyObject* obj_str = nullptr;
1585 PyObject* obj_ord = nullptr;
1586 if (!PyArg_ParseTuple(args, "O|O", &obj_str, &obj_ord)) {
1587 }
1588 TORCH_CHECK(
1589 THPUtils_checkString(obj_str),
1590 "cuda_tunableop_set_filename expects a string, but got ",
1591 THPUtils_typename(obj_str));
1592 auto filename = THPUtils_unpackString(obj_str);
1593 bool dev = false;
1594 if (obj_ord) {
1595 TORCH_CHECK(
1596 THPUtils_checkBool(obj_ord),
1597 "cuda_tunableop_set_filename expects a bool, but got ",
1598 THPUtils_typename(obj_ord));
1599 dev = THPUtils_unpackBool(obj_ord);
1600 }
1601 at::cuda::tunable::getTuningContext()->SetFilename(filename, dev);
1602 Py_RETURN_NONE;
1603 END_HANDLE_TH_ERRORS
1604 }
1605
THCPModule_cuda_tunableop_get_filename(PyObject * _unused,PyObject * noargs)1606 PyObject* THCPModule_cuda_tunableop_get_filename(
1607 PyObject* _unused,
1608 PyObject* noargs) {
1609 HANDLE_TH_ERRORS
1610 return THPUtils_packString(
1611 at::cuda::tunable::getTuningContext()->GetFilename());
1612 END_HANDLE_TH_ERRORS
1613 }
1614
THCPModule_cuda_tunableop_write_file(PyObject * _unused,PyObject * args)1615 PyObject* THCPModule_cuda_tunableop_write_file(
1616 PyObject* _unused,
1617 PyObject* args) {
1618 HANDLE_TH_ERRORS
1619 PyObject* str = nullptr;
1620 bool success = false;
1621 if (!PyArg_ParseTuple(args, "|O", &str)) {
1622 }
1623 if (str) {
1624 TORCH_CHECK(
1625 THPUtils_checkString(str),
1626 "cuda_tunableop_write_file expects a string, but got ",
1627 THPUtils_typename(str));
1628 auto filename = THPUtils_unpackString(str);
1629 success = at::cuda::tunable::getTuningContext()->WriteFile(filename);
1630 } else {
1631 success = at::cuda::tunable::getTuningContext()->WriteFile();
1632 }
1633 if (success) {
1634 Py_RETURN_TRUE;
1635 } else {
1636 Py_RETURN_FALSE;
1637 }
1638 END_HANDLE_TH_ERRORS
1639 }
1640
THCPModule_cuda_tunableop_read_file(PyObject * _unused,PyObject * args)1641 PyObject* THCPModule_cuda_tunableop_read_file(
1642 PyObject* _unused,
1643 PyObject* args) {
1644 HANDLE_TH_ERRORS
1645 PyObject* str = nullptr;
1646 bool success = false;
1647 if (!PyArg_ParseTuple(args, "|O", &str)) {
1648 }
1649 if (str) {
1650 TORCH_CHECK(
1651 THPUtils_checkString(str),
1652 "cuda_tunableop_read_file expects a string, but got ",
1653 THPUtils_typename(str));
1654 auto filename = THPUtils_unpackString(str);
1655 success = at::cuda::tunable::getTuningContext()->ReadFile(filename);
1656 } else {
1657 success = at::cuda::tunable::getTuningContext()->ReadFile();
1658 }
1659 if (success) {
1660 Py_RETURN_TRUE;
1661 } else {
1662 Py_RETURN_FALSE;
1663 }
1664 END_HANDLE_TH_ERRORS
1665 }
1666
THCPModule_cuda_tunableop_get_results(PyObject * _unused,PyObject * noargs)1667 PyObject* THCPModule_cuda_tunableop_get_results(
1668 PyObject* _unused,
1669 PyObject* noargs) {
1670 HANDLE_TH_ERRORS
1671 auto results =
1672 at::cuda::tunable::getTuningContext()->GetTuningResultsManager().Dump();
1673 size_t result_size = 0;
1674 for (const auto& [op_sig, kernelmap] : results) {
1675 result_size += kernelmap.size();
1676 }
1677 THPObjectPtr outer_tuple(PyTuple_New(result_size));
1678 if (!outer_tuple)
1679 throw python_error();
1680 size_t result_index = 0;
1681 for (const auto& [op_sig, kernelmap] : results) {
1682 for (const auto& [param_sig, result] : kernelmap) {
1683 THPObjectPtr inner_tuple(PyTuple_New(4));
1684 if (!inner_tuple)
1685 throw python_error();
1686 PyObject* obj_op_sig = THPUtils_packString(op_sig);
1687 if (!obj_op_sig)
1688 throw python_error();
1689 PyObject* obj_param_sig = THPUtils_packString(param_sig);
1690 if (!obj_param_sig)
1691 throw python_error();
1692 PyObject* obj_result_key = THPUtils_packString(result.GetKey());
1693 if (!obj_result_key)
1694 throw python_error();
1695 PyObject* obj_result_time = PyFloat_FromDouble(result.GetTime());
1696 if (!obj_result_time)
1697 throw python_error();
1698 PyTuple_SET_ITEM(inner_tuple.get(), 0, obj_op_sig);
1699 PyTuple_SET_ITEM(inner_tuple.get(), 1, obj_param_sig);
1700 PyTuple_SET_ITEM(inner_tuple.get(), 2, obj_result_key);
1701 PyTuple_SET_ITEM(inner_tuple.get(), 3, obj_result_time);
1702 PyTuple_SET_ITEM(
1703 outer_tuple.get(), result_index++, inner_tuple.release());
1704 }
1705 }
1706 return outer_tuple.release();
1707 END_HANDLE_TH_ERRORS
1708 }
1709
THCPModule_cuda_tunableop_get_validators(PyObject * _unused,PyObject * noargs)1710 PyObject* THCPModule_cuda_tunableop_get_validators(
1711 PyObject* _unused,
1712 PyObject* noargs) {
1713 HANDLE_TH_ERRORS
1714 auto validators = at::cuda::tunable::getTuningContext()
1715 ->GetTuningResultsValidator()
1716 .GetAllValidators();
1717 THPObjectPtr outer_tuple(PyTuple_New(validators.size()));
1718 if (!outer_tuple)
1719 throw python_error();
1720 size_t validator_index = 0;
1721 for (const auto& [key, val] : validators) {
1722 THPObjectPtr inner_tuple(PyTuple_New(2));
1723 if (!inner_tuple)
1724 throw python_error();
1725 PyObject* obj_key = THPUtils_packString(key);
1726 if (!obj_key)
1727 throw python_error();
1728 PyObject* obj_val = THPUtils_packString(val);
1729 if (!obj_val)
1730 throw python_error();
1731 PyTuple_SET_ITEM(inner_tuple.get(), 0, obj_key);
1732 PyTuple_SET_ITEM(inner_tuple.get(), 1, obj_val);
1733 PyTuple_SET_ITEM(
1734 outer_tuple.get(), validator_index++, inner_tuple.release());
1735 }
1736 return outer_tuple.release();
1737 END_HANDLE_TH_ERRORS
1738 }
1739
THCPModule_isCurrentStreamCapturing_wrap(PyObject * self,PyObject * noargs)1740 static PyObject* THCPModule_isCurrentStreamCapturing_wrap(
1741 PyObject* self,
1742 PyObject* noargs) {
1743 HANDLE_TH_ERRORS
1744 // If there's no cuda context, at::cuda::currentStreamCaptureStatus returns
1745 // CaptureStatus::None without initializing a context.
1746 if (at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None) {
1747 Py_RETURN_FALSE;
1748 } else {
1749 Py_RETURN_TRUE;
1750 }
1751 END_HANDLE_TH_ERRORS
1752 }
1753
THCPModule_setBenchmarkLimitCuDNN(PyObject * _unused,PyObject * arg)1754 PyObject* THCPModule_setBenchmarkLimitCuDNN(PyObject* _unused, PyObject* arg) {
1755 HANDLE_TH_ERRORS
1756 TORCH_CHECK(
1757 THPUtils_checkLong(arg),
1758 "set_benchmark_limit_cudnn expects an int, "
1759 "but got ",
1760 THPUtils_typename(arg));
1761 #if defined(USE_ROCM)
1762 TORCH_WARN_ONCE(
1763 "cuDNN Benchmark limit is not supported in MIOpen and will have no effect.");
1764 #endif
1765 auto benchmark_limit = static_cast<int>(THPUtils_unpackLong(arg));
1766 at::globalContext().setBenchmarkLimitCuDNN(benchmark_limit);
1767 Py_RETURN_NONE;
1768 END_HANDLE_TH_ERRORS
1769 }
1770
THCPModule_benchmarkLimitCuDNN(PyObject * _unused,PyObject * noargs)1771 PyObject* THCPModule_benchmarkLimitCuDNN(PyObject* _unused, PyObject* noargs) {
1772 return THPUtils_packInt32(at::globalContext().benchmarkLimitCuDNN());
1773 }
1774
1775 // NOLINTNEXTLINE(*-c-arrays*, *-global-variables)
1776 static struct PyMethodDef _THCPModule_methods[] = {
1777 {"_cuda_init", THCPModule_initExtension, METH_NOARGS, nullptr},
1778 {"_cuda_setDevice", THCPModule_setDevice_wrap, METH_O, nullptr},
1779 {"_cuda_exchangeDevice", THCPModule_exchangeDevice, METH_O, nullptr},
1780 {"_cuda_maybeExchangeDevice",
1781 THCPModule_maybeExchangeDevice,
1782 METH_O,
1783 nullptr},
1784 {"_cuda_getDevice", THCPModule_getDevice_wrap, METH_NOARGS, nullptr},
1785 {"_cuda_getDeviceCount",
1786 THCPModule_getDeviceCount_wrap,
1787 METH_NOARGS,
1788 nullptr},
1789 {"_cuda_canDeviceAccessPeer",
1790 THCPModule_canDeviceAccessPeer_wrap,
1791 METH_VARARGS,
1792 nullptr},
1793 {"_cuda_getArchFlags", THCPModule_getArchFlags, METH_NOARGS, nullptr},
1794 {"_cuda_isInBadFork", THCPModule_isInBadFork, METH_NOARGS, nullptr},
1795 {"_cuda_getCurrentStream",
1796 THCPModule_getCurrentStream_wrap,
1797 METH_O,
1798 nullptr},
1799 {"_cuda_getCurrentRawStream",
1800 THCPModule_getCurrentStream_raw,
1801 METH_O,
1802 nullptr},
1803 {"_cuda_getDefaultStream",
1804 THCPModule_getDefaultStream_wrap,
1805 METH_O,
1806 nullptr},
1807 {"_cuda_getCurrentBlasHandle",
1808 THCPModule_getCurrentBlasHandle_wrap,
1809 METH_NOARGS,
1810 nullptr},
1811 {"_cuda_clearCublasWorkspaces",
1812 THCPModule_clearBlasWorkspaces_wrap,
1813 METH_NOARGS,
1814 nullptr},
1815 {"_cuda_isCurrentStreamCapturing",
1816 THCPModule_isCurrentStreamCapturing_wrap,
1817 METH_NOARGS,
1818 nullptr},
1819 {"_cuda_setStream",
1820 castPyCFunctionWithKeywords(THCPModule_setStream_wrap),
1821 METH_VARARGS | METH_KEYWORDS,
1822 nullptr},
1823 {"_cuda_getCompiledVersion",
1824 THCPModule_getCompiledVersion,
1825 METH_NOARGS,
1826 nullptr},
1827 {"_cuda_hasPrimaryContext", THCPModule_hasPrimaryContext, METH_O, nullptr},
1828 {"_cuda_setMemoryFraction",
1829 THCPModule_setMemoryFraction,
1830 METH_VARARGS,
1831 nullptr},
1832 {"_cuda_emptyCache", THCPModule_emptyCache, METH_NOARGS, nullptr},
1833 {"_cuda_memoryStats", THCPModule_memoryStats, METH_O, nullptr},
1834 {"_cuda_resetAccumulatedMemoryStats",
1835 THCPModule_resetAccumulatedMemoryStats,
1836 METH_O,
1837 nullptr},
1838 {"_cuda_resetPeakMemoryStats",
1839 THCPModule_resetPeakMemoryStats,
1840 METH_O,
1841 nullptr},
1842 {"_cuda_memorySnapshot", THCPModule_memorySnapshot, METH_NOARGS, nullptr},
1843 {"_cuda_attach_out_of_memory_observer",
1844 THCPModule_attachOutOfMemoryObserver,
1845 METH_O,
1846 nullptr},
1847 {"_cuda_cudaHostAllocator",
1848 THCPModule_cudaHostAllocator,
1849 METH_NOARGS,
1850 nullptr},
1851 {"_host_emptyCache", THCPModule_hostEmptyCache, METH_NOARGS, nullptr},
1852 {"_cuda_cudaCachingAllocator_raw_alloc",
1853 THCPModule_cudaCachingAllocator_raw_alloc,
1854 METH_VARARGS,
1855 nullptr},
1856 {"_cuda_cudaCachingAllocator_raw_delete",
1857 THCPModule_cudaCachingAllocator_raw_delete,
1858 METH_O,
1859 nullptr},
1860 {"_cuda_cudaCachingAllocator_set_allocator_settings",
1861 THCPModule_cudaCachingAllocator_set_allocator_settings,
1862 METH_O,
1863 nullptr},
1864 {"_cuda_getAllocatorBackend",
1865 THCPModule_getAllocatorBackend,
1866 METH_NOARGS,
1867 nullptr},
1868 {"_cuda_synchronize", THCPModule_cudaSynchronize, METH_NOARGS, nullptr},
1869 {"_cuda_ipc_collect", THCPModule_cudaIPCCollect, METH_NOARGS, nullptr},
1870 {"_cuda_sleep", THCPModule_cudaSleep, METH_O, nullptr},
1871 {"_cuda_lock_mutex", THCPModule_cudaLockMutex, METH_NOARGS, nullptr},
1872 {"_cuda_unlock_mutex", THCPModule_cudaUnlockMutex, METH_NOARGS, nullptr},
1873 {"_cuda_set_sync_debug_mode",
1874 THCPModule_cudaSetSyncDebugMode,
1875 METH_O,
1876 nullptr},
1877 {"_cuda_get_sync_debug_mode",
1878 THCPModule_cudaGetSyncDebugMode,
1879 METH_NOARGS,
1880 nullptr},
1881 {"_cuda_jiterator_compile_and_launch_kernel",
1882 THCPModule_cudaJiteratorCompileAndLaunchKernel,
1883 METH_VARARGS,
1884 nullptr},
1885 {"_cuda_get_cudnn_benchmark_limit",
1886 THCPModule_benchmarkLimitCuDNN,
1887 METH_NOARGS,
1888 nullptr},
1889 {"_cuda_set_cudnn_benchmark_limit",
1890 THCPModule_setBenchmarkLimitCuDNN,
1891 METH_O,
1892 nullptr},
1893 #ifdef USE_NCCL
1894 {"_nccl_version", THCPModule_nccl_version, METH_NOARGS, nullptr},
1895 {"_nccl_version_suffix",
1896 THCPModule_nccl_version_suffix,
1897 METH_NOARGS,
1898 nullptr},
1899 {"_nccl_unique_id", THCPModule_nccl_unique_id, METH_NOARGS, nullptr},
1900 {"_nccl_init_rank", THCPModule_nccl_init_rank, METH_VARARGS, nullptr},
1901 {"_nccl_reduce", THCPModule_nccl_reduce, METH_VARARGS, nullptr},
1902 {"_nccl_all_reduce", THCPModule_nccl_all_reduce, METH_VARARGS, nullptr},
1903 {"_nccl_broadcast", THCPModule_nccl_broadcast, METH_VARARGS, nullptr},
1904 {"_nccl_all_gather", THCPModule_nccl_all_gather, METH_VARARGS, nullptr},
1905 {"_nccl_reduce_scatter",
1906 THCPModule_nccl_reduce_scatter,
1907 METH_VARARGS,
1908 nullptr},
1909 #endif
1910 {"_rocm_is_backward_pass",
1911 THCPModule_rocm_is_backward_pass,
1912 METH_NOARGS,
1913 nullptr},
1914 {"_cuda_tunableop_enable",
1915 THCPModule_cuda_tunableop_enable,
1916 METH_O,
1917 nullptr},
1918 {"_cuda_tunableop_is_enabled",
1919 THCPModule_cuda_tunableop_is_enabled,
1920 METH_NOARGS,
1921 nullptr},
1922 {"_cuda_tunableop_tuning_enable",
1923 THCPModule_cuda_tunableop_tuning_enable,
1924 METH_O,
1925 nullptr},
1926 {"_cuda_tunableop_tuning_is_enabled",
1927 THCPModule_cuda_tunableop_tuning_is_enabled,
1928 METH_NOARGS,
1929 nullptr},
1930 {"_cuda_tunableop_write_file_on_exit",
1931 THCPModule_cuda_tunableop_write_file_on_exit,
1932 METH_O,
1933 nullptr},
1934 {"_cuda_tunableop_set_max_tuning_duration",
1935 THCPModule_cuda_tunableop_set_max_tuning_duration,
1936 METH_O,
1937 nullptr},
1938 {"_cuda_tunableop_get_max_tuning_duration",
1939 THCPModule_cuda_tunableop_get_max_tuning_duration,
1940 METH_NOARGS,
1941 nullptr},
1942 {"_cuda_tunableop_set_max_tuning_iterations",
1943 THCPModule_cuda_tunableop_set_max_tuning_iterations,
1944 METH_O,
1945 nullptr},
1946 {"_cuda_tunableop_get_max_tuning_iterations",
1947 THCPModule_cuda_tunableop_get_max_tuning_iterations,
1948 METH_NOARGS,
1949 nullptr},
1950 {"_cuda_tunableop_set_filename",
1951 THCPModule_cuda_tunableop_set_filename,
1952 METH_VARARGS,
1953 nullptr},
1954 {"_cuda_tunableop_get_filename",
1955 THCPModule_cuda_tunableop_get_filename,
1956 METH_NOARGS,
1957 nullptr},
1958 {"_cuda_tunableop_write_file",
1959 THCPModule_cuda_tunableop_write_file,
1960 METH_VARARGS,
1961 nullptr},
1962 {"_cuda_tunableop_read_file",
1963 THCPModule_cuda_tunableop_read_file,
1964 METH_VARARGS,
1965 nullptr},
1966 {"_cuda_tunableop_get_results",
1967 THCPModule_cuda_tunableop_get_results,
1968 METH_NOARGS,
1969 nullptr},
1970 {"_cuda_tunableop_get_validators",
1971 THCPModule_cuda_tunableop_get_validators,
1972 METH_NOARGS,
1973 nullptr},
1974 {nullptr}};
1975
THCPModule_methods()1976 PyMethodDef* THCPModule_methods() {
1977 return _THCPModule_methods;
1978 }
1979
1980 namespace torch::cuda {
1981
1982 namespace shared {
1983
1984 void initCudartBindings(PyObject* module);
1985 void initNvtxBindings(PyObject* module);
1986 void initGdsBindings(PyObject* module);
1987 #if defined(USE_CUDNN) || defined(USE_ROCM)
1988 void initCudnnBindings(PyObject* module);
1989 #endif
1990 #if defined(USE_CUSPARSELT)
1991 void initCusparseltBindings(PyObject* module);
1992 #endif
1993
1994 } // namespace shared
1995
initModule(PyObject * module)1996 void initModule(PyObject* module) {
1997 python::initCommMethods(module);
1998 // As weird as it seems, this file is also compiled for ROCm,
1999 // so this condition might not always be true...
2000 shared::initCudartBindings(module);
2001 shared::initNvtxBindings(module);
2002 #if defined(USE_CUDNN) || defined(USE_ROCM)
2003 shared::initCudnnBindings(module);
2004 #endif
2005 #if defined(USE_CUSPARSELT)
2006 shared::initCusparseltBindings(module);
2007 #endif
2008 shared::initGdsBindings(module);
2009 registerCudaDeviceProperties(module);
2010 registerCudaPluggableAllocator(module);
2011 }
2012
2013 } // namespace torch::cuda
2014