xref: /aosp_15_r20/external/pytorch/torch/csrc/CudaIPCTypes.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #pragma once
2 #ifdef USE_CUDA
3 #include <c10/core/Allocator.h>
4 #include <c10/cuda/CUDACachingAllocator.h>
5 #include <c10/cuda/CUDAException.h>
6 #include <c10/util/Logging.h>
7 #include <cuda_runtime_api.h>
8 #include <torch/csrc/Export.h>
9 #include <cstddef>
10 namespace torch {
11 
12 TORCH_CUDA_CU_API bool CudaIPCCollect();
13 
14 struct CudaIPCReceivedData final {
15   CudaIPCReceivedData() = default;
CudaIPCReceivedDatafinal16   explicit CudaIPCReceivedData(std::shared_ptr<void> shared_ptr)
17       : shared_ptr_(std::move(shared_ptr)) {}
18   std::shared_ptr<void> shared_ptr_;
19 };
20 
21 struct CudaIPCSentData final {
22   std::string handle_;
23   uint64_t offset_;
24   uint64_t* counter_ptr_; // Reference counter shared memory block
25   at::DataPtr original_ptr_; // Original mem allocation
26   cudaEvent_t event_; // Sync cuEventDestroy
27   bool event_sync_required_;
28   at::Device device_;
29 
30   CudaIPCSentData(
31       std::string handle,
32       uint64_t offset,
33       uint64_t* counter_ptr,
34       at::Device device);
35   ~CudaIPCSentData();
36 
37   uint64_t counter_value();
handlefinal38   std::string handle() {
39     return handle_;
40   }
offsetfinal41   uint64_t offset() {
42     return offset_;
43   }
set_original_ptrfinal44   void set_original_ptr(at::DataPtr data_ptr) {
45     original_ptr_ = std::move(data_ptr);
46   }
47 };
48 
49 TORCH_CUDA_CU_API at::DataPtr GetNewRefCountedSentData(
50     void* data,
51     at::Device device);
52 
53 namespace {
54 
55 inline constexpr int64_t CUDA_IPC_REF_COUNTER_FILE_SIZE = 10000;
56 inline constexpr int64_t CUDA_IPC_WARN_AFTER_X_BLOCKS_IN_LIMBO = 1000;
57 // This was determined empirically that CUDA (v10.1 and below) have the limit
58 // on the number of recorded blocking interprocess events. It is around ~22,000.
59 // And to give us leeway, we picked 1000 as it gives us enough events to share
60 // tensors effectively.
61 inline constexpr int64_t CUDA_IPC_MAXIMUM_EVENTS_TO_USE = 1000;
62 
63 // All to be deleted data blocks with non zero reference counter goes there
64 struct CudaIPCSentDataLimbo final {
65   ~CudaIPCSentDataLimbo();
66   bool collect();
67   void add(std::unique_ptr<CudaIPCSentData> shared_block);
68   uint64_t size();
69 
70  private:
71   // TODO: Can be changed to FIFO in order to avoid full traverse on every
72   // collect()
73   std::vector<std::unique_ptr<CudaIPCSentData>> shared_blocks_;
74   std::mutex limbo_mutex_;
75 };
76 
77 struct CudaIPCRefCountersFile final {
CudaIPCRefCountersFilefinal78   CudaIPCRefCountersFile(
79       std::string handle,
80       uint64_t size,
81       at::DataPtr data_ptr)
82       : size_(size),
83 
84         handle_(std::move(handle)),
85         refcounted_shared_mem_(std::move(data_ptr)) {}
86 
counter_ptrfinal87   uint64_t* counter_ptr() {
88     return static_cast<uint64_t*>(refcounted_shared_mem_.get()) + next_offset_;
89   }
90 
set_counterfinal91   void set_counter(uint64_t value) {
92     *counter_ptr() = value;
93   }
94 
have_offsetsfinal95   bool have_offsets() {
96     return next_offset_ < size_;
97   }
98 
offsets_in_usefinal99   bool offsets_in_use() {
100     return used_slots_;
101   }
102 
get_offsetfinal103   uint64_t get_offset() {
104     return next_offset_;
105   }
106 
rotate_offsetfinal107   void rotate_offset() {
108     next_offset_++;
109     used_slots_++;
110   }
111 
return_offsetfinal112   void return_offset(uint64_t offset /* unused */) {
113     used_slots_--;
114   }
115 
handlefinal116   std::string handle() {
117     return handle_;
118   }
119 
120  private:
121   uint64_t next_offset_{0};
122   uint64_t size_;
123   uint64_t used_slots_{0};
124   std::string handle_;
125   at::DataPtr refcounted_shared_mem_;
126 };
127 
128 } // namespace
129 } // namespace torch
130 
131 namespace c10 {
132 namespace {
133 class CudaIPCCollectCallback : public FreeMemoryCallback {
134  public:
Execute()135   bool Execute() override {
136     return torch::CudaIPCCollect();
137   }
138 };
139 } // namespace
140 
141 } // namespace c10
142 
143 #endif
144