1 #pragma once 2 #ifdef USE_CUDA 3 #include <c10/core/Allocator.h> 4 #include <c10/cuda/CUDACachingAllocator.h> 5 #include <c10/cuda/CUDAException.h> 6 #include <c10/util/Logging.h> 7 #include <cuda_runtime_api.h> 8 #include <torch/csrc/Export.h> 9 #include <cstddef> 10 namespace torch { 11 12 TORCH_CUDA_CU_API bool CudaIPCCollect(); 13 14 struct CudaIPCReceivedData final { 15 CudaIPCReceivedData() = default; CudaIPCReceivedDatafinal16 explicit CudaIPCReceivedData(std::shared_ptr<void> shared_ptr) 17 : shared_ptr_(std::move(shared_ptr)) {} 18 std::shared_ptr<void> shared_ptr_; 19 }; 20 21 struct CudaIPCSentData final { 22 std::string handle_; 23 uint64_t offset_; 24 uint64_t* counter_ptr_; // Reference counter shared memory block 25 at::DataPtr original_ptr_; // Original mem allocation 26 cudaEvent_t event_; // Sync cuEventDestroy 27 bool event_sync_required_; 28 at::Device device_; 29 30 CudaIPCSentData( 31 std::string handle, 32 uint64_t offset, 33 uint64_t* counter_ptr, 34 at::Device device); 35 ~CudaIPCSentData(); 36 37 uint64_t counter_value(); handlefinal38 std::string handle() { 39 return handle_; 40 } offsetfinal41 uint64_t offset() { 42 return offset_; 43 } set_original_ptrfinal44 void set_original_ptr(at::DataPtr data_ptr) { 45 original_ptr_ = std::move(data_ptr); 46 } 47 }; 48 49 TORCH_CUDA_CU_API at::DataPtr GetNewRefCountedSentData( 50 void* data, 51 at::Device device); 52 53 namespace { 54 55 inline constexpr int64_t CUDA_IPC_REF_COUNTER_FILE_SIZE = 10000; 56 inline constexpr int64_t CUDA_IPC_WARN_AFTER_X_BLOCKS_IN_LIMBO = 1000; 57 // This was determined empirically that CUDA (v10.1 and below) have the limit 58 // on the number of recorded blocking interprocess events. It is around ~22,000. 59 // And to give us leeway, we picked 1000 as it gives us enough events to share 60 // tensors effectively. 61 inline constexpr int64_t CUDA_IPC_MAXIMUM_EVENTS_TO_USE = 1000; 62 63 // All to be deleted data blocks with non zero reference counter goes there 64 struct CudaIPCSentDataLimbo final { 65 ~CudaIPCSentDataLimbo(); 66 bool collect(); 67 void add(std::unique_ptr<CudaIPCSentData> shared_block); 68 uint64_t size(); 69 70 private: 71 // TODO: Can be changed to FIFO in order to avoid full traverse on every 72 // collect() 73 std::vector<std::unique_ptr<CudaIPCSentData>> shared_blocks_; 74 std::mutex limbo_mutex_; 75 }; 76 77 struct CudaIPCRefCountersFile final { CudaIPCRefCountersFilefinal78 CudaIPCRefCountersFile( 79 std::string handle, 80 uint64_t size, 81 at::DataPtr data_ptr) 82 : size_(size), 83 84 handle_(std::move(handle)), 85 refcounted_shared_mem_(std::move(data_ptr)) {} 86 counter_ptrfinal87 uint64_t* counter_ptr() { 88 return static_cast<uint64_t*>(refcounted_shared_mem_.get()) + next_offset_; 89 } 90 set_counterfinal91 void set_counter(uint64_t value) { 92 *counter_ptr() = value; 93 } 94 have_offsetsfinal95 bool have_offsets() { 96 return next_offset_ < size_; 97 } 98 offsets_in_usefinal99 bool offsets_in_use() { 100 return used_slots_; 101 } 102 get_offsetfinal103 uint64_t get_offset() { 104 return next_offset_; 105 } 106 rotate_offsetfinal107 void rotate_offset() { 108 next_offset_++; 109 used_slots_++; 110 } 111 return_offsetfinal112 void return_offset(uint64_t offset /* unused */) { 113 used_slots_--; 114 } 115 handlefinal116 std::string handle() { 117 return handle_; 118 } 119 120 private: 121 uint64_t next_offset_{0}; 122 uint64_t size_; 123 uint64_t used_slots_{0}; 124 std::string handle_; 125 at::DataPtr refcounted_shared_mem_; 126 }; 127 128 } // namespace 129 } // namespace torch 130 131 namespace c10 { 132 namespace { 133 class CudaIPCCollectCallback : public FreeMemoryCallback { 134 public: Execute()135 bool Execute() override { 136 return torch::CudaIPCCollect(); 137 } 138 }; 139 } // namespace 140 141 } // namespace c10 142 143 #endif 144