1 #pragma once 2 3 #include <ATen/core/CachingHostAllocator.h> 4 #include <c10/core/Allocator.h> 5 #include <c10/cuda/CUDAStream.h> 6 7 namespace at::cuda { 8 9 // 10 // A caching allocator for CUDA host allocations (pinned memory). 11 // 12 // This provides a drop-in replacement for THCudaHostAllocator, which re-uses 13 // freed pinned (page-locked) memory allocations. This avoids device 14 // synchronizations due to cudaFreeHost calls. 15 // 16 // To ensure correct behavior, THCCachingHostAllocator_recordEvent must be 17 // called anytime a pointer from this allocator is used in a cudaMemcpyAsync 18 // call between host and device, and passed the corresponding context from the 19 // allocation. This is currently invoked by at::native::copy_kernel_cuda. 20 // 21 TORCH_CUDA_CPP_API c10::Allocator* getCachingHostAllocator(); 22 23 // Records an event in the specified stream. The allocation corresponding to the 24 // input `ptr`/`ctx` will not be re-used until the event has occurred. 25 TORCH_CUDA_CPP_API bool CachingHostAllocator_recordEvent( 26 void* ptr, 27 void* ctx, 28 c10::cuda::CUDAStream stream); 29 30 // Releases cached pinned memory allocations via cudaHostFree 31 TORCH_CUDA_CPP_API void CachingHostAllocator_emptyCache(); 32 HostAlloc(size_t size)33inline TORCH_CUDA_CPP_API at::DataPtr HostAlloc(size_t size) { 34 return getCachingHostAllocator()->allocate(size); 35 } 36 37 } // namespace at::cuda 38