xref: /aosp_15_r20/external/pytorch/aten/src/ATen/cuda/CachingHostAllocator.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #pragma once
2 
3 #include <ATen/core/CachingHostAllocator.h>
4 #include <c10/core/Allocator.h>
5 #include <c10/cuda/CUDAStream.h>
6 
7 namespace at::cuda {
8 
9 //
10 // A caching allocator for CUDA host allocations (pinned memory).
11 //
12 // This provides a drop-in replacement for THCudaHostAllocator, which re-uses
13 // freed pinned (page-locked) memory allocations. This avoids device
14 // synchronizations due to cudaFreeHost calls.
15 //
16 // To ensure correct behavior, THCCachingHostAllocator_recordEvent must be
17 // called anytime a pointer from this allocator is used in a cudaMemcpyAsync
18 // call between host and device, and passed the corresponding context from the
19 // allocation. This is currently invoked by at::native::copy_kernel_cuda.
20 //
21 TORCH_CUDA_CPP_API c10::Allocator* getCachingHostAllocator();
22 
23 // Records an event in the specified stream. The allocation corresponding to the
24 // input `ptr`/`ctx` will not be re-used until the event has occurred.
25 TORCH_CUDA_CPP_API bool CachingHostAllocator_recordEvent(
26     void* ptr,
27     void* ctx,
28     c10::cuda::CUDAStream stream);
29 
30 // Releases cached pinned memory allocations via cudaHostFree
31 TORCH_CUDA_CPP_API void CachingHostAllocator_emptyCache();
32 
HostAlloc(size_t size)33 inline TORCH_CUDA_CPP_API at::DataPtr HostAlloc(size_t size) {
34   return getCachingHostAllocator()->allocate(size);
35 }
36 
37 } // namespace at::cuda
38