ATen/mps/MPSAllocator.h

//  Copyright © 2022 Apple Inc.

#pragma once

#include <ATen/mps/MPSAllocatorInterface.h>
#include <ATen/mps/MPSEvent.h>
#include <ATen/mps/MPSStream.h>

#include <cstdio>
#include <mutex>
#include <set>
#include <unordered_set>
#include <mach/vm_page_size.h>
#include <c10/util/flat_hash_map.h>

// this implementation is based on CUDACachingAllocator.
// It utilizes Metal Heaps to improve the performance with buffer allocation.
// Do not include this header. Use MPSAllocatorInterface.h instead.
// TODO: Unify the logic with CUDACachingAllocator and remove redundant code.
namespace at::mps::HeapAllocator {

static const size_t kMaxSmallAlloc = MB(1);    // largest "small" allocation is 1 MiB
static const size_t kMinLargeAlloc = MB(10);   // allocations between 1 and 10 MiB may use kLargeHeap
static const size_t kRoundLarge    = MB(2);    // round up large allocations to 2 MiB
static const size_t kSmallHeap     = MB(8);    // "small" allocations are packed in 8 MiB heaps
static const size_t kLargeHeap     = MB(32);   // "large" allocations may be packed in 32 MiB heaps
static const size_t kXLargeHeapD   = MB(128);  // "extra large" allocations on Discrete devices may be packed in 128 MiB heaps
static const size_t kXLargeHeapU   = MB(1024); // "extra large" allocations on Unified devices may be packed in 1 GiB heaps
static const size_t kMaxScalarAlloc = (sizeof(int64_t)); // largest "scalar" allocation

// buffer pools could be customized with a combination of usage flags
enum UsageFlags : uint32_t {
  PRIVATE = 0,
  SMALL   = (1 << 0), // small heaps have sizes of kSmallHeap, and large ones kLargeHeap
  SHARED  = (1 << 1), // shared pools allocated on devices with unified memory; otherwise, private between host/device
  MANAGED = (1 << 2), // managed storage mode
  HAZARD  = (1 << 3), // enables Automatic Hazard Tracking for the resources allocated on the pool
  SCALAR  = (1 << 4), // used to import CPU scalar values to GPU and use them in MPS Stream
};
// debug verbosity flags
enum DebugVerbosity : uint32_t {
  SILENT      = 0,
  PROFILING   = (1 << 0), // print generic profiling data for total system memory usage
  ALLOCATIONS = (1 << 1), // print buffer allocations
  RECYCLES    = (1 << 2), // print buffer recycling
  RELEASES    = (1 << 3), // print buffer releases
  LARGE_ONLY  = (1 << 4), // only log large buffer pool transactions
};

struct HeapBlock;

struct BufferBlock {
  id<MTLBuffer> buffer;
  void* cpu_ptr = nullptr; // stores the pointer to CPU mapping of a Shared MTLBuffer
  size_t size; // size after alignment
  size_t requested_size; // requested size (before alignment)
  // buffer shape is used for retrieving base of views in cached graphs
  std::vector<int64_t> shape;
  bool in_use = false;
  HeapBlock* heap;
  id_t buf_id;
  // counter to candidate least recently used buffers for garbage collection
  uint32_t gc_count = 0;
  uint32_t use_count = 0;
  // counter to assign unique ids to buffer blocks
  static uint64_t buffer_counter;
  // Metal events used to sync GPU/CPU operations on the shared-storage buffers
  MPSEventPtr event;

  BufferBlock(size_t Size, size_t RequestedSize = 0, const id<MTLBuffer> Buffer = nullptr,
              HeapBlock* Heap = nullptr) :
              buffer(Buffer), size(Size), requested_size(RequestedSize),
              heap(Heap), buf_id(Buffer ? ++buffer_counter : 0) { }

  static bool Comparator(const BufferBlock* a, const BufferBlock* b) {
    return (a->size != b->size) ? a->size < b->size : (uintptr_t)a->buffer < (uintptr_t)b->buffer;
  }
  static size_t alignUp(size_t Size, size_t Alignment) {
    assert(((Alignment - 1) & Alignment) == 0);
    return ((Size + Alignment - 1) & ~(Alignment - 1));
  }
  uint32_t retainCount() const { return [buffer retainCount]; }
};
typedef bool (*BufferComparison)(const BufferBlock*, const BufferBlock*);

struct BufferPool;
struct AllocParams {
  AllocParams(size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool) :
              search_key(Alloc_Size), pool(Pool), requested_size(Requested_Size) { }
  size_t size() const { return search_key.size; }

  BufferBlock search_key;
  BufferPool* pool;
  BufferBlock* buffer_block = nullptr;
  size_t requested_size;
  // true if we exceed the low watermark limit. In this case
  // we apply strategies to relieve the pressure before allocation.
  bool has_memory_pressure = false;
  // true if we're allocating on a unified memory device
  bool has_unified_memory = true;
};

struct HeapBlock {
  id<MTLHeap> heap;
  struct { size_t total, available; } size;
  BufferPool* pool;
  unsigned int n_buffers = 0;
  id_t heap_id;
  // indicates if we split this heap to sub-allocate 'several' buffers (otherwise single buffer)
  bool is_split;
  // counter to assign unique ids to heap blocks
  static uint64_t heap_counter;

  HeapBlock(size_t Size, const id<MTLHeap> Heap = nullptr, BufferPool *Pool = nullptr) :
            heap(Heap), size({.total = Size, .available = Size}), pool(Pool),
            heap_id(Heap ? ++heap_counter : 0), is_split(true) { }

  static MTLResourceOptions getOptions(uint32_t usage) {
    // TODO: check the caching performance of write-combined mode
    MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache;

    if (usage & UsageFlags::MANAGED)
      options |= MTLResourceStorageModeManaged;
    else if (usage & UsageFlags::SHARED)
      options |= MTLResourceStorageModeShared;
    else
      options |= MTLResourceStorageModePrivate;

    options |= (usage & UsageFlags::HAZARD) ? MTLResourceHazardTrackingModeTracked : MTLResourceHazardTrackingModeUntracked;

    return options;
  }

  static HeapBlock* createHeapBlock(AllocParams& params, id<MTLDevice> device, uint32_t usage) {
    HeapBlock *heapBlock = nullptr;
    bool is_split = true;
    const size_t size = params.size();
    MTLHeapDescriptor *d = [MTLHeapDescriptor new];
    if (d) {
      const size_t kXLargeHeap = params.has_unified_memory ? kXLargeHeapU : kXLargeHeapD;
      if (size <= kMaxSmallAlloc) {
        d.size = kSmallHeap;
      } else if (size < kMinLargeAlloc) {
        d.size = kLargeHeap;
      } else if (size < kXLargeHeap / 2 && !params.has_memory_pressure) {
        d.size = kXLargeHeap;
      } else {
        d.size = kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
        is_split = false;
      }
      d.storageMode = (usage & UsageFlags::SHARED) ? MTLStorageModeShared : MTLStorageModePrivate;
      d.cpuCacheMode = MTLCPUCacheModeDefaultCache;
      // this automatically handles Metal buffer access synchronizations at the
      // cost of slightly lower performance.
      d.hazardTrackingMode = (usage & UsageFlags::HAZARD) ? MTLHazardTrackingModeTracked : MTLHazardTrackingModeUntracked;
      d.resourceOptions = getOptions(usage);
      d.type = MTLHeapTypeAutomatic;
      id<MTLHeap> heap = [device newHeapWithDescriptor: d];
      if (heap) {
        [heap setPurgeableState:MTLPurgeableStateNonVolatile];
        const size_t heap_size = heapAvailableSize(heap);
        heapBlock = new HeapBlock(heap_size, heap, params.pool);
        if (heapBlock) {
          heapBlock->is_split = is_split;
        }
      }
      [d release];
    }
    return heapBlock;
  }
  static bool Comparator(const HeapBlock* a, const HeapBlock* b) {
    return (a->size.available != b->size.available) ? a->size.available < b->size.available :
                                                      (uintptr_t)a->heap < (uintptr_t)b->heap;
  }
  static NSUInteger heapAvailableSize(id<MTLHeap> heap, size_t Alignment = vm_page_size) {
    return [heap maxAvailableSizeWithAlignment:Alignment];
  }
  NSUInteger Size() {
    return [heap size];
  }
  id<MTLBuffer> newMTLBuffer(size_t length, uint32_t usage) {
    id<MTLBuffer> buf = [heap newBufferWithLength:length options:getOptions(usage)];
    if (buf) {
      updateAvailableSize();
      n_buffers++;
    }
    return buf;
  }
  // returns the retainCount before releasing the buffer
  uint32_t releaseMTLBuffer(id<MTLBuffer>& buffer) {
    const uint32_t retainCount = [buffer retainCount];
    [buffer release];
    buffer = nil;
    updateAvailableSize();
    n_buffers--;
    return retainCount;
  }
  // returns the retainCount before releasing the heap
  uint32_t releaseMTLHeap() {
    const uint32_t retainCount = [heap retainCount];
    TORCH_INTERNAL_ASSERT(!n_buffers); // assert if heap isn't empty
    [heap setPurgeableState:MTLPurgeableStateEmpty];
    [heap release];
    heap = nil;
    size.available = 0;
    return retainCount;
  }
  uint32_t retainCount() const { return [heap retainCount]; }
  void updateAvailableSize() { size.available = heapAvailableSize(heap); }
};
typedef bool (*HeapComparison)(const HeapBlock*, const HeapBlock*);

struct BufferPool {
  enum class Kind {
    PRIVATE_SMALL,
    PRIVATE_LARGE,
    SHARED_SMALL,
    SHARED_LARGE,
    SCALAR,
  };

  BufferPool(const id<MTLDevice> Device, uint32_t Usage) :
             device(Device), usage(Usage),
             heaps(HeapBlock::Comparator), available_buffers(BufferBlock::Comparator) { }

  const id<MTLDevice> device;
  // usage flags to customize the pool for various purposes (see UsageFlags enum)
  const uint32_t usage;
  // total number of buffers in the pool
  uint32_t n_buffers = 0;
  // total allocations size on this pool
  size_t allocated_size = 0;
  // total memory available in the pool
  size_t available_size = 0;
  // list of heaps ordered by their "available" (not total) memory size
  std::set<HeapBlock*, HeapComparison> heaps;
  // list of only "available" buffers in the pool (i.e., buffers not in-use)
  std::set<BufferBlock*, BufferComparison> available_buffers;
  // list of buffers that are in a state of "limbo" where they've already been freed
  // from PyTorch-side, but were not returned to pool due to still being
  // in-use by command buffers with retainCount > 1. In this state, the buffer is
  // neither ready to be recycled, nor could be returned to pool as available.
  // These buffers will be returned to pool once the command buffer's
  // completionHandler callbacks are called.
  std::unordered_set<BufferBlock*> buffers_pending_free;
  // list of heaps pending size update
  std::unordered_set<HeapBlock*> heaps_pending_update;
};

class MPSHeapAllocatorImpl {
public:
  explicit MPSHeapAllocatorImpl() :
    m_device(at::mps::MPSDevice::getInstance()->device()),
    m_max_buffer_size([m_device maxBufferLength]),
    m_stream(getDefaultMPSStream()),
    m_event_pool(getMPSEventPool()) {
    init_allocator();
  }
  ~MPSHeapAllocatorImpl() {
    emptyCache();
  }
  // interface exposed to at::Allocator
  id<MTLBuffer> malloc(size_t size, uint32_t usage);
  // frees a buffer and returns it into buffer pool
  void free(void* ptr);
  // releases all the cached buffers and their associated heaps
  void emptyCache();
  // free inactive buffers that are pending to be freed
  void freeInactiveBuffers();
  // returns true if buffer was allocated from the shared pool
  bool isSharedBuffer(const void* ptr);
  // get the requested unaligned size of an MTLBuffer
  ssize_t getUnalignedBufferSize(const void* ptr);
  // set the shape of a base tensor from a view tensor
  void setBufferShape(const void* ptr, const IntArrayRef& shape);
  // retrieve the shape of a base tensor from a view tensor
  IntArrayRef getBufferShape(const void* ptr);
  // get the unique ID of the buffer
  id_t getBufferId(const void* ptr);
  // allocate a buffer from a specialized pool to import CPU scalars into GPU
  id<MTLBuffer> allocScalarBufferWithValue(void* value, size_t size);
  // returns a CPU-mapping of the input buffer and its retainCount,
  // if only it has Shared storage-mode and allocated on MPSAllocator
  std::pair<const void*, uint32_t> getSharedBufferPtr(const void* buffer);
  // records events for a list of MTLBuffers (list is used to lock the mutex once)
  // returns true if records any event (given if passed buffers exist and are shared-storage)
  bool recordEvents(c10::ArrayRef<const void*> buffers);
  // waits for the event to signal the completion of GPU execution
  // on the passed shared buffers (list is used to lock the mutex once)
  // returns true if actually waited on any event
  bool waitForEvents(c10::ArrayRef<const void*> buffers);
  // this indicates how far (in Megabytes) the current total allocations are from the
  // low watermark limit which is used to detect if we're under memory pressure
  // This returns zero if we've reached the low watermark limit
  ssize_t getLowWatermarkValue();
  // (see m_low_watermark_ratio for description)
  void setLowWatermarkRatio(double ratio);
  // (see m_high_watermark_ratio for description)
  void setHighWatermarkRatio(double ratio);
  // (see m_low_watermark_limit for description)
  size_t getLowWatermarkLimit() const { return m_low_watermark_limit; }
  // (see m_max_total_allowed_size for description)
  size_t getHighWatermarkLimit() const { return m_max_total_allowed_size; }
  // (see m_total_allocated_memory for description)
  size_t getTotalAllocatedMemory() const { return m_total_allocated_memory; }
  // (see m_current_allocated_memory for description)
  size_t getCurrentAllocatedMemory() const { return m_current_allocated_memory; }
  // total GPU memory allocated in the process by Metal driver; including
  // implicit allocations from MPS/MPSGraph frameworks and MPSHeapAllocatorImpl.
  size_t getDriverAllocatedMemory() const { return current_allocated_size(); }
  // recommended Max memory for Metal
  size_t getRecommendedMaxMemory() const { return max_device_size(); }
  // (see enum DebugVerbosity for description)
  uint32_t getDebugVerbosity() const { return m_debug_verbosity; }
  // returns the device that we allocate from
  inline id<MTLDevice> Device() const { return m_device; }

  // TODO: make a common function to do size unit conversions in PyTorch.
  inline std::string format_size(uint64_t size) const;

private:
  // (see m_high_watermark_ratio for description)
  constexpr static double default_high_watermark_ratio = 1.7;
  // we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize.
  constexpr static double default_high_watermark_upper_bound = 2.0;
  // (see m_low_watermark_ratio for description)
  // on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize
  constexpr static double default_low_watermark_ratio_unified  = 1.4;
  constexpr static double default_low_watermark_ratio_discrete = 1.0;

  const id<MTLDevice> m_device;
  std::recursive_mutex m_mutex;
  // allocated buffers by device pointer
  ska::flat_hash_map<const void*, BufferBlock*> m_allocated_buffers;
  // using a container for pools to simplify iterating them
  ska::flat_hash_map<BufferPool::Kind, std::unique_ptr<BufferPool>> m_pools;
  // total memory allocated by HeapAllocator (including blocks in pools)
  size_t m_total_allocated_memory = 0;
  // currently active memory allocations in use (i.e., blocks not in pools)
  size_t m_current_allocated_memory = 0;
  // max buffer size allowed by Metal
  size_t m_max_buffer_size = 0;
  // maximum total size allowed to be allocated
  size_t m_max_total_allowed_size = 0;
  // high watermark ratio is a hard limit for the total allowed allocations
  // 0. : disables high watermark limit (may cause system failure if system-wide OOM occurs)
  // 1. : recommended maximum allocation size (i.e., device.recommendedMaxWorkingSetSize)
  // >1.: allows limits beyond the device.recommendedMaxWorkingSetSize
  // e.g., value 0.95 means we allocate up to 95% of recommended maximum
  // allocation size; beyond that, the allocations would fail with OOM error.
  double m_high_watermark_ratio;
  // low watermark ratio is a soft limit to attempt limiting memory allocations up to the lower watermark
  // level by garbage collection or committing command buffers more frequently (a.k.a, adaptive commit).
  // Value between 0 to m_high_watermark_ratio (setting 0.0 disables adaptive commit and garbage collection)
  // e.g., value 0.9 means we 'attempt' to limit allocations up to 90% of recommended maximum
  // allocation size.
  double m_low_watermark_ratio;
  // low watermark size limit (in Bytes) at the time we initialize the allocator
  size_t m_low_watermark_limit;
  // use "PYTORCH_DEBUG_MPS_ALLOCATOR" env-var to set debug verbosity
  uint32_t m_debug_verbosity;
  // default MPS stream
  MPSStream* m_stream;
  // we hold a reference to MPSEventPool so it could get destroyed after MPSAllocator
  std::shared_ptr<MPSEventPool> m_event_pool;

  void init_allocator();
  void init_buffer_pools();
  HeapBlock* get_free_heap(AllocParams& params);
  bool get_free_buffer(AllocParams& params);
  BufferBlock* get_allocated_buffer_block(const void* ptr);
  BufferBlock* alloc_buffer_block(size_t size, uint32_t usage);
  bool alloc_buffer(AllocParams& params);
  void free_buffer(BufferBlock* buffer_block);
  // returns true if the container heap is also released
  bool release_buffer(BufferBlock* buffer_block, bool remove_empty_heap = true);
  void release_buffers(BufferPool& pool);
  bool release_available_cached_buffers(AllocParams& params);
  bool release_cached_buffers();
  // free unused cached blocks to reclaim GPU memory if memory pressure is high
  void garbage_collect_cached_buffers(AllocParams& params);
  // returns the suitable buffer pool type for the usage or
  // requested/allocated sizes
  BufferPool& get_pool(size_t requested_size, size_t aligned_size, uint32_t usage);
  // returns the aligned allocation size that is optimized
  // for the buffers to get reused frequently
  size_t get_allocation_size(size_t size, uint32_t usage) const;
  // maximum size of device memory available for allocation in current process
  // Note: the recommendedMaxWorkingSetSize is typically 75% of the total system memory.
  size_t max_device_size() const { return [m_device recommendedMaxWorkingSetSize]; }
  // there are implicit allocations from MPS backend, so we need to query the 'device' for
  // total allocated size instead of manually tracking in MPSAllocator
  size_t current_allocated_size() const { return [m_device currentAllocatedSize]; }

  bool trigger_memory_callbacks(BufferBlock* buffer_block, IMpsAllocatorCallback::EventType event) const {
    for (const auto& name : MPSAllocatorCallbacksRegistry()->Keys()) {
      MPSAllocatorCallbacksRegistry()->Create(name)->executeMPSAllocatorCallback(buffer_block ? buffer_block->buffer : nullptr, event);
    }
    return true;
  }
};

} // namespace at::mps::HeapAllocator