#include #include #include #include #include namespace c10 { namespace { thread_local AllocationPlanner* allocation_planner{nullptr}; thread_local CPUProfilingAllocator* profiling_allocator{nullptr}; struct MemBlock { uint64_t start_offset, end_offset; MemBlock(uint64_t s, uint64_t e) : start_offset(s), end_offset(e) {} bool operator<(const MemBlock& other) const { return start_offset < other.start_offset; } }; enum class EventType { Allocate = 0, Free, Invalid }; struct MemEvent { uint64_t time; uint64_t allocation_id; uint64_t size; EventType type{EventType::Invalid}; MemEvent(uint64_t t, uint64_t id, uint64_t s, EventType e) : time(t), allocation_id(id), size(s), type(e) {} }; bool overlaps(const MemBlock& a, const MemBlock& b) { // two blocks dont overlap if // |---a--------|--------------b--------| // strat_a end_a <= start_b end_b return !( (a.end_offset <= b.start_offset) || (b.end_offset <= a.start_offset)); } bool validate_allocation_plan( const std::vector& alloc_events, const std::vector& allocation_offsets) { std::set allocations; for (const auto& event : alloc_events) { auto alloc_id = event.allocation_id; // Skip allocations not managed by AllocationPlan if (allocation_offsets[alloc_id] == std::numeric_limits::max()) { continue; } auto start_offset = allocation_offsets[alloc_id]; auto end_offset = allocation_offsets[alloc_id] + event.size; MemBlock mem_block(start_offset, end_offset); if (event.type == EventType::Allocate) { auto it = allocations.lower_bound(mem_block); if (it != allocations.end()) { auto next_block = *it; if (overlaps(next_block, mem_block)) { return false; } } if (it != allocations.begin()) { auto prev_block = *(--it); if (overlaps(prev_block, mem_block)) { return false; } } allocations.emplace(mem_block); } else if (event.type == EventType::Free) { auto it = allocations.find(mem_block); TORCH_CHECK( (*it).end_offset == end_offset, "Enf offset of allocation being freed must match the one recorded."); TORCH_CHECK( it != allocations.end(), "ProfilingAllocator: Allocate event " "must have preceded deallocate event."); allocations.erase(it); } else { TORCH_CHECK(false, "ProfilingAllocator: Invalid event type."); } } return true; } std::vector create_and_sort_mem_events( const std::vector& allocation_sizes, const std::vector& allocation_lifetimes) { std::vector events; for (uint64_t i = 0; i < allocation_sizes.size(); ++i) { // If observed allocation are freed outside the scope of // observation, then allocations are not managed by the // AllocationPlan. if (allocation_lifetimes[i] == std::numeric_limits::max()) { continue; } events.emplace_back(i, i, allocation_sizes[i], EventType::Allocate); events.emplace_back( allocation_lifetimes[i], i, allocation_sizes[i], EventType::Free); } std::sort( events.begin(), events.end(), [](const MemEvent& a, const MemEvent& b) -> bool { return a.time < b.time; }); return events; } std::vector formulate_greedy_allocation_plan( const std::vector& allocation_sizes, const std::vector& allocation_lifetimes) { // Step 1. Construct all allocation/free events. // Sort these events by timestamp. // Step 2. Iterate through all events. // 2.1 If allocate event: // Find all candidate in free_size_to_offset map // Greedily pick the first one. // Remove the entry from free_size_to_offset map. // new_offset = offset + request_size // new_size = size - request_size // Add new entry to both maps // 2.2 If free event. // Check if the returned offset merges with another chunk. // If so merge until no more merging is possible. // If returned offset does not merge, then // just return it as a chunk. // lower_bound on this map will get all candidates of // the right size for allocation. std::map free_size_to_offset; // This provides fast lookup when we want to insert freed block // back, especially when we want to merge blocks. ska::flat_hash_map::iterator> free_start_offset_to_size_iter; ska::flat_hash_map::iterator> free_end_offset_to_size_iter; // Upon free end_ptr = offset + size // If end_ptr exists merge freed allocation // Also find corresponding offset in size_to_offset // Remove that entry and update with new size and offset // If end_ptr does not exist then just insert offset,size // in map and correspondingly size, offset in the other map. // Merging should always be done recursively until no more chunks // that can be found. // After last free we should have only one entry left in these maps. std::vector allocation_offsets( allocation_sizes.size(), std::numeric_limits::max()); auto mem_events = create_and_sort_mem_events(allocation_sizes, allocation_lifetimes); uint64_t max_offset{0}; for (const auto& mem_event : mem_events) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) uint64_t alloc_offset; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) uint64_t new_offset, new_size; if (mem_event.type == EventType::Allocate) { auto it = free_size_to_offset.lower_bound(mem_event.size); if (it == free_size_to_offset.end()) { // If there is no contiguous block of the size requested // allocate a new one. alloc_offset = max_offset; max_offset += mem_event.size; } else { // If we have found a block of the size we want // 1. change the block by allocating out of it. // 1.1 Erase the entire block // 1.2 Erase the reverse map entries // 2. If block still has space left insert the remainder back in map. // Including reverse map entries. alloc_offset = it->second; new_offset = alloc_offset + mem_event.size; new_size = it->first - mem_event.size; free_size_to_offset.erase(it); free_start_offset_to_size_iter.erase(alloc_offset); free_end_offset_to_size_iter.erase(alloc_offset + it->first); if (new_size > 0) { auto ref_it = free_size_to_offset.emplace(new_size, new_offset).first; free_start_offset_to_size_iter.emplace(new_offset, ref_it); free_end_offset_to_size_iter.emplace(new_offset + new_size, ref_it); } } allocation_offsets[mem_event.allocation_id] = alloc_offset; } else { // 1. Check if freed block is adjacent to an existing free block // at its end boundary. This is done by checking // free_end_offset_to_size_iter. // If we find such a block, remove it and adjust size of // the block being freed. // 2. Similarly check if freed block is adjacent to an existing // free block at start boundary. This is done by checking // free_start_offset_to_size_iter. // If we find such a block, remove it and adjust size of // the block being freed. // 3. Insert the freed block in map. auto freed_offset = allocation_offsets[mem_event.allocation_id]; auto freed_size = mem_event.size; auto end_offset = freed_offset + freed_size; // Merge when another free block exist at the end of this block auto end_it = free_start_offset_to_size_iter.find(end_offset); if (end_it != free_start_offset_to_size_iter.end()) { auto merge_block_iter = end_it->second; auto merge_block_size = merge_block_iter->first; freed_size += merge_block_size; free_size_to_offset.erase(merge_block_iter); free_start_offset_to_size_iter.erase(end_it); // If the block is being merged then also remove it from // free_end_offset_to_size_iter free_end_offset_to_size_iter.erase(end_offset + merge_block_size); } // Merge when freed block exist at the end of another free block auto start_it = free_end_offset_to_size_iter.find(freed_offset); if (start_it != free_end_offset_to_size_iter.end()) { auto merge_block_iter = start_it->second; auto merge_block_size = merge_block_iter->first; freed_size += merge_block_size; freed_offset -= merge_block_size; free_size_to_offset.erase(merge_block_iter); free_end_offset_to_size_iter.erase(start_it); // If the block is being merged then also remove it from // free_start_offset_to_size_iter free_start_offset_to_size_iter.erase(freed_offset); } auto freed_block_it = free_size_to_offset.emplace(freed_size, freed_offset).first; free_start_offset_to_size_iter.emplace(freed_offset, freed_block_it); free_end_offset_to_size_iter.emplace( freed_offset + freed_size, freed_block_it); } } TORCH_CHECK( validate_allocation_plan(mem_events, allocation_offsets), "ProfilingAllocator: Allocation plan invalid."); return allocation_offsets; } } // namespace void AllocationPlan::clear() { allocation_sizes.clear(); allocation_lifetimes.clear(); allocation_offsets.clear(); } void AllocationPlanner::record_allocation( const uint64_t size, const void* ptr) { if (validation_mode_) { validation_success = validation_success && validate_allocation(size, ptr); return; } allocation_plan_->allocation_sizes.push_back(size); allocation_plan_->allocation_lifetimes.push_back( std::numeric_limits::max()); allocation_ptr_to_id_[ptr] = allocation_id_; allocation_id_++; } void AllocationPlanner::record_free(const void* ptr) { if (validation_mode_) { validation_success = validation_success && validate_free(ptr); return; } auto it = allocation_ptr_to_id_.find(ptr); if (it == allocation_ptr_to_id_.end()) { // Free being recorded was allocated outside of WithProfileAllocationGuard return; } auto id = it->second; TORCH_CHECK( id < allocation_plan_->allocation_lifetimes.size(), "Allocation must have been recorded during record_allocation."); allocation_plan_->allocation_lifetimes[id] = allocation_id_; } bool AllocationPlanner::validate_allocation( const uint64_t size, const void* ptr) { if (allocation_id_ >= allocation_plan_->allocation_sizes.size() || allocation_plan_->allocation_sizes[allocation_id_] != size) { TORCH_WARN( "Allocation request does not match plan:", "Allocation id:", allocation_id_, ", Number of recorded allocations:", allocation_plan_->allocation_sizes.size(), ", Recorded size of the requested allocation:", allocation_plan_->allocation_sizes[allocation_id_], ", but got:", size); return false; } allocation_ptr_to_id_[ptr] = allocation_id_; allocation_id_++; return true; } bool AllocationPlanner::validate_free(const void* ptr) { auto it = allocation_ptr_to_id_.find(ptr); if (it == allocation_ptr_to_id_.end()) { // Allocation that was made outside the validation scope is being freed here return true; } auto id = (*it).second; TORCH_CHECK( id < allocation_plan_->allocation_lifetimes.size(), "Allocation must have been recorded during validate_allocation."); auto lifetime_id = allocation_plan_->allocation_lifetimes[id]; return (lifetime_id == allocation_id_); } void AllocationPlanner::formulate_plan() { allocation_plan_->allocation_offsets = formulate_greedy_allocation_plan( allocation_plan_->allocation_sizes, allocation_plan_->allocation_lifetimes); allocation_plan_->total_size = 0; for (const auto i : c10::irange(allocation_plan_->allocation_sizes.size())) { if (allocation_plan_->allocation_lifetimes[i] == std::numeric_limits::max()) { continue; } auto limit = allocation_plan_->allocation_offsets[i] + allocation_plan_->allocation_sizes[i]; allocation_plan_->total_size = std::max(allocation_plan_->total_size, limit); } } void AllocationPlanner::clear() { allocation_plan_->clear(); allocation_ptr_to_id_.clear(); } void CPUProfilingAllocator::set_plan(const AllocationPlan* plan) { TORCH_CHECK(plan != nullptr, "Allocation plan is nullptr."); plan_ = plan; allocation_id_ = 0; allocation_ptr_to_id_.clear(); if (current_size_ < plan->total_size) { // Free existing memory and reallocate for larger size. c10::free_cpu(blob_); blob_ = c10::alloc_cpu(plan->total_size); current_size_ = plan->total_size; } } void CPUProfilingAllocator::unset_plan() { allocation_id_ = 0; allocation_ptr_to_id_.clear(); plan_ = nullptr; } void* CPUProfilingAllocator::allocate(const size_t bytes) { TORCH_CHECK( bytes == plan_->allocation_sizes[allocation_id_], "Got allocation request that does not match with the plan."); if (plan_->allocation_lifetimes[allocation_id_] == std::numeric_limits::max()) { // This allocation is not managed by ProfilingAllocator. allocation_id_++; return c10::alloc_cpu(bytes); } void* ptr = reinterpret_cast(blob_) + plan_->allocation_offsets[allocation_id_]; allocation_ptr_to_id_[ptr] = allocation_id_; allocation_id_++; return ptr; } void CPUProfilingAllocator::free(void* const ptr) { auto it = allocation_ptr_to_id_.find(ptr); if (it == allocation_ptr_to_id_.end()) { // Either // 1. Allocation that was made outside the validation scope is being freed // here or // 2. Allocation that is not managed by profiling allocator is being freed. // Example of the second type // Tensor out; // for (....) { // { // CPUProfilingAllocator // out = ...some op (This also frees previous memory held by out) // } // out is used.. // } c10::free_cpu(ptr); return; } auto id = it->second; TORCH_CHECK( id < plan_->allocation_lifetimes.size(), "Freeing allocation that is not accordingly to the plan."); auto lifetime_id = plan_->allocation_lifetimes[id]; TORCH_CHECK( lifetime_id == allocation_id_, "Lifetime of allocations do not match: allocation_id ", id, ", expected:", lifetime_id, ", got:", allocation_id_); } CPUProfilingAllocator::~CPUProfilingAllocator() { c10::free_cpu(blob_); } WithProfileAllocationsGuard::WithProfileAllocationsGuard(AllocationPlan* plan) { // Nesting of allocation profiling does not seem meaningful. TORCH_CHECK( allocation_planner == nullptr, "Nesting profiling allocations is not supported."); planner_ = std::make_unique(plan); planner_->clear(); allocation_planner = planner_.get(); } WithProfileAllocationsGuard::~WithProfileAllocationsGuard() { planner_->formulate_plan(); allocation_planner = nullptr; } WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard( AllocationPlan* plan, bool* success) { // Nesting of allocation profiling does not seem meaningful. TORCH_CHECK( allocation_planner == nullptr, "Nesting profiling allocations is not supported."); planner_ = std::make_unique(plan, true); success_ = success; allocation_planner = planner_.get(); } WithValidateAllocationPlanGuard::~WithValidateAllocationPlanGuard() { *success_ = planner_->validation_success; allocation_planner = nullptr; } AllocationPlanner* GetThreadLocalAllocationPlanner() { return allocation_planner; } WithProfilingAllocatorGuard::WithProfilingAllocatorGuard( CPUProfilingAllocator* allocator, const AllocationPlan* plan) { // Nesting of profiling allocator is not supported. TORCH_CHECK( profiling_allocator == nullptr, "Nesting profiling allocators is not supported."); profiling_allocator = allocator; profiling_allocator->set_plan(plan); } WithProfilingAllocatorGuard::~WithProfilingAllocatorGuard() { profiling_allocator->unset_plan(); profiling_allocator = nullptr; } CPUProfilingAllocator* GetThreadLocalProfilingAllocator() { return profiling_allocator; } } // namespace c10