1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_ 17 #define TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_ 18 19 #include <array> 20 #include <deque> 21 #include <memory> 22 #include <string> 23 #include <unordered_map> 24 #include <vector> 25 26 #include "absl/container/flat_hash_set.h" 27 #include "tensorflow/core/common_runtime/allocator_retry.h" 28 #include "tensorflow/core/common_runtime/shared_counter.h" 29 #include "tensorflow/core/framework/allocator.h" 30 #include "tensorflow/core/lib/strings/numbers.h" 31 #include "tensorflow/core/lib/strings/strcat.h" 32 #include "tensorflow/core/platform/macros.h" 33 #include "tensorflow/core/platform/mutex.h" 34 #include "tensorflow/core/platform/thread_annotations.h" 35 #include "tensorflow/core/platform/types.h" 36 37 namespace tensorflow { 38 39 class MemoryDump; 40 41 // A memory allocator that implements a 'best-fit with coalescing' 42 // algorithm. This is essentially a very simple version of Doug Lea's 43 // malloc (dlmalloc). 44 // 45 // The goal of this allocator is to support defragmentation via 46 // coalescing. One assumption we make is that the process using this 47 // allocator owns pretty much all of the memory, and that nearly 48 // all requests to allocate memory go through this interface. 49 class BFCAllocator : public Allocator { 50 public: 51 struct Options { 52 bool allow_growth = true; 53 54 // If true, the allocator may sleep for a period of time when it can't 55 // fulfill an allocation request, in the hopes that another thread will free 56 // up memory in the meantime. 57 // 58 // If false, the allocator will never sleep, even if 59 // AllocationAttributes::attr_retry_on_failure is true. 60 bool allow_retry_on_failure = true; 61 62 // Whether the allocator will deallocate free regions to avoid OOM due to 63 // memory fragmentation. 64 bool garbage_collection = false; 65 66 // Controls when a chunk should be split, if its size exceeds the requested 67 // allocation size. 68 double fragmentation_fraction = 0; 69 }; 70 BFCAllocator(std::unique_ptr<SubAllocator> sub_allocator, size_t total_memory, 71 const string& name, const Options& opts); 72 73 ~BFCAllocator() override; 74 Name()75 string Name() override { return name_; } 76 AllocateRaw(size_t alignment,size_t num_bytes)77 void* AllocateRaw(size_t alignment, size_t num_bytes) override { 78 return AllocateRaw(alignment, num_bytes, AllocationAttributes()); 79 } 80 81 void* AllocateRaw(size_t alignment, size_t num_bytes, 82 const AllocationAttributes& allocation_attr) override; 83 84 void DeallocateRaw(void* ptr) override; 85 86 bool TracksAllocationSizes() const override; 87 88 size_t RequestedSize(const void* ptr) const override; 89 90 size_t AllocatedSize(const void* ptr) const override; 91 92 int64_t AllocationId(const void* ptr) const override; 93 94 absl::optional<AllocatorStats> GetStats() override; 95 96 bool ClearStats() override; 97 SetTimingCounter(SharedCounter * sc)98 void SetTimingCounter(SharedCounter* sc) { timing_counter_ = sc; } 99 100 void SetSafeFrontier(uint64 count) override; 101 102 AllocatorMemoryType GetMemoryType() const override; 103 ShouldRecordOpName()104 bool ShouldRecordOpName() const { return true; } 105 106 MemoryDump RecordMemoryMap(); 107 108 protected: 109 // This setting controls when a chunk should be split, if its size exceeds the 110 // requested allocation size. It is not expected to be changed after 111 // initialization. SetInternalFragmentationFraction(double fraction)112 void SetInternalFragmentationFraction(double fraction) { 113 internal_fragmentation_fraction_ = fraction; 114 } 115 116 private: 117 struct Bin; 118 119 void* AllocateRawInternal(size_t alignment, size_t num_bytes, 120 bool dump_log_on_failure, 121 uint64 freed_before_count); 122 123 void* AllocateRawInternalWithRetry( 124 size_t alignment, size_t num_bytes, 125 const AllocationAttributes& allocation_attr); 126 127 void DeallocateRawInternal(void* ptr); 128 129 // Chunks whose freed_at_count is later than the safe frontier value are kept 130 // on a special list and not subject to merging immediately upon being freed. 131 // 132 // This function sweeps that list looking for Chunks whose timestamp is now 133 // safe. When found their freed_at_count is set to 0 and we attempt to merge 134 // them with their neighbors. 135 // 136 // If required_bytes > 0 then this function is being called in the context of 137 // a need for this many bytes that could not be satisfied without merging 138 // unsafe chunks, so we go ahead and merge the unsafe chunks too, just up to 139 // the point that a free chunk of required_bytes is produced. Note that 140 // unsafe merged chunks adopt the most conservative timestamp from their 141 // constituents so they're only useful for allocations not requiring a 142 // particular timestamp. 143 bool MergeTimestampedChunks(size_t required_bytes) 144 TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 145 146 // Return the largest free chunk bytes from the largest bin in constant time. 147 // The free chunks are sorted by size (and then address) in a bin. 148 int64_t LargestFreeChunk() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 149 150 // Add TraceMe (in memory allocation and deallocation) for memory stats 151 // profiling. The chunk_ptr is passed to get information such as address, 152 // chunk size and requested_size. 153 void AddTraceMe(absl::string_view traceme_name, const void* ptr) 154 TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 155 156 // Overloaded AddTraceMe function with chunk information. 157 void AddTraceMe(absl::string_view traceme_name, const void* chunk_ptr, 158 int64_t req_bytes, int64_t alloc_bytes) 159 TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 160 161 // A ChunkHandle is an index into the chunks_ vector in BFCAllocator 162 // kInvalidChunkHandle means an invalid chunk 163 typedef size_t ChunkHandle; 164 static constexpr ChunkHandle kInvalidChunkHandle = SIZE_MAX; 165 166 typedef int BinNum; 167 static constexpr int kInvalidBinNum = -1; 168 // The following means that the largest bin'd chunk size is 256 << 21 = 512MB. 169 static constexpr int kNumBins = 21; 170 171 // A Chunk points to a piece of memory that's either entirely free or entirely 172 // in use by one user memory allocation. 173 // 174 // An AllocationRegion's memory is split up into one or more disjoint Chunks, 175 // which together cover the whole region without gaps. Chunks participate in 176 // a doubly-linked list, and the prev/next pointers point to the physically 177 // adjacent chunks. 178 // 179 // Since a chunk cannot be partially in use, we may need to split a free chunk 180 // in order to service a user allocation. We always merge adjacent free 181 // chunks. 182 // 183 // Chunks contain information about whether they are in use or whether they 184 // are free, and contain a pointer to the bin they are in. 185 struct Chunk { 186 size_t size = 0; // Full size of buffer. 187 188 // We sometimes give chunks that are larger than needed to reduce 189 // fragmentation. requested_size keeps track of what the client 190 // actually wanted so we can understand whether our splitting 191 // strategy is efficient. 192 size_t requested_size = 0; 193 194 // allocation_id is set to -1 when the chunk is not in use. It is assigned a 195 // value greater than zero before the chunk is returned from 196 // AllocateRaw, and this value is unique among values assigned by 197 // the parent allocator. 198 int64_t allocation_id = -1; 199 void* ptr = nullptr; // pointer to granted subbuffer. 200 201 // If not kInvalidChunkHandle, the memory referred to by 'prev' is directly 202 // preceding the memory used by this chunk. E.g., It should start 203 // at 'ptr - prev->size' 204 ChunkHandle prev = kInvalidChunkHandle; 205 206 // If not kInvalidChunkHandle, the memory referred to by 'next' is directly 207 // following the memory used by this chunk. E.g., It should be at 208 // 'ptr + size' 209 ChunkHandle next = kInvalidChunkHandle; 210 211 // What bin are we in? 212 BinNum bin_num = kInvalidBinNum; 213 214 // Optional count when this chunk was most recently made free. 215 uint64 freed_at_count = 0; 216 in_useChunk217 bool in_use() const { return allocation_id != -1; } 218 219 #ifdef TENSORFLOW_MEM_DEBUG 220 // optional debugging info 221 const char* op_name = nullptr; 222 uint64 step_id = 0; 223 int64 action_count = 0; 224 #endif 225 DebugStringChunk226 string DebugString(BFCAllocator* a, 227 bool recurse) TF_NO_THREAD_SAFETY_ANALYSIS { 228 string dbg; 229 strings::StrAppend( 230 &dbg, " Size: ", strings::HumanReadableNumBytes(size), 231 " | Requested Size: ", strings::HumanReadableNumBytes(requested_size), 232 " | in_use: ", in_use(), " | bin_num: ", bin_num); 233 if (recurse && prev != BFCAllocator::kInvalidChunkHandle) { 234 Chunk* p = a->ChunkFromHandle(prev); 235 strings::StrAppend(&dbg, ", prev: ", p->DebugString(a, false)); 236 } 237 if (recurse && next != BFCAllocator::kInvalidChunkHandle) { 238 Chunk* n = a->ChunkFromHandle(next); 239 strings::StrAppend(&dbg, ", next: ", n->DebugString(a, false)); 240 } 241 #ifdef TENSORFLOW_MEM_DEBUG 242 strings::StrAppend(&dbg, ", for: ", op_name ? op_name : "UNKNOWN", 243 ", stepid: ", step_id, 244 ", last_action: ", action_count); 245 #endif 246 return dbg; 247 } 248 }; 249 250 // A Bin is a collection of similar-sized free chunks. 251 // Allocated chunks are never in a Bin. 252 struct Bin { 253 // All chunks in this bin have >= bin_size memory. 254 size_t bin_size = 0; 255 256 class ChunkComparator { 257 public: ChunkComparatorBin258 explicit ChunkComparator(BFCAllocator* allocator) 259 : allocator_(allocator) {} 260 // Sort first by size and then use pointer address as a tie breaker. operatorBin261 bool operator()(const ChunkHandle ha, 262 const ChunkHandle hb) const TF_NO_THREAD_SAFETY_ANALYSIS { 263 const Chunk* a = allocator_->ChunkFromHandle(ha); 264 const Chunk* b = allocator_->ChunkFromHandle(hb); 265 if (a->size != b->size) { 266 return a->size < b->size; 267 } 268 return a->ptr < b->ptr; 269 } 270 271 private: 272 BFCAllocator* allocator_; // The parent allocator 273 }; 274 275 typedef std::set<ChunkHandle, ChunkComparator> FreeChunkSet; 276 // List of free chunks within the bin, sorted by chunk size. 277 // Chunk * not owned. 278 FreeChunkSet free_chunks; BinBin279 Bin(BFCAllocator* allocator, size_t bs) 280 : bin_size(bs), free_chunks(ChunkComparator(allocator)) {} 281 }; 282 283 static constexpr size_t kMinAllocationBits = 8; 284 static constexpr size_t kMinAllocationSize = 1 << kMinAllocationBits; 285 286 // BFCAllocator allocates memory into a collection of disjoint 287 // AllocationRegions. Each AllocationRegion corresponds to one call to 288 // SubAllocator::Alloc(). (Actually, if a subsequent call to 289 // SubAllocator::Alloc() returns another region immediately adjacent to the 290 // last, it will be used to extend the first AllocationRegion, not create a 291 // separate one.) 292 // 293 // An AllocationRegion contains one or more Chunks, covering all of its 294 // memory. Its primary job is to map pointers to ChunkHandles. 295 // 296 // This class is thread-compatible. 297 class AllocationRegion { 298 public: AllocationRegion(void * ptr,size_t memory_size)299 AllocationRegion(void* ptr, size_t memory_size) 300 : ptr_(ptr), 301 memory_size_(memory_size), 302 end_ptr_( 303 static_cast<void*>(static_cast<char*>(ptr_) + memory_size_)) { 304 DCHECK_EQ(0, memory_size % kMinAllocationSize); 305 const size_t n_handles = 306 (memory_size + kMinAllocationSize - 1) / kMinAllocationSize; 307 handles_.resize(n_handles, kInvalidChunkHandle); 308 } 309 310 AllocationRegion() = default; AllocationRegion(AllocationRegion && other)311 AllocationRegion(AllocationRegion&& other) { Swap(&other); } 312 AllocationRegion& operator=(AllocationRegion&& other) { 313 Swap(&other); 314 return *this; 315 } 316 ptr()317 void* ptr() const { return ptr_; } end_ptr()318 void* end_ptr() const { return end_ptr_; } memory_size()319 size_t memory_size() const { return memory_size_; } extend(size_t size)320 void extend(size_t size) { 321 memory_size_ += size; 322 DCHECK_EQ(0, memory_size_ % kMinAllocationSize); 323 324 end_ptr_ = static_cast<void*>(static_cast<char*>(end_ptr_) + size); 325 const size_t n_handles = 326 (memory_size_ + kMinAllocationSize - 1) / kMinAllocationSize; 327 handles_.resize(n_handles, kInvalidChunkHandle); 328 } get_handle(const void * p)329 ChunkHandle get_handle(const void* p) const { 330 return handles_[IndexFor(p)]; 331 } set_handle(const void * p,ChunkHandle h)332 void set_handle(const void* p, ChunkHandle h) { handles_[IndexFor(p)] = h; } erase(const void * p)333 void erase(const void* p) { set_handle(p, kInvalidChunkHandle); } 334 335 private: Swap(AllocationRegion * other)336 void Swap(AllocationRegion* other) { 337 std::swap(ptr_, other->ptr_); 338 std::swap(memory_size_, other->memory_size_); 339 std::swap(end_ptr_, other->end_ptr_); 340 std::swap(handles_, other->handles_); 341 } 342 IndexFor(const void * p)343 size_t IndexFor(const void* p) const { 344 std::uintptr_t p_int = reinterpret_cast<std::uintptr_t>(p); 345 std::uintptr_t base_int = reinterpret_cast<std::uintptr_t>(ptr_); 346 DCHECK_GE(p_int, base_int); 347 DCHECK_LT(p_int, base_int + memory_size_); 348 return static_cast<size_t>(((p_int - base_int) >> kMinAllocationBits)); 349 } 350 351 // Metadata about the allocation region. 352 void* ptr_ = nullptr; 353 size_t memory_size_ = 0; 354 void* end_ptr_ = nullptr; 355 356 // Array of size "memory_size / kMinAllocationSize". It is 357 // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle 358 // for the memory allocation represented by "p" 359 std::vector<ChunkHandle> handles_; 360 361 TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion); 362 }; 363 364 // RegionManager aggregates one or more "AllocationRegions" and provides 365 // a layer of indirection from pointers to the underlying ChunkHandle, 366 // allowing allocation across multiple discontiguous memory regions. 367 // 368 // This class is thread-compatible. 369 class RegionManager { 370 public: RegionManager()371 RegionManager() {} ~RegionManager()372 ~RegionManager() {} 373 AddAllocationRegion(void * ptr,size_t memory_size)374 void AddAllocationRegion(void* ptr, size_t memory_size) { 375 // Insert sorted by end_ptr. 376 auto entry = 377 std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator); 378 regions_.insert(entry, AllocationRegion(ptr, memory_size)); 379 } 380 381 // Adds an alloation region for the given ptr and size, potentially 382 // extending a region if ptr matches the end_ptr of an existing region. 383 // If a region is extended, returns a pointer to the extended region so that 384 // the BFC allocator can reason about chunkification. AddOrExtendAllocationRegion(void * ptr,size_t memory_size)385 AllocationRegion* AddOrExtendAllocationRegion(void* ptr, 386 size_t memory_size) { 387 // Insert sorted by end_ptr. 388 auto entry = 389 std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator); 390 // Check if can be coalesced with preceding region. 391 if (entry != regions_.begin()) { 392 auto preceding_region = entry - 1; 393 if (preceding_region->end_ptr() == ptr) { 394 if (VLOG_IS_ON(1)) { 395 LOG(INFO) << "Extending region " << preceding_region->ptr() 396 << " of " 397 << strings::HumanReadableNumBytes( 398 preceding_region->memory_size()) 399 << " by " << strings::HumanReadableNumBytes(memory_size) 400 << " bytes"; 401 } 402 preceding_region->extend(memory_size); 403 return &*preceding_region; 404 } 405 } 406 VLOG(1) << "Inserting new region " << ptr << " of " 407 << strings::HumanReadableNumBytes(memory_size); 408 regions_.insert(entry, AllocationRegion(ptr, memory_size)); 409 return nullptr; 410 } 411 RemoveAllocationRegion(std::vector<AllocationRegion>::iterator it)412 std::vector<AllocationRegion>::iterator RemoveAllocationRegion( 413 std::vector<AllocationRegion>::iterator it) { 414 return regions_.erase(it); 415 } 416 get_handle(const void * p)417 ChunkHandle get_handle(const void* p) const { 418 return RegionFor(p)->get_handle(p); 419 } 420 set_handle(const void * p,ChunkHandle h)421 void set_handle(const void* p, ChunkHandle h) { 422 return MutableRegionFor(p)->set_handle(p, h); 423 } erase(const void * p)424 void erase(const void* p) { return MutableRegionFor(p)->erase(p); } 425 regions()426 const std::vector<AllocationRegion>& regions() const { return regions_; } 427 428 private: Comparator(const void * ptr,const AllocationRegion & other)429 static bool Comparator(const void* ptr, const AllocationRegion& other) { 430 return ptr < other.end_ptr(); 431 } 432 MutableRegionFor(const void * p)433 AllocationRegion* MutableRegionFor(const void* p) { 434 return const_cast<AllocationRegion*>(RegionFor(p)); 435 } 436 RegionFor(const void * p)437 const AllocationRegion* RegionFor(const void* p) const { 438 auto entry = 439 std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator); 440 441 if (entry != regions_.end()) { 442 return &(*entry); 443 } 444 445 LOG(FATAL) << "Could not find Region for " << p; 446 return nullptr; 447 } 448 449 private: 450 std::vector<AllocationRegion> regions_; 451 }; 452 453 // Returns 'bytes' rounded up to the next highest kMinAllocationSize. 454 static size_t RoundedBytes(size_t bytes); 455 456 // Try to add a new memory region that can satisfy an allocation of 457 // 'rounded_bytes' bytes. Returns true on success and false on 458 // failure. 459 bool Extend(size_t alignment, size_t rounded_bytes) 460 TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 461 462 // Deallocate free regions to give back the memory to suballocator, so that 463 // we can re-allocate a larger region. The main use scenario of this function 464 // is when OOM happens but we have free regions and the sum of sizes of free 465 // regions and unallocated bytes is larger than the requested size, implying 466 // (external) memory fragmentation. Returns true if any free regions are 467 // found and freed; false otherwise. 468 bool DeallocateFreeRegions(size_t rounded_bytes); 469 470 // Helper function to deallocate regions. 471 void DeallocateRegions(const absl::flat_hash_set<void*>& region_ptrs) 472 TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 473 474 // Returns a pointer to an underlying allocated chunk of size 475 // 'rounded_bytes'. 476 void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes, 477 uint64 freed_before) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 478 479 // Splits the chunk specified by 'h' into two chunks, one at least 480 // of size 'num_bytes'. 481 void SplitChunk(ChunkHandle h, size_t num_bytes) 482 TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 483 484 // Merges the two chunk handles. Requires that the chunks are 485 // contiguous in their allocation. 486 void Merge(ChunkHandle h, ChunkHandle h2) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 487 488 // Adds the chunk 'h' to the proper free bin. 489 void InsertFreeChunkIntoBin(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 490 491 // Removes the free chunk pointed to by 'c' from the set free_chunks. 492 void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks, 493 const Bin::FreeChunkSet::iterator& c) 494 TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 495 496 // Removes a free chunk from the bin. 497 void RemoveFreeChunkFromBin(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 498 void MaybeRemoveFreeChunkFromBin(ChunkHandle h) 499 TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 500 501 // Removes the chunk metadata represented by 'h'. 502 void DeleteChunk(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 503 504 string RenderOccupancy() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 505 void DumpMemoryLog(size_t num_bytes) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 506 MemoryDump RecordMemoryMapInternal() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 507 void MaybeWriteMemoryMap() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 508 509 ChunkHandle AllocateChunk() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 510 void DeallocateChunk(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 511 512 Chunk* ChunkFromHandle(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 513 const Chunk* ChunkFromHandle(ChunkHandle h) const 514 TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 515 516 void MarkFree(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 517 518 ChunkHandle TryToCoalesce(ChunkHandle h, bool ignore_freed_at) 519 TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 520 521 // Fragmentation is calculated as the reverse ratio of the largest free chunk 522 // size over total free memory, and returns a value within [0, 1]. 523 double GetFragmentation() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 524 525 // Information about a Bin that is useful for debugging. 526 struct BinDebugInfo { 527 size_t total_bytes_in_use = 0; 528 size_t total_bytes_in_bin = 0; 529 size_t total_requested_bytes_in_use = 0; 530 size_t total_chunks_in_use = 0; 531 size_t total_chunks_in_bin = 0; 532 }; 533 534 // Computes and returns a BinDebugInfo for each Bin. 535 std::array<BinDebugInfo, kNumBins> get_bin_debug_info() 536 TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); 537 538 AllocatorRetry retry_helper_; 539 540 // Structures immutable after construction 541 size_t memory_limit_ = 0; 542 Log2FloorNonZeroSlow(uint64 n)543 inline int Log2FloorNonZeroSlow(uint64 n) { 544 int r = 0; 545 while (n > 0) { 546 r++; 547 n >>= 1; 548 } 549 return r - 1; 550 } 551 552 // Returns floor(log2(n)). Log2FloorNonZero(uint64 n)553 inline int Log2FloorNonZero(uint64 n) { 554 #if defined(__GNUC__) 555 return 63 ^ __builtin_clzll(n); 556 #elif defined(PLATFORM_WINDOWS) && (_WIN64) 557 unsigned long index; 558 _BitScanReverse64(&index, n); 559 return index; 560 #else 561 return Log2FloorNonZeroSlow(n); 562 #endif 563 } 564 565 // Map from bin size to Bin BinFromIndex(BinNum index)566 Bin* BinFromIndex(BinNum index) { 567 return reinterpret_cast<Bin*>(&(bins_space_[index * sizeof(Bin)])); 568 } BinNumToSize(BinNum index)569 size_t BinNumToSize(BinNum index) { 570 return static_cast<size_t>(256) << index; 571 } BinNumForSize(size_t bytes)572 BinNum BinNumForSize(size_t bytes) { 573 uint64 v = std::max<size_t>(bytes, 256) >> kMinAllocationBits; 574 int b = std::min(kNumBins - 1, Log2FloorNonZero(v)); 575 return b; 576 } BinForSize(size_t bytes)577 Bin* BinForSize(size_t bytes) { return BinFromIndex(BinNumForSize(bytes)); } 578 579 char bins_space_[sizeof(Bin) * kNumBins]; 580 581 const Options opts_; 582 583 // The size of the current region allocation. 584 size_t curr_region_allocation_bytes_; 585 586 // The total number of allocated bytes by the allocator. 587 size_t total_region_allocated_bytes_ = 0; 588 589 // An indicator that expansion of a region has hit the limits 590 // of the available memory. 591 bool started_backpedal_ = false; 592 593 // Whether the allocator will coalesce adjacent sub allocator provided 594 // AllocationRegions. This may be disabled if discrete sub allocator 595 // regions can't be treated as contiguous (e.g. if the allocation refers to 596 // device visible memory which is not adjacent to the other region in the 597 // device's address space). 598 const bool coalesce_regions_; 599 600 std::unique_ptr<SubAllocator> sub_allocator_; 601 string name_; 602 SharedCounter* timing_counter_ = nullptr; 603 std::deque<ChunkHandle> timestamped_chunks_; 604 605 double internal_fragmentation_fraction_ = {0.0}; 606 607 std::atomic<uint64> safe_frontier_ = {0}; 608 609 // Structures mutable after construction 610 mutable mutex lock_; 611 RegionManager region_manager_ TF_GUARDED_BY(lock_); 612 613 std::vector<Chunk> chunks_ TF_GUARDED_BY(lock_); 614 615 // Pointer to head of linked list of free Chunks 616 ChunkHandle free_chunks_list_ TF_GUARDED_BY(lock_); 617 618 // Counter containing the next unique identifier to assign to a 619 // newly-created chunk. 620 int64_t next_allocation_id_ TF_GUARDED_BY(lock_); 621 622 // Stats. 623 AllocatorStats stats_ TF_GUARDED_BY(lock_); 624 #ifdef TENSORFLOW_MEM_DEBUG 625 int64 action_counter_ = 0 TF_GUARDED_BY(lock_); 626 #define MEM_DEBUG_SIZE_HISTORY_SIZE 4096 627 int64 size_history_[MEM_DEBUG_SIZE_HISTORY_SIZE]; 628 #endif 629 630 friend class GPUBFCAllocatorPrivateMethodsTest; 631 friend class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific; 632 TF_DISALLOW_COPY_AND_ASSIGN(BFCAllocator); 633 }; 634 635 } // namespace tensorflow 636 637 #endif // TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_ 638