xref: /aosp_15_r20/external/tensorflow/tensorflow/core/common_runtime/bfc_allocator.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/common_runtime/bfc_allocator.h"
17 
18 #include <algorithm>
19 #include <atomic>
20 #include <utility>
21 
22 #include "absl/strings/string_view.h"
23 #include "tensorflow/core/common_runtime/allocator_retry.h"
24 #include "tensorflow/core/lib/core/bits.h"
25 #include "tensorflow/core/lib/strings/numbers.h"
26 #include "tensorflow/core/lib/strings/str_util.h"
27 #include "tensorflow/core/lib/strings/strcat.h"
28 #include "tensorflow/core/platform/file_system.h"
29 #include "tensorflow/core/platform/logging.h"
30 #include "tensorflow/core/platform/mutex.h"
31 #ifdef TENSORFLOW_MEM_DEBUG
32 #include "tensorflow/core/platform/stacktrace.h"
33 #endif
34 #include "tensorflow/core/platform/types.h"
35 #include "tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h"
36 #include "tensorflow/core/profiler/lib/traceme.h"
37 #include "tensorflow/core/protobuf/bfc_memory_map.pb.h"
38 
39 namespace tensorflow {
40 
41 constexpr BFCAllocator::ChunkHandle BFCAllocator::kInvalidChunkHandle;
42 
BFCAllocator(std::unique_ptr<SubAllocator> sub_allocator,size_t total_memory,const string & name,const Options & opts)43 BFCAllocator::BFCAllocator(std::unique_ptr<SubAllocator> sub_allocator,
44                            size_t total_memory, const string& name,
45                            const Options& opts)
46     : opts_(opts),
47       coalesce_regions_(sub_allocator->SupportsCoalescing()),
48       sub_allocator_(std::move(sub_allocator)),
49       name_(name),
50       free_chunks_list_(kInvalidChunkHandle),
51       next_allocation_id_(1) {
52   if (opts.allow_growth) {
53     // 2MiB smallest initial allocation, unless total memory available
54     // is less.
55     curr_region_allocation_bytes_ =
56         RoundedBytes(std::min(total_memory, size_t{2 << 20}));
57   } else {
58     curr_region_allocation_bytes_ = RoundedBytes(total_memory);
59   }
60 
61   // Allocate the requested amount of memory.
62   memory_limit_ = total_memory;
63   stats_.bytes_limit = static_cast<int64_t>(total_memory);
64 
65   // Create a bunch of bins of various good sizes.
66 
67   // We create bins to fit all possible ranges that cover the
68   // memory_limit_ starting from allocations up to 256 bytes to
69   // allocations up to (and including) the memory limit.
70   VLOG(1) << "Creating new BFCAllocator named: " << name;
71   for (BinNum b = 0; b < kNumBins; b++) {
72     size_t bin_size = BinNumToSize(b);
73     VLOG(1) << "Creating bin of max chunk size "
74             << strings::HumanReadableNumBytes(bin_size);
75     new (BinFromIndex(b)) Bin(this, bin_size);
76     CHECK_EQ(BinForSize(bin_size), BinFromIndex(b));
77     CHECK_EQ(BinForSize(bin_size + 255), BinFromIndex(b));
78     CHECK_EQ(BinForSize(bin_size * 2 - 1), BinFromIndex(b));
79     if (b + 1 < kNumBins) {
80       CHECK_NE(BinForSize(bin_size * 2), BinFromIndex(b));
81     }
82   }
83 }
84 
~BFCAllocator()85 BFCAllocator::~BFCAllocator() {
86   // Return memory back.
87   VLOG(2) << "Number of regions allocated: "
88           << region_manager_.regions().size();
89   for (const auto& region : region_manager_.regions()) {
90     sub_allocator_->Free(region.ptr(), region.memory_size());
91   }
92 
93   for (BinNum b = 0; b < kNumBins; b++) {
94     BinFromIndex(b)->~Bin();
95   }
96 }
97 
ChunkFromHandle(ChunkHandle h)98 BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) {
99   DCHECK_GE(h, 0);
100   DCHECK_LT(h, static_cast<int>(chunks_.size()));
101   return &(chunks_[h]);
102 }
103 
ChunkFromHandle(ChunkHandle h) const104 const BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) const {
105   DCHECK_GE(h, 0);
106   DCHECK_LT(h, static_cast<int>(chunks_.size()));
107   return &(chunks_[h]);
108 }
109 
Extend(size_t alignment,size_t rounded_bytes)110 bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
111   size_t available_bytes = memory_limit_ - total_region_allocated_bytes_;
112   // Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
113   available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
114 
115   // Do we have enough space to handle the client's request?
116   // If not, fail immediately.
117   if (rounded_bytes > available_bytes) {
118     return false;
119   }
120 
121   // If curr_region_allocation_bytes_ is not enough to satisfy the
122   // allocation, keep multiplying by a power of two until that is
123   // sufficient.
124   bool increased_allocation = false;
125   while (rounded_bytes > curr_region_allocation_bytes_) {
126     curr_region_allocation_bytes_ *= 2;
127     increased_allocation = true;
128   }
129 
130   // Try allocating.
131   size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
132   size_t bytes_received;
133   void* mem_addr = sub_allocator_->Alloc(alignment, bytes, &bytes_received);
134   if (mem_addr == nullptr && !started_backpedal_) {
135     // Only backpedal once.
136     started_backpedal_ = true;
137 
138     static constexpr float kBackpedalFactor = 0.9;
139 
140     // Try allocating less memory.
141     while (mem_addr == nullptr) {
142       bytes = RoundedBytes(bytes * kBackpedalFactor);
143       if (bytes < rounded_bytes) break;
144       mem_addr = sub_allocator_->Alloc(alignment, bytes, &bytes_received);
145     }
146   }
147 
148   if (mem_addr == nullptr) {
149     return false;
150   }
151 
152   if (!increased_allocation) {
153     // Increase the region size of the next required allocation.
154     curr_region_allocation_bytes_ *= 2;
155   }
156 
157   VLOG(1) << "Extending allocation by "
158           << strings::HumanReadableNumBytes(bytes_received) << " bytes for "
159           << Name() << ".";
160 
161   total_region_allocated_bytes_ += bytes_received;
162   VLOG(1) << "Total allocated bytes: "
163           << strings::HumanReadableNumBytes(total_region_allocated_bytes_);
164 
165   VLOG(1) << "Allocated memory at " << mem_addr << " to "
166           << static_cast<void*>(static_cast<char*>(mem_addr) + bytes_received);
167 
168   AllocationRegion* maybe_extended_region = nullptr;
169   if (coalesce_regions_) {
170     maybe_extended_region =
171         region_manager_.AddOrExtendAllocationRegion(mem_addr, bytes_received);
172   } else {
173     region_manager_.AddAllocationRegion(mem_addr, bytes_received);
174   }
175 
176   // Create one large chunk for the whole memory space that will
177   // be chunked later.
178   ChunkHandle h = AllocateChunk();
179   BFCAllocator::Chunk* c = ChunkFromHandle(h);
180   c->ptr = mem_addr;
181   c->size = bytes_received;
182   c->allocation_id = -1;
183   c->prev = kInvalidChunkHandle;
184   c->next = kInvalidChunkHandle;
185   c->freed_at_count = 0;
186 
187   region_manager_.set_handle(c->ptr, h);
188 
189   // If the region was extended, then there exists a previous chunk that should
190   // be linked to the new chunk.
191   if (maybe_extended_region != nullptr) {
192     ChunkHandle prev =
193         maybe_extended_region->get_handle(maybe_extended_region->ptr());
194     BFCAllocator::Chunk* prev_chunk = ChunkFromHandle(prev);
195     // Find the last recorded chunk in the extended region.
196     while (prev_chunk->next != kInvalidChunkHandle) {
197       prev = prev_chunk->next;
198       prev_chunk = ChunkFromHandle(prev);
199     }
200     c->prev = prev;
201     prev_chunk->next = h;
202   }
203 
204   // Maybe merge adjacent chunks and insert the chunk into the right bin.
205   InsertFreeChunkIntoBin(TryToCoalesce(h, /*ignore_freed_at=*/false));
206 
207   return true;
208 }
209 
AllocateChunk()210 BFCAllocator::ChunkHandle BFCAllocator::AllocateChunk() {
211   if (free_chunks_list_ != kInvalidChunkHandle) {
212     ChunkHandle h = free_chunks_list_;
213     Chunk* c = ChunkFromHandle(h);
214     free_chunks_list_ = c->next;
215     return h;
216   } else {
217     ChunkHandle h = chunks_.size();
218     chunks_.resize(h + 1);
219     return h;
220   }
221 }
222 
DeallocateChunk(ChunkHandle h)223 void BFCAllocator::DeallocateChunk(ChunkHandle h) {
224   Chunk* c = ChunkFromHandle(h);
225   c->allocation_id = -1;
226   c->bin_num = kInvalidBinNum;
227   c->next = free_chunks_list_;
228   free_chunks_list_ = h;
229 }
230 
AllocateRawInternalWithRetry(size_t unused_alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)231 void* BFCAllocator::AllocateRawInternalWithRetry(
232     size_t unused_alignment, size_t num_bytes,
233     const AllocationAttributes& allocation_attr) {
234   // Fast path: Try once to allocate without getting the retry_helper_ involved
235   uint64 freed_by_count = 0;
236   if (allocation_attr.freed_by_func != nullptr) {
237     freed_by_count = (*allocation_attr.freed_by_func)();
238   }
239   void* r =
240       AllocateRawInternal(unused_alignment, num_bytes, false, freed_by_count);
241   if (r != nullptr) {
242     return r;
243   } else {
244     static const int64_t kMaxMillisToWait = 10000;  // 10 seconds
245     r = retry_helper_.AllocateRaw(
246         [this, &allocation_attr](size_t a, size_t nb, bool v) {
247           uint64 freed_by_count = 0;
248           if (allocation_attr.freed_by_func != nullptr) {
249             freed_by_count = (*allocation_attr.freed_by_func)();
250           }
251           return AllocateRawInternal(a, nb, v, freed_by_count);
252         },
253         kMaxMillisToWait, unused_alignment, num_bytes);
254     return r;
255   }
256 }
257 
AllocateRaw(size_t unused_alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)258 void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
259                                 const AllocationAttributes& allocation_attr) {
260   VLOG(3) << "AllocateRaw " << Name() << "  " << num_bytes;
261   void* result = [&] {
262     if (!opts_.allow_retry_on_failure || !allocation_attr.retry_on_failure) {
263       // If we have globally disabled retry-on-failure and fail to allocate an
264       // "important" alloc, we want to print a log, because the program may be
265       // about to fail due to OOM.
266       //
267       // Bit of a hack: We deem "important" allocs as those which are retryable.
268       // In TF, *non*-retryable allocations are usually those which we can
269       // tolerate failing.  For example, we allocate convolution scratch memory
270       // as non-retryable; if it fails, we'll just use a fallback algorithm that
271       // uses no scratch.
272       static std::atomic<int32> log_counter{0};
273       constexpr int kMaxFailureLogs = 10;
274       bool dump_log_on_failure =
275           (/*retry is globally disabled*/ !opts_.allow_retry_on_failure &&
276            /*alloc is "important"*/ allocation_attr.retry_on_failure &&
277            log_counter.load(std::memory_order_relaxed) < kMaxFailureLogs) ||
278           VLOG_IS_ON(2);
279 
280       uint64 freed_by_count = 0;
281       if (allocation_attr.freed_by_func != nullptr) {
282         freed_by_count = (*allocation_attr.freed_by_func)();
283       }
284       void* res = AllocateRawInternal(unused_alignment, num_bytes,
285                                       dump_log_on_failure, freed_by_count);
286       if (res == nullptr) {
287         int32 counter_value = log_counter.load(std::memory_order_relaxed);
288         if (counter_value < kMaxFailureLogs) {
289           log_counter.store(counter_value + 1, std::memory_order_relaxed);
290           LOG(WARNING)
291               << "Allocator (" << Name() << ") ran out of memory trying "
292               << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
293               << " with freed_by_count=" << freed_by_count << "."
294               << (!allocation_attr.retry_on_failure
295                       ? " The caller indicates that this is not a failure, but"
296                         " this may mean that there could be performance gains "
297                         "if more memory were available."
298                       : "");
299         }
300       }
301       return res;
302     } else {
303       return AllocateRawInternalWithRetry(unused_alignment, num_bytes,
304                                           allocation_attr);
305     }
306   }();
307   VLOG(3) << "AllocateRaw " << Name() << "  " << num_bytes << " " << result;
308   return result;
309 }
310 
311 // static
RoundedBytes(size_t bytes)312 size_t BFCAllocator::RoundedBytes(size_t bytes) {
313   size_t rounded_bytes =
314       (kMinAllocationSize *
315        ((bytes + kMinAllocationSize - 1) / kMinAllocationSize));
316   DCHECK_EQ(size_t{0}, rounded_bytes % kMinAllocationSize);
317   return rounded_bytes;
318 }
319 
DeallocateFreeRegions(size_t rounded_bytes)320 bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes)
321     TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
322   // Do nothing if garbage collection is off.
323   if (!opts_.garbage_collection) {
324     return false;
325   }
326 
327   // Searching for free regions.
328   absl::flat_hash_set<void*> free_region_ptrs;
329   size_t total_free_bytes = 0;
330   for (const AllocationRegion& region : region_manager_.regions()) {
331     ChunkHandle h = region_manager_.get_handle(region.ptr());
332     bool any_use = false;
333     while (h != kInvalidChunkHandle) {
334       const Chunk* c = ChunkFromHandle(h);
335       if (c->in_use()) {
336         any_use = true;
337         break;
338       }
339       h = c->next;
340     }
341 
342     if (!any_use) {
343       VLOG(2) << "Found free region with ptr = " << region.ptr();
344       free_region_ptrs.insert(region.ptr());
345       total_free_bytes += region.memory_size();
346     }
347   }
348 
349   if (total_free_bytes == 0) {
350     return false;
351   }
352 
353   // Rough estimation to check whether deallocation can help.
354   size_t available_bytes =
355       memory_limit_ - total_region_allocated_bytes_ + total_free_bytes;
356   if (rounded_bytes > available_bytes) {
357     return false;
358   }
359 
360   LOG(WARNING) << "Garbage collection: deallocate free memory regions"
361                << " (i.e., allocations) so that we can re-allocate a larger"
362                << " region to avoid OOM due to memory fragmentation. If you"
363                << " see this message frequently, you are running near the"
364                << " threshold of the available device memory and re-allocation"
365                << " may incur great performance overhead. You may try smaller"
366                << " batch sizes to observe the performance impact."
367                << " Set TF_ENABLE_GPU_GARBAGE_COLLECTION=false if you'd like to"
368                << " disable this feature.";
369 
370   // Deallocate free regions.
371   DeallocateRegions(free_region_ptrs);
372 
373   return true;
374 }
375 
DeallocateRegions(const absl::flat_hash_set<void * > & region_ptrs)376 void BFCAllocator::DeallocateRegions(
377     const absl::flat_hash_set<void*>& region_ptrs)
378     TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
379   // Explicitly remove the const qualifier as some compilers disallow passing
380   // const_iterator to std::vector::erase(), which is used in
381   // RemoveAllocationRegion().
382   auto regions =
383       const_cast<std::vector<AllocationRegion>*>(&region_manager_.regions());
384   auto it = regions->begin();
385   while (it != regions->end()) {
386     if (!region_ptrs.contains(it->ptr())) {
387       ++it;
388       continue;
389     }
390 
391     VLOG(2) << "Deallocate region with ptr = " << it->ptr();
392     // Remove all chunk registrations from Bins.
393     ChunkHandle h = region_manager_.get_handle(it->ptr());
394     while (h != kInvalidChunkHandle) {
395       const Chunk* c = ChunkFromHandle(h);
396       if (c->bin_num != kInvalidBinNum) {
397         RemoveFreeChunkFromBin(h);
398       }
399       auto h_to_delete = h;
400       h = c->next;
401       DeleteChunk(h_to_delete);
402     }
403 
404     // Deallocate the memory.
405     sub_allocator_->Free(it->ptr(), it->memory_size());
406     total_region_allocated_bytes_ -= it->memory_size();
407     it = region_manager_.RemoveAllocationRegion(it);
408   }
409 }
410 
AllocateRawInternal(size_t unused_alignment,size_t num_bytes,bool dump_log_on_failure,uint64 freed_before)411 void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
412                                         size_t num_bytes,
413                                         bool dump_log_on_failure,
414                                         uint64 freed_before) {
415   if (num_bytes == 0) {
416     VLOG(2) << "tried to allocate 0 bytes";
417     return nullptr;
418   }
419   // First, always allocate memory of at least kMinAllocationSize
420   // bytes, and always allocate multiples of kMinAllocationSize bytes
421   // so all memory addresses are nicely byte aligned.
422   size_t rounded_bytes = RoundedBytes(num_bytes);
423 
424   // The BFC allocator tries to find the best fit first.
425   BinNum bin_num = BinNumForSize(rounded_bytes);
426 
427   mutex_lock l(lock_);
428   if (!timestamped_chunks_.empty()) {
429     // Merge timestamped chunks whose counts have become safe for general use.
430     MergeTimestampedChunks(0);
431   }
432   void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
433   if (ptr != nullptr) {
434     AddTraceMe("MemoryAllocation", ptr);
435     return ptr;
436   }
437 
438   // Try to extend
439   if (Extend(unused_alignment, rounded_bytes)) {
440     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
441     if (ptr != nullptr) {
442       AddTraceMe("MemoryAllocation", ptr);
443       return ptr;
444     }
445   }
446 
447   if ((freed_before == 0) && (!timestamped_chunks_.empty())) {
448     // We're unable to satisfy an allocation request without a specific
449     // timestamp requirement.  Rather than fail, try merging any held-out
450     // timestamped chunks more aggressively until a free chunk of the necessary
451     // size is formed.
452     if (MergeTimestampedChunks(rounded_bytes)) {
453       ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
454       if (ptr != nullptr) {
455         AddTraceMe("MemoryAllocation", ptr);
456         return ptr;
457       }
458     }
459   }
460 
461   // Reaching this point means that no chunks can satisfy the request. Also,
462   // the unallocated bytes cannot satisfy the request. Before giving up, let's
463   // try deallocating free regions so that suballocator can combine them with
464   // the unallocated bytes and form a larger region.
465   if (DeallocateFreeRegions(rounded_bytes) &&
466       Extend(unused_alignment, rounded_bytes)) {
467     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
468     if (ptr != nullptr) {
469       AddTraceMe("MemoryAllocation", ptr);
470       return ptr;
471     }
472   }
473 
474   // We searched all bins for an existing free chunk to use and
475   // couldn't find one.  This means we must have run out of memory,
476   // Dump the memory log for analysis.
477   MaybeWriteMemoryMap();
478   if (dump_log_on_failure) {
479     LOG(WARNING)
480         << "Allocator (" << Name() << ") ran out of memory trying "
481         << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
482         << " (rounded to " << rounded_bytes << ")"
483         << "requested by op "
484         << profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation()
485                .pending_op_name
486         << "\nIf the cause is memory fragmentation maybe the environment "
487         << "variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will "
488         << "improve the situation. \nCurrent allocation summary follows."
489         << "\nCurrent allocation summary follows.";
490     DumpMemoryLog(rounded_bytes);
491     LOG(WARNING) << RenderOccupancy();
492   }
493   return nullptr;
494 }
495 
LargestFreeChunk()496 int64_t BFCAllocator::LargestFreeChunk() {
497   for (int i = kNumBins - 1; i >= 0; i--) {
498     if (!BinFromIndex(i)->free_chunks.empty()) {
499       return ChunkFromHandle(*BinFromIndex(i)->free_chunks.rbegin())->size;
500     }
501   }
502   return 0;
503 }
504 
GetFragmentation()505 double BFCAllocator::GetFragmentation() {
506   int64_t bytes_available = total_region_allocated_bytes_ - stats_.bytes_in_use;
507   DCHECK_GT(bytes_available, 0);
508   return static_cast<double>(bytes_available - LargestFreeChunk()) /
509          bytes_available;
510 }
511 
AddTraceMe(absl::string_view traceme_name,const void * ptr)512 void BFCAllocator::AddTraceMe(absl::string_view traceme_name, const void* ptr) {
513   BFCAllocator::Chunk* chunk = ChunkFromHandle(region_manager_.get_handle(ptr));
514   AddTraceMe(traceme_name, chunk->ptr, chunk->requested_size, chunk->size);
515 }
516 
AddTraceMe(absl::string_view traceme_name,const void * chunk_ptr,int64_t req_bytes,int64_t alloc_bytes)517 void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
518                               const void* chunk_ptr, int64_t req_bytes,
519                               int64_t alloc_bytes) {
520   tensorflow::profiler::TraceMe::InstantActivity(
521       [this, traceme_name, chunk_ptr, req_bytes, alloc_bytes]()
522           TF_NO_THREAD_SAFETY_ANALYSIS {
523             int64_t bytes_available =
524                 memory_limit_ - stats_.bytes_reserved - stats_.bytes_in_use;
525             const auto& annotation =
526                 profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation();
527             const auto op_name = annotation.pending_op_name
528                                      ? annotation.pending_op_name
529                                      : "(null)";
530             const auto region_type = annotation.pending_region_type
531                                          ? annotation.pending_region_type
532                                          : "(null)";
533             return tensorflow::profiler::TraceMeEncode(
534                 traceme_name, {{"allocator_name", name_},
535                                {"bytes_reserved", stats_.bytes_reserved},
536                                {"bytes_allocated", stats_.bytes_in_use},
537                                {"bytes_available", bytes_available},
538                                {"fragmentation", GetFragmentation()},
539                                {"peak_bytes_in_use", stats_.peak_bytes_in_use},
540                                {"requested_bytes", req_bytes},
541                                {"allocation_bytes", alloc_bytes},
542                                {"addr", reinterpret_cast<uint64>(chunk_ptr)},
543                                {"tf_op", op_name},
544                                {"id", annotation.pending_step_id},
545                                {"region_type", region_type},
546                                {"data_type", annotation.pending_data_type},
547                                {"shape", annotation.pending_shape_func()}});
548           },
549       /*level=*/profiler::TraceMeLevel::kInfo);
550 }
551 
FindChunkPtr(BinNum bin_num,size_t rounded_bytes,size_t num_bytes,uint64 freed_before)552 void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
553                                  size_t num_bytes, uint64 freed_before) {
554   // First identify the first bin that could satisfy rounded_bytes.
555   for (; bin_num < kNumBins; bin_num++) {
556     // Start searching from the first bin for the smallest chunk that fits
557     // rounded_bytes.
558     Bin* b = BinFromIndex(bin_num);
559     for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end();
560          ++citer) {
561       const BFCAllocator::ChunkHandle h = (*citer);
562       BFCAllocator::Chunk* chunk = ChunkFromHandle(h);
563       DCHECK(!chunk->in_use());
564       if (freed_before > 0 && freed_before < chunk->freed_at_count) {
565         continue;
566       }
567       if (chunk->size >= rounded_bytes) {
568         // We found an existing chunk that fits us that wasn't in use, so remove
569         // it from the free bin structure prior to using.
570         RemoveFreeChunkIterFromBin(&b->free_chunks, citer);
571 
572         // If we can break the size of the chunk into two reasonably large
573         // pieces, do don't waste more than max_internal_fragmentation_bytes on
574         // padding. If this threshold is not set by the user, then use 128MB as
575         // the default.
576         const int64_t max_internal_fragmentation_bytes =
577             (opts_.fragmentation_fraction > 0.0)
578                 ? opts_.fragmentation_fraction * memory_limit_
579                 : 128 << 20;
580 
581         if (chunk->size >= rounded_bytes * 2 ||
582             static_cast<int64_t>(chunk->size) - rounded_bytes >=
583                 max_internal_fragmentation_bytes) {
584           SplitChunk(h, rounded_bytes);
585           chunk = ChunkFromHandle(h);  // Update chunk pointer in case it moved
586         }
587 
588         // The requested size of the returned chunk is what the user
589         // has allocated.
590         chunk->requested_size = num_bytes;
591         // Assign a unique id and increment the id counter, marking the
592         // chunk as being in use.
593         chunk->allocation_id = next_allocation_id_++;
594 
595         // Update stats.
596         ++stats_.num_allocs;
597         stats_.bytes_in_use += chunk->size;
598         if (stats_.bytes_in_use > stats_.peak_bytes_in_use) {
599           VLOG(2) << "New Peak memory usage of " << stats_.bytes_in_use
600                   << " bytes for " << Name();
601         }
602         stats_.peak_bytes_in_use =
603             std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
604         stats_.largest_alloc_size =
605             std::max<std::size_t>(stats_.largest_alloc_size, chunk->size);
606 
607 #ifdef TENSORFLOW_MEM_DEBUG
608         if (ShouldRecordOpName()) {
609           const auto& annotation =
610               profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation();
611           if (annotation.pending_op_name != nullptr) {
612             chunk->op_name = annotation.pending_op_name;
613           } else {
614             LOG(INFO) << "missing pending_op_name for " << Name()
615                       << " reading addr "
616                       << static_cast<const void*>(&annotation.pending_op_name)
617                       << "\n"
618                       << CurrentStackTrace();
619             chunk->op_name = nullptr;
620           }
621           chunk->action_count = ++action_counter_;
622           chunk->step_id = annotation.pending_step_id;
623           int slot = chunk->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
624           size_history_[slot] = stats_.bytes_in_use;
625         }
626 #endif
627 
628         VLOG(4) << "Returning: " << chunk->ptr;
629         if (VLOG_IS_ON(4)) {
630           LOG(INFO) << "A: " << RenderOccupancy();
631         }
632         return chunk->ptr;
633       }
634     }
635   }
636 
637   return nullptr;
638 }
639 
SplitChunk(BFCAllocator::ChunkHandle h,size_t num_bytes)640 void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
641   // Allocate the new chunk before we do any ChunkFromHandle
642   ChunkHandle h_new_chunk = AllocateChunk();
643 
644   Chunk* c = ChunkFromHandle(h);
645   CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
646 
647   // Create a new chunk starting num_bytes after c
648   BFCAllocator::Chunk* new_chunk = ChunkFromHandle(h_new_chunk);
649   new_chunk->ptr = static_cast<void*>(static_cast<char*>(c->ptr) + num_bytes);
650   region_manager_.set_handle(new_chunk->ptr, h_new_chunk);
651 
652   // Set the new sizes of the chunks.
653   new_chunk->size = c->size - num_bytes;
654   c->size = num_bytes;
655 
656   // The new chunk is not in use.
657   new_chunk->allocation_id = -1;
658 
659   // It inherits the freed time.
660   new_chunk->freed_at_count = c->freed_at_count;
661 
662   // Maintain the pointers.
663   // c <-> c_neighbor becomes
664   // c <-> new_chunk <-> c_neighbor
665   BFCAllocator::ChunkHandle h_neighbor = c->next;
666   new_chunk->prev = h;
667   new_chunk->next = h_neighbor;
668   c->next = h_new_chunk;
669   if (h_neighbor != kInvalidChunkHandle) {
670     Chunk* c_neighbor = ChunkFromHandle(h_neighbor);
671     c_neighbor->prev = h_new_chunk;
672   }
673 
674   // Add the newly free chunk to the free bin.
675   InsertFreeChunkIntoBin(h_new_chunk);
676 }
677 
DeallocateRaw(void * ptr)678 void BFCAllocator::DeallocateRaw(void* ptr) {
679   VLOG(3) << "DeallocateRaw " << Name() << " "
680           << (ptr ? RequestedSize(ptr) : 0);
681   DeallocateRawInternal(ptr);
682   retry_helper_.NotifyDealloc();
683 }
684 
DeallocateRawInternal(void * ptr)685 void BFCAllocator::DeallocateRawInternal(void* ptr) {
686   if (ptr == nullptr) {
687     VLOG(2) << "tried to deallocate nullptr";
688     return;
689   }
690   mutex_lock l(lock_);
691 
692   // Find the chunk from the ptr.
693   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
694   CHECK(h != kInvalidChunkHandle);
695   // Record chunk information before it's freed.
696   Chunk* chunk = ChunkFromHandle(h);
697   void* chunk_ptr = chunk->ptr;
698   int64_t req_bytes = chunk->requested_size;
699   int64_t alloc_bytes = chunk->size;
700 
701   MarkFree(h);
702 
703   // Consider coalescing it.
704   if (timing_counter_) {
705     InsertFreeChunkIntoBin(h);
706     timestamped_chunks_.push_back(h);
707   } else {
708     InsertFreeChunkIntoBin(TryToCoalesce(h, false));
709   }
710 
711   // TraceMe needs to be added after MarkFree and InsertFreeChunkIntoBin for
712   // correct aggregation stats (bytes_in_use, fragmentation).
713   AddTraceMe("MemoryDeallocation", chunk_ptr, req_bytes, alloc_bytes);
714 
715   if (VLOG_IS_ON(4)) {
716     LOG(INFO) << "F: " << RenderOccupancy();
717   }
718 }
719 
720 // Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1.
721 // We merge Chunk(h2) into Chunk(h1).
Merge(BFCAllocator::ChunkHandle h1,BFCAllocator::ChunkHandle h2)722 void BFCAllocator::Merge(BFCAllocator::ChunkHandle h1,
723                          BFCAllocator::ChunkHandle h2) {
724   Chunk* c1 = ChunkFromHandle(h1);
725   Chunk* c2 = ChunkFromHandle(h2);
726   // We can only merge chunks that are not in use.
727   CHECK(!c1->in_use() && !c2->in_use());
728 
729   // c1's prev doesn't change, still points to the same ptr, and is
730   // still not in use.
731 
732   // Fix up neighbor pointers
733   //
734   // c1 <-> c2 <-> c3 should become
735   // c1 <-> c3
736 
737   BFCAllocator::ChunkHandle h3 = c2->next;
738   c1->next = h3;
739   CHECK(c2->prev == h1);
740   if (h3 != kInvalidChunkHandle) {
741     BFCAllocator::Chunk* c3 = ChunkFromHandle(h3);
742     c3->prev = h1;
743   }
744 
745   // Set the new size
746   c1->size += c2->size;
747 
748   // Pick latest free time.
749   c1->freed_at_count = std::max(c1->freed_at_count, c2->freed_at_count);
750 
751   DeleteChunk(h2);
752 }
753 
DeleteChunk(ChunkHandle h)754 void BFCAllocator::DeleteChunk(ChunkHandle h) {
755   // Delete h and cleanup all state
756   Chunk* c = ChunkFromHandle(h);
757   //  VLOG(4) << "Removing: " << c->ptr;
758   region_manager_.erase(c->ptr);
759   DeallocateChunk(h);
760 }
761 
InsertFreeChunkIntoBin(BFCAllocator::ChunkHandle h)762 void BFCAllocator::InsertFreeChunkIntoBin(BFCAllocator::ChunkHandle h) {
763   Chunk* c = ChunkFromHandle(h);
764   CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
765   BinNum bin_num = BinNumForSize(c->size);
766   Bin* new_bin = BinFromIndex(bin_num);
767   c->bin_num = bin_num;
768   new_bin->free_chunks.insert(h);
769 }
770 
RemoveFreeChunkIterFromBin(BFCAllocator::Bin::FreeChunkSet * free_chunks,const BFCAllocator::Bin::FreeChunkSet::iterator & citer)771 void BFCAllocator::RemoveFreeChunkIterFromBin(
772     BFCAllocator::Bin::FreeChunkSet* free_chunks,
773     const BFCAllocator::Bin::FreeChunkSet::iterator& citer) {
774   ChunkHandle h = *citer;
775   Chunk* c = ChunkFromHandle(h);
776   CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
777   free_chunks->erase(citer);
778   c->bin_num = kInvalidBinNum;
779 }
780 
RemoveFreeChunkFromBin(BFCAllocator::ChunkHandle h)781 void BFCAllocator::RemoveFreeChunkFromBin(BFCAllocator::ChunkHandle h) {
782   Chunk* c = ChunkFromHandle(h);
783   CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
784   CHECK_GT(BinFromIndex(c->bin_num)->free_chunks.erase(h), 0)
785       << "Could not find chunk in bin";
786   c->bin_num = kInvalidBinNum;
787 }
788 
MarkFree(BFCAllocator::ChunkHandle h)789 void BFCAllocator::MarkFree(BFCAllocator::ChunkHandle h) {
790   Chunk* c = ChunkFromHandle(h);
791   CHECK(c->in_use() && (c->bin_num == kInvalidBinNum));
792 
793   // Mark the chunk as no longer in use.
794   c->allocation_id = -1;
795 
796   // Optionally record the free time.
797   if (timing_counter_) {
798     c->freed_at_count = timing_counter_->next();
799   }
800 
801   // Updates the stats.
802   stats_.bytes_in_use -= c->size;
803 
804 #ifdef TENSORFLOW_MEM_DEBUG
805   if (ShouldRecordOpName()) {
806     c->action_count = ++action_counter_;
807     int slot = c->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
808     size_history_[slot] = stats_.bytes_in_use;
809   }
810 #endif
811 }
812 
TryToCoalesce(ChunkHandle h,bool ignore_freed_at)813 BFCAllocator::ChunkHandle BFCAllocator::TryToCoalesce(ChunkHandle h,
814                                                       bool ignore_freed_at) {
815   Chunk* c = ChunkFromHandle(h);
816   if ((!ignore_freed_at) && c->freed_at_count > 0) return h;
817   ChunkHandle coalesced_chunk = h;
818 
819   // If the next chunk is free, merge it into c and delete it.
820   if (c->next != kInvalidChunkHandle && !ChunkFromHandle(c->next)->in_use()) {
821     Chunk* n = ChunkFromHandle(c->next);
822     if ((n->freed_at_count == 0) || ignore_freed_at) {
823       VLOG(4) << "Merging c->next " << n->ptr << " with c " << c->ptr;
824       RemoveFreeChunkFromBin(c->next);
825       Merge(h, c->next);
826     }
827   }
828 
829   // If the previous chunk is free, merge c into it and delete c.
830   if (c->prev != kInvalidChunkHandle && !ChunkFromHandle(c->prev)->in_use()) {
831     Chunk* n = ChunkFromHandle(c->prev);
832     if ((n->freed_at_count == 0) || ignore_freed_at) {
833       VLOG(4) << "Merging c " << c->ptr << " into c->prev " << n->ptr;
834       coalesced_chunk = c->prev;
835       RemoveFreeChunkFromBin(c->prev);
836       Merge(c->prev, h);
837     }
838   }
839 
840   return coalesced_chunk;
841 }
842 
SetSafeFrontier(uint64 count)843 void BFCAllocator::SetSafeFrontier(uint64 count) {
844   uint64 current = safe_frontier_.load(std::memory_order_relaxed);
845   while (count > current) {
846     if (safe_frontier_.compare_exchange_strong(current, count)) {
847       retry_helper_.NotifyDealloc();
848       return;
849     } else {
850       current = safe_frontier_.load(std::memory_order_relaxed);
851     }
852   }
853 }
854 
MergeTimestampedChunks(size_t required_bytes)855 bool BFCAllocator::MergeTimestampedChunks(size_t required_bytes) {
856   VLOG(1) << "MergeTimestampedChunks queue_len=" << timestamped_chunks_.size()
857           << " required_bytes=" << required_bytes;
858   bool satisfied = (required_bytes == 0);
859   std::vector<void*> to_merge;
860   std::deque<ChunkHandle> new_ts_queue;
861   while (!timestamped_chunks_.empty()) {
862     ChunkHandle h = timestamped_chunks_.front();
863     timestamped_chunks_.pop_front();
864     DCHECK_NE(h, kInvalidChunkHandle);
865     Chunk* c = ChunkFromHandle(h);
866     // It's possible this chunk has already been merged so refetch and retest
867     // the handle.
868     h = region_manager_.get_handle(c->ptr);
869     if (h == kInvalidChunkHandle) {
870       continue;
871     }
872     if (c->in_use() || (c->bin_num == kInvalidBinNum)) {
873       // This chunk has already been reallocated.
874       continue;
875     }
876     if (c->freed_at_count == 0) {
877       to_merge.push_back(c->ptr);
878       continue;
879     }
880     // Chunk should be free and assigned to a bin.
881     DCHECK_NE(c->bin_num, kInvalidBinNum);
882     if (c->freed_at_count < safe_frontier_) {
883       c->freed_at_count = 0;
884       to_merge.push_back(c->ptr);
885     } else if (required_bytes > 0) {
886       to_merge.push_back(c->ptr);
887     } else {
888       new_ts_queue.push_back(h);
889     }
890   }
891   DCHECK(timestamped_chunks_.empty());
892   std::swap(timestamped_chunks_, new_ts_queue);
893 
894   // At this point all candidate chunks have been moved from timestamped_chunks_
895   // to to_merge.  If this is a standard merge (required_bytes == 0) then
896   // merge them all, otherwise merge just until a Chunk of the required size
897   // is produced.
898   for (int ci = 0, end = to_merge.size(); ci < end; ++ci) {
899     void* ptr = to_merge[ci];
900     // It's possible that the Chunk associated with this memory location got
901     // merged and deallocated in a prior iteration so refetch the handle and
902     // retest.
903     ChunkHandle h = region_manager_.get_handle(ptr);
904     if (h == kInvalidChunkHandle) continue;
905     if (required_bytes == 0 || !satisfied) {
906       Chunk* c = ChunkFromHandle(h);
907       DCHECK_NE(c->bin_num, kInvalidBinNum);
908       DCHECK(!c->in_use());
909       RemoveFreeChunkFromBin(h);
910       ChunkHandle new_h = TryToCoalesce(h, (required_bytes > 0));
911       InsertFreeChunkIntoBin(new_h);
912       if (required_bytes > 0) {
913         c = ChunkFromHandle(new_h);
914         if (new_h != h && c->freed_at_count > 0) {
915           timestamped_chunks_.push_back(new_h);
916         }
917         if (c->size >= required_bytes) {
918           satisfied = true;
919         }
920       }
921     } else {
922       // We were force merging Chunks with unsafe timestamps, but managed
923       // to create a satisfying Chunk so just requeue the rest.
924       timestamped_chunks_.push_back(h);
925     }
926   }
927   return satisfied;
928 }
929 
TracksAllocationSizes() const930 bool BFCAllocator::TracksAllocationSizes() const { return true; }
931 
RequestedSize(const void * ptr) const932 size_t BFCAllocator::RequestedSize(const void* ptr) const {
933   CHECK(ptr);
934   mutex_lock l(lock_);
935   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
936   CHECK(h != kInvalidChunkHandle)
937       << "Asked for requested size of pointer we never allocated: " << ptr;
938   const BFCAllocator::Chunk* c = ChunkFromHandle(h);
939   return c->requested_size;
940 }
941 
AllocatedSize(const void * ptr) const942 size_t BFCAllocator::AllocatedSize(const void* ptr) const {
943   mutex_lock l(lock_);
944   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
945   CHECK(h != kInvalidChunkHandle)
946       << "Asked for allocated size of pointer we never allocated: " << ptr;
947   const BFCAllocator::Chunk* c = ChunkFromHandle(h);
948   return c->size;
949 }
950 
AllocationId(const void * ptr) const951 int64_t BFCAllocator::AllocationId(const void* ptr) const {
952   mutex_lock l(lock_);
953   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
954   CHECK(h != kInvalidChunkHandle)
955       << "Asked for allocation id of pointer we never allocated: " << ptr;
956   const BFCAllocator::Chunk* c = ChunkFromHandle(h);
957   return c->allocation_id;
958 }
959 
960 namespace {
961 
RenderRegion(char * rendered,const size_t resolution,const size_t total_render_size,const size_t offset,const void * base_ptr,const void * ptr,const size_t size,const char c)962 void RenderRegion(char* rendered, const size_t resolution,
963                   const size_t total_render_size, const size_t offset,
964                   const void* base_ptr, const void* ptr, const size_t size,
965                   const char c) {
966   const char* base_ptr_c = static_cast<const char*>(base_ptr);
967   const char* ptr_c = static_cast<const char*>(ptr);
968 
969   size_t start_location =
970       ((ptr_c - base_ptr_c + offset) * resolution) / total_render_size;
971   CHECK_GE(start_location, 0);
972   CHECK_LT(start_location, resolution);
973   size_t end_location =
974       ((ptr_c + size - 1 - base_ptr_c + offset) * resolution) /
975       total_render_size;
976   CHECK_GE(end_location, 0);
977   CHECK_LT(end_location, resolution);
978 
979   for (size_t i = start_location; i <= end_location; ++i) {
980     rendered[i] = c;
981   }
982 }
983 
984 }  // namespace
985 
RenderOccupancy()986 string BFCAllocator::RenderOccupancy() {
987   // Make a buffer for the ASCII-art representation.
988   const size_t resolution = 100;
989   char rendered[resolution];
990 
991   // Compute the total region size to render over
992   size_t total_region_size = 0;
993   for (const auto& region : region_manager_.regions()) {
994     total_region_size += region.memory_size();
995   }
996 
997   if (total_region_size == 0) {
998     return "<allocator contains no memory>";
999   }
1000 
1001   // Start out with everything empty
1002   RenderRegion(rendered, resolution, total_region_size, 0, nullptr, nullptr,
1003                total_region_size, '_');
1004 
1005   size_t region_offset = 0;
1006   for (const auto& region : region_manager_.regions()) {
1007     ChunkHandle h = region_manager_.get_handle(region.ptr());
1008     // Then render each chunk left to right.
1009     while (h != kInvalidChunkHandle) {
1010       Chunk* c = ChunkFromHandle(h);
1011       if (c->in_use()) {
1012         // Render the wasted space
1013         size_t wasted = c->size - c->requested_size;
1014         if (wasted > 0) {
1015           RenderRegion(rendered, resolution, total_region_size,
1016                        region_offset + c->requested_size, region.ptr(), c->ptr,
1017                        wasted, 'x');
1018         }
1019         // Then the occupied space
1020         RenderRegion(rendered, resolution, total_region_size, region_offset,
1021                      region.ptr(), c->ptr, c->requested_size, '*');
1022       }
1023       h = c->next;
1024     }
1025     region_offset += region.memory_size();
1026   }
1027 
1028   return string(rendered, resolution);
1029 }
1030 
DumpMemoryLog(size_t num_bytes)1031 void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
1032   const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info();
1033   LOG(INFO) << "BFCAllocator dump for " << Name();
1034   for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) {
1035     Bin* b = BinFromIndex(bin_num);
1036     const BinDebugInfo& bin_info = bin_infos[bin_num];
1037     CHECK_EQ(b->free_chunks.size(),
1038              bin_info.total_chunks_in_bin - bin_info.total_chunks_in_use);
1039 
1040     LOG(INFO) << "Bin (" << b->bin_size
1041               << "): \tTotal Chunks: " << bin_info.total_chunks_in_bin
1042               << ", Chunks in use: " << bin_info.total_chunks_in_use << ". "
1043               << strings::HumanReadableNumBytes(bin_info.total_bytes_in_bin)
1044               << " allocated for chunks. "
1045               << strings::HumanReadableNumBytes(bin_info.total_bytes_in_use)
1046               << " in use in bin. "
1047               << strings::HumanReadableNumBytes(
1048                      bin_info.total_requested_bytes_in_use)
1049               << " client-requested in use in bin.";
1050   }
1051 
1052   // Find the bin that we would have liked to allocate in, so we
1053   // can get some further analysis about fragmentation.
1054   Bin* b = BinForSize(num_bytes);
1055 
1056   LOG(INFO) << "Bin for " << strings::HumanReadableNumBytes(num_bytes)
1057             << " was " << strings::HumanReadableNumBytes(b->bin_size)
1058             << ", Chunk State: ";
1059 
1060   for (ChunkHandle h : b->free_chunks) {
1061     Chunk* c = ChunkFromHandle(h);
1062     LOG(INFO) << c->DebugString(this, true);
1063   }
1064 
1065   // Next show the chunks that are in use, and also summarize their
1066   // number by size.
1067   std::map<size_t, int> in_use_by_size;
1068   for (const auto& region : region_manager_.regions()) {
1069     LOG(INFO) << "Next region of size " << region.memory_size();
1070     ChunkHandle h = region_manager_.get_handle(region.ptr());
1071     while (h != kInvalidChunkHandle) {
1072       const Chunk* c = ChunkFromHandle(h);
1073       if (c->in_use()) {
1074         in_use_by_size[c->size]++;
1075       }
1076       string buf = strings::StrCat(
1077           (c->in_use() ? "InUse" : "Free "), " at ",
1078           strings::Hex(reinterpret_cast<uint64>(c->ptr)), " of size ", c->size);
1079 #ifdef TENSORFLOW_MEM_DEBUG
1080       if (ShouldRecordOpName()) {
1081         strings::StrAppend(&buf, " by op ", c->op_name, " action_count ",
1082                            c->action_count, " step ", c->step_id);
1083       }
1084 #endif
1085       strings::StrAppend(&buf, " next ", c->next);
1086       if (timing_counter_) {
1087         strings::StrAppend(&buf, " freed_at_count ", c->freed_at_count);
1088       }
1089       LOG(INFO) << buf;
1090       h = c->next;
1091     }
1092   }
1093 
1094   LOG(INFO) << "     Summary of in-use Chunks by size: ";
1095   size_t total_bytes = 0;
1096   for (auto& it : in_use_by_size) {
1097     LOG(INFO) << it.second << " Chunks of size " << it.first << " totalling "
1098               << strings::HumanReadableNumBytes(it.first * it.second);
1099     total_bytes += (it.first * it.second);
1100   }
1101   LOG(INFO) << "Sum Total of in-use chunks: "
1102             << strings::HumanReadableNumBytes(total_bytes);
1103   LOG(INFO) << "total_region_allocated_bytes_: "
1104             << total_region_allocated_bytes_
1105             << " memory_limit_: " << memory_limit_ << " available bytes: "
1106             << (memory_limit_ - total_region_allocated_bytes_)
1107             << " curr_region_allocation_bytes_: "
1108             << curr_region_allocation_bytes_;
1109   LOG(INFO) << "Stats: \n" << stats_.DebugString();
1110 }
1111 
MaybeWriteMemoryMap()1112 void BFCAllocator::MaybeWriteMemoryMap() {
1113   const char* gpu_memory_map_file = std::getenv("TF_BFC_MEMORY_DUMP");
1114   if (gpu_memory_map_file != nullptr) {
1115     std::unique_ptr<WritableFile> dump_file;
1116     string file_name = strings::StrCat(gpu_memory_map_file, "_", Name(), ".",
1117                                        Env::Default()->NowMicros());
1118     Status status = Env::Default()->NewWritableFile(file_name, &dump_file);
1119     if (!status.ok()) {
1120       LOG(ERROR) << "Failed to open file " << file_name;
1121       return;
1122     }
1123     MemoryDump md = RecordMemoryMapInternal();
1124     status = dump_file->Append(md.SerializeAsString());
1125     if (!status.ok()) {
1126       LOG(ERROR) << "Error on writing to file " << gpu_memory_map_file << ": "
1127                  << status;
1128     }
1129   }
1130 }
1131 
RecordMemoryMap()1132 MemoryDump BFCAllocator::RecordMemoryMap() {
1133   mutex_lock l(lock_);
1134   return RecordMemoryMapInternal();
1135 }
1136 
RecordMemoryMapInternal()1137 MemoryDump BFCAllocator::RecordMemoryMapInternal() {
1138   MemoryDump md;
1139   md.set_allocator_name(Name());
1140 
1141   // Record the general stats
1142   MemAllocatorStats* mas = md.mutable_stats();
1143   mas->set_num_allocs(stats_.num_allocs);
1144   mas->set_bytes_in_use(stats_.bytes_in_use);
1145   mas->set_peak_bytes_in_use(stats_.peak_bytes_in_use);
1146   mas->set_largest_alloc_size(stats_.largest_alloc_size);
1147 
1148   // Record summary data for every bin.
1149   const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info();
1150   for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) {
1151     Bin* b = BinFromIndex(bin_num);
1152     const BinDebugInfo& bin_info = bin_infos[bin_num];
1153     DCHECK_EQ(b->free_chunks.size(),
1154               bin_info.total_chunks_in_bin - bin_info.total_chunks_in_use);
1155     BinSummary* bs = md.add_bin_summary();
1156     bs->set_bin(bin_num);
1157     bs->set_total_bytes_in_use(bin_info.total_bytes_in_use);
1158     bs->set_total_bytes_in_bin(bin_info.total_bytes_in_bin);
1159     bs->set_total_chunks_in_use(bin_info.total_chunks_in_use);
1160     bs->set_total_chunks_in_bin(bin_info.total_chunks_in_bin);
1161   }
1162 
1163   // Record state of every defined Chunk.
1164   for (const auto& region : region_manager_.regions()) {
1165     ChunkHandle h = region_manager_.get_handle(region.ptr());
1166     while (h != kInvalidChunkHandle) {
1167       const Chunk* c = ChunkFromHandle(h);
1168       MemChunk* mc = md.add_chunk();
1169       mc->set_in_use(c->in_use());
1170       mc->set_address(reinterpret_cast<uint64>(c->ptr));
1171       mc->set_size(c->size);
1172       mc->set_requested_size(c->requested_size);
1173       mc->set_bin(c->bin_num);
1174 #ifdef TENSORFLOW_MEM_DEBUG
1175       mc->set_op_name(c->op_name ? string(c->op_name) : "UNKNOWN");
1176       mc->set_step_id(c->step_id);
1177       mc->set_action_count(c->action_count);
1178 #endif
1179       if (timing_counter_) {
1180         mc->set_freed_at_count(c->in_use() ? 0 : c->freed_at_count);
1181       }
1182       h = c->next;
1183     }
1184   }
1185 
1186   mas->set_fragmentation_metric(GetFragmentation());
1187 
1188 #ifdef TENSORFLOW_MEM_DEBUG
1189   // Record the recent size history
1190   int history_len = static_cast<int>(std::min(
1191       action_counter_, static_cast<long long>(MEM_DEBUG_SIZE_HISTORY_SIZE)));
1192   for (int i = action_counter_ - history_len; i < action_counter_; ++i) {
1193     SnapShot* ss = md.add_snap_shot();
1194     ss->set_action_count(i);
1195     int slot = i % MEM_DEBUG_SIZE_HISTORY_SIZE;
1196     ss->set_size(size_history_[slot]);
1197   }
1198 #endif
1199 
1200   return md;
1201 }
1202 
GetStats()1203 absl::optional<AllocatorStats> BFCAllocator::GetStats() {
1204   mutex_lock l(lock_);
1205   return stats_;
1206 }
1207 
ClearStats()1208 bool BFCAllocator::ClearStats() {
1209   mutex_lock l(lock_);
1210   stats_.num_allocs = 0;
1211   stats_.peak_bytes_in_use = stats_.bytes_in_use;
1212   stats_.largest_alloc_size = 0;
1213   return true;
1214 }
1215 
1216 std::array<BFCAllocator::BinDebugInfo, BFCAllocator::kNumBins>
get_bin_debug_info()1217 BFCAllocator::get_bin_debug_info() {
1218   std::array<BinDebugInfo, kNumBins> bin_infos;
1219   for (const auto& region : region_manager_.regions()) {
1220     ChunkHandle h = region_manager_.get_handle(region.ptr());
1221     while (h != kInvalidChunkHandle) {
1222       const Chunk* c = ChunkFromHandle(h);
1223       BinNum bin_num = BinNumForSize(c->size);
1224       BinDebugInfo& bin_info = bin_infos[bin_num];
1225       bin_info.total_bytes_in_bin += c->size;
1226       bin_info.total_chunks_in_bin++;
1227       if (c->in_use()) {
1228         bin_info.total_bytes_in_use += c->size;
1229         bin_info.total_requested_bytes_in_use += c->requested_size;
1230         bin_info.total_chunks_in_use++;
1231       } else {
1232         Bin* bin = BinFromIndex(bin_num);
1233         CHECK_EQ(bin->free_chunks.count(h), 1);
1234         CHECK_EQ(c->bin_num, bin_num);
1235       }
1236       h = c->next;
1237     }
1238   }
1239   return bin_infos;
1240 }
1241 
GetMemoryType() const1242 AllocatorMemoryType BFCAllocator::GetMemoryType() const {
1243   return sub_allocator_->GetMemoryType();
1244 }
1245 
1246 }  // namespace tensorflow
1247