xref: /aosp_15_r20/external/pytorch/torch/csrc/profiler/unwind/unwind.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #include <c10/util/Exception.h>
2 #include <torch/csrc/profiler/unwind/unwind.h>
3 #include <torch/csrc/utils/cpp_stacktraces.h>
4 #include <unordered_map>
5 
6 #if !defined(__linux__) || !defined(__x86_64__) || !defined(__has_include) || \
7     !__has_include("ext/stdio_filebuf.h")
8 namespace torch::unwind {
unwind()9 std::vector<void*> unwind() {
10   TORCH_WARN_ONCE(
11       "record_context_cpp is not support on non-linux non-x86_64 platforms");
12   return {};
13 }
14 
libraryFor(void * addr)15 std::optional<std::pair<std::string, uint64_t>> libraryFor(void* addr) {
16   TORCH_WARN_ONCE(
17       "record_context_cpp is not support on non-linux non-x86_64 platforms");
18   return {};
19 }
20 
21 #ifndef FBCODE_CAFFE2
symbolize(const std::vector<void * > & frames,Mode mode)22 std::vector<Frame> symbolize(const std::vector<void*>& frames, Mode mode) {
23   TORCH_WARN_ONCE(
24       "record_context_cpp is not support on non-linux non-x86_64 platforms");
25   return {};
26 }
27 #endif
28 
stats()29 Stats stats() {
30   TORCH_WARN_ONCE(
31       "record_context_cpp is not support on non-linux non-x86_64 platforms");
32   return {};
33 }
34 
35 } // namespace torch::unwind
36 
37 #else
38 
39 #include <c10/util/flat_hash_map.h>
40 #include <dlfcn.h>
41 #include <elf.h>
42 #include <link.h>
43 #include <linux/limits.h>
44 #include <algorithm>
45 #include <climits>
46 #include <vector>
47 
48 #include <c10/util/irange.h>
49 #include <cxxabi.h>
50 #include <torch/csrc/profiler/unwind/communicate.h>
51 #include <torch/csrc/profiler/unwind/dwarf_enums.h>
52 #include <torch/csrc/profiler/unwind/eh_frame_hdr.h>
53 #include <torch/csrc/profiler/unwind/fast_symbolizer.h>
54 #include <torch/csrc/profiler/unwind/fde.h>
55 #include <torch/csrc/profiler/unwind/unwinder.h>
56 #include <shared_mutex>
57 
58 extern "C" void unwind_c(std::vector<void*>* result, int64_t rsp, int64_t rbp);
59 extern "C" void unwind_entry(std::vector<void*>* result);
60 
61 namespace torch::unwind {
62 struct UpgradeExclusive {
UpgradeExclusivetorch::unwind::UpgradeExclusive63   UpgradeExclusive(std::shared_lock<std::shared_timed_mutex>& rdlock)
64       : rdlock_(rdlock) {
65     rdlock_.unlock();
66     rdlock_.mutex()->lock();
67   }
~UpgradeExclusivetorch::unwind::UpgradeExclusive68   ~UpgradeExclusive() {
69     rdlock_.mutex()->unlock();
70     rdlock_.lock();
71   }
72 
73  private:
74   // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
75   std::shared_lock<std::shared_timed_mutex>& rdlock_;
76 };
77 
78 struct LibraryInfo {
LibraryInfotorch::unwind::LibraryInfo79   LibraryInfo(
80       std::string name,
81       uint64_t load_bias,
82       uint64_t last_addr,
83       void* eh_frame_hdr_ptr_)
84       : name_(std::move(name)),
85         load_bias_(load_bias),
86         last_addr_(last_addr),
87         eh_frame_hdr_(eh_frame_hdr_ptr_) {}
88 
load_biastorch::unwind::LibraryInfo89   uint64_t load_bias() const {
90     return load_bias_;
91   }
last_addrtorch::unwind::LibraryInfo92   uint64_t last_addr() const {
93     return last_addr_;
94   }
unwinderFortorch::unwind::LibraryInfo95   Unwinder unwinderFor(uint64_t addr) const {
96     void* fde_data = eh_frame_hdr_.entryForAddr(addr);
97     FDE fde(fde_data, name().c_str(), load_bias());
98     TableState state = fde.readUpTo(addr);
99     return Unwinder(state.cfa, state.registers[D_RIP], state.registers[D_RBP]);
100   }
nametorch::unwind::LibraryInfo101   const std::string& name() const {
102     return name_;
103   }
104 
105  private:
106   std::string name_;
107   uint64_t load_bias_; // addr >= load_bias_
108   uint64_t last_addr_; // addr < last_addr_
109   EHFrameHdr eh_frame_hdr_;
110 };
111 
process_name()112 static const char* process_name() {
113   // NOLINTNEXTLINE(*-c-arrays*)
114   static char name[PATH_MAX + 1] = "";
115   if (*name == '\0') {
116     ssize_t len = readlink("/proc/self/exe", name, PATH_MAX);
117     TORCH_INTERNAL_ASSERT(len != -1, "can't get path to exe")
118     name[len] = '\0';
119   }
120   return name;
121 }
122 
123 struct Version {
124   uint64_t adds_ = LONG_LONG_MAX;
125   uint64_t subs_ = LONG_LONG_MAX;
126 };
127 
128 struct UnwindCache {
currentVersiontorch::unwind::UnwindCache129   Version currentVersion() {
130     Version r;
131     dl_iterate_phdr(
132         [](struct dl_phdr_info* info,
133            size_t size [[maybe_unused]],
134            void* data) {
135           Version* v = (Version*)data;
136           v->adds_ = info->dlpi_adds;
137           v->subs_ = info->dlpi_subs;
138           return 1;
139         },
140         &r);
141     return r;
142   }
refreshLibrariestorch::unwind::UnwindCache143   void refreshLibraries() {
144     ++stats_.resets;
145     all_libraries_.clear();
146     ip_cache_.clear();
147     dl_iterate_phdr(
148         [](struct dl_phdr_info* info,
149            size_t size [[maybe_unused]],
150            void* data) {
151           auto self = (UnwindCache*)data;
152           uint64_t last_addr = 0;
153           auto segments = (Elf64_Phdr*)info->dlpi_phdr;
154           for (auto i : c10::irange(info->dlpi_phnum)) {
155             if (segments[i].p_type == PT_LOAD) {
156               auto begin = ((uint64_t)info->dlpi_addr + segments[i].p_vaddr);
157               auto end = (begin + segments[i].p_memsz);
158               last_addr = std::max(end, last_addr);
159             }
160             if (segments[i].p_type == PT_GNU_EH_FRAME) {
161               std::string library_name = info->dlpi_name;
162               if (library_name.empty()) {
163                 library_name = process_name();
164               }
165               auto eh_frame_hdr =
166                   // NOLINTNEXTLINE(performance-no-int-to-ptr)
167                   (void*)(segments[i].p_vaddr + info->dlpi_addr);
168               self->all_libraries_.emplace_back(
169                   std::move(library_name),
170                   info->dlpi_addr,
171                   last_addr,
172                   eh_frame_hdr);
173               return 0;
174             }
175           }
176           self->libraries_with_no_unwind_.emplace_back(info->dlpi_name);
177           return 0;
178         },
179         this);
180     std::sort(
181         all_libraries_.begin(),
182         all_libraries_.end(),
183         [](const LibraryInfo& lhs, const LibraryInfo& rhs) {
184           return lhs.load_bias() < rhs.load_bias();
185         });
186   }
checkRefreshtorch::unwind::UnwindCache187   void checkRefresh(std::shared_lock<std::shared_timed_mutex>& rdlock) {
188     Version current_version = currentVersion();
189     if (current_version.subs_ != last_version_.subs_) {
190       UpgradeExclusive lock(rdlock);
191       refreshLibraries();
192     }
193   }
194 
unwinderFortorch::unwind::UnwindCache195   const Unwinder& unwinderFor(
196       uint64_t addr,
197       std::shared_lock<std::shared_timed_mutex>& rdlock) {
198     auto it = ip_cache_.find(addr);
199     if (it != ip_cache_.end()) {
200       ++stats_.hits;
201       return it->second;
202     }
203 
204     // we are about to modify the cache
205     UpgradeExclusive lock(rdlock);
206     ++stats_.misses;
207 
208     Unwinder unwinder = Unwinder::unknown();
209     try {
210       unwinder = libraryFor(addr).unwinderFor(addr);
211     } catch (unwind::UnwindError& err) {
212       // because unwinders are cached this will only print
213       // once per frame that cannot be unwound.
214       TORCH_WARN("Unsupported unwinding pattern: ", err.what());
215     }
216     auto r = ip_cache_.insert_or_assign(addr, unwinder);
217     return r.first->second;
218   }
219 
findLibraryFortorch::unwind::UnwindCache220   const LibraryInfo* findLibraryFor(uint64_t addr) {
221     Version current_version = currentVersion();
222     if (current_version.subs_ != last_version_.subs_) {
223       refreshLibraries();
224       last_version_ = current_version;
225     }
226     auto* r = searchFor(addr);
227     if (!r) {
228       if (current_version.adds_ != last_version_.adds_) {
229         refreshLibraries();
230         last_version_ = current_version;
231       }
232       r = searchFor(addr);
233     }
234     return r;
235   }
236 
libraryFortorch::unwind::UnwindCache237   const LibraryInfo& libraryFor(uint64_t addr) {
238     auto* r = findLibraryFor(addr);
239     if (!r) {
240       for ([[maybe_unused]] const auto& l : libraries_with_no_unwind_) {
241         TORCH_WARN("Did not find a PT_GNU_EH_FRAME segment for ", l);
242       }
243       libraries_with_no_unwind_.clear();
244       throw UnwindError("addr not in range of known libraries");
245     }
246     return *r;
247   }
248 
statstorch::unwind::UnwindCache249   torch::unwind::Stats stats() {
250     return stats_;
251   }
252 
253  private:
searchFortorch::unwind::UnwindCache254   const LibraryInfo* searchFor(uint64_t addr) {
255     if (all_libraries_.empty()) {
256       return nullptr;
257     }
258     uint64_t low = 0;
259     uint64_t high = all_libraries_.size();
260     while (low + 1 < high) {
261       auto mid = (low + high) / 2;
262       if (addr < all_libraries_.at(mid).load_bias()) {
263         high = mid;
264       } else {
265         low = mid;
266       }
267     }
268     LibraryInfo* r = &all_libraries_.at(low);
269     if (addr < r->load_bias() || addr >= r->last_addr()) {
270       return nullptr;
271     }
272     return r;
273   }
274 
275   // sorted by load_bias
276   std::vector<LibraryInfo> all_libraries_;
277   ska::flat_hash_map<uint64_t, Unwinder> ip_cache_;
278 
279   torch::unwind::Stats stats_;
280 
281   // to keep track of whether we need to refresh this info
282   Version last_version_;
283 
284   std::vector<std::string> libraries_with_no_unwind_;
285 };
286 
287 static UnwindCache unwind_cache;
288 static std::shared_timed_mutex cache_mutex_;
289 
unwind()290 std::vector<void*> unwind() {
291   std::vector<void*> frames;
292   unwind_entry(&frames);
293   return frames;
294 }
295 
libraryFor(void * addr)296 std::optional<std::pair<std::string, uint64_t>> libraryFor(void* addr) {
297   if (!addr) {
298     return std::nullopt;
299   }
300   std::shared_lock lock(cache_mutex_);
301   const LibraryInfo* library_info = unwind_cache.findLibraryFor((uint64_t)addr);
302   if (!library_info) {
303     return std::nullopt;
304   }
305   return std::make_pair(
306       library_info->name(), (uint64_t)addr - library_info->load_bias());
307 }
308 
dladdr_lookup(void * addr)309 static std::string dladdr_lookup(void* addr) {
310   Dl_info dlinfo;
311   std::string funcname = "??";
312   if (dladdr(addr, &dlinfo) && dlinfo.dli_sname) {
313     funcname = demangle(dlinfo.dli_sname);
314   }
315   return funcname;
316 }
317 
318 struct Symbolizer {
Symbolizertorch::unwind::Symbolizer319   Symbolizer() {
320     auto envar = std::getenv("TORCH_ADDR2LINE_BINARY");
321     if (envar != nullptr) {
322       // currently we take user's input as is without checking
323       addr2line_binary_ = envar;
324       TORCH_WARN("Use custom addr2line binary: ", addr2line_binary_);
325     } else {
326       addr2line_binary_ = "addr2line"; // default
327     }
328   }
guardtorch::unwind::Symbolizer329   static std::lock_guard<std::mutex> guard() {
330     static std::mutex mutex;
331     return std::lock_guard<std::mutex>(mutex);
332   }
gettorch::unwind::Symbolizer333   static Symbolizer& get() {
334     static Symbolizer singleton;
335     return singleton;
336   }
337 
requesttorch::unwind::Symbolizer338   void request(void* addr) {
339     if (frame_map_.count(addr)) {
340       return;
341     }
342     auto maybe_library = libraryFor(addr);
343     if (!maybe_library) {
344       frame_map_[addr] = Frame{"??", "<unwind unsupported>", 0};
345       return;
346     }
347     has_pending_results_ = true;
348     auto& entry = getOrCreate(maybe_library->first);
349     entry.queried.push_back(addr);
350     auto libaddress = maybe_library->second - 1;
351     // NOLINTNEXTLINE(performance-no-int-to-ptr)
352     entry.comm->out() << (void*)libaddress << "\n";
353     // we need to make sure we don't write more than 64k bytes to
354     // a pipe before reading the results. Otherwise the buffer may
355     // get filled and block before we read the results.
356     // Each line is < 32 characters,
357     // so this limits us to < 32k bytes before we read rules.
358     if (entry.queried.size() - entry.completed > BLOCK) {
359       entry.comm->out().flush();
360       readPendingResults(entry);
361     }
362   }
lookuptorch::unwind::Symbolizer363   const Frame& lookup(void* addr) {
364     if (has_pending_results_) {
365       for (auto& kv : entries_) {
366         kv.second.comm->out().flush();
367       }
368       for (auto& kv : entries_) {
369         readPendingResults(kv.second);
370       }
371       has_pending_results_ = false;
372     }
373     return frame_map_.at(addr);
374   }
375 
376  private:
377   static constexpr int BLOCK = 1024;
378   const char* addr2line_binary_;
379   struct Entry {
380     std::unique_ptr<Communicate> comm;
381     std::vector<void*> queried;
382     size_t completed = 0;
383   };
384   ska::flat_hash_map<std::string, Entry> entries_;
385   ska::flat_hash_map<void*, Frame> frame_map_;
386   bool has_pending_results_ = true;
387 
getOrCreatetorch::unwind::Symbolizer388   Entry& getOrCreate(const std::string& name) {
389     auto it = entries_.find(name);
390     if (it == entries_.end()) {
391       // NOLINTNEXTLINE(*-c-arrays*)
392       const char* args[] = {
393           addr2line_binary_, "-C", "-f", "-e", name.c_str(), nullptr};
394       it = entries_
395                .insert_or_assign(
396                    name,
397                    Entry{
398                        std::make_unique<Communicate>(addr2line_binary_, args),
399                        {}})
400                .first;
401     }
402     return it->second;
403   }
readPendingResultstorch::unwind::Symbolizer404   void readPendingResults(Entry& e) {
405     size_t N = e.queried.size();
406     for (; e.completed < N; ++e.completed) {
407       Frame frame;
408       std::getline(e.comm->in(), frame.funcname);
409       std::string filename_lineno;
410       std::getline(e.comm->in(), filename_lineno);
411       auto colon = filename_lineno.find_last_of(':');
412       frame.filename = filename_lineno.substr(0, colon);
413       std::string lineno_str = filename_lineno.substr(colon + 1);
414       frame.lineno = lineno_str == "?" ? 0 : std::stoi(lineno_str);
415       frame_map_[e.queried[e.completed]] = std::move(frame);
416     }
417   }
418 };
419 
symbolize_fast(const std::vector<void * > & frames,Mode mode)420 static std::vector<Frame> symbolize_fast(
421     const std::vector<void*>& frames,
422     Mode mode) {
423   static std::mutex cache_mutex;
424   static std::array<ska::flat_hash_map<void*, Frame>, 2> frame_maps;
425   auto& frame_map = frame_maps[mode == Mode::fast ? 0 : 1];
426 
427   std::vector<uint32_t> indices_to_lookup;
428   std::vector<Frame> results;
429   results.reserve(frames.size());
430   {
431     std::lock_guard<std::mutex> lock(cache_mutex);
432     for (auto i : c10::irange(frames.size())) {
433       void* f = frames.at(i);
434       auto it = frame_map.find(f);
435       if (it == frame_map.end()) {
436         indices_to_lookup.push_back(i);
437         results.emplace_back(Frame{"??", "??", 0});
438       } else {
439         results.emplace_back(it->second);
440       }
441     }
442   }
443   if (!indices_to_lookup.empty()) {
444     // do symbolizer work
445     FastSymbolizer symbolizer;
446     for (auto i : indices_to_lookup) {
447       void* addr = frames.at(i);
448       Frame& f = results.at(i);
449       auto library = libraryFor(frames.at(i));
450       if (library) {
451         if (mode == Mode::fast) {
452           f = symbolizer.symbolize(library->first, library->second - 1);
453         } else {
454           f = Frame{library->first, "??", library->second - 1};
455         }
456       }
457       if (f.funcname == "??") {
458         f.funcname = dladdr_lookup(addr);
459       }
460     }
461     std::lock_guard<std::mutex> lock(cache_mutex);
462     for (auto i : indices_to_lookup) {
463       frame_map.emplace(frames.at(i), results.at(i));
464     }
465   }
466   return results;
467 }
468 
symbolize_addr2line(const std::vector<void * > & frames)469 static std::vector<Frame> symbolize_addr2line(
470     const std::vector<void*>& frames) {
471   auto guard = Symbolizer::guard();
472   Symbolizer& s = Symbolizer::get();
473   for (auto f : frames) {
474     s.request(f);
475   }
476   std::vector<Frame> results;
477   results.reserve(frames.size());
478   for (auto f : frames) {
479     results.emplace_back(s.lookup(f));
480   }
481   return results;
482 }
483 
484 // fbcode will use llvm symbolize since there is an llvm dependency already
485 #ifndef FBCODE_CAFFE2
symbolize(const std::vector<void * > & frames,Mode mode)486 std::vector<Frame> symbolize(const std::vector<void*>& frames, Mode mode) {
487   if (mode == Mode::addr2line) {
488     return symbolize_addr2line(frames);
489   } else {
490     return symbolize_fast(frames, mode);
491   }
492 }
493 #endif
494 
stats()495 Stats stats() {
496   return unwind_cache.stats();
497 }
498 
499 } // namespace torch::unwind
500 
unwind_c(std::vector<void * > * result,int64_t rsp,int64_t rbp)501 extern "C" void unwind_c(std::vector<void*>* result, int64_t rsp, int64_t rbp) {
502   std::shared_lock lock(torch::unwind::cache_mutex_);
503   torch::unwind::UnwindState state{};
504   // NOLINTNEXTLINE(performance-no-int-to-ptr)
505   state.rip = *(int64_t*)(rsp);
506   // +8 because we saved rsp after the return address was already pushed
507   // to the stack
508   state.rsp = rsp + 8;
509   state.rbp = rbp;
510   torch::unwind::unwind_cache.checkRefresh(lock);
511   while (true) { // unwind for _start sets rip as being undefined
512     // NOLINTNEXTLINE(performance-no-int-to-ptr)
513     result->push_back((void*)state.rip);
514     const torch::unwind::Unwinder& uw =
515         torch::unwind::unwind_cache.unwinderFor(state.rip, lock);
516     if (uw.terminator()) {
517       if (uw.isUnknown()) {
518         result->push_back(nullptr);
519       }
520       break;
521     }
522     state = uw.run(state);
523   }
524 }
525 
526 // calling convention puts the first three pointer/int64_t arguments in
527 // rdi rsi rdx (all caller-saved)
528 // rdi already holds the pointer to the result vector
529 // we add arguments for current rsp and rbp and then tail call
530 // into unwind_c
531 __asm__(
532     ".global unwind_entry\n"
533     "unwind_entry:\n"
534     "mov %rsp, %rsi;\n"
535     "mov %rbp, %rdx;\n"
536     "jmp unwind_c;\n");
537 
538 #endif
539