1 #include <c10/util/Exception.h>
2 #include <torch/csrc/profiler/unwind/unwind.h>
3 #include <torch/csrc/utils/cpp_stacktraces.h>
4 #include <unordered_map>
5
6 #if !defined(__linux__) || !defined(__x86_64__) || !defined(__has_include) || \
7 !__has_include("ext/stdio_filebuf.h")
8 namespace torch::unwind {
unwind()9 std::vector<void*> unwind() {
10 TORCH_WARN_ONCE(
11 "record_context_cpp is not support on non-linux non-x86_64 platforms");
12 return {};
13 }
14
libraryFor(void * addr)15 std::optional<std::pair<std::string, uint64_t>> libraryFor(void* addr) {
16 TORCH_WARN_ONCE(
17 "record_context_cpp is not support on non-linux non-x86_64 platforms");
18 return {};
19 }
20
21 #ifndef FBCODE_CAFFE2
symbolize(const std::vector<void * > & frames,Mode mode)22 std::vector<Frame> symbolize(const std::vector<void*>& frames, Mode mode) {
23 TORCH_WARN_ONCE(
24 "record_context_cpp is not support on non-linux non-x86_64 platforms");
25 return {};
26 }
27 #endif
28
stats()29 Stats stats() {
30 TORCH_WARN_ONCE(
31 "record_context_cpp is not support on non-linux non-x86_64 platforms");
32 return {};
33 }
34
35 } // namespace torch::unwind
36
37 #else
38
39 #include <c10/util/flat_hash_map.h>
40 #include <dlfcn.h>
41 #include <elf.h>
42 #include <link.h>
43 #include <linux/limits.h>
44 #include <algorithm>
45 #include <climits>
46 #include <vector>
47
48 #include <c10/util/irange.h>
49 #include <cxxabi.h>
50 #include <torch/csrc/profiler/unwind/communicate.h>
51 #include <torch/csrc/profiler/unwind/dwarf_enums.h>
52 #include <torch/csrc/profiler/unwind/eh_frame_hdr.h>
53 #include <torch/csrc/profiler/unwind/fast_symbolizer.h>
54 #include <torch/csrc/profiler/unwind/fde.h>
55 #include <torch/csrc/profiler/unwind/unwinder.h>
56 #include <shared_mutex>
57
58 extern "C" void unwind_c(std::vector<void*>* result, int64_t rsp, int64_t rbp);
59 extern "C" void unwind_entry(std::vector<void*>* result);
60
61 namespace torch::unwind {
62 struct UpgradeExclusive {
UpgradeExclusivetorch::unwind::UpgradeExclusive63 UpgradeExclusive(std::shared_lock<std::shared_timed_mutex>& rdlock)
64 : rdlock_(rdlock) {
65 rdlock_.unlock();
66 rdlock_.mutex()->lock();
67 }
~UpgradeExclusivetorch::unwind::UpgradeExclusive68 ~UpgradeExclusive() {
69 rdlock_.mutex()->unlock();
70 rdlock_.lock();
71 }
72
73 private:
74 // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
75 std::shared_lock<std::shared_timed_mutex>& rdlock_;
76 };
77
78 struct LibraryInfo {
LibraryInfotorch::unwind::LibraryInfo79 LibraryInfo(
80 std::string name,
81 uint64_t load_bias,
82 uint64_t last_addr,
83 void* eh_frame_hdr_ptr_)
84 : name_(std::move(name)),
85 load_bias_(load_bias),
86 last_addr_(last_addr),
87 eh_frame_hdr_(eh_frame_hdr_ptr_) {}
88
load_biastorch::unwind::LibraryInfo89 uint64_t load_bias() const {
90 return load_bias_;
91 }
last_addrtorch::unwind::LibraryInfo92 uint64_t last_addr() const {
93 return last_addr_;
94 }
unwinderFortorch::unwind::LibraryInfo95 Unwinder unwinderFor(uint64_t addr) const {
96 void* fde_data = eh_frame_hdr_.entryForAddr(addr);
97 FDE fde(fde_data, name().c_str(), load_bias());
98 TableState state = fde.readUpTo(addr);
99 return Unwinder(state.cfa, state.registers[D_RIP], state.registers[D_RBP]);
100 }
nametorch::unwind::LibraryInfo101 const std::string& name() const {
102 return name_;
103 }
104
105 private:
106 std::string name_;
107 uint64_t load_bias_; // addr >= load_bias_
108 uint64_t last_addr_; // addr < last_addr_
109 EHFrameHdr eh_frame_hdr_;
110 };
111
process_name()112 static const char* process_name() {
113 // NOLINTNEXTLINE(*-c-arrays*)
114 static char name[PATH_MAX + 1] = "";
115 if (*name == '\0') {
116 ssize_t len = readlink("/proc/self/exe", name, PATH_MAX);
117 TORCH_INTERNAL_ASSERT(len != -1, "can't get path to exe")
118 name[len] = '\0';
119 }
120 return name;
121 }
122
123 struct Version {
124 uint64_t adds_ = LONG_LONG_MAX;
125 uint64_t subs_ = LONG_LONG_MAX;
126 };
127
128 struct UnwindCache {
currentVersiontorch::unwind::UnwindCache129 Version currentVersion() {
130 Version r;
131 dl_iterate_phdr(
132 [](struct dl_phdr_info* info,
133 size_t size [[maybe_unused]],
134 void* data) {
135 Version* v = (Version*)data;
136 v->adds_ = info->dlpi_adds;
137 v->subs_ = info->dlpi_subs;
138 return 1;
139 },
140 &r);
141 return r;
142 }
refreshLibrariestorch::unwind::UnwindCache143 void refreshLibraries() {
144 ++stats_.resets;
145 all_libraries_.clear();
146 ip_cache_.clear();
147 dl_iterate_phdr(
148 [](struct dl_phdr_info* info,
149 size_t size [[maybe_unused]],
150 void* data) {
151 auto self = (UnwindCache*)data;
152 uint64_t last_addr = 0;
153 auto segments = (Elf64_Phdr*)info->dlpi_phdr;
154 for (auto i : c10::irange(info->dlpi_phnum)) {
155 if (segments[i].p_type == PT_LOAD) {
156 auto begin = ((uint64_t)info->dlpi_addr + segments[i].p_vaddr);
157 auto end = (begin + segments[i].p_memsz);
158 last_addr = std::max(end, last_addr);
159 }
160 if (segments[i].p_type == PT_GNU_EH_FRAME) {
161 std::string library_name = info->dlpi_name;
162 if (library_name.empty()) {
163 library_name = process_name();
164 }
165 auto eh_frame_hdr =
166 // NOLINTNEXTLINE(performance-no-int-to-ptr)
167 (void*)(segments[i].p_vaddr + info->dlpi_addr);
168 self->all_libraries_.emplace_back(
169 std::move(library_name),
170 info->dlpi_addr,
171 last_addr,
172 eh_frame_hdr);
173 return 0;
174 }
175 }
176 self->libraries_with_no_unwind_.emplace_back(info->dlpi_name);
177 return 0;
178 },
179 this);
180 std::sort(
181 all_libraries_.begin(),
182 all_libraries_.end(),
183 [](const LibraryInfo& lhs, const LibraryInfo& rhs) {
184 return lhs.load_bias() < rhs.load_bias();
185 });
186 }
checkRefreshtorch::unwind::UnwindCache187 void checkRefresh(std::shared_lock<std::shared_timed_mutex>& rdlock) {
188 Version current_version = currentVersion();
189 if (current_version.subs_ != last_version_.subs_) {
190 UpgradeExclusive lock(rdlock);
191 refreshLibraries();
192 }
193 }
194
unwinderFortorch::unwind::UnwindCache195 const Unwinder& unwinderFor(
196 uint64_t addr,
197 std::shared_lock<std::shared_timed_mutex>& rdlock) {
198 auto it = ip_cache_.find(addr);
199 if (it != ip_cache_.end()) {
200 ++stats_.hits;
201 return it->second;
202 }
203
204 // we are about to modify the cache
205 UpgradeExclusive lock(rdlock);
206 ++stats_.misses;
207
208 Unwinder unwinder = Unwinder::unknown();
209 try {
210 unwinder = libraryFor(addr).unwinderFor(addr);
211 } catch (unwind::UnwindError& err) {
212 // because unwinders are cached this will only print
213 // once per frame that cannot be unwound.
214 TORCH_WARN("Unsupported unwinding pattern: ", err.what());
215 }
216 auto r = ip_cache_.insert_or_assign(addr, unwinder);
217 return r.first->second;
218 }
219
findLibraryFortorch::unwind::UnwindCache220 const LibraryInfo* findLibraryFor(uint64_t addr) {
221 Version current_version = currentVersion();
222 if (current_version.subs_ != last_version_.subs_) {
223 refreshLibraries();
224 last_version_ = current_version;
225 }
226 auto* r = searchFor(addr);
227 if (!r) {
228 if (current_version.adds_ != last_version_.adds_) {
229 refreshLibraries();
230 last_version_ = current_version;
231 }
232 r = searchFor(addr);
233 }
234 return r;
235 }
236
libraryFortorch::unwind::UnwindCache237 const LibraryInfo& libraryFor(uint64_t addr) {
238 auto* r = findLibraryFor(addr);
239 if (!r) {
240 for ([[maybe_unused]] const auto& l : libraries_with_no_unwind_) {
241 TORCH_WARN("Did not find a PT_GNU_EH_FRAME segment for ", l);
242 }
243 libraries_with_no_unwind_.clear();
244 throw UnwindError("addr not in range of known libraries");
245 }
246 return *r;
247 }
248
statstorch::unwind::UnwindCache249 torch::unwind::Stats stats() {
250 return stats_;
251 }
252
253 private:
searchFortorch::unwind::UnwindCache254 const LibraryInfo* searchFor(uint64_t addr) {
255 if (all_libraries_.empty()) {
256 return nullptr;
257 }
258 uint64_t low = 0;
259 uint64_t high = all_libraries_.size();
260 while (low + 1 < high) {
261 auto mid = (low + high) / 2;
262 if (addr < all_libraries_.at(mid).load_bias()) {
263 high = mid;
264 } else {
265 low = mid;
266 }
267 }
268 LibraryInfo* r = &all_libraries_.at(low);
269 if (addr < r->load_bias() || addr >= r->last_addr()) {
270 return nullptr;
271 }
272 return r;
273 }
274
275 // sorted by load_bias
276 std::vector<LibraryInfo> all_libraries_;
277 ska::flat_hash_map<uint64_t, Unwinder> ip_cache_;
278
279 torch::unwind::Stats stats_;
280
281 // to keep track of whether we need to refresh this info
282 Version last_version_;
283
284 std::vector<std::string> libraries_with_no_unwind_;
285 };
286
287 static UnwindCache unwind_cache;
288 static std::shared_timed_mutex cache_mutex_;
289
unwind()290 std::vector<void*> unwind() {
291 std::vector<void*> frames;
292 unwind_entry(&frames);
293 return frames;
294 }
295
libraryFor(void * addr)296 std::optional<std::pair<std::string, uint64_t>> libraryFor(void* addr) {
297 if (!addr) {
298 return std::nullopt;
299 }
300 std::shared_lock lock(cache_mutex_);
301 const LibraryInfo* library_info = unwind_cache.findLibraryFor((uint64_t)addr);
302 if (!library_info) {
303 return std::nullopt;
304 }
305 return std::make_pair(
306 library_info->name(), (uint64_t)addr - library_info->load_bias());
307 }
308
dladdr_lookup(void * addr)309 static std::string dladdr_lookup(void* addr) {
310 Dl_info dlinfo;
311 std::string funcname = "??";
312 if (dladdr(addr, &dlinfo) && dlinfo.dli_sname) {
313 funcname = demangle(dlinfo.dli_sname);
314 }
315 return funcname;
316 }
317
318 struct Symbolizer {
Symbolizertorch::unwind::Symbolizer319 Symbolizer() {
320 auto envar = std::getenv("TORCH_ADDR2LINE_BINARY");
321 if (envar != nullptr) {
322 // currently we take user's input as is without checking
323 addr2line_binary_ = envar;
324 TORCH_WARN("Use custom addr2line binary: ", addr2line_binary_);
325 } else {
326 addr2line_binary_ = "addr2line"; // default
327 }
328 }
guardtorch::unwind::Symbolizer329 static std::lock_guard<std::mutex> guard() {
330 static std::mutex mutex;
331 return std::lock_guard<std::mutex>(mutex);
332 }
gettorch::unwind::Symbolizer333 static Symbolizer& get() {
334 static Symbolizer singleton;
335 return singleton;
336 }
337
requesttorch::unwind::Symbolizer338 void request(void* addr) {
339 if (frame_map_.count(addr)) {
340 return;
341 }
342 auto maybe_library = libraryFor(addr);
343 if (!maybe_library) {
344 frame_map_[addr] = Frame{"??", "<unwind unsupported>", 0};
345 return;
346 }
347 has_pending_results_ = true;
348 auto& entry = getOrCreate(maybe_library->first);
349 entry.queried.push_back(addr);
350 auto libaddress = maybe_library->second - 1;
351 // NOLINTNEXTLINE(performance-no-int-to-ptr)
352 entry.comm->out() << (void*)libaddress << "\n";
353 // we need to make sure we don't write more than 64k bytes to
354 // a pipe before reading the results. Otherwise the buffer may
355 // get filled and block before we read the results.
356 // Each line is < 32 characters,
357 // so this limits us to < 32k bytes before we read rules.
358 if (entry.queried.size() - entry.completed > BLOCK) {
359 entry.comm->out().flush();
360 readPendingResults(entry);
361 }
362 }
lookuptorch::unwind::Symbolizer363 const Frame& lookup(void* addr) {
364 if (has_pending_results_) {
365 for (auto& kv : entries_) {
366 kv.second.comm->out().flush();
367 }
368 for (auto& kv : entries_) {
369 readPendingResults(kv.second);
370 }
371 has_pending_results_ = false;
372 }
373 return frame_map_.at(addr);
374 }
375
376 private:
377 static constexpr int BLOCK = 1024;
378 const char* addr2line_binary_;
379 struct Entry {
380 std::unique_ptr<Communicate> comm;
381 std::vector<void*> queried;
382 size_t completed = 0;
383 };
384 ska::flat_hash_map<std::string, Entry> entries_;
385 ska::flat_hash_map<void*, Frame> frame_map_;
386 bool has_pending_results_ = true;
387
getOrCreatetorch::unwind::Symbolizer388 Entry& getOrCreate(const std::string& name) {
389 auto it = entries_.find(name);
390 if (it == entries_.end()) {
391 // NOLINTNEXTLINE(*-c-arrays*)
392 const char* args[] = {
393 addr2line_binary_, "-C", "-f", "-e", name.c_str(), nullptr};
394 it = entries_
395 .insert_or_assign(
396 name,
397 Entry{
398 std::make_unique<Communicate>(addr2line_binary_, args),
399 {}})
400 .first;
401 }
402 return it->second;
403 }
readPendingResultstorch::unwind::Symbolizer404 void readPendingResults(Entry& e) {
405 size_t N = e.queried.size();
406 for (; e.completed < N; ++e.completed) {
407 Frame frame;
408 std::getline(e.comm->in(), frame.funcname);
409 std::string filename_lineno;
410 std::getline(e.comm->in(), filename_lineno);
411 auto colon = filename_lineno.find_last_of(':');
412 frame.filename = filename_lineno.substr(0, colon);
413 std::string lineno_str = filename_lineno.substr(colon + 1);
414 frame.lineno = lineno_str == "?" ? 0 : std::stoi(lineno_str);
415 frame_map_[e.queried[e.completed]] = std::move(frame);
416 }
417 }
418 };
419
symbolize_fast(const std::vector<void * > & frames,Mode mode)420 static std::vector<Frame> symbolize_fast(
421 const std::vector<void*>& frames,
422 Mode mode) {
423 static std::mutex cache_mutex;
424 static std::array<ska::flat_hash_map<void*, Frame>, 2> frame_maps;
425 auto& frame_map = frame_maps[mode == Mode::fast ? 0 : 1];
426
427 std::vector<uint32_t> indices_to_lookup;
428 std::vector<Frame> results;
429 results.reserve(frames.size());
430 {
431 std::lock_guard<std::mutex> lock(cache_mutex);
432 for (auto i : c10::irange(frames.size())) {
433 void* f = frames.at(i);
434 auto it = frame_map.find(f);
435 if (it == frame_map.end()) {
436 indices_to_lookup.push_back(i);
437 results.emplace_back(Frame{"??", "??", 0});
438 } else {
439 results.emplace_back(it->second);
440 }
441 }
442 }
443 if (!indices_to_lookup.empty()) {
444 // do symbolizer work
445 FastSymbolizer symbolizer;
446 for (auto i : indices_to_lookup) {
447 void* addr = frames.at(i);
448 Frame& f = results.at(i);
449 auto library = libraryFor(frames.at(i));
450 if (library) {
451 if (mode == Mode::fast) {
452 f = symbolizer.symbolize(library->first, library->second - 1);
453 } else {
454 f = Frame{library->first, "??", library->second - 1};
455 }
456 }
457 if (f.funcname == "??") {
458 f.funcname = dladdr_lookup(addr);
459 }
460 }
461 std::lock_guard<std::mutex> lock(cache_mutex);
462 for (auto i : indices_to_lookup) {
463 frame_map.emplace(frames.at(i), results.at(i));
464 }
465 }
466 return results;
467 }
468
symbolize_addr2line(const std::vector<void * > & frames)469 static std::vector<Frame> symbolize_addr2line(
470 const std::vector<void*>& frames) {
471 auto guard = Symbolizer::guard();
472 Symbolizer& s = Symbolizer::get();
473 for (auto f : frames) {
474 s.request(f);
475 }
476 std::vector<Frame> results;
477 results.reserve(frames.size());
478 for (auto f : frames) {
479 results.emplace_back(s.lookup(f));
480 }
481 return results;
482 }
483
484 // fbcode will use llvm symbolize since there is an llvm dependency already
485 #ifndef FBCODE_CAFFE2
symbolize(const std::vector<void * > & frames,Mode mode)486 std::vector<Frame> symbolize(const std::vector<void*>& frames, Mode mode) {
487 if (mode == Mode::addr2line) {
488 return symbolize_addr2line(frames);
489 } else {
490 return symbolize_fast(frames, mode);
491 }
492 }
493 #endif
494
stats()495 Stats stats() {
496 return unwind_cache.stats();
497 }
498
499 } // namespace torch::unwind
500
unwind_c(std::vector<void * > * result,int64_t rsp,int64_t rbp)501 extern "C" void unwind_c(std::vector<void*>* result, int64_t rsp, int64_t rbp) {
502 std::shared_lock lock(torch::unwind::cache_mutex_);
503 torch::unwind::UnwindState state{};
504 // NOLINTNEXTLINE(performance-no-int-to-ptr)
505 state.rip = *(int64_t*)(rsp);
506 // +8 because we saved rsp after the return address was already pushed
507 // to the stack
508 state.rsp = rsp + 8;
509 state.rbp = rbp;
510 torch::unwind::unwind_cache.checkRefresh(lock);
511 while (true) { // unwind for _start sets rip as being undefined
512 // NOLINTNEXTLINE(performance-no-int-to-ptr)
513 result->push_back((void*)state.rip);
514 const torch::unwind::Unwinder& uw =
515 torch::unwind::unwind_cache.unwinderFor(state.rip, lock);
516 if (uw.terminator()) {
517 if (uw.isUnknown()) {
518 result->push_back(nullptr);
519 }
520 break;
521 }
522 state = uw.run(state);
523 }
524 }
525
526 // calling convention puts the first three pointer/int64_t arguments in
527 // rdi rsi rdx (all caller-saved)
528 // rdi already holds the pointer to the result vector
529 // we add arguments for current rsp and rbp and then tail call
530 // into unwind_c
531 __asm__(
532 ".global unwind_entry\n"
533 "unwind_entry:\n"
534 "mov %rsp, %rsi;\n"
535 "mov %rbp, %rdx;\n"
536 "jmp unwind_c;\n");
537
538 #endif
539