xref: /aosp_15_r20/external/pytorch/torch/csrc/profiler/kineto_shim.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #include <torch/csrc/profiler/collection.h>
2 #include <torch/csrc/profiler/kineto_shim.h>
3 
4 #ifdef USE_KINETO
5 #include <libkineto.h>
6 #endif
7 
8 #include <c10/util/Exception.h>
9 
10 namespace torch {
11 
12 namespace profiler::impl::kineto {
13 
14 // Here lies pain and `#ifdef USE_KINETO`
15 
16 #ifdef USE_KINETO
17 namespace {
18 const std::set<libkineto::ActivityType> kCpuTypes{
19     libkineto::ActivityType::CPU_OP,
20     libkineto::ActivityType::CPU_INSTANT_EVENT,
21     libkineto::ActivityType::USER_ANNOTATION,
22     libkineto::ActivityType::EXTERNAL_CORRELATION,
23     libkineto::ActivityType::XPU_RUNTIME,
24     libkineto::ActivityType::CUDA_RUNTIME,
25     libkineto::ActivityType::CUDA_DRIVER,
26     libkineto::ActivityType::PYTHON_FUNCTION,
27     libkineto::ActivityType::PRIVATEUSE1_RUNTIME,
28     libkineto::ActivityType::PRIVATEUSE1_DRIVER,
29 };
30 
31 const std::set<libkineto::ActivityType> kCudaTypes = {
32     libkineto::ActivityType::GPU_MEMCPY,
33     libkineto::ActivityType::GPU_MEMSET,
34     libkineto::ActivityType::GPU_USER_ANNOTATION,
35     libkineto::ActivityType::CONCURRENT_KERNEL,
36     // CUDA_RUNTIME appears in both kCpuTypes and kCudaTypes.
37     libkineto::ActivityType::CUDA_RUNTIME,
38     libkineto::ActivityType::CUDA_DRIVER,
39 };
40 const std::set<libkineto::ActivityType> kXpuTypes = {
41     libkineto::ActivityType::GPU_MEMCPY,
42     libkineto::ActivityType::GPU_MEMSET,
43     libkineto::ActivityType::CONCURRENT_KERNEL,
44     // XPU_RUNTIME appears in both kCpuTypes and kXpuTypes.
45     libkineto::ActivityType::XPU_RUNTIME,
46 };
47 const std::set<libkineto::ActivityType> kMtiaTypes = {
48     libkineto::ActivityType::MTIA_CCP_EVENTS,
49     libkineto::ActivityType::MTIA_RUNTIME,
50     libkineto::ActivityType::MTIA_WORKLOADD,
51 };
52 const std::set<libkineto::ActivityType> kPrivateUse1Types = {
53     libkineto::ActivityType::GPU_MEMCPY,
54     libkineto::ActivityType::GPU_MEMSET,
55     libkineto::ActivityType::GPU_USER_ANNOTATION,
56     libkineto::ActivityType::CONCURRENT_KERNEL,
57     // PRIVATEUSE1_RUNTIME appears in both kCpuTypes and kPrivateUse1Types.
58     libkineto::ActivityType::PRIVATEUSE1_RUNTIME,
59     libkineto::ActivityType::PRIVATEUSE1_DRIVER,
60 };
61 } // namespace
62 #endif // USE_KINETO
63 
64 static_assert(
65     c10::is_pod_v<DeviceAndResource>,
66     "Kineto specific details should be in `kineto_ids`.");
67 
kineto_ids()68 const DeviceAndResource kineto_ids() {
69 #ifdef USE_KINETO
70   return {
71       /*device=*/libkineto::processId(),
72       /*resource=*/libkineto::systemThreadId()};
73 #else
74   return {};
75 #endif // USE_KINETO
76 }
77 
addMetadata(activity_t * activity,const std::string & key,const std::string & value)78 void addMetadata(
79     activity_t* activity,
80     const std::string& key,
81     const std::string& value) {
82 #ifdef USE_KINETO
83   activity->addMetadata(key, value);
84 #endif // USE_KINETO
85 }
86 
TraceWrapper(const int64_t start_time,const std::string & name)87 TraceWrapper::TraceWrapper(const int64_t start_time, const std::string& name)
88 #ifdef USE_KINETO
89     : cpu_trace_(std::make_unique<libkineto::CpuTraceBuffer>()) {
90   cpu_trace_->span.startTime = start_time;
91   cpu_trace_->gpuOpCount = -1;
92   cpu_trace_->span.name = name;
93 }
94 #else
95 {
96 }
97 #endif // USE_KINETO
98 
99 TraceWrapper::~TraceWrapper() = default;
100 
addCPUActivity(const std::string & name,const libkineto::ActivityType type,const DeviceAndResource device_and_resource,const uint64_t correlation_id,const int64_t start_time,const int64_t end_time)101 activity_t* TraceWrapper::addCPUActivity(
102     const std::string& name,
103     const libkineto::ActivityType type,
104     const DeviceAndResource device_and_resource,
105     const uint64_t correlation_id,
106     const int64_t start_time,
107     const int64_t end_time) {
108 #ifdef USE_KINETO
109   TORCH_CHECK((bool)(*this), "Cannot add event to non-existent trace.");
110   cpu_trace_->emplace_activity(cpu_trace_->span, type, name);
111   auto& act = libkineto::CpuTraceBuffer::toRef(cpu_trace_->activities.back());
112   act.device = device_and_resource.device;
113   act.resource = device_and_resource.resource;
114   act.id = static_cast<int32_t>(correlation_id);
115   act.startTime = start_time;
116   if (type != libkineto::ActivityType::CPU_INSTANT_EVENT) {
117     act.endTime = end_time;
118   }
119   return cpu_trace_->activities.back().get();
120 #else
121   return nullptr;
122 #endif // USE_KINETO
123 }
124 
transferCpuTrace(int64_t end_time)125 void TraceWrapper::transferCpuTrace(int64_t end_time) {
126 #ifdef USE_KINETO
127   cpu_trace_->span.endTime = end_time;
128   libkineto::api().activityProfiler().transferCpuTrace(std::move(cpu_trace_));
129 #endif // USE_KINETO
130 }
131 
operator bool() const132 TraceWrapper::operator bool() const {
133 #ifdef USE_KINETO
134   return cpu_trace_ != nullptr;
135 #else
136   return false;
137 #endif // USE_KINETO
138 }
139 
ActivityTraceWrapper(std::unique_ptr<interface_trace_t> && trace)140 ActivityTraceWrapper::ActivityTraceWrapper(
141     std::unique_ptr<interface_trace_t>&& trace)
142     : trace_(std::move(trace)) {}
143 
operator bool() const144 ActivityTraceWrapper::operator bool() const {
145 #ifdef USE_KINETO
146   return trace_ != nullptr;
147 #else
148   return false;
149 #endif // USE_KINETO
150 }
151 
save(const std::string & path)152 void ActivityTraceWrapper::save(const std::string& path) {
153 #ifdef USE_KINETO
154   TORCH_CHECK(!saved_, "Trace is already saved.");
155   TORCH_CHECK(trace_ != nullptr, "Missing trace.")
156   trace_->save(path);
157   saved_ = true;
158 #else
159   TORCH_CHECK(
160       false,
161       "Saving a trace requires using torch.profiler with Kineto support (USE_KINETO=1)");
162 #endif // USE_KINETO
163 }
164 
165 namespace {
166 // Handles processing of Experimental Config options for Kineto
167 class ExperimentalConfigWrapper {
168  public:
ExperimentalConfigWrapper(const torch::profiler::impl::ExperimentalConfig & config)169   explicit ExperimentalConfigWrapper(
170       const torch::profiler::impl::ExperimentalConfig& config)
171       : config_(config) {}
172 
assertValid()173   bool assertValid() {
174     return !config_.profiler_metrics.empty();
175   }
176 
prepareTraceWithExperimentalOptions(bool add_cpu_activity)177   void prepareTraceWithExperimentalOptions(bool add_cpu_activity) {
178 #ifdef USE_KINETO
179     std::set<libkineto::ActivityType> k_activities{
180         libkineto::ActivityType::CUDA_PROFILER_RANGE};
181 
182     // Only add CPU activities if we are measuring per kernel ranges
183     if (add_cpu_activity && config_.profiler_measure_per_kernel) {
184       k_activities.insert(kCpuTypes.begin(), kCpuTypes.end());
185     }
186 
187     const size_t num_metrics = config_.profiler_metrics.size();
188     std::stringstream configss;
189 
190     LOG(INFO) << "CUPTI profiler metrics size = " << num_metrics;
191 
192     configss << "ACTIVITIES_WARMUP_PERIOD_SECS=0\n"
193              << "CUPTI_PROFILER_METRICS=";
194 
195     for (size_t i = 0; i < num_metrics; i++) {
196       configss << config_.profiler_metrics[i];
197       if (num_metrics > 1 && i < (num_metrics - 1)) {
198         configss << ",";
199       }
200     }
201     configss << "\nCUPTI_PROFILER_ENABLE_PER_KERNEL="
202              << (config_.profiler_measure_per_kernel ? "true" : "false")
203              << "\n";
204     LOG(INFO) << "Generated config = " << configss.str();
205 
206     libkineto::api().activityProfiler().prepareTrace(
207         k_activities, configss.str());
208 #endif // USE_KINETO
209   }
210 
211  private:
212   // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
213   const torch::profiler::impl::ExperimentalConfig& config_;
214 };
215 } // namespace
216 
collectivesProfilerExists()217 bool collectivesProfilerExists() {
218 #ifdef KINETO_HAS_NCCL_PROFILER
219   return true;
220 #else
221   return false;
222 #endif
223 }
224 
prepareTrace(const bool cpuOnly,const ActivitySet & activities,const torch::profiler::impl::ExperimentalConfig & config)225 void prepareTrace(
226     const bool cpuOnly,
227     const ActivitySet& activities,
228     const torch::profiler::impl::ExperimentalConfig& config) {
229 #ifdef USE_KINETO
230   if (!libkineto::api().isProfilerRegistered()) {
231     libkineto_init(/*cpuOnly=*/cpuOnly, /*logOnError=*/true);
232     libkineto::api().suppressLogMessages();
233   }
234 
235   if (!libkineto::api().isProfilerInitialized()) {
236     libkineto::api().initProfilerIfRegistered();
237   }
238 
239   std::set<libkineto::ActivityType> k_activities;
240   bool has_cpu_activity =
241       activities.count(torch::autograd::profiler::ActivityType::CPU);
242 
243   if (has_cpu_activity) {
244     k_activities.insert(kCpuTypes.begin(), kCpuTypes.end());
245   }
246   if (activities.count(torch::autograd::profiler::ActivityType::XPU)) {
247     k_activities.insert(kXpuTypes.begin(), kXpuTypes.end());
248   }
249   if (activities.count(torch::autograd::profiler::ActivityType::MTIA)) {
250     k_activities.insert(kMtiaTypes.begin(), kMtiaTypes.end());
251   }
252   if (activities.count(torch::autograd::profiler::ActivityType::CUDA)) {
253     k_activities.insert(kCudaTypes.begin(), kCudaTypes.end());
254     if (config.enable_cuda_sync_events || get_cuda_sync_enabled()) {
255       LOG(INFO) << "Enabling CUDA Sync Events";
256       k_activities.insert(libkineto::ActivityType::CUDA_SYNC);
257     }
258   }
259   if (collectivesProfilerExists()) {
260     k_activities.insert(libkineto::ActivityType::COLLECTIVE_COMM);
261   }
262   if (activities.count(torch::autograd::profiler::ActivityType::PrivateUse1)) {
263     k_activities.insert(kPrivateUse1Types.begin(), kPrivateUse1Types.end());
264   }
265 
266   ExperimentalConfigWrapper configWrap(config);
267 
268   // Experimental Configuration options are present
269   if (config && configWrap.assertValid()) {
270     configWrap.prepareTraceWithExperimentalOptions(has_cpu_activity);
271     return;
272   }
273 
274   libkineto::api().activityProfiler().prepareTrace(k_activities);
275 #endif // USE_KINETO
276 }
277 
toggleCollectionDynamic(const bool enable)278 void toggleCollectionDynamic(const bool enable) {
279 #ifdef USE_KINETO
280   // TODO: We may want to consider adding another input arg for this function
281   // if we want to support turning off certain devices and keeping others on.
282   // For now, we can keep it simple at have it turn off all tracing of "CUDA"
283   // devices
284   libkineto::api().activityProfiler().toggleCollectionDynamic(enable);
285 #endif // USE_KINETO
286 }
287 
startTrace()288 void startTrace() {
289 #ifdef USE_KINETO
290   libkineto::api().activityProfiler().startTrace();
291 #endif // USE_KINETO
292 }
293 
stopTrace()294 ActivityTraceWrapper stopTrace() {
295   return ActivityTraceWrapper{
296 #ifdef USE_KINETO
297       libkineto::api().activityProfiler().stopTrace()
298 #else
299       std::make_unique<interface_trace_t>()
300 #endif // USE_KINETO
301   };
302 }
303 
pushCorrelationId(uint64_t correlation_id)304 void pushCorrelationId(uint64_t correlation_id) {
305 #ifdef USE_KINETO
306   libkineto::api().activityProfiler().pushCorrelationId(correlation_id);
307 #endif // USE_KINETO
308 }
309 
pushUserCorrelationId(uint64_t correlation_id)310 void pushUserCorrelationId(uint64_t correlation_id) {
311 #ifdef USE_KINETO
312   libkineto::api().activityProfiler().pushUserCorrelationId(correlation_id);
313 #endif // USE_KINETO
314 }
315 
popCorrelationId()316 void popCorrelationId() {
317 #ifdef USE_KINETO
318   libkineto::api().activityProfiler().popCorrelationId();
319 #endif // USE_KINETO
320 }
321 
popUserCorrelationId()322 void popUserCorrelationId() {
323 #ifdef USE_KINETO
324   libkineto::api().activityProfiler().popUserCorrelationId();
325 #endif // USE_KINETO
326 }
327 
recordThreadInfo()328 void recordThreadInfo() {
329 #ifdef USE_KINETO
330   libkineto::api().activityProfiler().recordThreadInfo();
331 #endif // USE_KINETO
332 }
333 
logInvariantViolation(const std::string & assertion,const std::string & error,const std::string & profile_id,const std::string & group_profile_id)334 void logInvariantViolation(
335     const std::string& assertion,
336     const std::string& error,
337     const std::string& profile_id,
338     const std::string& group_profile_id) {
339 #ifdef USE_KINETO
340   if (libkineto::api().isProfilerInitialized()) {
341     libkineto::api().activityProfiler().logInvariantViolation(
342         profile_id, assertion, error, group_profile_id);
343   }
344 #endif // USE_KINETO
345 }
346 
347 } // namespace profiler::impl::kineto
348 
349 namespace autograd::profiler {
deviceTypeFromActivity(libkineto::ActivityType activity_type)350 c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) {
351   // fallthrough
352   switch (activity_type) {
353     case libkineto::ActivityType::GPU_MEMCPY:
354     case libkineto::ActivityType::GPU_MEMSET:
355     case libkineto::ActivityType::CONCURRENT_KERNEL:
356     case libkineto::ActivityType::CUDA_SYNC:
357     case libkineto::ActivityType::GPU_USER_ANNOTATION:
358     case libkineto::ActivityType::CUDA_PROFILER_RANGE: {
359       // PrivateUse1 kineto backend reuse above ActivityTypes,
360       // If PrivateUse1 backend enabled, this should return
361       // c10::DeviceType::PrivateUse1.
362       c10::DeviceType device_type = []() {
363         if (c10::get_privateuse1_backend() != "privateuseone") {
364           return c10::DeviceType::PrivateUse1;
365         }
366         return c10::DeviceType::CUDA;
367       }();
368       return device_type;
369     }
370     // TODO: T151322015
371     case libkineto::ActivityType::MTIA_CCP_EVENTS:
372     case libkineto::ActivityType::MTIA_WORKLOADD: {
373       // PrivateUse1 kineto backend reuse above ActivityTypes,
374       // If PrivateUse1 backend enabled, this should return
375       // c10::DeviceType::PrivateUse1.
376       c10::DeviceType device_type = []() {
377         if (c10::get_privateuse1_backend() != "privateuseone") {
378           return c10::DeviceType::PrivateUse1;
379         }
380         return c10::DeviceType::MTIA;
381       }();
382       return device_type;
383     }
384     case libkineto::ActivityType::CPU_OP:
385     case libkineto::ActivityType::USER_ANNOTATION:
386     case libkineto::ActivityType::EXTERNAL_CORRELATION:
387     case libkineto::ActivityType::CUDA_RUNTIME:
388     case libkineto::ActivityType::XPU_RUNTIME:
389     case libkineto::ActivityType::CPU_INSTANT_EVENT:
390     case libkineto::ActivityType::GLOW_RUNTIME:
391     case libkineto::ActivityType::MTIA_RUNTIME:
392     case libkineto::ActivityType::PYTHON_FUNCTION:
393     case libkineto::ActivityType::CUDA_DRIVER:
394     case libkineto::ActivityType::PRIVATEUSE1_RUNTIME:
395     case libkineto::ActivityType::PRIVATEUSE1_DRIVER:
396       return c10::DeviceType::CPU;
397     default: {
398       TORCH_WARN(
399           "Unknown activity type (",
400           (uint8_t)activity_type,
401           "), assuming CPU device");
402       return c10::DeviceType::CPU;
403     }
404   }
405 }
406 
addMetadataJson(const std::string & key,const std::string & value)407 void addMetadataJson(const std::string& key, const std::string& value) {
408 #ifdef USE_KINETO
409   if (libkineto::api().isProfilerInitialized()) {
410     libkineto::api().activityProfiler().addMetadata(key, value);
411   } else {
412     LOG(WARNING) << "Profiler is not initialized: skipping profiling metadata";
413   }
414 #else
415   LOG(WARNING) << "Adding profiling metadata requires using "
416                << "torch.profiler with Kineto support (USE_KINETO=1)";
417 #endif // USE_KINETO
418 }
419 
profilerStep()420 void profilerStep() {
421 #ifdef USE_KINETO
422   libkineto::api().initProfilerIfRegistered();
423 
424   if (libkineto::api().isProfilerInitialized()) {
425     libkineto::api().activityProfiler().step();
426   } else {
427     VLOG(1) << "Profiler is not initialized: skipping step() invocation";
428   }
429 #endif // USE_KINETO
430 }
431 
432 } // namespace autograd::profiler
433 
434 } // namespace torch
435