1 #include <torch/csrc/profiler/collection.h>
2 #include <torch/csrc/profiler/kineto_shim.h>
3
4 #ifdef USE_KINETO
5 #include <libkineto.h>
6 #endif
7
8 #include <c10/util/Exception.h>
9
10 namespace torch {
11
12 namespace profiler::impl::kineto {
13
14 // Here lies pain and `#ifdef USE_KINETO`
15
16 #ifdef USE_KINETO
17 namespace {
18 const std::set<libkineto::ActivityType> kCpuTypes{
19 libkineto::ActivityType::CPU_OP,
20 libkineto::ActivityType::CPU_INSTANT_EVENT,
21 libkineto::ActivityType::USER_ANNOTATION,
22 libkineto::ActivityType::EXTERNAL_CORRELATION,
23 libkineto::ActivityType::XPU_RUNTIME,
24 libkineto::ActivityType::CUDA_RUNTIME,
25 libkineto::ActivityType::CUDA_DRIVER,
26 libkineto::ActivityType::PYTHON_FUNCTION,
27 libkineto::ActivityType::PRIVATEUSE1_RUNTIME,
28 libkineto::ActivityType::PRIVATEUSE1_DRIVER,
29 };
30
31 const std::set<libkineto::ActivityType> kCudaTypes = {
32 libkineto::ActivityType::GPU_MEMCPY,
33 libkineto::ActivityType::GPU_MEMSET,
34 libkineto::ActivityType::GPU_USER_ANNOTATION,
35 libkineto::ActivityType::CONCURRENT_KERNEL,
36 // CUDA_RUNTIME appears in both kCpuTypes and kCudaTypes.
37 libkineto::ActivityType::CUDA_RUNTIME,
38 libkineto::ActivityType::CUDA_DRIVER,
39 };
40 const std::set<libkineto::ActivityType> kXpuTypes = {
41 libkineto::ActivityType::GPU_MEMCPY,
42 libkineto::ActivityType::GPU_MEMSET,
43 libkineto::ActivityType::CONCURRENT_KERNEL,
44 // XPU_RUNTIME appears in both kCpuTypes and kXpuTypes.
45 libkineto::ActivityType::XPU_RUNTIME,
46 };
47 const std::set<libkineto::ActivityType> kMtiaTypes = {
48 libkineto::ActivityType::MTIA_CCP_EVENTS,
49 libkineto::ActivityType::MTIA_RUNTIME,
50 libkineto::ActivityType::MTIA_WORKLOADD,
51 };
52 const std::set<libkineto::ActivityType> kPrivateUse1Types = {
53 libkineto::ActivityType::GPU_MEMCPY,
54 libkineto::ActivityType::GPU_MEMSET,
55 libkineto::ActivityType::GPU_USER_ANNOTATION,
56 libkineto::ActivityType::CONCURRENT_KERNEL,
57 // PRIVATEUSE1_RUNTIME appears in both kCpuTypes and kPrivateUse1Types.
58 libkineto::ActivityType::PRIVATEUSE1_RUNTIME,
59 libkineto::ActivityType::PRIVATEUSE1_DRIVER,
60 };
61 } // namespace
62 #endif // USE_KINETO
63
64 static_assert(
65 c10::is_pod_v<DeviceAndResource>,
66 "Kineto specific details should be in `kineto_ids`.");
67
kineto_ids()68 const DeviceAndResource kineto_ids() {
69 #ifdef USE_KINETO
70 return {
71 /*device=*/libkineto::processId(),
72 /*resource=*/libkineto::systemThreadId()};
73 #else
74 return {};
75 #endif // USE_KINETO
76 }
77
addMetadata(activity_t * activity,const std::string & key,const std::string & value)78 void addMetadata(
79 activity_t* activity,
80 const std::string& key,
81 const std::string& value) {
82 #ifdef USE_KINETO
83 activity->addMetadata(key, value);
84 #endif // USE_KINETO
85 }
86
TraceWrapper(const int64_t start_time,const std::string & name)87 TraceWrapper::TraceWrapper(const int64_t start_time, const std::string& name)
88 #ifdef USE_KINETO
89 : cpu_trace_(std::make_unique<libkineto::CpuTraceBuffer>()) {
90 cpu_trace_->span.startTime = start_time;
91 cpu_trace_->gpuOpCount = -1;
92 cpu_trace_->span.name = name;
93 }
94 #else
95 {
96 }
97 #endif // USE_KINETO
98
99 TraceWrapper::~TraceWrapper() = default;
100
addCPUActivity(const std::string & name,const libkineto::ActivityType type,const DeviceAndResource device_and_resource,const uint64_t correlation_id,const int64_t start_time,const int64_t end_time)101 activity_t* TraceWrapper::addCPUActivity(
102 const std::string& name,
103 const libkineto::ActivityType type,
104 const DeviceAndResource device_and_resource,
105 const uint64_t correlation_id,
106 const int64_t start_time,
107 const int64_t end_time) {
108 #ifdef USE_KINETO
109 TORCH_CHECK((bool)(*this), "Cannot add event to non-existent trace.");
110 cpu_trace_->emplace_activity(cpu_trace_->span, type, name);
111 auto& act = libkineto::CpuTraceBuffer::toRef(cpu_trace_->activities.back());
112 act.device = device_and_resource.device;
113 act.resource = device_and_resource.resource;
114 act.id = static_cast<int32_t>(correlation_id);
115 act.startTime = start_time;
116 if (type != libkineto::ActivityType::CPU_INSTANT_EVENT) {
117 act.endTime = end_time;
118 }
119 return cpu_trace_->activities.back().get();
120 #else
121 return nullptr;
122 #endif // USE_KINETO
123 }
124
transferCpuTrace(int64_t end_time)125 void TraceWrapper::transferCpuTrace(int64_t end_time) {
126 #ifdef USE_KINETO
127 cpu_trace_->span.endTime = end_time;
128 libkineto::api().activityProfiler().transferCpuTrace(std::move(cpu_trace_));
129 #endif // USE_KINETO
130 }
131
operator bool() const132 TraceWrapper::operator bool() const {
133 #ifdef USE_KINETO
134 return cpu_trace_ != nullptr;
135 #else
136 return false;
137 #endif // USE_KINETO
138 }
139
ActivityTraceWrapper(std::unique_ptr<interface_trace_t> && trace)140 ActivityTraceWrapper::ActivityTraceWrapper(
141 std::unique_ptr<interface_trace_t>&& trace)
142 : trace_(std::move(trace)) {}
143
operator bool() const144 ActivityTraceWrapper::operator bool() const {
145 #ifdef USE_KINETO
146 return trace_ != nullptr;
147 #else
148 return false;
149 #endif // USE_KINETO
150 }
151
save(const std::string & path)152 void ActivityTraceWrapper::save(const std::string& path) {
153 #ifdef USE_KINETO
154 TORCH_CHECK(!saved_, "Trace is already saved.");
155 TORCH_CHECK(trace_ != nullptr, "Missing trace.")
156 trace_->save(path);
157 saved_ = true;
158 #else
159 TORCH_CHECK(
160 false,
161 "Saving a trace requires using torch.profiler with Kineto support (USE_KINETO=1)");
162 #endif // USE_KINETO
163 }
164
165 namespace {
166 // Handles processing of Experimental Config options for Kineto
167 class ExperimentalConfigWrapper {
168 public:
ExperimentalConfigWrapper(const torch::profiler::impl::ExperimentalConfig & config)169 explicit ExperimentalConfigWrapper(
170 const torch::profiler::impl::ExperimentalConfig& config)
171 : config_(config) {}
172
assertValid()173 bool assertValid() {
174 return !config_.profiler_metrics.empty();
175 }
176
prepareTraceWithExperimentalOptions(bool add_cpu_activity)177 void prepareTraceWithExperimentalOptions(bool add_cpu_activity) {
178 #ifdef USE_KINETO
179 std::set<libkineto::ActivityType> k_activities{
180 libkineto::ActivityType::CUDA_PROFILER_RANGE};
181
182 // Only add CPU activities if we are measuring per kernel ranges
183 if (add_cpu_activity && config_.profiler_measure_per_kernel) {
184 k_activities.insert(kCpuTypes.begin(), kCpuTypes.end());
185 }
186
187 const size_t num_metrics = config_.profiler_metrics.size();
188 std::stringstream configss;
189
190 LOG(INFO) << "CUPTI profiler metrics size = " << num_metrics;
191
192 configss << "ACTIVITIES_WARMUP_PERIOD_SECS=0\n"
193 << "CUPTI_PROFILER_METRICS=";
194
195 for (size_t i = 0; i < num_metrics; i++) {
196 configss << config_.profiler_metrics[i];
197 if (num_metrics > 1 && i < (num_metrics - 1)) {
198 configss << ",";
199 }
200 }
201 configss << "\nCUPTI_PROFILER_ENABLE_PER_KERNEL="
202 << (config_.profiler_measure_per_kernel ? "true" : "false")
203 << "\n";
204 LOG(INFO) << "Generated config = " << configss.str();
205
206 libkineto::api().activityProfiler().prepareTrace(
207 k_activities, configss.str());
208 #endif // USE_KINETO
209 }
210
211 private:
212 // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
213 const torch::profiler::impl::ExperimentalConfig& config_;
214 };
215 } // namespace
216
collectivesProfilerExists()217 bool collectivesProfilerExists() {
218 #ifdef KINETO_HAS_NCCL_PROFILER
219 return true;
220 #else
221 return false;
222 #endif
223 }
224
prepareTrace(const bool cpuOnly,const ActivitySet & activities,const torch::profiler::impl::ExperimentalConfig & config)225 void prepareTrace(
226 const bool cpuOnly,
227 const ActivitySet& activities,
228 const torch::profiler::impl::ExperimentalConfig& config) {
229 #ifdef USE_KINETO
230 if (!libkineto::api().isProfilerRegistered()) {
231 libkineto_init(/*cpuOnly=*/cpuOnly, /*logOnError=*/true);
232 libkineto::api().suppressLogMessages();
233 }
234
235 if (!libkineto::api().isProfilerInitialized()) {
236 libkineto::api().initProfilerIfRegistered();
237 }
238
239 std::set<libkineto::ActivityType> k_activities;
240 bool has_cpu_activity =
241 activities.count(torch::autograd::profiler::ActivityType::CPU);
242
243 if (has_cpu_activity) {
244 k_activities.insert(kCpuTypes.begin(), kCpuTypes.end());
245 }
246 if (activities.count(torch::autograd::profiler::ActivityType::XPU)) {
247 k_activities.insert(kXpuTypes.begin(), kXpuTypes.end());
248 }
249 if (activities.count(torch::autograd::profiler::ActivityType::MTIA)) {
250 k_activities.insert(kMtiaTypes.begin(), kMtiaTypes.end());
251 }
252 if (activities.count(torch::autograd::profiler::ActivityType::CUDA)) {
253 k_activities.insert(kCudaTypes.begin(), kCudaTypes.end());
254 if (config.enable_cuda_sync_events || get_cuda_sync_enabled()) {
255 LOG(INFO) << "Enabling CUDA Sync Events";
256 k_activities.insert(libkineto::ActivityType::CUDA_SYNC);
257 }
258 }
259 if (collectivesProfilerExists()) {
260 k_activities.insert(libkineto::ActivityType::COLLECTIVE_COMM);
261 }
262 if (activities.count(torch::autograd::profiler::ActivityType::PrivateUse1)) {
263 k_activities.insert(kPrivateUse1Types.begin(), kPrivateUse1Types.end());
264 }
265
266 ExperimentalConfigWrapper configWrap(config);
267
268 // Experimental Configuration options are present
269 if (config && configWrap.assertValid()) {
270 configWrap.prepareTraceWithExperimentalOptions(has_cpu_activity);
271 return;
272 }
273
274 libkineto::api().activityProfiler().prepareTrace(k_activities);
275 #endif // USE_KINETO
276 }
277
toggleCollectionDynamic(const bool enable)278 void toggleCollectionDynamic(const bool enable) {
279 #ifdef USE_KINETO
280 // TODO: We may want to consider adding another input arg for this function
281 // if we want to support turning off certain devices and keeping others on.
282 // For now, we can keep it simple at have it turn off all tracing of "CUDA"
283 // devices
284 libkineto::api().activityProfiler().toggleCollectionDynamic(enable);
285 #endif // USE_KINETO
286 }
287
startTrace()288 void startTrace() {
289 #ifdef USE_KINETO
290 libkineto::api().activityProfiler().startTrace();
291 #endif // USE_KINETO
292 }
293
stopTrace()294 ActivityTraceWrapper stopTrace() {
295 return ActivityTraceWrapper{
296 #ifdef USE_KINETO
297 libkineto::api().activityProfiler().stopTrace()
298 #else
299 std::make_unique<interface_trace_t>()
300 #endif // USE_KINETO
301 };
302 }
303
pushCorrelationId(uint64_t correlation_id)304 void pushCorrelationId(uint64_t correlation_id) {
305 #ifdef USE_KINETO
306 libkineto::api().activityProfiler().pushCorrelationId(correlation_id);
307 #endif // USE_KINETO
308 }
309
pushUserCorrelationId(uint64_t correlation_id)310 void pushUserCorrelationId(uint64_t correlation_id) {
311 #ifdef USE_KINETO
312 libkineto::api().activityProfiler().pushUserCorrelationId(correlation_id);
313 #endif // USE_KINETO
314 }
315
popCorrelationId()316 void popCorrelationId() {
317 #ifdef USE_KINETO
318 libkineto::api().activityProfiler().popCorrelationId();
319 #endif // USE_KINETO
320 }
321
popUserCorrelationId()322 void popUserCorrelationId() {
323 #ifdef USE_KINETO
324 libkineto::api().activityProfiler().popUserCorrelationId();
325 #endif // USE_KINETO
326 }
327
recordThreadInfo()328 void recordThreadInfo() {
329 #ifdef USE_KINETO
330 libkineto::api().activityProfiler().recordThreadInfo();
331 #endif // USE_KINETO
332 }
333
logInvariantViolation(const std::string & assertion,const std::string & error,const std::string & profile_id,const std::string & group_profile_id)334 void logInvariantViolation(
335 const std::string& assertion,
336 const std::string& error,
337 const std::string& profile_id,
338 const std::string& group_profile_id) {
339 #ifdef USE_KINETO
340 if (libkineto::api().isProfilerInitialized()) {
341 libkineto::api().activityProfiler().logInvariantViolation(
342 profile_id, assertion, error, group_profile_id);
343 }
344 #endif // USE_KINETO
345 }
346
347 } // namespace profiler::impl::kineto
348
349 namespace autograd::profiler {
deviceTypeFromActivity(libkineto::ActivityType activity_type)350 c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) {
351 // fallthrough
352 switch (activity_type) {
353 case libkineto::ActivityType::GPU_MEMCPY:
354 case libkineto::ActivityType::GPU_MEMSET:
355 case libkineto::ActivityType::CONCURRENT_KERNEL:
356 case libkineto::ActivityType::CUDA_SYNC:
357 case libkineto::ActivityType::GPU_USER_ANNOTATION:
358 case libkineto::ActivityType::CUDA_PROFILER_RANGE: {
359 // PrivateUse1 kineto backend reuse above ActivityTypes,
360 // If PrivateUse1 backend enabled, this should return
361 // c10::DeviceType::PrivateUse1.
362 c10::DeviceType device_type = []() {
363 if (c10::get_privateuse1_backend() != "privateuseone") {
364 return c10::DeviceType::PrivateUse1;
365 }
366 return c10::DeviceType::CUDA;
367 }();
368 return device_type;
369 }
370 // TODO: T151322015
371 case libkineto::ActivityType::MTIA_CCP_EVENTS:
372 case libkineto::ActivityType::MTIA_WORKLOADD: {
373 // PrivateUse1 kineto backend reuse above ActivityTypes,
374 // If PrivateUse1 backend enabled, this should return
375 // c10::DeviceType::PrivateUse1.
376 c10::DeviceType device_type = []() {
377 if (c10::get_privateuse1_backend() != "privateuseone") {
378 return c10::DeviceType::PrivateUse1;
379 }
380 return c10::DeviceType::MTIA;
381 }();
382 return device_type;
383 }
384 case libkineto::ActivityType::CPU_OP:
385 case libkineto::ActivityType::USER_ANNOTATION:
386 case libkineto::ActivityType::EXTERNAL_CORRELATION:
387 case libkineto::ActivityType::CUDA_RUNTIME:
388 case libkineto::ActivityType::XPU_RUNTIME:
389 case libkineto::ActivityType::CPU_INSTANT_EVENT:
390 case libkineto::ActivityType::GLOW_RUNTIME:
391 case libkineto::ActivityType::MTIA_RUNTIME:
392 case libkineto::ActivityType::PYTHON_FUNCTION:
393 case libkineto::ActivityType::CUDA_DRIVER:
394 case libkineto::ActivityType::PRIVATEUSE1_RUNTIME:
395 case libkineto::ActivityType::PRIVATEUSE1_DRIVER:
396 return c10::DeviceType::CPU;
397 default: {
398 TORCH_WARN(
399 "Unknown activity type (",
400 (uint8_t)activity_type,
401 "), assuming CPU device");
402 return c10::DeviceType::CPU;
403 }
404 }
405 }
406
addMetadataJson(const std::string & key,const std::string & value)407 void addMetadataJson(const std::string& key, const std::string& value) {
408 #ifdef USE_KINETO
409 if (libkineto::api().isProfilerInitialized()) {
410 libkineto::api().activityProfiler().addMetadata(key, value);
411 } else {
412 LOG(WARNING) << "Profiler is not initialized: skipping profiling metadata";
413 }
414 #else
415 LOG(WARNING) << "Adding profiling metadata requires using "
416 << "torch.profiler with Kineto support (USE_KINETO=1)";
417 #endif // USE_KINETO
418 }
419
profilerStep()420 void profilerStep() {
421 #ifdef USE_KINETO
422 libkineto::api().initProfilerIfRegistered();
423
424 if (libkineto::api().isProfilerInitialized()) {
425 libkineto::api().activityProfiler().step();
426 } else {
427 VLOG(1) << "Profiler is not initialized: skipping step() invocation";
428 }
429 #endif // USE_KINETO
430 }
431
432 } // namespace autograd::profiler
433
434 } // namespace torch
435