xref: /aosp_15_r20/external/tensorflow/tensorflow/core/profiler/convert/xplane_to_step_stats.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/convert/xplane_to_step_stats.h"
17 
18 #include <cstdint>
19 #include <string>
20 
21 #include "absl/strings/str_cat.h"
22 #include "absl/strings/string_view.h"
23 #include "absl/strings/strip.h"
24 #include "tensorflow/core/framework/step_stats.pb.h"
25 #include "tensorflow/core/platform/types.h"
26 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
27 #include "tensorflow/core/profiler/utils/gpu_event_stats.h"
28 #include "tensorflow/core/profiler/utils/math_utils.h"
29 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
30 #include "tensorflow/core/profiler/utils/xplane_schema.h"
31 #include "tensorflow/core/profiler/utils/xplane_utils.h"
32 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
33 
34 namespace tensorflow {
35 namespace profiler {
36 namespace {
37 
38 struct CorrelationInfo {
39   uint32_t thread_id;
40   uint64_t enqueue_time_ns;
41 };
42 
43 enum GpuEventType {
44   kUnknown,
45   kKernel,
46   kMemcpyH2D,
47   kMemcpyD2H,
48   kMemcpyD2D,
49   kMemcpyP2P,
50 };
51 
ParseMemcpyName(absl::string_view memcpy_name)52 GpuEventType ParseMemcpyName(absl::string_view memcpy_name) {
53   if (absl::ConsumePrefix(&memcpy_name, "Memcpy")) {
54     if (memcpy_name == "H2D") return GpuEventType::kMemcpyH2D;
55     if (memcpy_name == "D2H") return GpuEventType::kMemcpyD2H;
56     if (memcpy_name == "D2D") return GpuEventType::kMemcpyD2D;
57     if (memcpy_name == "P2P") return GpuEventType::kMemcpyP2P;
58   }
59   return GpuEventType::kUnknown;
60 }
61 
SetNodeTimes(const XEventVisitor & event,NodeExecStats * ns)62 void SetNodeTimes(const XEventVisitor& event, NodeExecStats* ns) {
63   ns->set_all_start_micros(NanoToMicro(event.TimestampNs()));
64   ns->set_op_start_rel_micros(0);
65   ns->set_op_end_rel_micros(NanoToMicro(event.DurationNs()));
66   ns->set_all_end_rel_micros(NanoToMicro(event.DurationNs()));
67 }
68 
69 }  // namespace
70 
ConvertGpuXSpaceToStepStats(const XSpace & xspace,StepStats * step_stats)71 void ConvertGpuXSpaceToStepStats(const XSpace& xspace, StepStats* step_stats) {
72   std::vector<const XPlane*> device_planes =
73       FindPlanesWithPrefix(xspace, kGpuPlanePrefix);
74   if (device_planes.empty()) {
75     LOG(WARNING) << "GPU trace was not collected.";
76     return;
77   }
78   std::vector<const XPlane*> host_planes = FindPlanesWithNames(
79       xspace, {kCuptiDriverApiPlaneName, kRoctracerApiPlaneName});
80   DCHECK_LE(host_planes.size(), 1);
81 
82   absl::flat_hash_map<int64_t /*correlation_id*/, CorrelationInfo>
83       correlation_info_map;
84   for (const XPlane* host_plane : host_planes) {
85     absl::flat_hash_map<uint32_t /*device_id*/, DeviceStepStats*>
86         sync_dev_stats_map;
87     XPlaneVisitor plane = CreateTfXPlaneVisitor(host_plane);
88     plane.ForEachLine([&](const XLineVisitor& line) {
89       uint32_t thread_id = line.Id();
90       line.ForEachEvent([&](const XEventVisitor& event) {
91         LaunchEventStats stats(&event);
92         if (event.Name() == "cuStreamSynchronize") {
93           if (stats.device_id.has_value()) {
94             uint32_t device_ordinal = stats.device_id.value();
95             DeviceStepStats* sync_dev_stats =
96                 sync_dev_stats_map[device_ordinal];
97             if (sync_dev_stats == nullptr) {
98               sync_dev_stats = step_stats->add_dev_stats();
99               sync_dev_stats->set_device(
100                   absl::StrCat("/device:GPU:", device_ordinal, "/sync"));
101             }
102             NodeExecStats* ns = sync_dev_stats->add_node_stats();
103             SetNodeTimes(event, ns);
104             ns->set_node_name(std::string(event.Name()));
105             ns->set_timeline_label(absl::StrCat("ThreadId ", thread_id));
106             ns->set_thread_id(thread_id);
107           }
108         } else {
109           if (stats.correlation_id.has_value()) {
110             int64_t correlation_id = stats.correlation_id.value();
111             uint64_t enqueue_time_ns = event.TimestampNs();
112             correlation_info_map[correlation_id] = {thread_id, enqueue_time_ns};
113           }
114         }
115       });
116     });
117   }
118   for (const XPlane* device_plane : device_planes) {
119     absl::flat_hash_map<std::pair<int64_t /*stream_id*/, GpuEventType>,
120                         DeviceStepStats*>
121         stream_dev_stats_map;
122     DeviceStepStats* unknown_stream_dev_stats = nullptr;
123     DeviceStepStats* all_streams_dev_stats = nullptr;
124     DeviceStepStats* memcpy_dev_stats = nullptr;
125     XPlaneVisitor plane = CreateTfXPlaneVisitor(device_plane);
126     uint32_t device_ordinal = plane.Id();
127     plane.ForEachLine([&](const XLineVisitor& line) {
128       uint32_t stream_id = line.Id();
129       line.ForEachEvent([&](const XEventVisitor& event) {
130         GpuEventStats stats(&event);
131 
132         auto ns = absl::make_unique<NodeExecStats>();
133         SetNodeTimes(event, ns.get());
134 
135         // Get launch information if available.
136         if (stats.correlation_id.has_value()) {
137           auto it = correlation_info_map.find(stats.correlation_id.value());
138           if (it != correlation_info_map.end()) {
139             const CorrelationInfo& correlation_info = it->second;
140             ns->set_scheduled_micros(
141                 NanoToMicro(correlation_info.enqueue_time_ns));
142             ns->set_thread_id(correlation_info.thread_id);
143           }
144         }
145 
146         absl::string_view node_name =
147             stats.IsTfOp() ? stats.tf_op_fullname : event.Name();
148         ns->set_node_name(std::string(node_name));
149 
150         if (stats.IsKernel()) {
151           absl::string_view kernel_name = event.Name();
152           ns->set_timeline_label(
153               absl::StrCat(kernel_name, " ", stats.kernel_details));
154           DeviceStepStats*& stream_dev_stats =
155               stream_dev_stats_map[{stream_id, GpuEventType::kKernel}];
156           if (stream_dev_stats == nullptr) {
157             stream_dev_stats = step_stats->add_dev_stats();
158             stream_dev_stats->set_device(absl::StrCat(
159                 "/device:GPU:", device_ordinal, "/stream:", stream_id));
160           }
161           *stream_dev_stats->add_node_stats() = *ns;
162           if (all_streams_dev_stats == nullptr) {
163             all_streams_dev_stats = step_stats->add_dev_stats();
164             all_streams_dev_stats->set_device(
165                 absl::StrCat("/device:GPU:", device_ordinal, "/stream:all"));
166           }
167           all_streams_dev_stats->add_node_stats()->Swap(ns.get());
168 
169         } else if (stats.IsMemCpy()) {
170           absl::string_view memcpy_name = event.Name();
171           ns->set_timeline_label(
172               absl::StrCat(memcpy_name, " ", stats.memcpy_details));
173           GpuEventType gpu_event_type = ParseMemcpyName(memcpy_name);
174           DCHECK_NE(gpu_event_type, GpuEventType::kUnknown);
175           DeviceStepStats*& stream_dev_stats =
176               stream_dev_stats_map[{stream_id, gpu_event_type}];
177           if (stream_dev_stats == nullptr) {
178             stream_dev_stats = step_stats->add_dev_stats();
179             stream_dev_stats->set_device(
180                 absl::StrCat("/device:GPU:", device_ordinal,
181                              "/stream:", stream_id, "<", memcpy_name, ">"));
182           }
183           *stream_dev_stats->add_node_stats() = *ns;
184           if (memcpy_dev_stats == nullptr) {
185             memcpy_dev_stats = step_stats->add_dev_stats();
186             memcpy_dev_stats->set_device(
187                 absl::StrCat("/device:GPU:", device_ordinal, "/memcpy"));
188           }
189           memcpy_dev_stats->add_node_stats()->Swap(ns.get());
190 
191         } else {
192           ns->set_timeline_label(std::string(node_name));
193           if (unknown_stream_dev_stats == nullptr) {
194             unknown_stream_dev_stats = step_stats->add_dev_stats();
195             unknown_stream_dev_stats->set_device(
196                 absl::StrCat("/device:GPU:", device_ordinal, "/stream:"));
197           }
198           unknown_stream_dev_stats->add_node_stats()->Swap(ns.get());
199         }
200       });
201     });
202   }
203 }
204 
205 }  // namespace profiler
206 }  // namespace tensorflow
207