1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/convert/xplane_to_step_stats.h"
17
18 #include <cstdint>
19 #include <string>
20
21 #include "absl/strings/str_cat.h"
22 #include "absl/strings/string_view.h"
23 #include "absl/strings/strip.h"
24 #include "tensorflow/core/framework/step_stats.pb.h"
25 #include "tensorflow/core/platform/types.h"
26 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
27 #include "tensorflow/core/profiler/utils/gpu_event_stats.h"
28 #include "tensorflow/core/profiler/utils/math_utils.h"
29 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
30 #include "tensorflow/core/profiler/utils/xplane_schema.h"
31 #include "tensorflow/core/profiler/utils/xplane_utils.h"
32 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
33
34 namespace tensorflow {
35 namespace profiler {
36 namespace {
37
38 struct CorrelationInfo {
39 uint32_t thread_id;
40 uint64_t enqueue_time_ns;
41 };
42
43 enum GpuEventType {
44 kUnknown,
45 kKernel,
46 kMemcpyH2D,
47 kMemcpyD2H,
48 kMemcpyD2D,
49 kMemcpyP2P,
50 };
51
ParseMemcpyName(absl::string_view memcpy_name)52 GpuEventType ParseMemcpyName(absl::string_view memcpy_name) {
53 if (absl::ConsumePrefix(&memcpy_name, "Memcpy")) {
54 if (memcpy_name == "H2D") return GpuEventType::kMemcpyH2D;
55 if (memcpy_name == "D2H") return GpuEventType::kMemcpyD2H;
56 if (memcpy_name == "D2D") return GpuEventType::kMemcpyD2D;
57 if (memcpy_name == "P2P") return GpuEventType::kMemcpyP2P;
58 }
59 return GpuEventType::kUnknown;
60 }
61
SetNodeTimes(const XEventVisitor & event,NodeExecStats * ns)62 void SetNodeTimes(const XEventVisitor& event, NodeExecStats* ns) {
63 ns->set_all_start_micros(NanoToMicro(event.TimestampNs()));
64 ns->set_op_start_rel_micros(0);
65 ns->set_op_end_rel_micros(NanoToMicro(event.DurationNs()));
66 ns->set_all_end_rel_micros(NanoToMicro(event.DurationNs()));
67 }
68
69 } // namespace
70
ConvertGpuXSpaceToStepStats(const XSpace & xspace,StepStats * step_stats)71 void ConvertGpuXSpaceToStepStats(const XSpace& xspace, StepStats* step_stats) {
72 std::vector<const XPlane*> device_planes =
73 FindPlanesWithPrefix(xspace, kGpuPlanePrefix);
74 if (device_planes.empty()) {
75 LOG(WARNING) << "GPU trace was not collected.";
76 return;
77 }
78 std::vector<const XPlane*> host_planes = FindPlanesWithNames(
79 xspace, {kCuptiDriverApiPlaneName, kRoctracerApiPlaneName});
80 DCHECK_LE(host_planes.size(), 1);
81
82 absl::flat_hash_map<int64_t /*correlation_id*/, CorrelationInfo>
83 correlation_info_map;
84 for (const XPlane* host_plane : host_planes) {
85 absl::flat_hash_map<uint32_t /*device_id*/, DeviceStepStats*>
86 sync_dev_stats_map;
87 XPlaneVisitor plane = CreateTfXPlaneVisitor(host_plane);
88 plane.ForEachLine([&](const XLineVisitor& line) {
89 uint32_t thread_id = line.Id();
90 line.ForEachEvent([&](const XEventVisitor& event) {
91 LaunchEventStats stats(&event);
92 if (event.Name() == "cuStreamSynchronize") {
93 if (stats.device_id.has_value()) {
94 uint32_t device_ordinal = stats.device_id.value();
95 DeviceStepStats* sync_dev_stats =
96 sync_dev_stats_map[device_ordinal];
97 if (sync_dev_stats == nullptr) {
98 sync_dev_stats = step_stats->add_dev_stats();
99 sync_dev_stats->set_device(
100 absl::StrCat("/device:GPU:", device_ordinal, "/sync"));
101 }
102 NodeExecStats* ns = sync_dev_stats->add_node_stats();
103 SetNodeTimes(event, ns);
104 ns->set_node_name(std::string(event.Name()));
105 ns->set_timeline_label(absl::StrCat("ThreadId ", thread_id));
106 ns->set_thread_id(thread_id);
107 }
108 } else {
109 if (stats.correlation_id.has_value()) {
110 int64_t correlation_id = stats.correlation_id.value();
111 uint64_t enqueue_time_ns = event.TimestampNs();
112 correlation_info_map[correlation_id] = {thread_id, enqueue_time_ns};
113 }
114 }
115 });
116 });
117 }
118 for (const XPlane* device_plane : device_planes) {
119 absl::flat_hash_map<std::pair<int64_t /*stream_id*/, GpuEventType>,
120 DeviceStepStats*>
121 stream_dev_stats_map;
122 DeviceStepStats* unknown_stream_dev_stats = nullptr;
123 DeviceStepStats* all_streams_dev_stats = nullptr;
124 DeviceStepStats* memcpy_dev_stats = nullptr;
125 XPlaneVisitor plane = CreateTfXPlaneVisitor(device_plane);
126 uint32_t device_ordinal = plane.Id();
127 plane.ForEachLine([&](const XLineVisitor& line) {
128 uint32_t stream_id = line.Id();
129 line.ForEachEvent([&](const XEventVisitor& event) {
130 GpuEventStats stats(&event);
131
132 auto ns = absl::make_unique<NodeExecStats>();
133 SetNodeTimes(event, ns.get());
134
135 // Get launch information if available.
136 if (stats.correlation_id.has_value()) {
137 auto it = correlation_info_map.find(stats.correlation_id.value());
138 if (it != correlation_info_map.end()) {
139 const CorrelationInfo& correlation_info = it->second;
140 ns->set_scheduled_micros(
141 NanoToMicro(correlation_info.enqueue_time_ns));
142 ns->set_thread_id(correlation_info.thread_id);
143 }
144 }
145
146 absl::string_view node_name =
147 stats.IsTfOp() ? stats.tf_op_fullname : event.Name();
148 ns->set_node_name(std::string(node_name));
149
150 if (stats.IsKernel()) {
151 absl::string_view kernel_name = event.Name();
152 ns->set_timeline_label(
153 absl::StrCat(kernel_name, " ", stats.kernel_details));
154 DeviceStepStats*& stream_dev_stats =
155 stream_dev_stats_map[{stream_id, GpuEventType::kKernel}];
156 if (stream_dev_stats == nullptr) {
157 stream_dev_stats = step_stats->add_dev_stats();
158 stream_dev_stats->set_device(absl::StrCat(
159 "/device:GPU:", device_ordinal, "/stream:", stream_id));
160 }
161 *stream_dev_stats->add_node_stats() = *ns;
162 if (all_streams_dev_stats == nullptr) {
163 all_streams_dev_stats = step_stats->add_dev_stats();
164 all_streams_dev_stats->set_device(
165 absl::StrCat("/device:GPU:", device_ordinal, "/stream:all"));
166 }
167 all_streams_dev_stats->add_node_stats()->Swap(ns.get());
168
169 } else if (stats.IsMemCpy()) {
170 absl::string_view memcpy_name = event.Name();
171 ns->set_timeline_label(
172 absl::StrCat(memcpy_name, " ", stats.memcpy_details));
173 GpuEventType gpu_event_type = ParseMemcpyName(memcpy_name);
174 DCHECK_NE(gpu_event_type, GpuEventType::kUnknown);
175 DeviceStepStats*& stream_dev_stats =
176 stream_dev_stats_map[{stream_id, gpu_event_type}];
177 if (stream_dev_stats == nullptr) {
178 stream_dev_stats = step_stats->add_dev_stats();
179 stream_dev_stats->set_device(
180 absl::StrCat("/device:GPU:", device_ordinal,
181 "/stream:", stream_id, "<", memcpy_name, ">"));
182 }
183 *stream_dev_stats->add_node_stats() = *ns;
184 if (memcpy_dev_stats == nullptr) {
185 memcpy_dev_stats = step_stats->add_dev_stats();
186 memcpy_dev_stats->set_device(
187 absl::StrCat("/device:GPU:", device_ordinal, "/memcpy"));
188 }
189 memcpy_dev_stats->add_node_stats()->Swap(ns.get());
190
191 } else {
192 ns->set_timeline_label(std::string(node_name));
193 if (unknown_stream_dev_stats == nullptr) {
194 unknown_stream_dev_stats = step_stats->add_dev_stats();
195 unknown_stream_dev_stats->set_device(
196 absl::StrCat("/device:GPU:", device_ordinal, "/stream:"));
197 }
198 unknown_stream_dev_stats->add_node_stats()->Swap(ns.get());
199 }
200 });
201 });
202 }
203 }
204
205 } // namespace profiler
206 } // namespace tensorflow
207