1 /* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/convert/op_profile_builder.h"
17
18 #include <algorithm>
19 #include <cstdint>
20 #include <memory>
21 #include <string>
22 #include <vector>
23
24 #include "absl/container/flat_hash_map.h"
25 #include "absl/container/node_hash_map.h"
26 #include "absl/strings/ascii.h"
27 #include "absl/strings/str_cat.h"
28 #include "tensorflow/core/lib/gtl/top_n.h"
29 #include "tensorflow/core/platform/logging.h"
30 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
31 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
32 #include "tensorflow/core/profiler/convert/xla_op_utils.h"
33 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
34 #include "tensorflow/core/profiler/protobuf/op_profile.pb.h"
35 #include "tensorflow/core/profiler/utils/math_utils.h"
36 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
37
38 namespace tensorflow {
39 namespace profiler {
40 namespace {
41
42 using op_profile::Metrics;
43 using op_profile::Node;
44
45 // Fill symbol details into a node.
PopulateSymbolNode(const OpMetrics & op_metrics,Node * node)46 void PopulateSymbolNode(const OpMetrics& op_metrics, Node* node) {
47 node->set_name(op_metrics.name());
48 Node::XLAInstruction& xla = *node->mutable_xla();
49 xla.set_expression(op_metrics.long_name());
50 xla.set_category(op_metrics.category());
51 xla.set_provenance(op_metrics.provenance());
52 if (op_metrics.has_layout()) {
53 for (const auto& dimension : op_metrics.layout().dimensions()) {
54 auto* dim = xla.mutable_layout()->add_dimensions();
55 dim->set_size(dimension.size());
56 dim->set_alignment(dimension.alignment());
57 dim->set_semantics(absl::AsciiStrToLower(
58 LayoutDimensionSemantics_Name(dimension.semantics())));
59 }
60 }
61 xla.set_computation_primitive_size(op_metrics.computation_primitive_size());
62 }
63
64 // Sort the children and only keep the top K children.
65 template <typename Cmp>
TopKChildren(const Node * root,int k,Cmp cmp)66 Node TopKChildren(const Node* root, int k, Cmp cmp) {
67 tensorflow::gtl::TopN<const Node*, decltype(cmp)> top_n(k, cmp);
68 for (const Node& node : root->children()) {
69 top_n.push(&node);
70 }
71 Node output;
72 std::unique_ptr<std::vector<const Node*>> extracted_nodes(top_n.Extract());
73 for (const Node* node : *extracted_nodes) {
74 *output.add_children() = *node;
75 }
76 return output;
77 }
78
79 // Copy symbol details into a deduplicated node from the top child node.
CopySymbolDetailsToDeduplicatedNode(Node * top_child_node,Node * deduplicated_node)80 void CopySymbolDetailsToDeduplicatedNode(Node* top_child_node,
81 Node* deduplicated_node) {
82 deduplicated_node->set_name(
83 absl::StrCat(top_child_node->name(), " and its duplicate(s)"));
84 Node::XLAInstruction& xla = *deduplicated_node->mutable_xla();
85 const Node::XLAInstruction& top_child_node_xla = top_child_node->xla();
86 xla.set_expression(top_child_node_xla.expression());
87 xla.set_category(top_child_node_xla.category());
88 if (IsFusion(top_child_node_xla.category())) return;
89 xla.set_provenance(top_child_node_xla.provenance());
90 *xla.mutable_layout() = top_child_node_xla.layout();
91 }
92
SortAndPruneChildren(int k,int level,Node * root)93 void SortAndPruneChildren(int k, int level, Node* root) {
94 // Set the total number of children before pruning.
95 root->set_num_children(root->children_size());
96 for (Node& node : *root->mutable_children()) {
97 SortAndPruneChildren(k, level - 1, &node);
98 }
99 k = level > 0 ? root->children_size() : k;
100
101 if (root->children_size() > 1) {
102 if (root->has_xla() && IsFusion(root->xla().category())) {
103 // Sort the children under fusion node by raw flops.
104 *root->mutable_children() =
105 TopKChildren(root, k, [](const Node* a, const Node* b) {
106 return a->metrics().raw_flops() > b->metrics().raw_flops();
107 }).children();
108 } else {
109 *root->mutable_children() =
110 TopKChildren(root, k, [](const Node* a, const Node* b) {
111 return a->metrics().time() > b->metrics().time();
112 }).children();
113 }
114 }
115 }
116
117 // Finalize deduplicated nodes by copying symbol details from the top child
118 // node.
FinalizeDeduplicatedNodes(bool by_program,Node * root)119 void FinalizeDeduplicatedNodes(bool by_program, Node* root) {
120 if (by_program) {
121 for (Node& program_node : *root->mutable_children()) {
122 for (Node& category_node : *program_node.mutable_children()) {
123 for (Node& deduplicated_node : *category_node.mutable_children()) {
124 // Skip for non deduplicated nodes. Those nodes already have name set.
125 if (!deduplicated_node.name().empty() ||
126 deduplicated_node.children().empty())
127 continue;
128 CopySymbolDetailsToDeduplicatedNode(
129 deduplicated_node.mutable_children(0), &deduplicated_node);
130 }
131 }
132 }
133 } else {
134 for (Node& category_node : *root->mutable_children()) {
135 for (Node& deduplicated_node : *category_node.mutable_children()) {
136 // Skip for non deduplicated nodes. Those nodes already have name set.
137 if (!deduplicated_node.name().empty() ||
138 deduplicated_node.children().empty())
139 continue;
140 CopySymbolDetailsToDeduplicatedNode(
141 deduplicated_node.mutable_children(0), &deduplicated_node);
142 }
143 }
144 }
145 }
146
147 // Fills op metrics into a node.
PopulateOpMetricsNode(const OpMetrics & op_metrics,double peak_gigaflops_per_second_per_core,double peak_gibibytes_per_second_per_core,uint64_t total_time_ps,Node * node)148 void PopulateOpMetricsNode(const OpMetrics& op_metrics,
149 double peak_gigaflops_per_second_per_core,
150 double peak_gibibytes_per_second_per_core,
151 uint64_t total_time_ps, Node* node) {
152 DCHECK_EQ(ChildrenTimePs(op_metrics), 0);
153
154 Metrics* metrics = node->mutable_metrics();
155 // The UI computes flops_rate = raw_flops / raw_time
156 // and memory_bandwidth = raw_bytes_accessed / raw_time. See:
157 // https://github.com/tensorflow/profiler/blob/master/frontend/app/common/utils/utils.ts
158 metrics->set_raw_time(op_metrics.time_ps());
159 metrics->set_raw_flops(op_metrics.flops());
160 metrics->set_raw_bytes_accessed(op_metrics.bytes_accessed());
161
162 // "time" is the op or category fraction of total time.
163 metrics->set_time(SafeDivide(op_metrics.time_ps(), total_time_ps));
164
165 double flops_utilization = SafeDivide(GigaFlopsPerSecondPerCore(op_metrics),
166 peak_gigaflops_per_second_per_core);
167 // The UI expects flops_utilization = flops / time. See:
168 // https://github.com/tensorflow/profiler/blob/master/frontend/app/common/utils/utils.ts
169 metrics->set_flops(flops_utilization * metrics->time());
170
171 // TODO(b/219984562): Use hierarchical roofline.
172 double mem_bw_utilization = SafeDivide(GibiBytesPerSecondPerCore(op_metrics),
173 peak_gibibytes_per_second_per_core);
174 metrics->set_memory_bandwidth(mem_bw_utilization);
175 }
176
177 // Sets the total time on the root node metrics.
SetTotalTime(uint64_t total_time_ps,Node * root)178 void SetTotalTime(uint64_t total_time_ps, Node* root) {
179 Metrics* metrics = root->mutable_metrics();
180 metrics->set_raw_time(total_time_ps);
181 metrics->set_time(1.0);
182 }
183
184 // Recursively insert "fused instruction" nodes (with raw flops).
InsertFusedInstructions(const OpMetrics & op_metrics,Node * node)185 void InsertFusedInstructions(const OpMetrics& op_metrics, Node* node) {
186 if (!op_metrics.has_children()) return;
187 for (const auto& child : op_metrics.children().metrics_db()) {
188 Node* new_node = node->add_children();
189 PopulateSymbolNode(child, new_node);
190 new_node->mutable_metrics()->set_raw_flops(child.flops());
191 if (child.has_children()) {
192 InsertFusedInstructions(child, new_node);
193 }
194 }
195 }
196
197 } // namespace
198
GenerateProgramName(uint64_t program_id) const199 std::string OpProfileBuilder::GenerateProgramName(uint64_t program_id) const {
200 DCHECK(program_name_map_ != nullptr);
201 auto iter = program_name_map_->find(program_id);
202 if (iter == program_name_map_->end()) return "main";
203 return HloModuleNameWithProgramId(iter->second, program_id);
204 }
205
AddOpNode(const OpMetrics & op_metrics,Category * category,Node * deduplicated_node)206 Node* OpProfileBuilder::AddOpNode(const OpMetrics& op_metrics,
207 Category* category, Node* deduplicated_node) {
208 Node* leaf;
209 if (deduplicated_node != nullptr) {
210 leaf = deduplicated_node->add_children();
211 } else if (category != nullptr) {
212 leaf = category->node->add_children();
213 } else {
214 leaf = root_->add_children();
215 }
216 PopulateSymbolNode(op_metrics, leaf);
217 InsertFusedInstructions(op_metrics, leaf);
218 return leaf;
219 }
220
LookupOrAddDeduplicatedNode(const OpMetrics & op_metrics,Category * category)221 Node* OpProfileBuilder::LookupOrAddDeduplicatedNode(const OpMetrics& op_metrics,
222 Category* category) {
223 Node*& deduplicated_node =
224 category->deduplicated_nodes[op_metrics.deduplicated_name()];
225 if (deduplicated_node == nullptr) {
226 deduplicated_node = category->node->add_children();
227 }
228 return deduplicated_node;
229 }
230
LookupOrAddCategoryNode(const OpMetrics & op_metrics,Program * program)231 OpProfileBuilder::Category* OpProfileBuilder::LookupOrAddCategoryNode(
232 const OpMetrics& op_metrics, Program* program) {
233 Category* category;
234 Node* category_parent;
235 if (program != nullptr) {
236 category = &program->categories[op_metrics.category()];
237 category_parent = program->node;
238 } else {
239 category = &category_map_[op_metrics.category()];
240 category_parent = root_;
241 }
242 if (category->node == nullptr) {
243 category->node = category_parent->add_children();
244 category->node->set_name(op_metrics.category());
245 }
246 return category;
247 }
248
LookupOrAddProgramNode(const OpMetrics & op_metrics)249 OpProfileBuilder::Program* OpProfileBuilder::LookupOrAddProgramNode(
250 const OpMetrics& op_metrics) {
251 uint64_t program_id = op_metrics.hlo_module_id();
252 Program* program = &programs_map_[program_id];
253 if (program->node == nullptr) {
254 program->node = root_->add_children();
255 program->node->set_name(GenerateProgramName(program_id));
256 }
257 return program;
258 }
259
AddOp(const OpMetrics & op_metrics)260 void OpProfileBuilder::AddOp(const OpMetrics& op_metrics) {
261 // Exclude ops with children ops to avoid double counting of flops, bytes and
262 // time from children ops.
263 if (ChildrenTimePs(op_metrics) > 0) return;
264
265 // The path from the root to the leaf node:
266 // e.g. by_program -> cluster_xx -> convolution -> convolution.1 and its
267 // deduplicates -> convolution.1
268 // We will aggregate the metrics of convolution.1 to all its parent nodes.
269 std::vector<Node*> all_paths = {root_};
270
271 if (IsIdleOp(op_metrics)) {
272 Node* leaf = AddOpNode(op_metrics);
273 all_paths.push_back(leaf);
274 } else {
275 Program* program = nullptr;
276 if (options_.group_by_program) {
277 program = LookupOrAddProgramNode(op_metrics);
278 all_paths.push_back(program->node);
279 }
280
281 Category* category = LookupOrAddCategoryNode(op_metrics, program);
282 all_paths.push_back(category->node);
283
284 Node* deduplicated_node = nullptr;
285 if (options_.group_by_deduplicated_name &&
286 !op_metrics.deduplicated_name().empty()) {
287 deduplicated_node = LookupOrAddDeduplicatedNode(op_metrics, category);
288 all_paths.push_back(deduplicated_node);
289 }
290
291 Node* leaf = AddOpNode(op_metrics, category, deduplicated_node);
292 all_paths.push_back(leaf);
293 }
294
295 for (auto* node : all_paths) {
296 // Per program combiner does not need to update OpMetrics.num_cores
297 CombineOpMetrics(op_metrics, &metrics_[node], /*update_num_cores=*/false);
298 }
299 }
300
Finalize(double peak_gigaflops_per_second_per_core,double peak_gibibytes_per_second_per_core,uint64_t total_time_ps)301 void OpProfileBuilder::Finalize(double peak_gigaflops_per_second_per_core,
302 double peak_gibibytes_per_second_per_core,
303 uint64_t total_time_ps) {
304 for (const auto& [node, op_metrics] : metrics_) {
305 PopulateOpMetricsNode(op_metrics, peak_gigaflops_per_second_per_core,
306 peak_gibibytes_per_second_per_core, total_time_ps,
307 node);
308 }
309 SetTotalTime(total_time_ps, root_);
310 // If grouping by program, we build a two-level pruned tree: the first level
311 // is per program and the second level is per category. Otherwise we build a
312 // single-level per category pruned tree.
313 int level = options_.group_by_program ? 2 : 1;
314 SortAndPruneChildren(options_.children_per_node, level, root_);
315 if (options_.group_by_deduplicated_name) {
316 FinalizeDeduplicatedNodes(options_.group_by_program, root_);
317 }
318 }
319
OpProfileBuilder(const OpProfileOptions & options,tensorflow::profiler::op_profile::Node * root,const tensorflow::protobuf::Map<uint64_t,std::string> * program_name_map)320 OpProfileBuilder::OpProfileBuilder(
321 const OpProfileOptions& options,
322 tensorflow::profiler::op_profile::Node* root,
323 const tensorflow::protobuf::Map<uint64_t, std::string>* program_name_map)
324 : options_(options), root_(root), program_name_map_(program_name_map) {
325 CHECK(root != nullptr);
326 DCHECK(!options_.group_by_program || program_name_map_ != nullptr);
327 root->set_name(options_.group_by_program ? "by_program" : "by_category");
328 }
329 } // namespace profiler
330 } // namespace tensorflow
331